Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -723,8 +723,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
// last replacement string, we don't want to convert a UTF8String => java.langString every time.
@transient private var lastReplacement: String = _
@transient private var lastReplacementInUTF8: UTF8String = _
// result buffer write by Matcher
@transient private lazy val result: JStringBuilder = new JStringBuilder
final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_REPLACE)

override def nullSafeEval(s: Any, p: Any, r: Any, i: Any): Any = {
Expand All @@ -738,26 +736,8 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
lastReplacementInUTF8 = r.asInstanceOf[UTF8String].clone()
lastReplacement = lastReplacementInUTF8.toString
}
val source = s.toString()
val position = i.asInstanceOf[Int] - 1
if (position == 0 || position < source.length) {
val m = pattern.matcher(source)
m.region(position, source.length)
result.delete(0, result.length())
while (m.find) {
try {
m.appendReplacement(result, lastReplacement)
} catch {
case NonFatal(e) =>
throw QueryExecutionErrors.invalidRegexpReplaceError(s.toString,
p.toString, r.toString, i.asInstanceOf[Int], e)
}
}
m.appendTail(result)
UTF8String.fromString(result.toString)
} else {
s
}
RegExpUtils.replace(pattern, s.asInstanceOf[UTF8String], lastReplacement,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer using Java String as the method parameter here. So that we can avoid all these .asInstanceOf[UTF8String]

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 8c24f32RegExpUtils.replace now takes source and regexp as String, so the nullSafeEval call site no longer needs the .asInstanceOf[UTF8String] casts. I also dropped the now-redundant rep parameter and reuse replacement for the error message, since they're the same value (lastReplacement is built from the rep argument). Thanks for the review!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow-up on this, plus one open question for a joint call:

Pushed edcd5c6: I also dropped the regexp parameter entirely and now use pattern.pattern() for the error message. Since compileRegexPattern compiles the raw regexp without escaping (collation is carried in the int flags, not the pattern text), pattern.pattern() round-trips the original regexp exactly, and it's only evaluated on the rare error path -- this removes the eager per-row regexp.toString() that the String-param version would otherwise have added in the generated code.

Open question on the remaining subject argument: to make the eval site fully cast-free I pass it as String (s.toString). One consequence is that the out-of-range no-op branch now returns UTF8String.fromString(source) instead of the original subject bytes. For valid UTF-8 this is byte-identical, and it's consistent with the match path (which already builds its result via UTF8String.fromString(...)). The only divergence is for malformed UTF-8 subjects (e.g. cast(binary as string) with invalid bytes), where the String round-trip substitutes U+FFFD; passing it as String also drops the checked cast, so a hypothetical type mismatch would no longer fail fast.

I can instead keep subject: UTF8String (returning it unchanged on the out-of-range path -> exact bytes + checked cast) at the cost of one remaining .asInstanceOf[UTF8String] at the call site. Do you prefer (a) the fully cast-free String version as-is, or (b) keeping subject: UTF8String for byte-exact passthrough? Happy to go either way.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either a or b seems fine. Thanks

p.asInstanceOf[UTF8String], r.asInstanceOf[UTF8String], i.asInstanceOf[Int])
}

override def dataType: DataType = subject.dataType
Expand All @@ -768,14 +748,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
override def prettyName: String = "regexp_replace"

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val termResult = ctx.freshName("termResult")

val classNameStringBuilder = classOf[JStringBuilder].getCanonicalName

val matcher = ctx.freshName("matcher")
val source = ctx.freshName("source")
val position = ctx.freshName("position")

val termLastReplacement = ctx.addMutableState("String", "lastReplacement")
val termLastReplacementInUTF8 = ctx.addMutableState("UTF8String", "lastReplacementInUTF8")

Expand All @@ -784,41 +756,22 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
} else {
""
}
val regExpUtils = RegExpUtils.getClass.getName.stripSuffix("$")

nullSafeCodeGen(ctx, ev, (subject, regexp, rep, pos) => {
s"""
${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, collationId)}
if (!$rep.equals($termLastReplacementInUTF8)) {
// replacement string changed
$termLastReplacementInUTF8 = $rep.clone();
$termLastReplacement = $termLastReplacementInUTF8.toString();
}
String $source = $subject.toString();
int $position = $pos - 1;
if ($position == 0 || $position < $source.length()) {
$classNameStringBuilder $termResult = new $classNameStringBuilder();
$matcher.region($position, $source.length());

while ($matcher.find()) {
try {
$matcher.appendReplacement($termResult, $termLastReplacement);
} catch (Throwable e) {
if (scala.util.control.NonFatal.apply(e)) {
throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(),
$rep.toString(), $pos, e);
} else {
throw e;
}
}
val (patternCode, termPattern) =
RegExpUtils.initLastPatternCode(ctx, regexp, prettyName, collationId)
s"""
$patternCode
if (!$rep.equals($termLastReplacementInUTF8)) {
// replacement string changed
$termLastReplacementInUTF8 = $rep.clone();
$termLastReplacement = $termLastReplacementInUTF8.toString();
}
$matcher.appendTail($termResult);
${ev.value} = UTF8String.fromString($termResult.toString());
$termResult = null;
} else {
${ev.value} = $subject;
}
$setEvNotNull
"""
${ev.value} = $regExpUtils.replace(
$termPattern, $subject, $termLastReplacement, $regexp, $rep, $pos);
$setEvNotNull
"""
})
}

Expand Down Expand Up @@ -1242,27 +1195,43 @@ case class RegExpInStr(subject: Expression, regexp: Expression, idx: Expression)
}

object RegExpUtils {
def initLastMatcherCode(
// Emits the regex-pattern caching block (recompile only when the regexp value changes) and
// returns (code, patternTermName). The caller can build a Matcher from the returned term, or
// pass it to `replace`.
def initLastPatternCode(
ctx: CodegenContext,
subject: String,
regexp: String,
matcher: String,
prettyName: String,
collationId: Int): String = {
collationId: Int): (String, String) = {
val classNamePattern = classOf[Pattern].getCanonicalName
val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
val termPattern = ctx.addMutableState(classNamePattern, "pattern")
val collationRegexFlags = CollationSupport.collationAwareRegexFlags(collationId)
val utils = classOf[ExpressionImplUtils].getName

val code =
s"""
|if (!$regexp.equals($termLastRegex)) {
| // regex value changed
| UTF8String r = $regexp.clone();
| $termPattern =
| $utils.compileRegexPattern(r.toString(), $collationRegexFlags, "$prettyName");
| $termLastRegex = r;
|}
|""".stripMargin
(code, termPattern)
}

def initLastMatcherCode(
ctx: CodegenContext,
subject: String,
regexp: String,
matcher: String,
prettyName: String,
collationId: Int): String = {
val (patternCode, termPattern) = initLastPatternCode(ctx, regexp, prettyName, collationId)
s"""
|if (!$regexp.equals($termLastRegex)) {
| // regex value changed
| UTF8String r = $regexp.clone();
| $termPattern =
| $utils.compileRegexPattern(r.toString(), $collationRegexFlags, "$prettyName");
| $termLastRegex = r;
|}
|$patternCode
|java.util.regex.Matcher $matcher = $termPattern.matcher($subject.toString());
|""".stripMargin
}
Expand All @@ -1274,4 +1243,40 @@ object RegExpUtils {
r.toString, CollationSupport.collationAwareRegexFlags(collationId), prettyName)
(pattern, r)
}

/**
* Runs the regexp_replace loop shared by RegExpReplace's eval and codegen, so the generated
* Java is a single call rather than an inline matcher build + match/replace loop + error
* construction. The matcher is built here from `pattern` so `subject.toString` is computed once.
* `subject` is returned unchanged when the start position is out of range; `regexp`/`rep`/`pos`
* are only used to build the error message on a failed replacement.
*/
def replace(
pattern: Pattern,
subject: UTF8String,
replacement: String,
regexp: UTF8String,
rep: UTF8String,
pos: Int): UTF8String = {
val source = subject.toString
val position = pos - 1
if (position == 0 || position < source.length) {
val matcher = pattern.matcher(source)
matcher.region(position, source.length)
val result = new JStringBuilder
while (matcher.find()) {
try {
matcher.appendReplacement(result, replacement)
} catch {
case NonFatal(e) =>
throw QueryExecutionErrors.invalidRegexpReplaceError(
source, regexp.toString, rep.toString, pos, e)
}
}
matcher.appendTail(result)
UTF8String.fromString(result.toString)
} else {
subject
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num"))
checkEvaluation(nonNullExpr, "num-num", row1)

// A replacement that references a non-existent group makes the replace fail.
checkErrorInExpression[SparkRuntimeException](
expr,
create_row("100-200", "(\\d+)", "$2"),
"INVALID_REGEXP_REPLACE",
Map(
"source" -> "100-200",
"pattern" -> "(\\d+)",
"replacement" -> "$2",
"position" -> "1"))

// Test escaping of arguments
GenerateUnsafeProjection.generate(
RegExpReplace(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")) :: Nil)
Expand Down