[SPARK-16514][SQL] Fix various regex codegen bugs

## What changes were proposed in this pull request?

RegexExtract and RegexReplace currently crash on non-nullable input due use of a hard-coded local variable name (e.g. compiles fail with `java.lang.Exception: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 85, Column 26: Redefinition of local variable "m" `).

This changes those variables to use fresh names, and also in a few other places.

## How was this patch tested?

Unit tests. rxin

Author: Eric Liang <ekl@databricks.com>

Closes #14168 from ericl/sc-3906.
This commit is contained in:
Eric Liang 2016-07-12 23:09:02 -07:00 committed by Reynold Xin
parent 56bd399a86
commit 1c58fa905b
2 changed files with 39 additions and 15 deletions

View file

@ -108,10 +108,11 @@ case class Like(left: Expression, right: Expression)
""")
}
} else {
val rightStr = ctx.freshName("rightStr")
nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
s"""
String rightStr = ${eval2}.toString();
${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr));
String $rightStr = ${eval2}.toString();
${patternClass} $pattern = ${patternClass}.compile($escapeFunc($rightStr));
${ev.value} = $pattern.matcher(${eval1}.toString()).matches();
"""
})
@ -157,10 +158,11 @@ case class RLike(left: Expression, right: Expression)
""")
}
} else {
val rightStr = ctx.freshName("rightStr")
nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
s"""
String rightStr = ${eval2}.toString();
${patternClass} $pattern = ${patternClass}.compile(rightStr);
String $rightStr = ${eval2}.toString();
${patternClass} $pattern = ${patternClass}.compile($rightStr);
${ev.value} = $pattern.matcher(${eval1}.toString()).find(0);
"""
})
@ -259,6 +261,8 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
val classNamePattern = classOf[Pattern].getCanonicalName
val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
val matcher = ctx.freshName("matcher")
ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
ctx.addMutableState("String", termLastReplacement, s"${termLastReplacement} = null;")
@ -267,6 +271,12 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
ctx.addMutableState(classNameStringBuffer,
termResult, s"${termResult} = new $classNameStringBuffer();")
val setEvNotNull = if (nullable) {
s"${ev.isNull} = false;"
} else {
""
}
nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => {
s"""
if (!$regexp.equals(${termLastRegex})) {
@ -280,14 +290,14 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
}
${termResult}.delete(0, ${termResult}.length());
java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString());
java.util.regex.Matcher ${matcher} = ${termPattern}.matcher($subject.toString());
while (m.find()) {
m.appendReplacement(${termResult}, ${termLastReplacement});
while (${matcher}.find()) {
${matcher}.appendReplacement(${termResult}, ${termLastReplacement});
}
m.appendTail(${termResult});
${matcher}.appendTail(${termResult});
${ev.value} = UTF8String.fromString(${termResult}.toString());
${ev.isNull} = false;
$setEvNotNull
"""
})
}
@ -334,10 +344,18 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
val termLastRegex = ctx.freshName("lastRegex")
val termPattern = ctx.freshName("pattern")
val classNamePattern = classOf[Pattern].getCanonicalName
val matcher = ctx.freshName("matcher")
val matchResult = ctx.freshName("matchResult")
ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
val setEvNotNull = if (nullable) {
s"${ev.isNull} = false;"
} else {
""
}
nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
s"""
if (!$regexp.equals(${termLastRegex})) {
@ -345,15 +363,15 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
${termLastRegex} = $regexp.clone();
${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
}
java.util.regex.Matcher m =
java.util.regex.Matcher ${matcher} =
${termPattern}.matcher($subject.toString());
if (m.find()) {
java.util.regex.MatchResult mr = m.toMatchResult();
${ev.value} = UTF8String.fromString(mr.group($idx));
${ev.isNull} = false;
if (${matcher}.find()) {
java.util.regex.MatchResult ${matchResult} = ${matcher}.toMatchResult();
${ev.value} = UTF8String.fromString(${matchResult}.group($idx));
$setEvNotNull
} else {
${ev.value} = UTF8String.EMPTY_UTF8;
${ev.isNull} = false;
$setEvNotNull
}"""
})
}

View file

@ -631,6 +631,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(expr, null, row4)
checkEvaluation(expr, null, row5)
checkEvaluation(expr, null, row6)
val nonNullExpr = RegExpReplace(Literal("100-200"), Literal("(\\d+)"), Literal("num"))
checkEvaluation(nonNullExpr, "num-num", row1)
}
test("RegexExtract") {
@ -657,6 +660,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val expr1 = new RegExpExtract(s, p)
checkEvaluation(expr1, "100", row1)
val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
checkEvaluation(nonNullExpr, "100", row1)
}
test("SPLIT") {