[SPARK-8443][SQL] Split GenerateMutableProjection Codegen due to JVM Code Size Limits

By grouping projection calls into multiple apply function, we are able to push the number of projections codegen can handle from ~1k to ~60k. I have set the unit test to test against 5k as 60k took 15s for the unit test to complete.

Author: Forest Fang <forest.fang@outlook.com>

Closes #7076 from saurfang/codegen_size_limit and squashes the following commits:

b7a7635 [Forest Fang] [SPARK-8443][SQL] Execute and verify split projections in test
adef95a [Forest Fang] [SPARK-8443][SQL] Use safer factor and rewrite splitting code
1b5aa7e [Forest Fang] [SPARK-8443][SQL] inline execution if one block only
9405680 [Forest Fang] [SPARK-8443][SQL] split projection code by size limit
This commit is contained in:
Forest Fang 2015-07-18 21:05:44 -07:00 committed by Reynold Xin
parent 45d798c323
commit 6cb6096c01
2 changed files with 50 additions and 3 deletions

View file

@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen
import org.apache.spark.sql.catalyst.expressions._
import scala.collection.mutable.ArrayBuffer
// MutableProjection is not accessible in Java
abstract class BaseMutableProjection extends MutableProjection
@ -45,10 +47,41 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
else
${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
"""
}.mkString("\n")
}
// collect projections into blocks as function has 64kb codesize limit in JVM
val projectionBlocks = new ArrayBuffer[String]()
val blockBuilder = new StringBuilder()
for (projection <- projectionCode) {
if (blockBuilder.length > 16 * 1000) {
projectionBlocks.append(blockBuilder.toString())
blockBuilder.clear()
}
blockBuilder.append(projection)
}
projectionBlocks.append(blockBuilder.toString())
val (projectionFuns, projectionCalls) = {
// inline execution if codesize limit was not broken
if (projectionBlocks.length == 1) {
("", projectionBlocks.head)
} else {
(
projectionBlocks.zipWithIndex.map { case (body, i) =>
s"""
|private void apply$i(InternalRow i) {
| $body
|}
""".stripMargin
}.mkString,
projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n")
)
}
}
val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")
val code = s"""
public Object generate($exprType[] expr) {
return new SpecificProjection(expr);
@ -75,9 +108,11 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
return (InternalRow) mutableRow;
}
$projectionFuns
public Object apply(Object _i) {
InternalRow i = (InternalRow) _i;
$projectionCode
$projectionCalls
return mutableRow;
}

View file

@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
/**
* Additional tests for code generation.
*/
class CodeGenerationSuite extends SparkFunSuite {
class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
test("multithreaded eval") {
import scala.concurrent._
@ -42,4 +42,16 @@ class CodeGenerationSuite extends SparkFunSuite {
futures.foreach(Await.result(_, 10.seconds))
}
test("SPARK-8443: split wide projections into blocks due to JVM code size limit") {
val length = 5000
val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1)))
val plan = GenerateMutableProjection.generate(expressions)()
val actual = plan(new GenericMutableRow(length)).toSeq
val expected = Seq.fill(length)(true)
if (!checkResult(actual, expected)) {
fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
}
}
}