spark-instrumented-optimizer/sql/core/benchmarks/JoinBenchmark-results.txt

83 lines
6.7 KiB
Plaintext
Raw Normal View History

================================================================================================
Join Benchmark
================================================================================================
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
Join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
Join w long wholestage off 3717 3779 88 5.6 177.2 1.0X
Join w long wholestage on 1283 1458 109 16.3 61.2 2.9X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
Join w long duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
Join w long duplicated wholestage off 5342 5378 50 3.9 254.7 1.0X
Join w long duplicated wholestage on 1559 1594 24 13.5 74.3 3.4X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
Join w 2 ints: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
Join w 2 ints wholestage off 169393 169454 86 0.1 8077.3 1.0X
Join w 2 ints wholestage on 165629 165648 19 0.1 7897.8 1.0X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
Join w 2 longs: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
Join w 2 longs wholestage off 6194 6240 66 3.4 295.3 1.0X
Join w 2 longs wholestage on 3389 3447 55 6.2 161.6 1.8X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
Join w 2 longs duplicated: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
Join w 2 longs duplicated wholestage off 14445 14511 93 1.5 688.8 1.0X
Join w 2 longs duplicated wholestage on 8465 8554 120 2.5 403.6 1.7X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
outer join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
outer join w long wholestage off 2565 2582 23 8.2 122.3 1.0X
outer join w long wholestage on 1323 1339 19 15.8 63.1 1.9X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
semi join w long: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
semi join w long wholestage off 1754 1761 11 12.0 83.6 1.0X
semi join w long wholestage on 846 859 21 24.8 40.3 2.1X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
sort merge join wholestage off 1283 1294 16 1.6 611.8 1.0X
sort merge join wholestage on 1185 1210 26 1.8 565.0 1.1X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
sort merge join with duplicates: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------
sort merge join with duplicates wholestage off 1915 1934 27 1.1 913.1 1.0X
sort merge join with duplicates wholestage on 1742 1778 36 1.2 830.7 1.1X
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
[SPARK-29320][TESTS] Compare `sql/core` module in JDK8/11 (Part 1) ### What changes were proposed in this pull request? This PR regenerates the `sql/core` benchmarks in JDK8/11 to compare the result. In general, we compare the ratio instead of the time. However, in this PR, the average time is compared. This PR should be considered as a rough comparison. **A. EXPECTED CASES(JDK11 is faster in general)** - [x] BloomFilterBenchmark (JDK11 is faster except one case) - [x] BuiltInDataSourceWriteBenchmark (JDK11 is faster at CSV/ORC) - [x] CSVBenchmark (JDK11 is faster except five cases) - [x] ColumnarBatchBenchmark (JDK11 is faster at `boolean`/`string` and some cases in `int`/`array`) - [x] DatasetBenchmark (JDK11 is faster with `string`, but is slower for `long` type) - [x] ExternalAppendOnlyUnsafeRowArrayBenchmark (JDK11 is faster except two cases) - [x] ExtractBenchmark (JDK11 is faster except HOUR/MINUTE/SECOND/MILLISECONDS/MICROSECONDS) - [x] HashedRelationMetricsBenchmark (JDK11 is faster) - [x] JSONBenchmark (JDK11 is much faster except eight cases) - [x] JoinBenchmark (JDK11 is faster except five cases) - [x] OrcNestedSchemaPruningBenchmark (JDK11 is faster in nine cases) - [x] PrimitiveArrayBenchmark (JDK11 is faster) - [x] SortBenchmark (JDK11 is faster except `Arrays.sort` case) - [x] UDFBenchmark (N/A, values are too small) - [x] UnsafeArrayDataBenchmark (JDK11 is faster except one case) - [x] WideTableBenchmark (JDK11 is faster except two cases) **B. CASES WE NEED TO INVESTIGATE MORE LATER** - [x] AggregateBenchmark (JDK11 is slower in general) - [x] CompressionSchemeBenchmark (JDK11 is slower in general except `string`) - [x] DataSourceReadBenchmark (JDK11 is slower in general) - [x] DateTimeBenchmark (JDK11 is slightly slower in general except `parsing`) - [x] MakeDateTimeBenchmark (JDK11 is slower except two cases) - [x] MiscBenchmark (JDK11 is slower except ten cases) - [x] OrcV2NestedSchemaPruningBenchmark (JDK11 is slower) - [x] ParquetNestedSchemaPruningBenchmark (JDK11 is slower except six cases) - [x] RangeBenchmark (JDK11 is slower except one case) `FilterPushdownBenchmark/InExpressionBenchmark/WideSchemaBenchmark` will be compared later because it took long timer. ### Why are the changes needed? According to the result, there are some difference between JDK8/JDK11. This will be a baseline for the future improvement and comparison. Also, as a reproducible environment, the following environment is used. - Instance: `r3.xlarge` - OS: `CentOS Linux release 7.5.1804 (Core)` - JDK: - `OpenJDK Runtime Environment (build 1.8.0_222-b10)` - `OpenJDK Runtime Environment 18.9 (build 11.0.4+11-LTS)` ### Does this PR introduce any user-facing change? No. ### How was this patch tested? This is a test-only PR. We need to run benchmark. Closes #26003 from dongjoon-hyun/SPARK-29320. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2019-10-03 11:58:25 -04:00
shuffle hash join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
[SPARK-34620][SQL] Code-gen broadcast nested loop join (inner/cross) ### What changes were proposed in this pull request? `BroadcastNestedLoopJoinExec` does not have code-gen, and we can potentially boost the CPU performance for this operator if we add code-gen for it. https://databricks.com/blog/2017/02/16/processing-trillion-rows-per-second-single-machine-can-nested-loop-joins-fast.html also showed the evidence in one fork. The codegen for `BroadcastNestedLoopJoinExec` shared some code with `HashJoin`, and the interface `JoinCodegenSupport` is created to hold those common logic. This PR is only supporting inner and cross join. Other join types will be added later in followup PRs. Example query and generated code: ``` val df1 = spark.range(4).select($"id".as("k1")) val df2 = spark.range(3).select($"id".as("k2")) df1.join(df2, $"k1" + 1 =!= $"k2").explain("codegen") ``` ``` == Subtree 2 / 2 (maxMethodCodeSize:282; maxConstantPoolSize:203(0.31% used); numInnerClasses:0) == *(2) BroadcastNestedLoopJoin BuildRight, Inner, NOT ((k1#2L + 1) = k2#6L) :- *(2) Project [id#0L AS k1#2L] : +- *(2) Range (0, 4, step=1, splits=2) +- BroadcastExchange IdentityBroadcastMode, [id=#22] +- *(1) Project [id#4L AS k2#6L] +- *(1) Range (0, 3, step=1, splits=2) Generated code: /* 001 */ public Object generate(Object[] references) { /* 002 */ return new GeneratedIteratorForCodegenStage2(references); /* 003 */ } /* 004 */ /* 005 */ // codegenStageId=2 /* 006 */ final class GeneratedIteratorForCodegenStage2 extends org.apache.spark.sql.execution.BufferedRowIterator { /* 007 */ private Object[] references; /* 008 */ private scala.collection.Iterator[] inputs; /* 009 */ private boolean range_initRange_0; /* 010 */ private long range_nextIndex_0; /* 011 */ private TaskContext range_taskContext_0; /* 012 */ private InputMetrics range_inputMetrics_0; /* 013 */ private long range_batchEnd_0; /* 014 */ private long range_numElementsTodo_0; /* 015 */ private InternalRow[] bnlj_buildRowArray_0; /* 016 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] range_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[4]; /* 017 */ /* 018 */ public GeneratedIteratorForCodegenStage2(Object[] references) { /* 019 */ this.references = references; /* 020 */ } /* 021 */ /* 022 */ public void init(int index, scala.collection.Iterator[] inputs) { /* 023 */ partitionIndex = index; /* 024 */ this.inputs = inputs; /* 025 */ /* 026 */ range_taskContext_0 = TaskContext.get(); /* 027 */ range_inputMetrics_0 = range_taskContext_0.taskMetrics().inputMetrics(); /* 028 */ range_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 029 */ range_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 030 */ range_mutableStateArray_0[2] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); /* 031 */ bnlj_buildRowArray_0 = (InternalRow[]) ((org.apache.spark.broadcast.TorrentBroadcast) references[1] /* broadcastTerm */).value(); /* 032 */ range_mutableStateArray_0[3] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(2, 0); /* 033 */ /* 034 */ } /* 035 */ /* 036 */ private void bnlj_doConsume_0(long bnlj_expr_0_0) throws java.io.IOException { /* 037 */ for (int bnlj_arrayIndex_0 = 0; bnlj_arrayIndex_0 < bnlj_buildRowArray_0.length; bnlj_arrayIndex_0++) { /* 038 */ UnsafeRow bnlj_buildRow_0 = (UnsafeRow) bnlj_buildRowArray_0[bnlj_arrayIndex_0]; /* 039 */ /* 040 */ long bnlj_value_1 = bnlj_buildRow_0.getLong(0); /* 041 */ /* 042 */ long bnlj_value_4 = -1L; /* 043 */ /* 044 */ bnlj_value_4 = bnlj_expr_0_0 + 1L; /* 045 */ /* 046 */ boolean bnlj_value_3 = false; /* 047 */ bnlj_value_3 = bnlj_value_4 == bnlj_value_1; /* 048 */ boolean bnlj_value_2 = false; /* 049 */ bnlj_value_2 = !(bnlj_value_3); /* 050 */ if (!(false || !bnlj_value_2)) /* 051 */ { /* 052 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[2] /* numOutputRows */).add(1); /* 053 */ /* 054 */ range_mutableStateArray_0[3].reset(); /* 055 */ /* 056 */ range_mutableStateArray_0[3].write(0, bnlj_expr_0_0); /* 057 */ /* 058 */ range_mutableStateArray_0[3].write(1, bnlj_value_1); /* 059 */ append((range_mutableStateArray_0[3].getRow()).copy()); /* 060 */ /* 061 */ } /* 062 */ } /* 063 */ /* 064 */ } /* 065 */ /* 066 */ private void initRange(int idx) { /* 067 */ java.math.BigInteger index = java.math.BigInteger.valueOf(idx); /* 068 */ java.math.BigInteger numSlice = java.math.BigInteger.valueOf(2L); /* 069 */ java.math.BigInteger numElement = java.math.BigInteger.valueOf(4L); /* 070 */ java.math.BigInteger step = java.math.BigInteger.valueOf(1L); /* 071 */ java.math.BigInteger start = java.math.BigInteger.valueOf(0L); /* 072 */ long partitionEnd; /* 073 */ /* 074 */ java.math.BigInteger st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); /* 075 */ if (st.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 076 */ range_nextIndex_0 = Long.MAX_VALUE; /* 077 */ } else if (st.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 078 */ range_nextIndex_0 = Long.MIN_VALUE; /* 079 */ } else { /* 080 */ range_nextIndex_0 = st.longValue(); /* 081 */ } /* 082 */ range_batchEnd_0 = range_nextIndex_0; /* 083 */ /* 084 */ java.math.BigInteger end = index.add(java.math.BigInteger.ONE).multiply(numElement).divide(numSlice) /* 085 */ .multiply(step).add(start); /* 086 */ if (end.compareTo(java.math.BigInteger.valueOf(Long.MAX_VALUE)) > 0) { /* 087 */ partitionEnd = Long.MAX_VALUE; /* 088 */ } else if (end.compareTo(java.math.BigInteger.valueOf(Long.MIN_VALUE)) < 0) { /* 089 */ partitionEnd = Long.MIN_VALUE; /* 090 */ } else { /* 091 */ partitionEnd = end.longValue(); /* 092 */ } /* 093 */ /* 094 */ java.math.BigInteger startToEnd = java.math.BigInteger.valueOf(partitionEnd).subtract( /* 095 */ java.math.BigInteger.valueOf(range_nextIndex_0)); /* 096 */ range_numElementsTodo_0 = startToEnd.divide(step).longValue(); /* 097 */ if (range_numElementsTodo_0 < 0) { /* 098 */ range_numElementsTodo_0 = 0; /* 099 */ } else if (startToEnd.remainder(step).compareTo(java.math.BigInteger.valueOf(0L)) != 0) { /* 100 */ range_numElementsTodo_0++; /* 101 */ } /* 102 */ } /* 103 */ /* 104 */ protected void processNext() throws java.io.IOException { /* 105 */ // initialize Range /* 106 */ if (!range_initRange_0) { /* 107 */ range_initRange_0 = true; /* 108 */ initRange(partitionIndex); /* 109 */ } /* 110 */ /* 111 */ while (true) { /* 112 */ if (range_nextIndex_0 == range_batchEnd_0) { /* 113 */ long range_nextBatchTodo_0; /* 114 */ if (range_numElementsTodo_0 > 1000L) { /* 115 */ range_nextBatchTodo_0 = 1000L; /* 116 */ range_numElementsTodo_0 -= 1000L; /* 117 */ } else { /* 118 */ range_nextBatchTodo_0 = range_numElementsTodo_0; /* 119 */ range_numElementsTodo_0 = 0; /* 120 */ if (range_nextBatchTodo_0 == 0) break; /* 121 */ } /* 122 */ range_batchEnd_0 += range_nextBatchTodo_0 * 1L; /* 123 */ } /* 124 */ /* 125 */ int range_localEnd_0 = (int)((range_batchEnd_0 - range_nextIndex_0) / 1L); /* 126 */ for (int range_localIdx_0 = 0; range_localIdx_0 < range_localEnd_0; range_localIdx_0++) { /* 127 */ long range_value_0 = ((long)range_localIdx_0 * 1L) + range_nextIndex_0; /* 128 */ /* 129 */ // common sub-expressions /* 130 */ /* 131 */ bnlj_doConsume_0(range_value_0); /* 132 */ /* 133 */ if (shouldStop()) { /* 134 */ range_nextIndex_0 = range_value_0 + 1L; /* 135 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localIdx_0 + 1); /* 136 */ range_inputMetrics_0.incRecordsRead(range_localIdx_0 + 1); /* 137 */ return; /* 138 */ } /* 139 */ /* 140 */ } /* 141 */ range_nextIndex_0 = range_batchEnd_0; /* 142 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(range_localEnd_0); /* 143 */ range_inputMetrics_0.incRecordsRead(range_localEnd_0); /* 144 */ range_taskContext_0.killTaskIfInterrupted(); /* 145 */ } /* 146 */ } /* 147 */ /* 148 */ } ``` ### Why are the changes needed? Improve query CPU performance. Added a micro benchmark query in `JoinBenchmark.scala`. Saw 1x of run time improvement: ``` OpenJDK 64-Bit Server VM 11.0.9+11-LTS on Linux 4.14.219-161.340.amzn2.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 2.50GHz broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- broadcast nested loop join wholestage off 62922 63052 184 0.3 3000.3 1.0X broadcast nested loop join wholestage on 30946 30972 26 0.7 1475.6 2.0X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? * Added unit test in `WholeStageCodegenSuite.scala`, and existing unit tests for `BroadcastNestedLoopJoinExec`. * Updated golden files for several TCPDS query plans, as whole stage code-gen for `BroadcastNestedLoopJoinExec` is triggered. * Updated `JoinBenchmark-jdk11-results.txt ` and `JoinBenchmark-results.txt` with new benchmark result. Followed previous benchmark PRs - https://github.com/apache/spark/pull/27078 and https://github.com/apache/spark/pull/26003 to use same type of machine: ``` Amazon AWS EC2 type: r3.xlarge region: us-west-2 (Oregon) OS: Linux ``` Closes #31736 from c21/nested-join-exec. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
2021-03-09 06:45:43 -05:00
shuffle hash join wholestage off 1037 1047 14 4.0 247.3 1.0X
shuffle hash join wholestage on 652 668 15 6.4 155.4 1.6X
OpenJDK 64-Bit Server VM 1.8.0_272-b10 on Linux 4.14.219-161.340.amzn2.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
broadcast nested loop join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------
broadcast nested loop join wholestage off 65666 66140 671 0.3 3131.2 1.0X
broadcast nested loop join wholestage on 29429 29523 64 0.7 1403.3 2.2X