History

Cheng Su c1e995ac95 [SPARK-35350][SQL] Add code-gen for left semi sort merge join ### What changes were proposed in this pull request? As title. This PR is to add code-gen support for LEFT SEMI sort merge join. The main change is to add `semiJoin` code path in `SortMergeJoinExec.doProduce()` and introduce `onlyBufferFirstMatchedRow` in `SortMergeJoinExec.genScanner()`. The latter is for left semi sort merge join without condition. For this kind of query, we don't need to buffer all matched rows, but only the first one (this is same as non-code-gen code path). Example query: ``` val df1 = spark.range(10).select($"id".as("k1")) val df2 = spark.range(4).select($"id".as("k2")) val oneJoinDF = df1.join(df2.hint("SHUFFLE_MERGE"), $"k1" === $"k2", "left_semi") ``` Example of generated code for the query: ``` == Subtree 5 / 5 (maxMethodCodeSize:302; maxConstantPoolSize:156(0.24% used); numInnerClasses:0) == (5) Project [id#0L AS k1#2L] +- (5) SortMergeJoin [id#0L], [k2#6L], LeftSemi :- (2) Sort [id#0L ASC NULLS FIRST], false, 0 : +- Exchange hashpartitioning(id#0L, 5), ENSURE_REQUIREMENTS, [id=#27] : +- (1) Range (0, 10, step=1, splits=2) +- (4) Sort [k2#6L ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(k2#6L, 5), ENSURE_REQUIREMENTS, [id=#33] +- (3) Project [id#4L AS k2#6L] +- (3) Range (0, 4, step=1, splits=2) Generated code: / 001 / public Object generate(Object[] references) { / 002 / return new GeneratedIteratorForCodegenStage5(references); / 003 / } / 004 / / 005 / // codegenStageId=5 / 006 / final class GeneratedIteratorForCodegenStage5 extends org.apache.spark.sql.execution.BufferedRowIterator { / 007 / private Object[] references; / 008 / private scala.collection.Iterator[] inputs; / 009 / private scala.collection.Iterator smj_streamedInput_0; / 010 / private scala.collection.Iterator smj_bufferedInput_0; / 011 / private InternalRow smj_streamedRow_0; / 012 / private InternalRow smj_bufferedRow_0; / 013 / private long smj_value_2; / 014 / private org.apache.spark.sql.execution.ExternalAppendOnlyUnsafeRowArray smj_matches_0; / 015 / private long smj_value_3; / 016 / private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] smj_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2]; / 017 / / 018 / public GeneratedIteratorForCodegenStage5(Object[] references) { / 019 / this.references = references; / 020 / } / 021 / / 022 / public void init(int index, scala.collection.Iterator[] inputs) { / 023 / partitionIndex = index; / 024 / this.inputs = inputs; / 025 / smj_streamedInput_0 = inputs[0]; / 026 / smj_bufferedInput_0 = inputs[1]; / 027 / / 028 / smj_matches_0 = new org.apache.spark.sql.execution.ExternalAppendOnlyUnsafeRowArray(1, 2147483647); / 029 / smj_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); / 030 / smj_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0); / 031 / / 032 / } / 033 / / 034 / private boolean findNextJoinRows( / 035 / scala.collection.Iterator streamedIter, / 036 / scala.collection.Iterator bufferedIter) { / 037 / smj_streamedRow_0 = null; / 038 / int comp = 0; / 039 / while (smj_streamedRow_0 == null) { / 040 / if (!streamedIter.hasNext()) return false; / 041 / smj_streamedRow_0 = (InternalRow) streamedIter.next(); / 042 / long smj_value_0 = smj_streamedRow_0.getLong(0); / 043 / if (false) { / 044 / smj_streamedRow_0 = null; / 045 / continue; / 046 / / 047 / } / 048 / if (!smj_matches_0.isEmpty()) { / 049 / comp = 0; / 050 / if (comp == 0) { / 051 / comp = (smj_value_0 > smj_value_3 ? 1 : smj_value_0 < smj_value_3 ? -1 : 0); / 052 / } / 053 / / 054 / if (comp == 0) { / 055 / return true; / 056 / } / 057 / smj_matches_0.clear(); / 058 / } / 059 / / 060 / do { / 061 / if (smj_bufferedRow_0 == null) { / 062 / if (!bufferedIter.hasNext()) { / 063 / smj_value_3 = smj_value_0; / 064 / return !smj_matches_0.isEmpty(); / 065 / } / 066 / smj_bufferedRow_0 = (InternalRow) bufferedIter.next(); / 067 / long smj_value_1 = smj_bufferedRow_0.getLong(0); / 068 / if (false) { / 069 / smj_bufferedRow_0 = null; / 070 / continue; / 071 / } / 072 / smj_value_2 = smj_value_1; / 073 / } / 074 / / 075 / comp = 0; / 076 / if (comp == 0) { / 077 / comp = (smj_value_0 > smj_value_2 ? 1 : smj_value_0 < smj_value_2 ? -1 : 0); / 078 / } / 079 / / 080 / if (comp > 0) { / 081 / smj_bufferedRow_0 = null; / 082 / } else if (comp < 0) { / 083 / if (!smj_matches_0.isEmpty()) { / 084 / smj_value_3 = smj_value_0; / 085 / return true; / 086 / } else { / 087 / smj_streamedRow_0 = null; / 088 / } / 089 / } else { / 090 / if (smj_matches_0.isEmpty()) { / 091 / smj_matches_0.add((UnsafeRow) smj_bufferedRow_0); / 092 / } / 093 / / 094 / smj_bufferedRow_0 = null; / 095 / } / 096 / } while (smj_streamedRow_0 != null); / 097 / } / 098 / return false; // unreachable / 099 / } / 100 / / 101 / protected void processNext() throws java.io.IOException { / 102 / while (findNextJoinRows(smj_streamedInput_0, smj_bufferedInput_0)) { / 103 / long smj_value_4 = -1L; / 104 / smj_value_4 = smj_streamedRow_0.getLong(0); / 105 / scala.collection.Iterator<UnsafeRow> smj_iterator_0 = smj_matches_0.generateIterator(); / 106 / boolean smj_hasOutputRow_0 = false; / 107 / / 108 / while (!smj_hasOutputRow_0 && smj_iterator_0.hasNext()) { / 109 / InternalRow smj_bufferedRow_1 = (InternalRow) smj_iterator_0.next(); / 110 / / 111 / smj_hasOutputRow_0 = true; / 112 / ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] / numOutputRows /).add(1); / 113 / / 114 / // common sub-expressions / 115 / / 116 / smj_mutableStateArray_0[1].reset(); / 117 / / 118 / smj_mutableStateArray_0[1].write(0, smj_value_4); / 119 / append((smj_mutableStateArray_0[1].getRow()).copy()); / 120 / / 121 / } / 122 / if (shouldStop()) return; / 123 / } / 124 / ((org.apache.spark.sql.execution.joins.SortMergeJoinExec) references[1] / plan /).cleanupResources(); / 125 / } / 126 / / 127 / } ``` ### Why are the changes needed? Improve query CPU performance. Test with one query: ``` def sortMergeJoin(): Unit = { val N = 2 << 20 codegenBenchmark("left semi sort merge join", N) { val df1 = spark.range(N).selectExpr(s"id 2 as k1") val df2 = spark.range(N).selectExpr(s"id * 3 as k2") val df = df1.join(df2, col("k1") === col("k2"), "left_semi") assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined) df.noop() } } ``` Seeing 30% of run-time improvement: ``` Running benchmark: left semi sort merge join Running case: left semi sort merge join code-gen off Stopped after 2 iterations, 1369 ms Running case: left semi sort merge join code-gen on Stopped after 5 iterations, 2743 ms Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.16 Intel(R) Core(TM) i9-9980HK CPU 2.40GHz left semi sort merge join: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ left semi sort merge join code-gen off 676 685 13 3.1 322.2 1.0X left semi sort merge join code-gen on 524 549 32 4.0 249.7 1.3X ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added unit test in `WholeStageCodegenSuite.scala` and `ExistenceJoinSuite.scala`. Closes #32528 from c21/smj-left-semi. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>		2021-05-13 12:52:26 +00:00
..
catalyst	[SPARK-35384][SQL] Improve performance for InvokeLike.invoke	2021-05-12 20:57:21 -07:00
core	[SPARK-35350][SQL] Add code-gen for left semi sort merge join	2021-05-13 12:52:26 +00:00
hive	Revert "[SPARK-35321][SQL] Don't register Hive permanent functions when creating Hive client"	2021-05-08 13:01:17 -07:00
hive-thriftserver	[SPARK-35105][SQL] Support multiple paths for ADD FILE/JAR/ARCHIVE commands	2021-04-29 13:58:51 +09:00
create-docs.sh	[SPARK-34010][SQL][DODCS] Use python3 instead of python in SQL documentation build	2021-01-05 19:48:10 +09:00
gen-sql-api-docs.py	[SPARK-34747][SQL][DOCS] Add virtual operators to the built-in function document	2021-03-19 10:19:26 +09:00
gen-sql-config-docs.py	[SPARK-31550][SQL][DOCS] Set nondeterministic configurations with general meanings in sql configuration doc	2020-04-27 17:08:52 +09:00
gen-sql-functions-docs.py	[SPARK-31562][SQL] Update ExpressionDescription for substring, current_date, and current_timestamp	2020-04-26 11:46:52 -07:00
mkdocs.yml	[SPARK-30731] Update deprecated Mkdocs option	2020-02-19 17:28:58 +09:00
README.md	[SPARK-30510][SQL][DOCS] Publicly document Spark SQL configuration options	2020-02-09 19:20:47 +09:00

README.md

Spark SQL

This module provides support for executing relational queries expressed in either SQL or the DataFrame/Dataset API.

Spark SQL is broken up into four subprojects:

Catalyst (sql/catalyst) - An implementation-agnostic framework for manipulating trees of relational operators and expressions.
Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs. This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files.
Hive Support (sql/hive) - Includes extensions that allow users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allow users to run queries that include Hive UDFs, UDAFs, and UDTFs.
HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server.

Running ./sql/create-docs.sh generates SQL documentation for built-in functions under sql/site, and SQL configuration documentation that gets included as part of configuration.md in the main docs directory.