4b865104b3
## What changes were proposed in this pull request? This PR adds some tests converted from pivot.sql to test UDFs following the combination guide in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'pivot.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index 9a8f783da4..cb9e4d736c 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out -1,5 +1,5 -- Automatically generated by SQLQueryTestSuite --- Number of queries: 32 +-- Number of queries: 30 -- !query 0 -40,14 +40,14 struct<> -- !query 3 SELECT * FROM ( - SELECT year, course, earnings FROM courseSales + SELECT udf(year), course, earnings FROM courseSales ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR course IN ('dotNET', 'Java') ) -- !query 3 schema -struct<year:int,dotNET:bigint,Java:bigint> +struct<CAST(udf(cast(year as string)) AS INT):int,dotNET:bigint,Java:bigint> -- !query 3 output 2012 15000 20000 2013 48000 30000 -56,7 +56,7 struct<year:int,dotNET:bigint,Java:bigint> -- !query 4 SELECT * FROM courseSales PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR year IN (2012, 2013) ) -- !query 4 schema -71,11 +71,11 SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - sum(earnings), avg(earnings) + udf(sum(earnings)), udf(avg(earnings)) FOR course IN ('dotNET', 'Java') ) -- !query 5 schema -struct<year:int,dotNET_sum(CAST(earnings AS BIGINT)):bigint,dotNET_avg(CAST(earnings AS BIGINT)):double,Java_sum(CAST(earnings AS BIGINT)):bigint,Java_avg(CAST(earnings AS BIGINT)):double> +struct<year:int,dotNET_CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint,dotNET_CAST(udf(cast(avg(cast(earnings as bigint)) as string)) AS DOUBLE):double,Java_CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint,Java_CAST(udf(cast(avg(cast(earnings as bigint)) as string)) AS DOUBLE):double> -- !query 5 output 2012 15000 7500.0 20000 20000.0 2013 48000 48000.0 30000 30000.0 -83,10 +83,10 struct<year:int,dotNET_sum(CAST(earnings AS BIGINT)):bigint,dotNET_avg(CAST(earn -- !query 6 SELECT * FROM ( - SELECT course, earnings FROM courseSales + SELECT udf(course) as course, earnings FROM courseSales ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR course IN ('dotNET', 'Java') ) -- !query 6 schema -100,23 +100,23 SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - sum(earnings), min(year) + udf(sum(udf(earnings))), udf(min(year)) FOR course IN ('dotNET', 'Java') ) -- !query 7 schema -struct<dotNET_sum(CAST(earnings AS BIGINT)):bigint,dotNET_min(year):int,Java_sum(CAST(earnings AS BIGINT)):bigint,Java_min(year):int> +struct<dotNET_CAST(udf(cast(sum(cast(cast(udf(cast(earnings as string)) as int) as bigint)) as string)) AS BIGINT):bigint,dotNET_CAST(udf(cast(min(year) as string)) AS INT):int,Java_CAST(udf(cast(sum(cast(cast(udf(cast(earnings as string)) as int) as bigint)) as string)) AS BIGINT):bigint,Java_CAST(udf(cast(min(year) as string)) AS INT):int> -- !query 7 output 63000 2012 50000 2012 -- !query 8 SELECT * FROM ( - SELECT course, year, earnings, s + SELECT course, year, earnings, udf(s) as s FROM courseSales JOIN years ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR s IN (1, 2) ) -- !query 8 schema -135,11 +135,11 SELECT * FROM ( JOIN years ON year = y ) PIVOT ( - sum(earnings), min(s) + udf(sum(earnings)), udf(min(s)) FOR course IN ('dotNET', 'Java') ) -- !query 9 schema -struct<year:int,dotNET_sum(CAST(earnings AS BIGINT)):bigint,dotNET_min(s):int,Java_sum(CAST(earnings AS BIGINT)):bigint,Java_min(s):int> +struct<year:int,dotNET_CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint,dotNET_CAST(udf(cast(min(s) as string)) AS INT):int,Java_CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint,Java_CAST(udf(cast(min(s) as string)) AS INT):int> -- !query 9 output 2012 15000 1 20000 1 2013 48000 2 30000 2 -152,7 +152,7 SELECT * FROM ( JOIN years ON year = y ) PIVOT ( - sum(earnings * s) + udf(sum(earnings * s)) FOR course IN ('dotNET', 'Java') ) -- !query 10 schema -167,7 +167,7 SELECT 2012_s, 2013_s, 2012_a, 2013_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) PIVOT ( - sum(e) s, avg(e) a + udf(sum(e)) s, udf(avg(e)) a FOR y IN (2012, 2013) ) -- !query 11 schema -182,7 +182,7 SELECT firstYear_s, secondYear_s, firstYear_a, secondYear_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) PIVOT ( - sum(e) s, avg(e) a + udf(sum(e)) s, udf(avg(e)) a FOR y IN (2012 as firstYear, 2013 secondYear) ) -- !query 12 schema -195,7 +195,7 struct<firstYear_s:bigint,secondYear_s:bigint,firstYear_a:double,secondYear_a:do -- !query 13 SELECT * FROM courseSales PIVOT ( - abs(earnings) + udf(abs(earnings)) FOR year IN (2012, 2013) ) -- !query 13 schema -210,7 +210,7 SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - sum(earnings), year + udf(sum(earnings)), year FOR course IN ('dotNET', 'Java') ) -- !query 14 schema -225,7 +225,7 SELECT * FROM ( SELECT course, earnings FROM courseSales ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR year IN (2012, 2013) ) -- !query 15 schema -240,11 +240,11 SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - ceil(sum(earnings)), avg(earnings) + 1 as a1 + udf(ceil(udf(sum(earnings)))), avg(earnings) + 1 as a1 FOR course IN ('dotNET', 'Java') ) -- !query 16 schema -struct<year:int,dotNET_CEIL(sum(CAST(earnings AS BIGINT))):bigint,dotNET_a1:double,Java_CEIL(sum(CAST(earnings AS BIGINT))):bigint,Java_a1:double> +struct<year:int,dotNET_CAST(udf(cast(CEIL(cast(udf(cast(sum(cast(earnings as bigint)) as string)) as bigint)) as string)) AS BIGINT):bigint,dotNET_a1:double,Java_CAST(udf(cast(CEIL(cast(udf(cast(sum(cast(earnings as bigint)) as string)) as bigint)) as string)) AS BIGINT):bigint,Java_a1:double> -- !query 16 output 2012 15000 7501.0 20000 20001.0 2013 48000 48001.0 30000 30001.0 -255,7 +255,7 SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) PIVOT ( - sum(avg(earnings)) + sum(udf(avg(earnings))) FOR course IN ('dotNET', 'Java') ) -- !query 17 schema -272,7 +272,7 SELECT * FROM ( JOIN years ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) -- !query 18 schema -289,7 +289,7 SELECT * FROM ( JOIN years ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) ) -- !query 19 schema -306,7 +306,7 SELECT * FROM ( JOIN years ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, year) IN ('dotNET', 'Java') ) -- !query 20 schema -319,7 +319,7 Invalid pivot value 'dotNET': value data type string does not match pivot column -- !query 21 SELECT * FROM courseSales PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR year IN (s, 2013) ) -- !query 21 schema -332,7 +332,7 cannot resolve '`s`' given input columns: [coursesales.course, coursesales.earni -- !query 22 SELECT * FROM courseSales PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR year IN (course, 2013) ) -- !query 22 schema -343,151 +343,118 Literal expressions required for pivot values, found 'course#x'; -- !query 23 -SELECT * FROM ( - SELECT course, year, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - min(a) - FOR course IN ('dotNET', 'Java') -) --- !query 23 schema -struct<year:int,dotNET:array<int>,Java:array<int>> --- !query 23 output -2012 [1,1] [1,1] -2013 [2,2] [2,2] - - --- !query 24 -SELECT * FROM ( - SELECT course, year, y, a - FROM courseSales - JOIN yearsWithComplexTypes ON year = y -) -PIVOT ( - max(a) - FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) -) --- !query 24 schema -struct<year:int,[2012, dotNET]:array<int>,[2013, Java]:array<int>> --- !query 24 output -2012 [1,1] NULL -2013 NULL [2,2] - - --- !query 25 SELECT * FROM ( SELECT earnings, year, a FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR a IN (array(1, 1), array(2, 2)) ) --- !query 25 schema +-- !query 23 schema struct<year:int,[1, 1]:bigint,[2, 2]:bigint> --- !query 25 output +-- !query 23 output 2012 35000 NULL 2013 NULL 78000 --- !query 26 +-- !query 24 SELECT * FROM ( - SELECT course, earnings, year, a + SELECT course, earnings, udf(year) as year, a FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) --- !query 26 schema +-- !query 24 schema struct<year:int,[dotNET, [1, 1]]:bigint,[Java, [2, 2]]:bigint> --- !query 26 output +-- !query 24 output 2012 15000 NULL 2013 NULL 30000 --- !query 27 +-- !query 25 SELECT * FROM ( SELECT earnings, year, s FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR s IN ((1, 'a'), (2, 'b')) ) --- !query 27 schema +-- !query 25 schema struct<year:int,[1, a]:bigint,[2, b]:bigint> --- !query 27 output +-- !query 25 output 2012 35000 NULL 2013 NULL 78000 --- !query 28 +-- !query 26 SELECT * FROM ( SELECT course, earnings, year, s FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) --- !query 28 schema +-- !query 26 schema struct<year:int,[dotNET, [1, a]]:bigint,[Java, [2, b]]:bigint> --- !query 28 output +-- !query 26 output 2012 15000 NULL 2013 NULL 30000 --- !query 29 +-- !query 27 SELECT * FROM ( SELECT earnings, year, m FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR m IN (map('1', 1), map('2', 2)) ) --- !query 29 schema +-- !query 27 schema struct<> --- !query 29 output +-- !query 27 output org.apache.spark.sql.AnalysisException Invalid pivot column 'm#x'. Pivot columns must be comparable.; --- !query 30 +-- !query 28 SELECT * FROM ( SELECT course, earnings, year, m FROM courseSales JOIN yearsWithComplexTypes ON year = y ) PIVOT ( - sum(earnings) + udf(sum(earnings)) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ) --- !query 30 schema +-- !query 28 schema struct<> --- !query 30 output +-- !query 28 output org.apache.spark.sql.AnalysisException Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; --- !query 31 +-- !query 29 SELECT * FROM ( - SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w + SELECT course, earnings, udf("a") as a, udf("z") as z, udf("b") as b, udf("y") as y, + udf("c") as c, udf("x") as x, udf("d") as d, udf("w") as w FROM courseSales ) PIVOT ( - sum(Earnings) + udf(sum(Earnings)) FOR Course IN ('dotNET', 'Java') ) --- !query 31 schema +-- !query 29 schema struct<a:string,z:string,b:string,y:string,c:string,x:string,d:string,w:string,dotNET:bigint,Java:bigint> --- !query 31 output +-- !query 29 output a z b y c x d w 63000 50000 ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes #25122 from chitralverma/SPARK-28286. Authored-by: chitralverma <chitralverma@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> |
||
---|---|---|
.github | ||
assembly | ||
bin | ||
build | ||
common | ||
conf | ||
core | ||
data | ||
dev | ||
docs | ||
examples | ||
external | ||
graph | ||
graphx | ||
hadoop-cloud | ||
launcher | ||
licenses | ||
licenses-binary | ||
mllib | ||
mllib-local | ||
project | ||
python | ||
R | ||
repl | ||
resource-managers | ||
sbin | ||
sql | ||
streaming | ||
tools | ||
.gitattributes | ||
.gitignore | ||
appveyor.yml | ||
CONTRIBUTING.md | ||
LICENSE | ||
LICENSE-binary | ||
NOTICE | ||
NOTICE-binary | ||
pom.xml | ||
README.md | ||
scalastyle-config.xml |
Apache Spark
Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Scala, Java, Python, and R, and an optimized engine that supports general computation graphs for data analysis. It also supports a rich set of higher-level tools including Spark SQL for SQL and DataFrames, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for stream processing.
Online Documentation
You can find the latest Spark documentation, including a programming guide, on the project web page. This README file only contains basic setup instructions.
Building Spark
Spark is built using Apache Maven. To build Spark and its example programs, run:
build/mvn -DskipTests clean package
(You do not need to do this if you downloaded a pre-built package.)
You can build Spark using more than one thread by using the -T option with Maven, see "Parallel builds in Maven 3". More detailed documentation is available from the project site, at "Building Spark".
For general development tips, including info on developing Spark using an IDE, see "Useful Developer Tools".
Interactive Scala Shell
The easiest way to start using Spark is through the Scala shell:
./bin/spark-shell
Try the following command, which should return 1,000,000,000:
scala> spark.range(1000 * 1000 * 1000).count()
Interactive Python Shell
Alternatively, if you prefer Python, you can use the Python shell:
./bin/pyspark
And run the following command, which should also return 1,000,000,000:
>>> spark.range(1000 * 1000 * 1000).count()
Example Programs
Spark also comes with several sample programs in the examples
directory.
To run one of them, use ./bin/run-example <class> [params]
. For example:
./bin/run-example SparkPi
will run the Pi example locally.
You can set the MASTER environment variable when running examples to submit
examples to a cluster. This can be a mesos:// or spark:// URL,
"yarn" to run on YARN, and "local" to run
locally with one thread, or "local[N]" to run locally with N threads. You
can also use an abbreviated class name if the class is in the examples
package. For instance:
MASTER=spark://host:7077 ./bin/run-example SparkPi
Many of the example programs print usage help if no params are given.
Running Tests
Testing first requires building Spark. Once Spark is built, tests can be run using:
./dev/run-tests
Please see the guidance on how to run tests for a module, or individual tests.
There is also a Kubernetes integration test, see resource-managers/kubernetes/integration-tests/README.md
A Note About Hadoop Versions
Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported storage systems. Because the protocols have changed in different versions of Hadoop, you must build Spark against the same version that your cluster runs.
Please refer to the build documentation at "Specifying the Hadoop Version and Enabling YARN" for detailed guidance on building for a particular distribution of Hadoop, including building for particular Hive and Hive Thriftserver distributions.
Configuration
Please refer to the Configuration Guide in the online documentation for an overview on how to configure Spark.
Contributing
Please review the Contribution to Spark guide for information on how to get started contributing to the project.