spark-instrumented-optimizer

History

Stavros Kontopoulos ec84415358 [SPARK-28280][PYTHON][SQL][TESTS][FOLLOW-UP] Add UDF cases into group by clause in 'udf-group-by.sql' ## What changes were proposed in this pull request? This PR is a followup of a fix as described in here: https://github.com/apache/spark/pull/25215#issuecomment-517659981 <details><summary>Diff comparing to 'group-by.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out index 3a5df254f2..febe47b5ba 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out -13,26 +13,26 struct<> -- !query 1 -SELECT a, COUNT(b) FROM testData +SELECT udf(a), udf(COUNT(b)) FROM testData -- !query 1 schema struct<> -- !query 1 output org.apache.spark.sql.AnalysisException -grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.; +grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(CAST(udf(cast(count(b) as string)) AS BIGINT) AS `CAST(udf(cast(count(b) as string)) AS BIGINT)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.; -- !query 2 -SELECT COUNT(a), COUNT(b) FROM testData +SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData -- !query 2 schema -struct<count(a):bigint,count(b):bigint> +struct<count(CAST(udf(cast(a as string)) AS INT)):bigint,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> -- !query 2 output 7 7 -- !query 3 -SELECT a, COUNT(b) FROM testData GROUP BY a +SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a -- !query 3 schema -struct<a:int,count(b):bigint> +struct<CAST(udf(cast(a as string)) AS INT):int,count(CAST(udf(cast(b as string)) AS INT)):bigint> -- !query 3 output 1 2 2 2 -41,7 +41,7 NULL 1 -- !query 4 -SELECT a, COUNT(b) FROM testData GROUP BY b +SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b -- !query 4 schema struct<> -- !query 4 output -50,9 +50,9 expression 'testdata.`a`' is neither present in the group by, nor is it an aggre -- !query 5 -SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a +SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a) -- !query 5 schema -struct<count(a):bigint,count(b):bigint> +struct<count(CAST(udf(cast(a as string)) AS INT)):bigint,count(CAST(udf(cast(b as string)) AS INT)):bigint> -- !query 5 output 0 1 2 2 -61,15 +61,15 struct<count(a):bigint,count(b):bigint> -- !query 6 -SELECT 'foo', COUNT(a) FROM testData GROUP BY 1 +SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1 -- !query 6 schema -struct<foo:string,count(a):bigint> +struct<foo:string,count(CAST(udf(cast(a as string)) AS INT)):bigint> -- !query 6 output foo 7 -- !query 7 -SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1 +SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1) -- !query 7 schema struct<foo:string> -- !query 7 output -77,25 +77,25 struct<foo:string> -- !query 8 -SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1 +SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) -- !query 8 schema -struct<foo:string,approx_count_distinct(a):bigint> +struct<foo:string,CAST(udf(cast(approx_count_distinct(cast(udf(cast(a as string)) as int), 0.05, 0, 0) as string)) AS BIGINT):bigint> -- !query 8 output -- !query 9 -SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1 +SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) -- !query 9 schema -struct<foo:string,max(named_struct(a, a)):struct<a:int>> +struct<foo:string,max(named_struct(col1, CAST(udf(cast(a as string)) AS INT))):struct<col1:int>> -- !query 9 output -- !query 10 -SELECT a + b, COUNT(b) FROM testData GROUP BY a + b +SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b -- !query 10 schema -struct<(a + b):int,count(b):bigint> +struct<CAST(udf(cast((a + b) as string)) AS INT):int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> -- !query 10 output 2 1 3 2 -105,7 +105,7 NULL 1 -- !query 11 -SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1 +SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1 -- !query 11 schema struct<> -- !query 11 output -114,9 +114,9 expression 'testdata.`a`' is neither present in the group by, nor is it an aggre -- !query 12 -SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1 +SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1) -- !query 12 schema -struct<((a + 1) + 1):int,count(b):bigint> +struct<(CAST(udf(cast((a + 1) as string)) AS INT) + 1):int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> -- !query 12 output 3 2 4 2 -125,26 +125,26 NULL 1 -- !query 13 -SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) +SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) FROM testData -- !query 13 schema -struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint> +struct<skewness(CAST(CAST(udf(cast(a as string)) AS INT) AS DOUBLE)):double,CAST(udf(cast(kurtosis(cast(a as double)) as string)) AS DOUBLE):double,CAST(udf(cast(min(a) as string)) AS INT):int,max(CAST(udf(cast(a as string)) AS INT)):int,CAST(udf(cast(avg(cast(cast(udf(cast(a as string)) as int) as bigint)) as string)) AS DOUBLE):double,CAST(udf(cast(var_samp(cast(a as double)) as string)) AS DOUBLE):double,stddev_samp(CAST(CAST(udf(cast(a as string)) AS INT) AS DOUBLE)):double,CAST(udf(cast(sum(cast(a as bigint)) as string)) AS BIGINT):bigint,CAST(udf(cast(count(a) as string)) AS BIGINT):bigint> -- !query 13 output -0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 -- !query 14 -SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a +SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a) -- !query 14 schema -struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint> +struct<count(DISTINCT CAST(udf(cast(b as string)) AS INT)):bigint,CAST(udf(cast(count(distinct b, c) as string)) AS BIGINT):bigint> -- !query 14 output 1 1 -- !query 15 -SELECT a AS k, COUNT(b) FROM testData GROUP BY k +SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k -- !query 15 schema -struct<k:int,count(b):bigint> +struct<k:int,count(CAST(udf(cast(b as string)) AS INT)):bigint> -- !query 15 output 1 2 2 2 -153,21 +153,21 NULL 1 -- !query 16 -SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1 +SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1 -- !query 16 schema -struct<k:int,count(b):bigint> +struct<k:int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> -- !query 16 output 2 2 3 2 -- !query 17 -SELECT COUNT(b) AS k FROM testData GROUP BY k +SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k -- !query 17 schema struct<> -- !query 17 output org.apache.spark.sql.AnalysisException -aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`); +aggregate functions are not allowed in GROUP BY, but found CAST(udf(cast(count(b) as string)) AS BIGINT); -- !query 18 -180,7 +180,7 struct<> -- !query 19 -SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a +SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a) -- !query 19 schema struct<> -- !query 19 output -197,32 +197,32 spark.sql.groupByAliases false -- !query 21 -SELECT a AS k, COUNT(b) FROM testData GROUP BY k +SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k -- !query 21 schema struct<> -- !query 21 output org.apache.spark.sql.AnalysisException -cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 47 +cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 57 -- !query 22 -SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a +SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a) -- !query 22 schema -struct<a:int,count(1):bigint> +struct<CAST(udf(cast(a as string)) AS INT):int,count(CAST(udf(cast(1 as string)) AS INT)):bigint> -- !query 22 output -- !query 23 -SELECT COUNT(1) FROM testData WHERE false +SELECT udf(COUNT(1)) FROM testData WHERE false -- !query 23 schema -struct<count(1):bigint> +struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint> -- !query 23 output 0 -- !query 24 -SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t +SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t -- !query 24 schema struct<1:int> -- !query 24 output -232,7 +232,7 struct<1:int> -- !query 25 SELECT 1 from ( SELECT 1 AS z, - MIN(a.x) + udf(MIN(a.x)) FROM (select 1 as x) a WHERE false ) b -244,32 +244,32 struct<1:int> -- !query 26 -SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count() +SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count() FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y) -- !query 26 schema -struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint> +struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,CAST(udf(cast(corr(distinct cast(y as double), cast(x as double)) as string)) AS DOUBLE):double,count(1):bigint> -- !query 26 output 1.0 1.0 3 -- !query 27 -SELECT 1 FROM range(10) HAVING true +SELECT udf(1) FROM range(10) HAVING true -- !query 27 schema -struct<1:int> +struct<CAST(udf(cast(1 as string)) AS INT):int> -- !query 27 output 1 -- !query 28 -SELECT 1 FROM range(10) HAVING MAX(id) > 0 +SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 -- !query 28 schema -struct<1:int> +struct<CAST(udf(cast(cast(udf(cast(1 as string)) as int) as string)) AS INT):int> -- !query 28 output 1 -- !query 29 -SELECT id FROM range(10) HAVING id > 0 +SELECT udf(id) FROM range(10) HAVING id > 0 -- !query 29 schema struct<> -- !query 29 output -291,33 +291,33 struct<> -- !query 31 -SELECT every(v), some(v), any(v) FROM test_agg WHERE 1 = 0 +SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0 -- !query 31 schema -struct<every(v):boolean,some(v):boolean,any(v):boolean> +struct<CAST(udf(cast(every(v) as string)) AS BOOLEAN):boolean,CAST(udf(cast(some(v) as string)) AS BOOLEAN):boolean,any(v):boolean> -- !query 31 output NULL NULL NULL -- !query 32 -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 4 +SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4 -- !query 32 schema -struct<every(v):boolean,some(v):boolean,any(v):boolean> +struct<CAST(udf(cast(every(cast(udf(cast(v as string)) as boolean)) as string)) AS BOOLEAN):boolean,some(v):boolean,any(v):boolean> -- !query 32 output NULL NULL NULL -- !query 33 -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 5 +SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5 -- !query 33 schema -struct<every(v):boolean,some(v):boolean,any(v):boolean> +struct<every(v):boolean,CAST(udf(cast(some(v) as string)) AS BOOLEAN):boolean,any(v):boolean> -- !query 33 output false true true -- !query 34 -SELECT k, every(v), some(v), any(v) FROM test_agg GROUP BY k +SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k) -- !query 34 schema -struct<k:int,every(v):boolean,some(v):boolean,any(v):boolean> +struct<CAST(udf(cast(k as string)) AS INT):int,every(v):boolean,CAST(udf(cast(some(v) as string)) AS BOOLEAN):boolean,any(v):boolean> -- !query 34 output 1 false true true 2 true true true -327,9 +327,9 struct<k:int,every(v):boolean,some(v):boolean,any(v):boolean> -- !query 35 -SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false +SELECT udf(k), every(v) FROM test_agg GROUP BY k HAVING every(v) = false -- !query 35 schema -struct<k:int,every(v):boolean> +struct<CAST(udf(cast(k as string)) AS INT):int,every(v):boolean> -- !query 35 output 1 false 3 false -337,77 +337,77 struct<k:int,every(v):boolean> -- !query 36 -SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL +SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL -- !query 36 schema -struct<k:int,every(v):boolean> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(every(v) as string)) AS BOOLEAN):boolean> -- !query 36 output 4 NULL -- !query 37 -SELECT k, - Every(v) AS every +SELECT udf(k), + udf(Every(v)) AS every FROM test_agg WHERE k = 2 AND v IN (SELECT Any(v) FROM test_agg WHERE k = 1) -GROUP BY k +GROUP BY udf(k) -- !query 37 schema -struct<k:int,every:boolean> +struct<CAST(udf(cast(k as string)) AS INT):int,every:boolean> -- !query 37 output 2 true -- !query 38 -SELECT k, +SELECT udf(udf(k)), Every(v) AS every FROM test_agg WHERE k = 2 AND v IN (SELECT Every(v) FROM test_agg WHERE k = 1) -GROUP BY k +GROUP BY udf(udf(k)) -- !query 38 schema -struct<k:int,every:boolean> +struct<CAST(udf(cast(cast(udf(cast(k as string)) as int) as string)) AS INT):int,every:boolean> -- !query 38 output -- !query 39 -SELECT every(1) +SELECT every(udf(1)) -- !query 39 schema struct<> -- !query 39 output org.apache.spark.sql.AnalysisException -cannot resolve 'every(1)' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 +cannot resolve 'every(CAST(udf(cast(1 as string)) AS INT))' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 -- !query 40 -SELECT some(1S) +SELECT some(udf(1S)) -- !query 40 schema struct<> -- !query 40 output org.apache.spark.sql.AnalysisException -cannot resolve 'some(1S)' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 +cannot resolve 'some(CAST(udf(cast(1 as string)) AS SMALLINT))' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 -- !query 41 -SELECT any(1L) +SELECT any(udf(1L)) -- !query 41 schema struct<> -- !query 41 output org.apache.spark.sql.AnalysisException -cannot resolve 'any(1L)' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 +cannot resolve 'any(CAST(udf(cast(1 as string)) AS BIGINT))' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 -- !query 42 -SELECT every("true") +SELECT udf(every("true")) -- !query 42 schema struct<> -- !query 42 output org.apache.spark.sql.AnalysisException -cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 7 +cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 11 -- !query 43 -428,9 +428,9 struct<k:int,v:boolean,every(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST -- !query 44 -SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg -- !query 44 schema -struct<k:int,v:boolean,some(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):boolean> +struct<k:int,CAST(udf(cast(cast(udf(cast(v as string)) as boolean) as string)) AS BOOLEAN):boolean,some(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):boolean> -- !query 44 output 1 false false 1 true true -445,9 +445,9 struct<k:int,v:boolean,some(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST R -- !query 45 -SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +SELECT udf(udf(k)), v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg -- !query 45 schema -struct<k:int,v:boolean,any(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):boolean> +struct<CAST(udf(cast(cast(udf(cast(k as string)) as int) as string)) AS INT):int,v:boolean,any(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):boolean> -- !query 45 output 1 false false 1 true true -462,17 +462,17 struct<k:int,v:boolean,any(v) OVER (PARTITION BY k ORDER BY v ASC NULLS FIRST RA -- !query 46 -SELECT count() FROM test_agg HAVING count() > 1L +SELECT udf(count()) FROM test_agg HAVING count() > 1L -- !query 46 schema -struct<count(1):bigint> +struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint> -- !query 46 output 10 -- !query 47 -SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true +SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true -- !query 47 schema -struct<k:int,max(v):boolean> +struct<k:int,CAST(udf(cast(max(v) as string)) AS BOOLEAN):boolean> -- !query 47 output 1 true 2 true -480,7 +480,7 struct<k:int,max(v):boolean> -- !query 48 -SELECT * FROM (SELECT COUNT() AS cnt FROM test_agg) WHERE cnt > 1L +SELECT FROM (SELECT udf(COUNT()) AS cnt FROM test_agg) WHERE cnt > 1L -- !query 48 schema struct<cnt:bigint> -- !query 48 output -488,7 +488,7 struct<cnt:bigint> -- !query 49 -SELECT count() FROM test_agg WHERE count() > 1L +SELECT udf(count()) FROM test_agg WHERE count() > 1L -- !query 49 schema struct<> -- !query 49 output -500,7 +500,7 Invalid expressions: [count(1)]; -- !query 50 -SELECT count() FROM test_agg WHERE count() + 1L > 1L +SELECT udf(count()) FROM test_agg WHERE count() + 1L > 1L -- !query 50 schema struct<> -- !query 50 output -512,7 +512,7 Invalid expressions: [count(1)]; -- !query 51 -SELECT count() FROM test_agg WHERE k = 1 or k = 2 or count() + 1L > 1L or max(k) > 1 +SELECT udf(count()) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 -- !query 51 schema struct<> -- !query 51 output ``` </p> </details> ## How was this patch tested? Tested as instructed in SPARK-27921. Closes #25360 from skonto/group-by-followup. Authored-by: Stavros Kontopoulos <st.kontopoulos@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>		2019-08-13 10:06:32 +09:00
..
benchmarks	[SPARK-27707][SQL] Prune unnecessary nested fields from Generate	2019-07-18 23:32:07 -07:00
src	[SPARK-28280][PYTHON][SQL][TESTS][FOLLOW-UP] Add UDF cases into group by clause in 'udf-group-by.sql'	2019-08-13 10:06:32 +09:00
v1.2.1/src	[SPARK-28108][SQL][test-hadoop3.2] Simplify OrcFilters	2019-06-24 12:23:52 +08:00
v2.3.5/src	[SPARK-28108][SQL][test-hadoop3.2] Simplify OrcFilters	2019-06-24 12:23:52 +08:00
pom.xml	[SPARK-27521][SQL] Move data source v2 to catalyst module	2019-06-05 09:55:55 -07:00