spark-instrumented-optimizer/python/pyspark
mn-mikke 5fea17b3be [SPARK-23821][SQL] Collection function: flatten
## What changes were proposed in this pull request?

This PR adds a new collection function that transforms an array of arrays into a single array. The PR comprises:
- An expression for flattening array structure
- Flatten function
- A wrapper for PySpark

## How was this patch tested?

New tests added into:
- CollectionExpressionsSuite
- DataFrameFunctionsSuite

## Codegen examples
### Primitive type
```
val df = Seq(
  Seq(Seq(1, 2), Seq(4, 5)),
  Seq(null, Seq(1))
).toDF("i")
df.filter($"i".isNotNull || $"i".isNull).select(flatten($"i")).debugCodegen
```
Result:
```
/* 033 */         boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */         ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */         null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */         boolean filter_value = true;
/* 038 */
/* 039 */         if (!(!inputadapter_isNull)) {
/* 040 */           filter_value = inputadapter_isNull;
/* 041 */         }
/* 042 */         if (!filter_value) continue;
/* 043 */
/* 044 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 045 */
/* 046 */         boolean project_isNull = inputadapter_isNull;
/* 047 */         ArrayData project_value = null;
/* 048 */
/* 049 */         if (!inputadapter_isNull) {
/* 050 */           for (int z = 0; !project_isNull && z < inputadapter_value.numElements(); z++) {
/* 051 */             project_isNull |= inputadapter_value.isNullAt(z);
/* 052 */           }
/* 053 */           if (!project_isNull) {
/* 054 */             long project_numElements = 0;
/* 055 */             for (int z = 0; z < inputadapter_value.numElements(); z++) {
/* 056 */               project_numElements += inputadapter_value.getArray(z).numElements();
/* 057 */             }
/* 058 */             if (project_numElements > 2147483632) {
/* 059 */               throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 060 */                 project_numElements + " elements due to exceeding the array size limit 2147483632.");
/* 061 */             }
/* 062 */
/* 063 */             long project_size = UnsafeArrayData.calculateSizeOfUnderlyingByteArray(
/* 064 */               project_numElements,
/* 065 */               4);
/* 066 */             if (project_size > 2147483632) {
/* 067 */               throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 068 */                 project_size + " bytes of data due to exceeding the limit 2147483632" +
/* 069 */                 " bytes for UnsafeArrayData.");
/* 070 */             }
/* 071 */
/* 072 */             byte[] project_array = new byte[(int)project_size];
/* 073 */             UnsafeArrayData project_tempArrayData = new UnsafeArrayData();
/* 074 */             Platform.putLong(project_array, 16, project_numElements);
/* 075 */             project_tempArrayData.pointTo(project_array, 16, (int)project_size);
/* 076 */             int project_counter = 0;
/* 077 */             for (int k = 0; k < inputadapter_value.numElements(); k++) {
/* 078 */               ArrayData arr = inputadapter_value.getArray(k);
/* 079 */               for (int l = 0; l < arr.numElements(); l++) {
/* 080 */                 if (arr.isNullAt(l)) {
/* 081 */                   project_tempArrayData.setNullAt(project_counter);
/* 082 */                 } else {
/* 083 */                   project_tempArrayData.setInt(
/* 084 */                     project_counter,
/* 085 */                     arr.getInt(l)
/* 086 */                   );
/* 087 */                 }
/* 088 */                 project_counter++;
/* 089 */               }
/* 090 */             }
/* 091 */             project_value = project_tempArrayData;
/* 092 */
/* 093 */           }
/* 094 */
/* 095 */         }
```
### Non-primitive type
```
val df = Seq(
  Seq(Seq("a", "b"), Seq(null, "d")),
  Seq(null, Seq("a"))
).toDF("s")
df.filter($"s".isNotNull || $"s".isNull).select(flatten($"s")).debugCodegen
```
Result:
```
/* 033 */         boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */         ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */         null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */         boolean filter_value = true;
/* 038 */
/* 039 */         if (!(!inputadapter_isNull)) {
/* 040 */           filter_value = inputadapter_isNull;
/* 041 */         }
/* 042 */         if (!filter_value) continue;
/* 043 */
/* 044 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 045 */
/* 046 */         boolean project_isNull = inputadapter_isNull;
/* 047 */         ArrayData project_value = null;
/* 048 */
/* 049 */         if (!inputadapter_isNull) {
/* 050 */           for (int z = 0; !project_isNull && z < inputadapter_value.numElements(); z++) {
/* 051 */             project_isNull |= inputadapter_value.isNullAt(z);
/* 052 */           }
/* 053 */           if (!project_isNull) {
/* 054 */             long project_numElements = 0;
/* 055 */             for (int z = 0; z < inputadapter_value.numElements(); z++) {
/* 056 */               project_numElements += inputadapter_value.getArray(z).numElements();
/* 057 */             }
/* 058 */             if (project_numElements > 2147483632) {
/* 059 */               throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 060 */                 project_numElements + " elements due to exceeding the array size limit 2147483632.");
/* 061 */             }
/* 062 */
/* 063 */             Object[] project_arrayObject = new Object[(int)project_numElements];
/* 064 */             int project_counter = 0;
/* 065 */             for (int k = 0; k < inputadapter_value.numElements(); k++) {
/* 066 */               ArrayData arr = inputadapter_value.getArray(k);
/* 067 */               for (int l = 0; l < arr.numElements(); l++) {
/* 068 */                 project_arrayObject[project_counter] = arr.getUTF8String(l);
/* 069 */                 project_counter++;
/* 070 */               }
/* 071 */             }
/* 072 */             project_value = new org.apache.spark.sql.catalyst.util.GenericArrayData(project_arrayObject);
/* 073 */
/* 074 */           }
/* 075 */
/* 076 */         }
```

Author: mn-mikke <mrkAha12346github>

Closes #20938 from mn-mikke/feature/array-api-flatten-to-master.
2018-04-25 11:19:08 +09:00
..
ml [SPARK-21741][ML][PYSPARK] Python API for DataFrame-based multivariate summarizer 2018-04-17 10:11:08 -07:00
mllib [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
sql [SPARK-23821][SQL] Collection function: flatten 2018-04-25 11:19:08 +09:00
streaming [SPARK-24014][PYSPARK] Add onStreamingStarted method to StreamingListener 2018-04-19 10:00:57 +08:00
__init__.py [SPARK-23328][PYTHON] Disallow default value None in na.replace/replace when 'to_replace' is not a dictionary 2018-02-09 14:21:10 +08:00
_globals.py [SPARK-23328][PYTHON] Disallow default value None in na.replace/replace when 'to_replace' is not a dictionary 2018-02-09 14:21:10 +08:00
accumulators.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
broadcast.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
cloudpickle.py [SPARK-23159][PYTHON] Update cloudpickle to v0.4.3 2018-03-08 20:19:55 +09:00
conf.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
context.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
daemon.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
files.py [SPARK-3309] [PySpark] Put all public API in __all__ 2014-09-03 11:49:45 -07:00
find_spark_home.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
heapq3.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
java_gateway.py [SPARK-20791][PYSPARK] Use Arrow to create Spark DataFrame from Pandas 2017-11-13 13:16:01 +09:00
join.py [SPARK-14202] [PYTHON] Use generator expression instead of list comp in python_full_outer_jo… 2016-03-28 14:51:36 -07:00
profiler.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
rdd.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
rddsampler.py [SPARK-4897] [PySpark] Python 3 support 2015-04-16 16:20:57 -07:00
resultiterable.py [SPARK-3074] [PySpark] support groupByKey() with single huge key 2015-04-09 17:07:23 -07:00
serializers.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
shell.py [SPARK-19570][PYSPARK] Allow to disable hive in pyspark shell 2017-04-12 10:54:50 -07:00
shuffle.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00
statcounter.py [SPARK-6919] [PYSPARK] Add asDict method to StatCounter 2015-09-29 13:38:15 -07:00
status.py [SPARK-4172] [PySpark] Progress API in Python 2015-02-17 13:36:43 -08:00
storagelevel.py [SPARK-13992][CORE][PYSPARK][FOLLOWUP] Update OFF_HEAP semantics for Java api and Python api 2016-04-12 23:06:55 -07:00
taskcontext.py [SPARK-18576][PYTHON] Add basic TaskContext information to PySpark 2016-12-20 15:51:21 -08:00
tests.py [SPARK-23517][PYTHON] Make pyspark.util._exception_message produce the trace from Java side by Py4JJavaError 2018-03-01 00:44:13 +09:00
traceback_utils.py [SPARK-1087] Move python traceback utilities into new traceback_utils.py file. 2014-09-15 19:28:17 -07:00
util.py [SPARK-23700][PYTHON] Cleanup imports in pyspark.sql 2018-03-26 12:42:32 +09:00
version.py [SPARK-23028] Bump master branch version to 2.4.0-SNAPSHOT 2018-01-13 00:37:59 +08:00
worker.py [SPARK-23522][PYTHON] always use sys.exit over builtin exit 2018-03-08 20:38:34 +09:00