spark-instrumented-optimizer

History

HyukjinKwon 113f8c8d13 [SPARK-28132][PYTHON] Update document type conversion for Pandas UDFs (pyarrow 0.13.0, pandas 0.24.2, Python 3.7) ## What changes were proposed in this pull request? This PR updates the chart generated at SPARK-25666. We deprecated Python 2. It's better to use Python 3. We don't have to test `unicode` and `long` anymore in Python 3. So it was removed. Use this code to generate the chart: ```python from pyspark.sql.types import * from pyspark.sql.functions import pandas_udf columns = [ ('none', 'object(NoneType)'), ('bool', 'bool'), ('int8', 'int8'), ('int16', 'int16'), ('int32', 'int32'), ('int64', 'int64'), ('uint8', 'uint8'), ('uint16', 'uint16'), ('uint32', 'uint32'), ('uint64', 'uint64'), ('float64', 'float16'), ('float64', 'float32'), ('float64', 'float64'), ('date', 'datetime64[ns]'), ('tz_aware_dates', 'datetime64[ns, US/Eastern]'), ('string', 'object(string)'), ('decimal', 'object(Decimal)'), ('array', 'object(array[int32])'), ('float128', 'float128'), ('complex64', 'complex64'), ('complex128', 'complex128'), ('category', 'category'), ('tdeltas', 'timedelta64[ns]'), ] def create_dataframe(): import pandas as pd import numpy as np import decimal pdf = pd.DataFrame({ 'none': [None, None], 'bool': [True, False], 'int8': np.arange(1, 3).astype('int8'), 'int16': np.arange(1, 3).astype('int16'), 'int32': np.arange(1, 3).astype('int32'), 'int64': np.arange(1, 3).astype('int64'), 'uint8': np.arange(1, 3).astype('uint8'), 'uint16': np.arange(1, 3).astype('uint16'), 'uint32': np.arange(1, 3).astype('uint32'), 'uint64': np.arange(1, 3).astype('uint64'), 'float16': np.arange(1, 3).astype('float16'), 'float32': np.arange(1, 3).astype('float32'), 'float64': np.arange(1, 3).astype('float64'), 'float128': np.arange(1, 3).astype('float128'), 'complex64': np.arange(1, 3).astype('complex64'), 'complex128': np.arange(1, 3).astype('complex128'), 'string': list('ab'), 'array': pd.Series([np.array([1, 2, 3], dtype=np.int32), np.array([1, 2, 3], dtype=np.int32)]), 'decimal': pd.Series([decimal.Decimal('1'), decimal.Decimal('2')]), 'date': pd.date_range('19700101', periods=2).values, 'category': pd.Series(list("AB")).astype('category')}) pdf['tdeltas'] = [pdf.date.diff()[1], pdf.date.diff()[0]] pdf['tz_aware_dates'] = pd.date_range('19700101', periods=2, tz='US/Eastern') return pdf types = [ BooleanType(), ByteType(), ShortType(), IntegerType(), LongType(), FloatType(), DoubleType(), DateType(), TimestampType(), StringType(), DecimalType(10, 0), ArrayType(IntegerType()), MapType(StringType(), IntegerType()), StructType([StructField("_1", IntegerType())]), BinaryType(), ] df = spark.range(2).repartition(1) results = [] count = 0 total = len(types) * len(columns) values = [] spark.sparkContext.setLogLevel("FATAL") for t in types: result = [] for column, pandas_t in columns: v = create_dataframe()[column][0] values.append(v) try: row = df.select(pandas_udf(lambda _: create_dataframe()[column], t)(df.id)).first() ret_str = repr(row[0]) except Exception: ret_str = "X" result.append(ret_str) progress = "SQL Type: [%s]\n Pandas Value(Type): %s(%s)]\n Result Python Value: [%s]" % ( t.simpleString(), v, pandas_t, ret_str) count += 1 print("%s/%s:\n %s" % (count, total, progress)) results.append([t.simpleString()] + list(map(str, result))) schema = ["SQL Type \\ Pandas Value(Type)"] + list(map(lambda values_column: "%s(%s)" % (values_column[0], values_column[1][1]), zip(values, columns))) strings = spark.createDataFrame(results, schema=schema)._jdf.showString(20, 20, False) print("\n".join(map(lambda line: " # %s # noqa" % line, strings.strip().split("\n")))) ``` ## How was this patch tested? Manually. Closes #24930 from HyukjinKwon/SPARK-28132. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: Bryan Cutler <cutlerb@gmail.com>		2019-06-21 10:47:54 -07:00
..
avro	[SPARK-26856][PYSPARK][FOLLOWUP] Fix UT failure due to wrong patterns for Kinesis assembly	2019-04-02 14:52:56 +09:00
tests	[SPARK-28041][PYTHON] Increase minimum supported Pandas to 0.23.2	2019-06-18 09:10:58 +09:00
__init__.py	[SPARK-22369][PYTHON][DOCS] Exposes catalog API documentation in PySpark	2017-11-02 15:22:52 +01:00
catalog.py	[SPARK-24665][PYSPARK][FOLLOWUP] Use SQLConf in PySpark to manage all sql configs	2018-08-17 10:18:08 +08:00
column.py	[SPARK-28031][PYSPARK][TEST] Improve doctest on over function of Column	2019-06-13 11:04:41 +09:00
conf.py	[SPARK-23698][PYTHON] Resolve undefined names in Python 3	2018-08-22 10:06:59 -07:00
context.py	[SPARK-26640][CORE][ML][SQL][STREAMING][PYSPARK] Code cleanup from lgtm.com analysis	2019-01-17 19:40:39 -06:00
dataframe.py	[SPARK-27834][SQL][R][PYTHON] Make separate PySpark/SparkR vectorization configurations	2019-06-03 10:01:37 +09:00
functions.py	[SPARK-28132][PYTHON] Update document type conversion for Pandas UDFs (pyarrow 0.13.0, pandas 0.24.2, Python 3.7)	2019-06-21 10:47:54 -07:00
group.py	[SPARK-24722][SQL] pivot() with Column type argument	2018-08-04 14:17:32 +08:00
readwriter.py	[SPARK-28058][DOC] Add a note to doc of mode of CSV for column pruning	2019-06-18 13:48:32 +09:00
session.py	[SPARK-27995][PYTHON] Note the difference between str of Python 2 and 3 at Arrow optimized	2019-06-11 18:43:59 +09:00
streaming.py	[SPARK-27627][SQL] Make option "pathGlobFilter" as a general option for all file sources	2019-05-09 08:41:43 +09:00
types.py	[SPARK-23299][SQL][PYSPARK] Fix __repr__ behaviour for Rows	2019-05-06 10:00:49 -07:00
udf.py	[SPARK-26412][PYSPARK][SQL] Allow Pandas UDF to take an iterator of pd.Series or an iterator of tuple of pd.Series	2019-06-15 08:29:20 -07:00
utils.py	[SPARK-28041][PYTHON] Increase minimum supported Pandas to 0.23.2	2019-06-18 09:10:58 +09:00
window.py	[MINOR][PYSPARK][SQL][DOC] Fix rowsBetween doc in Window	2019-06-14 09:56:37 +09:00