bda5b51576
## What changes were proposed in this pull request? Add missing validation for `LongType` in `pyspark.sql.types._make_type_verifier`. ## How was this patch tested? Doctests / unittests / manual tests. Unpatched version: ``` In [23]: s.createDataFrame([{'x': 1 << 64}], StructType([StructField('x', LongType())])).collect() Out[23]: [Row(x=None)] ``` Patched: ``` In [5]: s.createDataFrame([{'x': 1 << 64}], StructType([StructField('x', LongType())])).collect() --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-5-c1740fcadbf9> in <module> ----> 1 s.createDataFrame([{'x': 1 << 64}], StructType([StructField('x', LongType())])).collect() /usr/local/lib/python3.5/site-packages/pyspark/sql/session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema) 689 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) 690 else: --> 691 rdd, schema = self._createFromLocal(map(prepare, data), schema) 692 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) 693 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) /usr/local/lib/python3.5/site-packages/pyspark/sql/session.py in _createFromLocal(self, data, schema) 405 # make sure data could consumed multiple times 406 if not isinstance(data, list): --> 407 data = list(data) 408 409 if schema is None or isinstance(schema, (list, tuple)): /usr/local/lib/python3.5/site-packages/pyspark/sql/session.py in prepare(obj) 671 672 def prepare(obj): --> 673 verify_func(obj) 674 return obj 675 elif isinstance(schema, DataType): /usr/local/lib/python3.5/site-packages/pyspark/sql/types.py in verify(obj) 1427 def verify(obj): 1428 if not verify_nullability(obj): -> 1429 verify_value(obj) 1430 1431 return verify /usr/local/lib/python3.5/site-packages/pyspark/sql/types.py in verify_struct(obj) 1397 if isinstance(obj, dict): 1398 for f, verifier in verifiers: -> 1399 verifier(obj.get(f)) 1400 elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False): 1401 # the order in obj could be different than dataType.fields /usr/local/lib/python3.5/site-packages/pyspark/sql/types.py in verify(obj) 1427 def verify(obj): 1428 if not verify_nullability(obj): -> 1429 verify_value(obj) 1430 1431 return verify /usr/local/lib/python3.5/site-packages/pyspark/sql/types.py in verify_long(obj) 1356 if obj < -9223372036854775808 or obj > 9223372036854775807: 1357 raise ValueError( -> 1358 new_msg("object of LongType out of range, got: %s" % obj)) 1359 1360 verify_value = verify_long ValueError: field x: object of LongType out of range, got: 18446744073709551616 ``` Closes #25117 from simplylizz/master. Authored-by: Anton Yanchenko <simplylizz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> |
||
---|---|---|
.. | ||
ml | ||
mllib | ||
sql | ||
streaming | ||
testing | ||
tests | ||
__init__.py | ||
_globals.py | ||
accumulators.py | ||
broadcast.py | ||
cloudpickle.py | ||
conf.py | ||
context.py | ||
daemon.py | ||
files.py | ||
find_spark_home.py | ||
heapq3.py | ||
java_gateway.py | ||
join.py | ||
profiler.py | ||
rdd.py | ||
rddsampler.py | ||
resourceinformation.py | ||
resultiterable.py | ||
serializers.py | ||
shell.py | ||
shuffle.py | ||
statcounter.py | ||
status.py | ||
storagelevel.py | ||
taskcontext.py | ||
traceback_utils.py | ||
util.py | ||
version.py | ||
worker.py |