[SPARK-32686][PYTHON] Un-deprecate inferring DataFrame schema from list of dict
### What changes were proposed in this pull request? As discussed in https://github.com/apache/spark/pull/29491#discussion_r474451282 and in SPARK-32686, this PR un-deprecates Spark's ability to infer a DataFrame schema from a list of dictionaries. The ability is Pythonic and matches functionality offered by Pandas. ### Why are the changes needed? This change clarifies to users that this behavior is supported and is not going away in the near future. ### Does this PR introduce _any_ user-facing change? Yes. There used to be a `UserWarning` for this, but now there isn't. ### How was this patch tested? I tested this manually. Before: ```python >>> spark.createDataFrame(spark.sparkContext.parallelize([{'a': 5}])) /Users/nchamm/Documents/GitHub/nchammas/spark/python/pyspark/sql/session.py:388: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead warnings.warn("Using RDD of dict to inferSchema is deprecated. " DataFrame[a: bigint] >>> spark.createDataFrame([{'a': 5}]) .../python/pyspark/sql/session.py:378: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead warnings.warn("inferring schema from dict is deprecated," DataFrame[a: bigint] ``` After: ```python >>> spark.createDataFrame(spark.sparkContext.parallelize([{'a': 5}])) DataFrame[a: bigint] >>> spark.createDataFrame([{'a': 5}]) DataFrame[a: bigint] ``` Closes #29510 from nchammas/SPARK-32686-df-dict-infer-schema. Authored-by: Nicholas Chammas <nicholas.chammas@liveramp.com> Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
This commit is contained in:
parent
e3a88a9767
commit
41cf1d093f
|
@ -359,18 +359,14 @@ class SparkSession(SparkConversionMixin):
|
||||||
|
|
||||||
def _inferSchemaFromList(self, data, names=None):
|
def _inferSchemaFromList(self, data, names=None):
|
||||||
"""
|
"""
|
||||||
Infer schema from list of Row or tuple.
|
Infer schema from list of Row, dict, or tuple.
|
||||||
|
|
||||||
:param data: list of Row or tuple
|
:param data: list of Row, dict, or tuple
|
||||||
:param names: list of column names
|
:param names: list of column names
|
||||||
:return: :class:`pyspark.sql.types.StructType`
|
:return: :class:`pyspark.sql.types.StructType`
|
||||||
"""
|
"""
|
||||||
if not data:
|
if not data:
|
||||||
raise ValueError("can not infer schema from empty dataset")
|
raise ValueError("can not infer schema from empty dataset")
|
||||||
first = data[0]
|
|
||||||
if type(first) is dict:
|
|
||||||
warnings.warn("inferring schema from dict is deprecated,"
|
|
||||||
"please use pyspark.sql.Row instead")
|
|
||||||
schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
|
schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
|
||||||
if _has_nulltype(schema):
|
if _has_nulltype(schema):
|
||||||
raise ValueError("Some of types cannot be determined after inferring")
|
raise ValueError("Some of types cannot be determined after inferring")
|
||||||
|
@ -378,9 +374,9 @@ class SparkSession(SparkConversionMixin):
|
||||||
|
|
||||||
def _inferSchema(self, rdd, samplingRatio=None, names=None):
|
def _inferSchema(self, rdd, samplingRatio=None, names=None):
|
||||||
"""
|
"""
|
||||||
Infer schema from an RDD of Row or tuple.
|
Infer schema from an RDD of Row, dict, or tuple.
|
||||||
|
|
||||||
:param rdd: an RDD of Row or tuple
|
:param rdd: an RDD of Row, dict, or tuple
|
||||||
:param samplingRatio: sampling ratio, or no sampling (default)
|
:param samplingRatio: sampling ratio, or no sampling (default)
|
||||||
:return: :class:`pyspark.sql.types.StructType`
|
:return: :class:`pyspark.sql.types.StructType`
|
||||||
"""
|
"""
|
||||||
|
@ -388,9 +384,6 @@ class SparkSession(SparkConversionMixin):
|
||||||
if not first:
|
if not first:
|
||||||
raise ValueError("The first row in RDD is empty, "
|
raise ValueError("The first row in RDD is empty, "
|
||||||
"can not infer schema")
|
"can not infer schema")
|
||||||
if type(first) is dict:
|
|
||||||
warnings.warn("Using RDD of dict to inferSchema is deprecated. "
|
|
||||||
"Use pyspark.sql.Row instead")
|
|
||||||
|
|
||||||
if samplingRatio is None:
|
if samplingRatio is None:
|
||||||
schema = _infer_schema(first, names=names)
|
schema = _infer_schema(first, names=names)
|
||||||
|
|
Loading…
Reference in a new issue