[SPARK-32686][PYTHON] Un-deprecate inferring DataFrame schema from list of dict

### What changes were proposed in this pull request?

As discussed in https://github.com/apache/spark/pull/29491#discussion_r474451282 and in SPARK-32686, this PR un-deprecates Spark's ability to infer a DataFrame schema from a list of dictionaries. The ability is Pythonic and matches functionality offered by Pandas.

### Why are the changes needed?

This change clarifies to users that this behavior is supported and is not going away in the near future.

### Does this PR introduce _any_ user-facing change?

Yes. There used to be a `UserWarning` for this, but now there isn't.

### How was this patch tested?

I tested this manually.

Before:

```python
>>> spark.createDataFrame(spark.sparkContext.parallelize([{'a': 5}]))
/Users/nchamm/Documents/GitHub/nchammas/spark/python/pyspark/sql/session.py:388: UserWarning: Using RDD of dict to inferSchema is deprecated. Use pyspark.sql.Row instead
  warnings.warn("Using RDD of dict to inferSchema is deprecated. "
DataFrame[a: bigint]

>>> spark.createDataFrame([{'a': 5}])
.../python/pyspark/sql/session.py:378: UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead
  warnings.warn("inferring schema from dict is deprecated,"
DataFrame[a: bigint]
```

After:

```python
>>> spark.createDataFrame(spark.sparkContext.parallelize([{'a': 5}]))
DataFrame[a: bigint]

>>> spark.createDataFrame([{'a': 5}])
DataFrame[a: bigint]
```

Closes #29510 from nchammas/SPARK-32686-df-dict-infer-schema.

Authored-by: Nicholas Chammas <nicholas.chammas@liveramp.com>
Signed-off-by: Bryan Cutler <cutlerb@gmail.com>
This commit is contained in:
Nicholas Chammas 2020-08-24 14:55:11 -07:00 committed by Bryan Cutler
parent e3a88a9767
commit 41cf1d093f

View file

@ -359,18 +359,14 @@ class SparkSession(SparkConversionMixin):
def _inferSchemaFromList(self, data, names=None): def _inferSchemaFromList(self, data, names=None):
""" """
Infer schema from list of Row or tuple. Infer schema from list of Row, dict, or tuple.
:param data: list of Row or tuple :param data: list of Row, dict, or tuple
:param names: list of column names :param names: list of column names
:return: :class:`pyspark.sql.types.StructType` :return: :class:`pyspark.sql.types.StructType`
""" """
if not data: if not data:
raise ValueError("can not infer schema from empty dataset") raise ValueError("can not infer schema from empty dataset")
first = data[0]
if type(first) is dict:
warnings.warn("inferring schema from dict is deprecated,"
"please use pyspark.sql.Row instead")
schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
if _has_nulltype(schema): if _has_nulltype(schema):
raise ValueError("Some of types cannot be determined after inferring") raise ValueError("Some of types cannot be determined after inferring")
@ -378,9 +374,9 @@ class SparkSession(SparkConversionMixin):
def _inferSchema(self, rdd, samplingRatio=None, names=None): def _inferSchema(self, rdd, samplingRatio=None, names=None):
""" """
Infer schema from an RDD of Row or tuple. Infer schema from an RDD of Row, dict, or tuple.
:param rdd: an RDD of Row or tuple :param rdd: an RDD of Row, dict, or tuple
:param samplingRatio: sampling ratio, or no sampling (default) :param samplingRatio: sampling ratio, or no sampling (default)
:return: :class:`pyspark.sql.types.StructType` :return: :class:`pyspark.sql.types.StructType`
""" """
@ -388,9 +384,6 @@ class SparkSession(SparkConversionMixin):
if not first: if not first:
raise ValueError("The first row in RDD is empty, " raise ValueError("The first row in RDD is empty, "
"can not infer schema") "can not infer schema")
if type(first) is dict:
warnings.warn("Using RDD of dict to inferSchema is deprecated. "
"Use pyspark.sql.Row instead")
if samplingRatio is None: if samplingRatio is None:
schema = _infer_schema(first, names=names) schema = _infer_schema(first, names=names)