[SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.

To maintain consistency with the Scala API.

Author: Reynold Xin <rxin@databricks.com>

Closes #5284 from rxin/df-na-alias and squashes the following commits:

19f46b7 [Reynold Xin] Show DataFrameNaFunctions in docs.
6618118 [Reynold Xin] [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.
This commit is contained in:
Reynold Xin 2015-03-31 00:25:23 -07:00
parent f07e714062
commit b80a030e90
2 changed files with 45 additions and 6 deletions

View file

@ -22,22 +22,24 @@ public classes of Spark SQL:
Main entry point for :class:`DataFrame` and SQL functionality. Main entry point for :class:`DataFrame` and SQL functionality.
- L{DataFrame} - L{DataFrame}
A distributed collection of data grouped into named columns. A distributed collection of data grouped into named columns.
- L{GroupedData}
Aggregation methods, returned by :func:`DataFrame.groupBy`.
- L{Column} - L{Column}
A column expression in a :class:`DataFrame`. A column expression in a :class:`DataFrame`.
- L{Row} - L{Row}
A row of data in a :class:`DataFrame`. A row of data in a :class:`DataFrame`.
- L{HiveContext} - L{HiveContext}
Main entry point for accessing data stored in Apache Hive. Main entry point for accessing data stored in Apache Hive.
- L{GroupedData}
Aggregation methods, returned by :func:`DataFrame.groupBy`.
- L{DataFrameNaFunctions}
Methods for handling missing data (null values).
- L{functions} - L{functions}
List of built-in functions available for :class:`DataFrame`. List of built-in functions available for :class:`DataFrame`.
""" """
from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.types import Row from pyspark.sql.types import Row
from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions
__all__ = [ __all__ = [
'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions'
] ]

View file

@ -31,7 +31,7 @@ from pyspark.sql.types import *
from pyspark.sql.types import _create_cls, _parse_datatype_json_string from pyspark.sql.types import _create_cls, _parse_datatype_json_string
__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"] __all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"]
class DataFrame(object): class DataFrame(object):
@ -86,6 +86,12 @@ class DataFrame(object):
return self._lazy_rdd return self._lazy_rdd
@property
def na(self):
"""Returns a :class:`DataFrameNaFunctions` for handling missing values.
"""
return DataFrameNaFunctions(self)
def toJSON(self, use_unicode=False): def toJSON(self, use_unicode=False):
"""Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row. """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row.
@ -693,6 +699,8 @@ class DataFrame(object):
def dropna(self, how='any', thresh=None, subset=None): def dropna(self, how='any', thresh=None, subset=None):
"""Returns a new :class:`DataFrame` omitting rows with null values. """Returns a new :class:`DataFrame` omitting rows with null values.
This is an alias for `na.drop`.
:param how: 'any' or 'all'. :param how: 'any' or 'all'.
If 'any', drop a row if it contains any nulls. If 'any', drop a row if it contains any nulls.
If 'all', drop a row only if all its values are null. If 'all', drop a row only if all its values are null.
@ -704,6 +712,10 @@ class DataFrame(object):
>>> df4.dropna().show() >>> df4.dropna().show()
age height name age height name
10 80 Alice 10 80 Alice
>>> df4.na.drop().show()
age height name
10 80 Alice
""" """
if how is not None and how not in ['any', 'all']: if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'") raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@ -723,7 +735,7 @@ class DataFrame(object):
return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx) return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx)
def fillna(self, value, subset=None): def fillna(self, value, subset=None):
"""Replace null values. """Replace null values, alias for `na.fill`.
:param value: int, long, float, string, or dict. :param value: int, long, float, string, or dict.
Value to replace null values with. Value to replace null values with.
@ -748,6 +760,13 @@ class DataFrame(object):
5 null Bob 5 null Bob
50 null Tom 50 null Tom
50 null unknown 50 null unknown
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
""" """
if not isinstance(value, (float, int, long, basestring, dict)): if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict") raise ValueError("value should be a float, int, long, string, or dict")
@ -1134,6 +1153,24 @@ class Column(object):
return 'Column<%s>' % self._jc.toString().encode('utf8') return 'Column<%s>' % self._jc.toString().encode('utf8')
class DataFrameNaFunctions(object):
"""Functionality for working with missing data in :class:`DataFrame`.
"""
def __init__(self, df):
self.df = df
def drop(self, how='any', thresh=None, subset=None):
return self.df.dropna(how=how, thresh=thresh, subset=subset)
drop.__doc__ = DataFrame.dropna.__doc__
def fill(self, value, subset=None):
return self.df.fillna(value=value, subset=subset)
fill.__doc__ = DataFrame.fillna.__doc__
def _test(): def _test():
import doctest import doctest
from pyspark.context import SparkContext from pyspark.context import SparkContext