[SPARK-20442][PYTHON][DOCS] Fill up documentations for functions in Column API in PySpark

## What changes were proposed in this pull request?

This PR proposes to fill up the documentation with examples for `bitwiseOR`, `bitwiseAND`, `bitwiseXOR`. `contains`, `asc` and `desc` in `Column` API.

Also, this PR fixes minor typos in the documentation and matches some of the contents between Scala doc and Python doc.

Lastly, this PR suggests to use `spark` rather than `sc` in doc tests in `Column` for Python documentation.

## How was this patch tested?

Doc tests were added and manually tested with the commands below:

`./python/run-tests.py --module pyspark-sql`
`./python/run-tests.py --module pyspark-sql --python-executable python3`
`./dev/lint-python`

Output was checked via `make html` under `./python/docs`. The snapshots will be left on the codes with comments.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #17737 from HyukjinKwon/SPARK-20442.
This commit is contained in:
hyukjinkwon 2017-04-29 13:46:40 -07:00 committed by Holden Karau
parent 70f1bcd7bc
commit d228cd0b02
3 changed files with 101 additions and 40 deletions

View file

@ -185,9 +185,43 @@ class Column(object):
"in a string column or 'array_contains' function for an array column.") "in a string column or 'array_contains' function for an array column.")
# bitwise operators # bitwise operators
bitwiseOR = _bin_op("bitwiseOR") _bitwiseOR_doc = """
bitwiseAND = _bin_op("bitwiseAND") Compute bitwise OR of this expression with another expression.
bitwiseXOR = _bin_op("bitwiseXOR")
:param other: a value or :class:`Column` to calculate bitwise or(|) against
this :class:`Column`.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseOR(df.b)).collect()
[Row((a | b)=235)]
"""
_bitwiseAND_doc = """
Compute bitwise AND of this expression with another expression.
:param other: a value or :class:`Column` to calculate bitwise and(&) against
this :class:`Column`.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseAND(df.b)).collect()
[Row((a & b)=10)]
"""
_bitwiseXOR_doc = """
Compute bitwise XOR of this expression with another expression.
:param other: a value or :class:`Column` to calculate bitwise xor(^) against
this :class:`Column`.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseXOR(df.b)).collect()
[Row((a ^ b)=225)]
"""
bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc)
bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc)
bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc)
@since(1.3) @since(1.3)
def getItem(self, key): def getItem(self, key):
@ -195,7 +229,7 @@ class Column(object):
An expression that gets an item at position ``ordinal`` out of a list, An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict. or gets an item by key out of a dict.
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"]) >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show() >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
+----+------+ +----+------+
|l[0]|d[key]| |l[0]|d[key]|
@ -217,7 +251,7 @@ class Column(object):
An expression that gets a field by name in a StructField. An expression that gets a field by name in a StructField.
>>> from pyspark.sql import Row >>> from pyspark.sql import Row
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF() >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))])
>>> df.select(df.r.getField("b")).show() >>> df.select(df.r.getField("b")).show()
+---+ +---+
|r.b| |r.b|
@ -250,8 +284,17 @@ class Column(object):
raise TypeError("Column is not iterable") raise TypeError("Column is not iterable")
# string methods # string methods
_contains_doc = """
Contains the other element. Returns a boolean :class:`Column` based on a string match.
:param other: string in line
>>> df.filter(df.name.contains('o')).collect()
[Row(age=5, name=u'Bob')]
"""
_rlike_doc = """ _rlike_doc = """
Return a Boolean :class:`Column` based on a regex match. SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
match.
:param other: an extended regex expression :param other: an extended regex expression
@ -259,7 +302,7 @@ class Column(object):
[Row(age=2, name=u'Alice')] [Row(age=2, name=u'Alice')]
""" """
_like_doc = """ _like_doc = """
Return a Boolean :class:`Column` based on a SQL LIKE match. SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.
:param other: a SQL LIKE pattern :param other: a SQL LIKE pattern
@ -269,9 +312,9 @@ class Column(object):
[Row(age=2, name=u'Alice')] [Row(age=2, name=u'Alice')]
""" """
_startswith_doc = """ _startswith_doc = """
Return a Boolean :class:`Column` based on a string match. String starts with. Returns a boolean :class:`Column` based on a string match.
:param other: string at end of line (do not use a regex `^`) :param other: string at start of line (do not use a regex `^`)
>>> df.filter(df.name.startswith('Al')).collect() >>> df.filter(df.name.startswith('Al')).collect()
[Row(age=2, name=u'Alice')] [Row(age=2, name=u'Alice')]
@ -279,7 +322,7 @@ class Column(object):
[] []
""" """
_endswith_doc = """ _endswith_doc = """
Return a Boolean :class:`Column` based on matching end of string. String ends with. Returns a boolean :class:`Column` based on a string match.
:param other: string at end of line (do not use a regex `$`) :param other: string at end of line (do not use a regex `$`)
@ -289,7 +332,7 @@ class Column(object):
[] []
""" """
contains = _bin_op("contains") contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc))
rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc))
like = ignore_unicode_prefix(_bin_op("like", _like_doc)) like = ignore_unicode_prefix(_bin_op("like", _like_doc))
startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc))
@ -337,27 +380,40 @@ class Column(object):
return Column(jc) return Column(jc)
# order # order
asc = _unary_op("asc", "Returns a sort expression based on the" _asc_doc = """
" ascending order of the given column name.") Returns a sort expression based on the ascending order of the given column name
desc = _unary_op("desc", "Returns a sort expression based on the"
" descending order of the given column name.")
_isNull_doc = """
True if the current expression is null. Often combined with
:func:`DataFrame.filter` to select rows with null values.
>>> from pyspark.sql import Row >>> from pyspark.sql import Row
>>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df2.filter(df2.height.isNull()).collect() >>> df.select(df.name).orderBy(df.name.asc()).collect()
[Row(name=u'Alice'), Row(name=u'Tom')]
"""
_desc_doc = """
Returns a sort expression based on the descending order of the given column name.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df.select(df.name).orderBy(df.name.desc()).collect()
[Row(name=u'Tom'), Row(name=u'Alice')]
"""
asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc))
desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc))
_isNull_doc = """
True if the current expression is null.
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df.filter(df.height.isNull()).collect()
[Row(height=None, name=u'Alice')] [Row(height=None, name=u'Alice')]
""" """
_isNotNull_doc = """ _isNotNull_doc = """
True if the current expression is null. Often combined with True if the current expression is NOT null.
:func:`DataFrame.filter` to select rows with non-null values.
>>> from pyspark.sql import Row >>> from pyspark.sql import Row
>>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)])
>>> df2.filter(df2.height.isNotNull()).collect() >>> df.filter(df.height.isNotNull()).collect()
[Row(height=80, name=u'Tom')] [Row(height=80, name=u'Tom')]
""" """
@ -527,7 +583,7 @@ def _test():
.appName("sql.column tests")\ .appName("sql.column tests")\
.getOrCreate() .getOrCreate()
sc = spark.sparkContext sc = spark.sparkContext
globs['sc'] = sc globs['spark'] = spark
globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \
.toDF(StructType([StructField('age', IntegerType()), .toDF(StructType([StructField('age', IntegerType()),
StructField('name', StringType())])) StructField('name', StringType())]))

View file

@ -86,7 +86,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
} }
/** /**
* A function that calculates bitwise xor of two numbers. * A function that calculates bitwise xor({@literal ^}) of two numbers.
* *
* Code generation inherited from BinaryArithmetic. * Code generation inherited from BinaryArithmetic.
*/ */

View file

@ -779,7 +779,7 @@ class Column(val expr: Expression) extends Logging {
def isin(list: Any*): Column = withExpr { In(expr, list.map(lit(_).expr)) } def isin(list: Any*): Column = withExpr { In(expr, list.map(lit(_).expr)) }
/** /**
* SQL like expression. * SQL like expression. Returns a boolean column based on a SQL LIKE match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -787,7 +787,8 @@ class Column(val expr: Expression) extends Logging {
def like(literal: String): Column = withExpr { Like(expr, lit(literal).expr) } def like(literal: String): Column = withExpr { Like(expr, lit(literal).expr) }
/** /**
* SQL RLIKE expression (LIKE with Regex). * SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex
* match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -838,7 +839,7 @@ class Column(val expr: Expression) extends Logging {
} }
/** /**
* Contains the other element. * Contains the other element. Returns a boolean column based on a string match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -846,7 +847,7 @@ class Column(val expr: Expression) extends Logging {
def contains(other: Any): Column = withExpr { Contains(expr, lit(other).expr) } def contains(other: Any): Column = withExpr { Contains(expr, lit(other).expr) }
/** /**
* String starts with. * String starts with. Returns a boolean column based on a string match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -854,7 +855,7 @@ class Column(val expr: Expression) extends Logging {
def startsWith(other: Column): Column = withExpr { StartsWith(expr, lit(other).expr) } def startsWith(other: Column): Column = withExpr { StartsWith(expr, lit(other).expr) }
/** /**
* String starts with another string literal. * String starts with another string literal. Returns a boolean column based on a string match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -862,7 +863,7 @@ class Column(val expr: Expression) extends Logging {
def startsWith(literal: String): Column = this.startsWith(lit(literal)) def startsWith(literal: String): Column = this.startsWith(lit(literal))
/** /**
* String ends with. * String ends with. Returns a boolean column based on a string match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -870,7 +871,7 @@ class Column(val expr: Expression) extends Logging {
def endsWith(other: Column): Column = withExpr { EndsWith(expr, lit(other).expr) } def endsWith(other: Column): Column = withExpr { EndsWith(expr, lit(other).expr) }
/** /**
* String ends with another string literal. * String ends with another string literal. Returns a boolean column based on a string match.
* *
* @group expr_ops * @group expr_ops
* @since 1.3.0 * @since 1.3.0
@ -1008,7 +1009,7 @@ class Column(val expr: Expression) extends Logging {
def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to)) def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to))
/** /**
* Returns an ordering used in sorting. * Returns a sort expression based on the descending order of the column.
* {{{ * {{{
* // Scala * // Scala
* df.sort(df("age").desc) * df.sort(df("age").desc)
@ -1023,7 +1024,8 @@ class Column(val expr: Expression) extends Logging {
def desc: Column = withExpr { SortOrder(expr, Descending) } def desc: Column = withExpr { SortOrder(expr, Descending) }
/** /**
* Returns a descending ordering used in sorting, where null values appear before non-null values. * Returns a sort expression based on the descending order of the column,
* and null values appear before non-null values.
* {{{ * {{{
* // Scala: sort a DataFrame by age column in descending order and null values appearing first. * // Scala: sort a DataFrame by age column in descending order and null values appearing first.
* df.sort(df("age").desc_nulls_first) * df.sort(df("age").desc_nulls_first)
@ -1038,7 +1040,8 @@ class Column(val expr: Expression) extends Logging {
def desc_nulls_first: Column = withExpr { SortOrder(expr, Descending, NullsFirst, Set.empty) } def desc_nulls_first: Column = withExpr { SortOrder(expr, Descending, NullsFirst, Set.empty) }
/** /**
* Returns a descending ordering used in sorting, where null values appear after non-null values. * Returns a sort expression based on the descending order of the column,
* and null values appear after non-null values.
* {{{ * {{{
* // Scala: sort a DataFrame by age column in descending order and null values appearing last. * // Scala: sort a DataFrame by age column in descending order and null values appearing last.
* df.sort(df("age").desc_nulls_last) * df.sort(df("age").desc_nulls_last)
@ -1053,7 +1056,7 @@ class Column(val expr: Expression) extends Logging {
def desc_nulls_last: Column = withExpr { SortOrder(expr, Descending, NullsLast, Set.empty) } def desc_nulls_last: Column = withExpr { SortOrder(expr, Descending, NullsLast, Set.empty) }
/** /**
* Returns an ascending ordering used in sorting. * Returns a sort expression based on ascending order of the column.
* {{{ * {{{
* // Scala: sort a DataFrame by age column in ascending order. * // Scala: sort a DataFrame by age column in ascending order.
* df.sort(df("age").asc) * df.sort(df("age").asc)
@ -1068,7 +1071,8 @@ class Column(val expr: Expression) extends Logging {
def asc: Column = withExpr { SortOrder(expr, Ascending) } def asc: Column = withExpr { SortOrder(expr, Ascending) }
/** /**
* Returns an ascending ordering used in sorting, where null values appear before non-null values. * Returns a sort expression based on ascending order of the column,
* and null values return before non-null values.
* {{{ * {{{
* // Scala: sort a DataFrame by age column in ascending order and null values appearing first. * // Scala: sort a DataFrame by age column in ascending order and null values appearing first.
* df.sort(df("age").asc_nulls_last) * df.sort(df("age").asc_nulls_last)
@ -1083,7 +1087,8 @@ class Column(val expr: Expression) extends Logging {
def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Set.empty) } def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Set.empty) }
/** /**
* Returns an ordering used in sorting, where null values appear after non-null values. * Returns a sort expression based on ascending order of the column,
* and null values appear after non-null values.
* {{{ * {{{
* // Scala: sort a DataFrame by age column in ascending order and null values appearing last. * // Scala: sort a DataFrame by age column in ascending order and null values appearing last.
* df.sort(df("age").asc_nulls_last) * df.sort(df("age").asc_nulls_last)