2015-02-14 02:03:22 -05:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
|
|
# this work for additional information regarding copyright ownership.
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
# (the "License"); you may not use this file except in compliance with
|
|
|
|
# the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
|
|
|
|
"""
|
|
|
|
A collections of builtin functions
|
|
|
|
"""
|
2015-04-16 19:20:57 -04:00
|
|
|
import sys
|
2017-02-15 13:16:34 -05:00
|
|
|
import functools
|
[SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
## What changes were proposed in this pull request?
This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.
This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:
**Before**
<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />
**After**
<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />
For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):
```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```
so, it won't actually mess up the terminal much unless it is intended.
If this is intendedly enabled, it'd should as below:
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
"Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```
These instances were found by:
```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```
## How was this patch tested?
Manually tested.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #19535 from HyukjinKwon/deprecated-warning.
2017-10-23 23:44:47 -04:00
|
|
|
import warnings
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2015-04-16 19:20:57 -04:00
|
|
|
if sys.version < "3":
|
|
|
|
from itertools import imap as map
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2015-09-08 23:56:22 -04:00
|
|
|
from pyspark import since, SparkContext
|
2017-11-17 10:43:08 -05:00
|
|
|
from pyspark.rdd import ignore_unicode_prefix, PythonEvalType
|
2015-05-15 23:09:15 -04:00
|
|
|
from pyspark.sql.column import Column, _to_java_column, _to_seq
|
2015-09-22 02:36:41 -04:00
|
|
|
from pyspark.sql.dataframe import DataFrame
|
2017-11-17 10:43:08 -05:00
|
|
|
from pyspark.sql.types import StringType, DataType
|
2018-03-25 23:42:32 -04:00
|
|
|
# Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409
|
2017-11-17 10:43:08 -05:00
|
|
|
from pyspark.sql.udf import UserDefinedFunction, _create_udf
|
2015-02-14 02:03:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
def _create_function(name, doc=""):
|
|
|
|
""" Create a function for aggregator by name"""
|
|
|
|
def _(col):
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-02-17 13:22:48 -05:00
|
|
|
jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)
|
2015-02-14 02:03:22 -05:00
|
|
|
return Column(jc)
|
|
|
|
_.__name__ = name
|
|
|
|
_.__doc__ = doc
|
|
|
|
return _
|
|
|
|
|
|
|
|
|
[SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
## What changes were proposed in this pull request?
This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.
This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:
**Before**
<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />
**After**
<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />
For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):
```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```
so, it won't actually mess up the terminal much unless it is intended.
If this is intendedly enabled, it'd should as below:
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
"Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```
These instances were found by:
```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```
## How was this patch tested?
Manually tested.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #19535 from HyukjinKwon/deprecated-warning.
2017-10-23 23:44:47 -04:00
|
|
|
def _wrap_deprecated_function(func, message):
|
|
|
|
""" Wrap the deprecated function to print out deprecation warnings"""
|
|
|
|
def _(col):
|
|
|
|
warnings.warn(message, DeprecationWarning)
|
|
|
|
return func(col)
|
|
|
|
return functools.wraps(func)(_)
|
|
|
|
|
|
|
|
|
2015-05-06 01:56:01 -04:00
|
|
|
def _create_binary_mathfunction(name, doc=""):
|
|
|
|
""" Create a binary mathfunction by name"""
|
|
|
|
def _(col1, col2):
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
# users might write ints for simplicity. This would throw an error on the JVM side.
|
|
|
|
jc = getattr(sc._jvm.functions, name)(col1._jc if isinstance(col1, Column) else float(col1),
|
|
|
|
col2._jc if isinstance(col2, Column) else float(col2))
|
|
|
|
return Column(jc)
|
|
|
|
_.__name__ = name
|
|
|
|
_.__doc__ = doc
|
|
|
|
return _
|
|
|
|
|
|
|
|
|
2015-05-23 11:30:05 -04:00
|
|
|
def _create_window_function(name, doc=''):
|
|
|
|
""" Create a window function by name """
|
|
|
|
def _():
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = getattr(sc._jvm.functions, name)()
|
|
|
|
return Column(jc)
|
|
|
|
_.__name__ = name
|
|
|
|
_.__doc__ = 'Window function: ' + doc
|
|
|
|
return _
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
_lit_doc = """
|
|
|
|
Creates a :class:`Column` of literal value.
|
2015-05-23 11:30:05 -04:00
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1)
|
|
|
|
[Row(height=5, spark_user=True)]
|
|
|
|
"""
|
2015-02-14 02:03:22 -05:00
|
|
|
_functions = {
|
2017-07-08 02:59:34 -04:00
|
|
|
'lit': _lit_doc,
|
2015-02-14 02:03:22 -05:00
|
|
|
'col': 'Returns a :class:`Column` based on the given column name.',
|
|
|
|
'column': 'Returns a :class:`Column` based on the given column name.',
|
2015-02-24 21:59:23 -05:00
|
|
|
'asc': 'Returns a sort expression based on the ascending order of the given column name.',
|
|
|
|
'desc': 'Returns a sort expression based on the descending order of the given column name.',
|
|
|
|
|
2015-02-14 02:03:22 -05:00
|
|
|
'upper': 'Converts a string expression to upper case.',
|
|
|
|
'lower': 'Converts a string expression to upper case.',
|
|
|
|
'sqrt': 'Computes the square root of the specified float value.',
|
2015-04-29 03:09:24 -04:00
|
|
|
'abs': 'Computes the absolute value.',
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
'max': 'Aggregate function: returns the maximum value of the expression in a group.',
|
|
|
|
'min': 'Aggregate function: returns the minimum value of the expression in a group.',
|
|
|
|
'count': 'Aggregate function: returns the number of items in a group.',
|
|
|
|
'sum': 'Aggregate function: returns the sum of all values in the expression.',
|
|
|
|
'avg': 'Aggregate function: returns the average of the values in a group.',
|
|
|
|
'mean': 'Aggregate function: returns the average of the values in a group.',
|
|
|
|
'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
|
|
|
|
}
|
|
|
|
|
|
|
|
_functions_1_4 = {
|
2015-05-06 01:56:01 -04:00
|
|
|
# unary math functions
|
2018-03-05 09:46:40 -05:00
|
|
|
'acos': ':return: inverse cosine of `col`, as if computed by `java.lang.Math.acos()`',
|
|
|
|
'asin': ':return: inverse sine of `col`, as if computed by `java.lang.Math.asin()`',
|
|
|
|
'atan': ':return: inverse tangent of `col`, as if computed by `java.lang.Math.atan()`',
|
2015-05-06 01:56:01 -04:00
|
|
|
'cbrt': 'Computes the cube-root of the given value.',
|
|
|
|
'ceil': 'Computes the ceiling of the given value.',
|
2018-03-05 09:46:40 -05:00
|
|
|
'cos': """:param col: angle in radians
|
|
|
|
:return: cosine of the angle, as if computed by `java.lang.Math.cos()`.""",
|
|
|
|
'cosh': """:param col: hyperbolic angle
|
|
|
|
:return: hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`""",
|
2015-05-06 01:56:01 -04:00
|
|
|
'exp': 'Computes the exponential of the given value.',
|
|
|
|
'expm1': 'Computes the exponential of the given value minus one.',
|
|
|
|
'floor': 'Computes the floor of the given value.',
|
|
|
|
'log': 'Computes the natural logarithm of the given value.',
|
|
|
|
'log10': 'Computes the logarithm of the given value in Base 10.',
|
|
|
|
'log1p': 'Computes the natural logarithm of the given value plus one.',
|
|
|
|
'rint': 'Returns the double value that is closest in value to the argument and' +
|
|
|
|
' is equal to a mathematical integer.',
|
|
|
|
'signum': 'Computes the signum of the given value.',
|
2018-03-05 09:46:40 -05:00
|
|
|
'sin': """:param col: angle in radians
|
|
|
|
:return: sine of the angle, as if computed by `java.lang.Math.sin()`""",
|
|
|
|
'sinh': """:param col: hyperbolic angle
|
|
|
|
:return: hyperbolic sine of the given value,
|
|
|
|
as if computed by `java.lang.Math.sinh()`""",
|
|
|
|
'tan': """:param col: angle in radians
|
|
|
|
:return: tangent of the given value, as if computed by `java.lang.Math.tan()`""",
|
|
|
|
'tanh': """:param col: hyperbolic angle
|
|
|
|
:return: hyperbolic tangent of the given value,
|
|
|
|
as if computed by `java.lang.Math.tanh()`""",
|
2017-07-08 02:59:34 -04:00
|
|
|
'toDegrees': '.. note:: Deprecated in 2.1, use :func:`degrees` instead.',
|
|
|
|
'toRadians': '.. note:: Deprecated in 2.1, use :func:`radians` instead.',
|
2015-05-07 04:00:29 -04:00
|
|
|
'bitwiseNOT': 'Computes bitwise not.',
|
2015-02-14 02:03:22 -05:00
|
|
|
}
|
|
|
|
|
2018-04-08 00:09:06 -04:00
|
|
|
_functions_2_4 = {
|
|
|
|
'asc_nulls_first': 'Returns a sort expression based on the ascending order of the given' +
|
|
|
|
' column name, and null values return before non-null values.',
|
|
|
|
'asc_nulls_last': 'Returns a sort expression based on the ascending order of the given' +
|
|
|
|
' column name, and null values appear after non-null values.',
|
|
|
|
'desc_nulls_first': 'Returns a sort expression based on the descending order of the given' +
|
|
|
|
' column name, and null values appear before non-null values.',
|
|
|
|
'desc_nulls_last': 'Returns a sort expression based on the descending order of the given' +
|
|
|
|
' column name, and null values appear after non-null values',
|
|
|
|
}
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
_collect_list_doc = """
|
|
|
|
Aggregate function: returns a list of objects with duplicates.
|
|
|
|
|
|
|
|
>>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
|
|
|
|
>>> df2.agg(collect_list('age')).collect()
|
|
|
|
[Row(collect_list(age)=[2, 5, 5])]
|
|
|
|
"""
|
|
|
|
_collect_set_doc = """
|
|
|
|
Aggregate function: returns a set of objects with duplicate elements eliminated.
|
|
|
|
|
|
|
|
>>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
|
|
|
|
>>> df2.agg(collect_set('age')).collect()
|
|
|
|
[Row(collect_set(age)=[5, 2])]
|
|
|
|
"""
|
2015-11-03 16:33:46 -05:00
|
|
|
_functions_1_6 = {
|
|
|
|
# unary math functions
|
2015-11-09 17:30:37 -05:00
|
|
|
'stddev': 'Aggregate function: returns the unbiased sample standard deviation of' +
|
|
|
|
' the expression in a group.',
|
|
|
|
'stddev_samp': 'Aggregate function: returns the unbiased sample standard deviation of' +
|
|
|
|
' the expression in a group.',
|
|
|
|
'stddev_pop': 'Aggregate function: returns population standard deviation of' +
|
|
|
|
' the expression in a group.',
|
|
|
|
'variance': 'Aggregate function: returns the population variance of the values in a group.',
|
|
|
|
'var_samp': 'Aggregate function: returns the unbiased variance of the values in a group.',
|
|
|
|
'var_pop': 'Aggregate function: returns the population variance of the values in a group.',
|
|
|
|
'skewness': 'Aggregate function: returns the skewness of the values in a group.',
|
|
|
|
'kurtosis': 'Aggregate function: returns the kurtosis of the values in a group.',
|
2017-07-08 02:59:34 -04:00
|
|
|
'collect_list': _collect_list_doc,
|
|
|
|
'collect_set': _collect_set_doc
|
2016-10-07 06:49:34 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
_functions_2_1 = {
|
|
|
|
# unary math functions
|
2018-03-05 09:46:40 -05:00
|
|
|
'degrees': """
|
|
|
|
Converts an angle measured in radians to an approximately equivalent angle
|
|
|
|
measured in degrees.
|
|
|
|
:param col: angle in radians
|
|
|
|
:return: angle in degrees, as if computed by `java.lang.Math.toDegrees()`
|
|
|
|
""",
|
|
|
|
'radians': """
|
|
|
|
Converts an angle measured in degrees to an approximately equivalent angle
|
|
|
|
measured in radians.
|
|
|
|
:param col: angle in degrees
|
|
|
|
:return: angle in radians, as if computed by `java.lang.Math.toRadians()`
|
|
|
|
""",
|
2015-11-03 16:33:46 -05:00
|
|
|
}
|
|
|
|
|
2015-05-06 01:56:01 -04:00
|
|
|
# math functions that take two arguments as input
|
|
|
|
_binary_mathfunctions = {
|
2018-03-05 09:46:40 -05:00
|
|
|
'atan2': """
|
|
|
|
:param col1: coordinate on y-axis
|
|
|
|
:param col2: coordinate on x-axis
|
|
|
|
:return: the `theta` component of the point
|
|
|
|
(`r`, `theta`)
|
|
|
|
in polar coordinates that corresponds to the point
|
|
|
|
(`x`, `y`) in Cartesian coordinates,
|
|
|
|
as if computed by `java.lang.Math.atan2()`
|
|
|
|
""",
|
2016-07-28 17:57:15 -04:00
|
|
|
'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.',
|
2015-06-18 02:31:30 -04:00
|
|
|
'pow': 'Returns the value of the first argument raised to the power of the second argument.',
|
2015-05-06 01:56:01 -04:00
|
|
|
}
|
|
|
|
|
2015-05-23 11:30:05 -04:00
|
|
|
_window_functions = {
|
2015-11-25 00:30:53 -05:00
|
|
|
'row_number':
|
|
|
|
"""returns a sequential number starting at 1 within a window partition.""",
|
|
|
|
'dense_rank':
|
2015-05-23 11:30:05 -04:00
|
|
|
"""returns the rank of rows within a window partition, without any gaps.
|
|
|
|
|
2017-01-08 20:53:53 -05:00
|
|
|
The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
|
|
|
|
sequence when there are ties. That is, if you were ranking a competition using dense_rank
|
2015-05-23 11:30:05 -04:00
|
|
|
and had three people tie for second place, you would say that all three were in second
|
2017-01-08 20:53:53 -05:00
|
|
|
place and that the next person came in third. Rank would give me sequential numbers, making
|
|
|
|
the person that came in third place (after the ties) would register as coming in fifth.
|
|
|
|
|
|
|
|
This is equivalent to the DENSE_RANK function in SQL.""",
|
2015-05-23 11:30:05 -04:00
|
|
|
'rank':
|
|
|
|
"""returns the rank of rows within a window partition.
|
|
|
|
|
2017-01-08 20:53:53 -05:00
|
|
|
The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
|
|
|
|
sequence when there are ties. That is, if you were ranking a competition using dense_rank
|
2015-05-23 11:30:05 -04:00
|
|
|
and had three people tie for second place, you would say that all three were in second
|
2017-01-08 20:53:53 -05:00
|
|
|
place and that the next person came in third. Rank would give me sequential numbers, making
|
|
|
|
the person that came in third place (after the ties) would register as coming in fifth.
|
2015-05-23 11:30:05 -04:00
|
|
|
|
|
|
|
This is equivalent to the RANK function in SQL.""",
|
2015-11-25 00:30:53 -05:00
|
|
|
'cume_dist':
|
2015-05-23 11:30:05 -04:00
|
|
|
"""returns the cumulative distribution of values within a window partition,
|
2015-11-25 00:30:53 -05:00
|
|
|
i.e. the fraction of rows that are below the current row.""",
|
|
|
|
'percent_rank':
|
|
|
|
"""returns the relative rank (i.e. percentile) of rows within a window partition.""",
|
2015-05-23 11:30:05 -04:00
|
|
|
}
|
|
|
|
|
[SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
## What changes were proposed in this pull request?
This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.
This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:
**Before**
<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />
**After**
<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />
For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):
```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```
so, it won't actually mess up the terminal much unless it is intended.
If this is intendedly enabled, it'd should as below:
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
"Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```
These instances were found by:
```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```
## How was this patch tested?
Manually tested.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #19535 from HyukjinKwon/deprecated-warning.
2017-10-23 23:44:47 -04:00
|
|
|
# Wraps deprecated functions (keys) with the messages (values).
|
|
|
|
_functions_deprecated = {
|
|
|
|
'toDegrees': 'Deprecated in 2.1, use degrees instead.',
|
|
|
|
'toRadians': 'Deprecated in 2.1, use radians instead.',
|
|
|
|
}
|
|
|
|
|
2015-02-14 02:03:22 -05:00
|
|
|
for _name, _doc in _functions.items():
|
2015-05-21 02:05:54 -04:00
|
|
|
globals()[_name] = since(1.3)(_create_function(_name, _doc))
|
|
|
|
for _name, _doc in _functions_1_4.items():
|
|
|
|
globals()[_name] = since(1.4)(_create_function(_name, _doc))
|
2015-05-06 01:56:01 -04:00
|
|
|
for _name, _doc in _binary_mathfunctions.items():
|
2015-05-21 02:05:54 -04:00
|
|
|
globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
|
2015-05-23 11:30:05 -04:00
|
|
|
for _name, _doc in _window_functions.items():
|
2015-11-25 00:30:53 -05:00
|
|
|
globals()[_name] = since(1.6)(_create_window_function(_name, _doc))
|
2015-11-03 16:33:46 -05:00
|
|
|
for _name, _doc in _functions_1_6.items():
|
|
|
|
globals()[_name] = since(1.6)(_create_function(_name, _doc))
|
2016-10-07 06:49:34 -04:00
|
|
|
for _name, _doc in _functions_2_1.items():
|
|
|
|
globals()[_name] = since(2.1)(_create_function(_name, _doc))
|
[SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
## What changes were proposed in this pull request?
This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.
This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:
**Before**
<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />
**After**
<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />
For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):
```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```
so, it won't actually mess up the terminal much unless it is intended.
If this is intendedly enabled, it'd should as below:
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
"Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```
These instances were found by:
```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```
## How was this patch tested?
Manually tested.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #19535 from HyukjinKwon/deprecated-warning.
2017-10-23 23:44:47 -04:00
|
|
|
for _name, _message in _functions_deprecated.items():
|
|
|
|
globals()[_name] = _wrap_deprecated_function(globals()[_name], _message)
|
2018-04-08 00:09:06 -04:00
|
|
|
for _name, _doc in _functions_2_4.items():
|
|
|
|
globals()[_name] = since(2.4)(_create_function(_name, _doc))
|
2015-02-14 02:03:22 -05:00
|
|
|
del _name, _doc
|
2015-05-01 00:56:03 -04:00
|
|
|
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.3)
|
2015-04-26 14:46:58 -04:00
|
|
|
def approxCountDistinct(col, rsd=None):
|
2016-10-07 06:49:34 -04:00
|
|
|
"""
|
2017-07-08 02:59:34 -04:00
|
|
|
.. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead.
|
2016-10-07 06:49:34 -04:00
|
|
|
"""
|
[SPARK-22313][PYTHON] Mark/print deprecation warnings as DeprecationWarning for deprecated APIs
## What changes were proposed in this pull request?
This PR proposes to mark the existing warnings as `DeprecationWarning` and print out warnings for deprecated functions.
This could be actually useful for Spark app developers. I use (old) PyCharm and this IDE can detect this specific `DeprecationWarning` in some cases:
**Before**
<img src="https://user-images.githubusercontent.com/6477701/31762664-df68d9f8-b4f6-11e7-8773-f0468f70a2cc.png" height="45" />
**After**
<img src="https://user-images.githubusercontent.com/6477701/31762662-de4d6868-b4f6-11e7-98dc-3c8446a0c28a.png" height="70" />
For console usage, `DeprecationWarning` is usually disabled (see https://docs.python.org/2/library/warnings.html#warning-categories and https://docs.python.org/3/library/warnings.html#warning-categories):
```
>>> import warnings
>>> filter(lambda f: f[2] == DeprecationWarning, warnings.filters)
[('ignore', <_sre.SRE_Pattern object at 0x10ba58c00>, <type 'exceptions.DeprecationWarning'>, <_sre.SRE_Pattern object at 0x10bb04138>, 0), ('ignore', None, <type 'exceptions.DeprecationWarning'>, None, 0)]
```
so, it won't actually mess up the terminal much unless it is intended.
If this is intendedly enabled, it'd should as below:
```
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>>
>>> from pyspark.sql import functions
>>> functions.approxCountDistinct("a")
.../spark/python/pyspark/sql/functions.py:232: DeprecationWarning: Deprecated in 2.1, use approx_count_distinct instead.
"Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
...
```
These instances were found by:
```
cd python/pyspark
grep -r "Deprecated" .
grep -r "deprecated" .
grep -r "deprecate" .
```
## How was this patch tested?
Manually tested.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #19535 from HyukjinKwon/deprecated-warning.
2017-10-23 23:44:47 -04:00
|
|
|
warnings.warn("Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning)
|
2016-10-07 06:49:34 -04:00
|
|
|
return approx_count_distinct(col, rsd)
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.1)
|
|
|
|
def approx_count_distinct(col, rsd=None):
|
2017-07-08 02:59:34 -04:00
|
|
|
"""Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`.
|
2015-04-26 14:46:58 -04:00
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
:param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more
|
|
|
|
efficient to use :func:`countDistinct`
|
|
|
|
|
|
|
|
>>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()
|
|
|
|
[Row(distinct_ages=2)]
|
2015-04-26 14:46:58 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if rsd is None:
|
2016-10-07 06:49:34 -04:00
|
|
|
jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))
|
2015-04-26 14:46:58 -04:00
|
|
|
else:
|
2016-10-07 06:49:34 -04:00
|
|
|
jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)
|
2015-04-26 14:46:58 -04:00
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-09-22 02:36:41 -04:00
|
|
|
@since(1.6)
|
|
|
|
def broadcast(df):
|
|
|
|
"""Marks a DataFrame as small enough for use in broadcast joins."""
|
|
|
|
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return DataFrame(sc._jvm.functions.broadcast(df._jdf), df.sql_ctx)
|
|
|
|
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-07 13:58:35 -04:00
|
|
|
def coalesce(*cols):
|
|
|
|
"""Returns the first column that is not null.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
|
2015-05-07 13:58:35 -04:00
|
|
|
>>> cDf.show()
|
|
|
|
+----+----+
|
|
|
|
| a| b|
|
|
|
|
+----+----+
|
|
|
|
|null|null|
|
|
|
|
| 1|null|
|
|
|
|
|null| 2|
|
|
|
|
+----+----+
|
|
|
|
|
|
|
|
>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
|
2016-02-21 09:53:15 -05:00
|
|
|
+--------------+
|
|
|
|
|coalesce(a, b)|
|
|
|
|
+--------------+
|
|
|
|
| null|
|
|
|
|
| 1|
|
|
|
|
| 2|
|
|
|
|
+--------------+
|
2015-05-07 13:58:35 -04:00
|
|
|
|
|
|
|
>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
|
2016-02-21 09:53:15 -05:00
|
|
|
+----+----+----------------+
|
|
|
|
| a| b|coalesce(a, 0.0)|
|
|
|
|
+----+----+----------------+
|
|
|
|
|null|null| 0.0|
|
|
|
|
| 1|null| 1.0|
|
|
|
|
|null| 2| 0.0|
|
|
|
|
+----+----+----------------+
|
2015-05-07 13:58:35 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-10 18:47:10 -05:00
|
|
|
@since(1.6)
|
|
|
|
def corr(col1, col2):
|
2017-07-08 02:59:34 -04:00
|
|
|
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``.
|
2015-11-10 18:47:10 -05:00
|
|
|
|
2016-02-12 15:43:13 -05:00
|
|
|
>>> a = range(20)
|
|
|
|
>>> b = [2 * x for x in range(20)]
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame(zip(a, b), ["a", "b"])
|
2016-02-12 15:43:13 -05:00
|
|
|
>>> df.agg(corr("a", "b").alias('c')).collect()
|
|
|
|
[Row(c=1.0)]
|
2015-11-10 18:47:10 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2)))
|
|
|
|
|
|
|
|
|
2016-02-12 15:43:13 -05:00
|
|
|
@since(2.0)
|
|
|
|
def covar_pop(col1, col2):
|
2017-07-08 02:59:34 -04:00
|
|
|
"""Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``.
|
2016-02-12 15:43:13 -05:00
|
|
|
|
|
|
|
>>> a = [1] * 10
|
|
|
|
>>> b = [1] * 10
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame(zip(a, b), ["a", "b"])
|
2016-02-12 15:43:13 -05:00
|
|
|
>>> df.agg(covar_pop("a", "b").alias('c')).collect()
|
|
|
|
[Row(c=0.0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.0)
|
|
|
|
def covar_samp(col1, col2):
|
2017-07-08 02:59:34 -04:00
|
|
|
"""Returns a new :class:`Column` for the sample covariance of ``col1`` and ``col2``.
|
2016-02-12 15:43:13 -05:00
|
|
|
|
|
|
|
>>> a = [1] * 10
|
|
|
|
>>> b = [1] * 10
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame(zip(a, b), ["a", "b"])
|
2016-02-12 15:43:13 -05:00
|
|
|
>>> df.agg(covar_samp("a", "b").alias('c')).collect()
|
|
|
|
[Row(c=0.0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.covar_samp(_to_java_column(col1), _to_java_column(col2)))
|
|
|
|
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.3)
|
2015-02-14 02:03:22 -05:00
|
|
|
def countDistinct(col, *cols):
|
2015-03-31 21:31:36 -04:00
|
|
|
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
|
2015-02-14 02:03:22 -05:00
|
|
|
|
|
|
|
>>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
|
|
|
|
[Row(c=2)]
|
|
|
|
|
|
|
|
>>> df.agg(countDistinct("age", "name").alias('c')).collect()
|
|
|
|
[Row(c=2)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-04-17 12:29:27 -04:00
|
|
|
jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
|
2015-02-14 02:03:22 -05:00
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2016-01-31 16:56:13 -05:00
|
|
|
@since(1.3)
|
|
|
|
def first(col, ignorenulls=False):
|
|
|
|
"""Aggregate function: returns the first value in a group.
|
|
|
|
|
|
|
|
The function by default returns the first values it sees. It will return the first non-null
|
|
|
|
value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.first(_to_java_column(col), ignorenulls)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2016-02-10 23:13:38 -05:00
|
|
|
@since(2.0)
|
|
|
|
def grouping(col):
|
|
|
|
"""
|
|
|
|
Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
|
|
|
|
or not, returns 1 for aggregated or 0 for not aggregated in the result set.
|
|
|
|
|
|
|
|
>>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()
|
|
|
|
+-----+--------------+--------+
|
|
|
|
| name|grouping(name)|sum(age)|
|
|
|
|
+-----+--------------+--------+
|
|
|
|
| null| 1| 7|
|
|
|
|
|Alice| 0| 2|
|
|
|
|
| Bob| 0| 5|
|
|
|
|
+-----+--------------+--------+
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.grouping(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.0)
|
|
|
|
def grouping_id(*cols):
|
|
|
|
"""
|
|
|
|
Aggregate function: returns the level of grouping, equals to
|
|
|
|
|
|
|
|
(grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
|
|
|
|
|
2016-11-22 06:40:18 -05:00
|
|
|
.. note:: The list of columns should match with grouping columns exactly, or empty (means all
|
2016-11-06 00:47:33 -04:00
|
|
|
the grouping columns).
|
2016-02-10 23:13:38 -05:00
|
|
|
|
|
|
|
>>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
|
[SPARK-12720][SQL] SQL Generation Support for Cube, Rollup, and Grouping Sets
#### What changes were proposed in this pull request?
This PR is for supporting SQL generation for cube, rollup and grouping sets.
For example, a query using rollup:
```SQL
SELECT count(*) as cnt, key % 5, grouping_id() FROM t1 GROUP BY key % 5 WITH ROLLUP
```
Original logical plan:
```
Aggregate [(key#17L % cast(5 as bigint))#47L,grouping__id#46],
[(count(1),mode=Complete,isDistinct=false) AS cnt#43L,
(key#17L % cast(5 as bigint))#47L AS _c1#45L,
grouping__id#46 AS _c2#44]
+- Expand [List(key#17L, value#18, (key#17L % cast(5 as bigint))#47L, 0),
List(key#17L, value#18, null, 1)],
[key#17L,value#18,(key#17L % cast(5 as bigint))#47L,grouping__id#46]
+- Project [key#17L,
value#18,
(key#17L % cast(5 as bigint)) AS (key#17L % cast(5 as bigint))#47L]
+- Subquery t1
+- Relation[key#17L,value#18] ParquetRelation
```
Converted SQL:
```SQL
SELECT count( 1) AS `cnt`,
(`t1`.`key` % CAST(5 AS BIGINT)),
grouping_id() AS `_c2`
FROM `default`.`t1`
GROUP BY (`t1`.`key` % CAST(5 AS BIGINT))
GROUPING SETS (((`t1`.`key` % CAST(5 AS BIGINT))), ())
```
#### How was the this patch tested?
Added eight test cases in `LogicalPlanToSQLSuite`.
Author: gatorsmile <gatorsmile@gmail.com>
Author: xiaoli <lixiao1983@gmail.com>
Author: Xiao Li <xiaoli@Xiaos-MacBook-Pro.local>
Closes #11283 from gatorsmile/groupingSetsToSQL.
2016-03-05 06:25:03 -05:00
|
|
|
+-----+-------------+--------+
|
|
|
|
| name|grouping_id()|sum(age)|
|
|
|
|
+-----+-------------+--------+
|
|
|
|
| null| 1| 7|
|
|
|
|
|Alice| 0| 2|
|
|
|
|
| Bob| 0| 5|
|
|
|
|
+-----+-------------+--------+
|
2016-02-10 23:13:38 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.grouping_id(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-25 00:30:53 -05:00
|
|
|
@since(1.6)
|
|
|
|
def input_file_name():
|
|
|
|
"""Creates a string column for the file name of the current Spark task.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.input_file_name())
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.6)
|
|
|
|
def isnan(col):
|
|
|
|
"""An expression that returns true iff the column is NaN.
|
2015-11-26 02:24:33 -05:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
|
2015-11-26 02:24:33 -05:00
|
|
|
>>> df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).collect()
|
|
|
|
[Row(r1=False, r2=False), Row(r1=True, r2=True)]
|
2015-11-25 00:30:53 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.isnan(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.6)
|
|
|
|
def isnull(col):
|
|
|
|
"""An expression that returns true iff the column is null.
|
2015-11-26 02:24:33 -05:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
|
2015-11-26 02:24:33 -05:00
|
|
|
>>> df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).collect()
|
|
|
|
[Row(r1=False, r2=False), Row(r1=True, r2=True)]
|
2015-11-25 00:30:53 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.isnull(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2016-01-31 16:56:13 -05:00
|
|
|
@since(1.3)
|
|
|
|
def last(col, ignorenulls=False):
|
|
|
|
"""Aggregate function: returns the last value in a group.
|
|
|
|
|
|
|
|
The function by default returns the last values it sees. It will return the last non-null
|
|
|
|
value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.last(_to_java_column(col), ignorenulls)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-25 00:30:53 -05:00
|
|
|
@since(1.6)
|
|
|
|
def monotonically_increasing_id():
|
2015-04-28 03:39:08 -04:00
|
|
|
"""A column that generates monotonically increasing 64-bit integers.
|
|
|
|
|
|
|
|
The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
|
|
|
|
The current implementation puts the partition ID in the upper 31 bits, and the record number
|
|
|
|
within each partition in the lower 33 bits. The assumption is that the data frame has
|
|
|
|
less than 1 billion partitions, and each partition has less than 8 billion records.
|
|
|
|
|
2015-05-23 11:30:05 -04:00
|
|
|
As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
|
2015-04-28 03:39:08 -04:00
|
|
|
This expression would return the following IDs:
|
|
|
|
0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
|
|
|
|
|
|
|
|
>>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1'])
|
2015-11-25 00:30:53 -05:00
|
|
|
>>> df0.select(monotonically_increasing_id().alias('id')).collect()
|
2015-04-28 03:39:08 -04:00
|
|
|
[Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-11-25 00:30:53 -05:00
|
|
|
return Column(sc._jvm.functions.monotonically_increasing_id())
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.6)
|
|
|
|
def nanvl(col1, col2):
|
|
|
|
"""Returns col1 if it is not NaN, or col2 if col1 is NaN.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).
|
2015-11-26 02:24:33 -05:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
|
2015-11-26 02:24:33 -05:00
|
|
|
>>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()
|
|
|
|
[Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]
|
2015-11-25 00:30:53 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2)))
|
2015-04-28 03:39:08 -04:00
|
|
|
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
@ignore_unicode_prefix
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-01 15:49:02 -04:00
|
|
|
def rand(seed=None):
|
2016-11-06 00:47:33 -04:00
|
|
|
"""Generates a random column with independent and identically distributed (i.i.d.) samples
|
|
|
|
from U[0.0, 1.0].
|
2017-07-08 02:59:34 -04:00
|
|
|
|
|
|
|
>>> df.withColumn('rand', rand(seed=42) * 3).collect()
|
|
|
|
[Row(age=2, name=u'Alice', rand=1.1568609015300986),
|
|
|
|
Row(age=5, name=u'Bob', rand=1.403379671529166)]
|
2015-05-01 15:49:02 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-06 20:03:14 -04:00
|
|
|
if seed is not None:
|
2015-05-01 15:49:02 -04:00
|
|
|
jc = sc._jvm.functions.rand(seed)
|
|
|
|
else:
|
|
|
|
jc = sc._jvm.functions.rand()
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
@ignore_unicode_prefix
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-01 15:49:02 -04:00
|
|
|
def randn(seed=None):
|
2016-11-06 00:47:33 -04:00
|
|
|
"""Generates a column with independent and identically distributed (i.i.d.) samples from
|
|
|
|
the standard normal distribution.
|
2017-07-08 02:59:34 -04:00
|
|
|
|
|
|
|
>>> df.withColumn('randn', randn(seed=42)).collect()
|
|
|
|
[Row(age=2, name=u'Alice', randn=-0.7556247885860078),
|
|
|
|
Row(age=5, name=u'Bob', randn=-0.0861619008451133)]
|
2015-05-01 15:49:02 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-06 20:03:14 -04:00
|
|
|
if seed is not None:
|
2015-05-01 15:49:02 -04:00
|
|
|
jc = sc._jvm.functions.randn(seed)
|
|
|
|
else:
|
|
|
|
jc = sc._jvm.functions.randn()
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-06-30 19:59:44 -04:00
|
|
|
@since(1.5)
|
2015-08-04 22:25:24 -04:00
|
|
|
def round(col, scale=0):
|
2015-06-30 19:59:44 -04:00
|
|
|
"""
|
[SPARK-14639] [PYTHON] [R] Add `bround` function in Python/R.
## What changes were proposed in this pull request?
This issue aims to expose Scala `bround` function in Python/R API.
`bround` function is implemented in SPARK-14614 by extending current `round` function.
We used the following semantics from Hive.
```java
public static double bround(double input, int scale) {
if (Double.isNaN(input) || Double.isInfinite(input)) {
return input;
}
return BigDecimal.valueOf(input).setScale(scale, RoundingMode.HALF_EVEN).doubleValue();
}
```
After this PR, `pyspark` and `sparkR` also support `bround` function.
**PySpark**
```python
>>> from pyspark.sql.functions import bround
>>> sqlContext.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()
[Row(r=2.0)]
```
**SparkR**
```r
> df = createDataFrame(sqlContext, data.frame(x = c(2.5, 3.5)))
> head(collect(select(df, bround(df$x, 0))))
bround(x, 0)
1 2
2 4
```
## How was this patch tested?
Pass the Jenkins tests (including new testcases).
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12509 from dongjoon-hyun/SPARK-14639.
2016-04-20 01:28:11 -04:00
|
|
|
Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0
|
2015-08-04 22:25:24 -04:00
|
|
|
or at integral part when `scale` < 0.
|
2015-06-26 01:07:37 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect()
|
[SPARK-14639] [PYTHON] [R] Add `bround` function in Python/R.
## What changes were proposed in this pull request?
This issue aims to expose Scala `bround` function in Python/R API.
`bround` function is implemented in SPARK-14614 by extending current `round` function.
We used the following semantics from Hive.
```java
public static double bround(double input, int scale) {
if (Double.isNaN(input) || Double.isInfinite(input)) {
return input;
}
return BigDecimal.valueOf(input).setScale(scale, RoundingMode.HALF_EVEN).doubleValue();
}
```
After this PR, `pyspark` and `sparkR` also support `bround` function.
**PySpark**
```python
>>> from pyspark.sql.functions import bround
>>> sqlContext.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()
[Row(r=2.0)]
```
**SparkR**
```r
> df = createDataFrame(sqlContext, data.frame(x = c(2.5, 3.5)))
> head(collect(select(df, bround(df$x, 0))))
bround(x, 0)
1 2
2 4
```
## How was this patch tested?
Pass the Jenkins tests (including new testcases).
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12509 from dongjoon-hyun/SPARK-14639.
2016-04-20 01:28:11 -04:00
|
|
|
[Row(r=3.0)]
|
2015-06-26 01:07:37 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-04 22:25:24 -04:00
|
|
|
return Column(sc._jvm.functions.round(_to_java_column(col), scale))
|
2015-06-26 01:07:37 -04:00
|
|
|
|
|
|
|
|
[SPARK-14639] [PYTHON] [R] Add `bround` function in Python/R.
## What changes were proposed in this pull request?
This issue aims to expose Scala `bround` function in Python/R API.
`bround` function is implemented in SPARK-14614 by extending current `round` function.
We used the following semantics from Hive.
```java
public static double bround(double input, int scale) {
if (Double.isNaN(input) || Double.isInfinite(input)) {
return input;
}
return BigDecimal.valueOf(input).setScale(scale, RoundingMode.HALF_EVEN).doubleValue();
}
```
After this PR, `pyspark` and `sparkR` also support `bround` function.
**PySpark**
```python
>>> from pyspark.sql.functions import bround
>>> sqlContext.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()
[Row(r=2.0)]
```
**SparkR**
```r
> df = createDataFrame(sqlContext, data.frame(x = c(2.5, 3.5)))
> head(collect(select(df, bround(df$x, 0))))
bround(x, 0)
1 2
2 4
```
## How was this patch tested?
Pass the Jenkins tests (including new testcases).
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12509 from dongjoon-hyun/SPARK-14639.
2016-04-20 01:28:11 -04:00
|
|
|
@since(2.0)
|
|
|
|
def bround(col, scale=0):
|
|
|
|
"""
|
|
|
|
Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` >= 0
|
|
|
|
or at integral part when `scale` < 0.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()
|
[SPARK-14639] [PYTHON] [R] Add `bround` function in Python/R.
## What changes were proposed in this pull request?
This issue aims to expose Scala `bround` function in Python/R API.
`bround` function is implemented in SPARK-14614 by extending current `round` function.
We used the following semantics from Hive.
```java
public static double bround(double input, int scale) {
if (Double.isNaN(input) || Double.isInfinite(input)) {
return input;
}
return BigDecimal.valueOf(input).setScale(scale, RoundingMode.HALF_EVEN).doubleValue();
}
```
After this PR, `pyspark` and `sparkR` also support `bround` function.
**PySpark**
```python
>>> from pyspark.sql.functions import bround
>>> sqlContext.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()
[Row(r=2.0)]
```
**SparkR**
```r
> df = createDataFrame(sqlContext, data.frame(x = c(2.5, 3.5)))
> head(collect(select(df, bround(df$x, 0))))
bround(x, 0)
1 2
2 4
```
## How was this patch tested?
Pass the Jenkins tests (including new testcases).
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12509 from dongjoon-hyun/SPARK-14639.
2016-04-20 01:28:11 -04:00
|
|
|
[Row(r=2.0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.bround(_to_java_column(col), scale))
|
|
|
|
|
|
|
|
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
@since(1.5)
|
|
|
|
def shiftLeft(col, numBits):
|
2016-02-22 04:52:07 -05:00
|
|
|
"""Shift the given value numBits left.
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(21,)], ['a']).select(shiftLeft('a', 1).alias('r')).collect()
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
[Row(r=42)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-04 22:25:24 -04:00
|
|
|
return Column(sc._jvm.functions.shiftLeft(_to_java_column(col), numBits))
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def shiftRight(col, numBits):
|
2016-11-06 00:47:33 -04:00
|
|
|
"""(Signed) shift the given value numBits right.
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
|
[SPARK-8223] [SPARK-8224] [SQL] shift left and shift right
Jira:
https://issues.apache.org/jira/browse/SPARK-8223
https://issues.apache.org/jira/browse/SPARK-8224
~~I am aware of #7174 and will update this pr, if it's merged.~~ Done
I don't know if #7034 can simplify this, but we can have a look on it, if it gets merged
rxin In the Jira ticket the function as no second argument. I added a `numBits` argument that allows to specify the number of bits. I guess this improves the usability. I wanted to add `shiftleft(value)` as well, but the `selectExpr` dataframe tests crashes, if I have both. I order to do this, I added the following to the functions.scala `def shiftRight(e: Column): Column = ShiftRight(e.expr, lit(1).expr)`, but as I mentioned this doesn't pass tests like `df.selectExpr("shiftRight(a)", ...` (not enough arguments exception).
If we need the bitwise shift in order to be hive compatible, I suggest to add `shiftLeft` and something like `shiftLeftX`
Author: Tarek Auel <tarek.auel@googlemail.com>
Closes #7178 from tarekauel/8223 and squashes the following commits:
8023bb5 [Tarek Auel] [SPARK-8223][SPARK-8224] fixed test
f3f64e6 [Tarek Auel] [SPARK-8223][SPARK-8224] Integer -> Int
f628706 [Tarek Auel] [SPARK-8223][SPARK-8224] removed toString; updated function description
3b56f2a [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
5189690 [Tarek Auel] [SPARK-8223][SPARK-8224] minor fix and style fix
9434a28 [Tarek Auel] Merge remote-tracking branch 'origin/master' into 8223
44ee324 [Tarek Auel] [SPARK-8223][SPARK-8224] docu fix
ac7fe9d [Tarek Auel] [SPARK-8223][SPARK-8224] right and left bit shift
2015-07-02 13:02:19 -04:00
|
|
|
[Row(r=21)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.shiftRight(_to_java_column(col), numBits)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-07-03 18:39:16 -04:00
|
|
|
@since(1.5)
|
|
|
|
def shiftRightUnsigned(col, numBits):
|
2016-02-22 04:52:07 -05:00
|
|
|
"""Unsigned shift the given value numBits right.
|
2015-07-03 18:39:16 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(-42,)], ['a'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(shiftRightUnsigned('a', 1).alias('r')).collect()
|
2015-07-03 18:39:16 -04:00
|
|
|
[Row(r=9223372036854775787)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.shiftRightUnsigned(_to_java_column(col), numBits)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-25 00:30:53 -05:00
|
|
|
@since(1.6)
|
|
|
|
def spark_partition_id():
|
2016-11-04 01:27:35 -04:00
|
|
|
"""A column for partition ID.
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2016-11-22 06:40:18 -05:00
|
|
|
.. note:: This is indeterministic because it depends on data partitioning and task scheduling.
|
2015-04-26 14:46:58 -04:00
|
|
|
|
2015-11-25 00:30:53 -05:00
|
|
|
>>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()
|
2015-04-26 14:46:58 -04:00
|
|
|
[Row(pid=0), Row(pid=0)]
|
2015-02-14 02:03:22 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-11-25 00:30:53 -05:00
|
|
|
return Column(sc._jvm.functions.spark_partition_id())
|
2015-02-14 02:03:22 -05:00
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
2015-07-25 03:34:59 -04:00
|
|
|
def expr(str):
|
|
|
|
"""Parses the expression string into the column that it represents
|
|
|
|
|
|
|
|
>>> df.select(expr("length(name)")).collect()
|
2015-11-10 14:06:29 -05:00
|
|
|
[Row(length(name)=5), Row(length(name)=3)]
|
2015-07-25 03:34:59 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.expr(str))
|
|
|
|
|
|
|
|
|
2015-05-01 15:49:02 -04:00
|
|
|
@ignore_unicode_prefix
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-01 15:49:02 -04:00
|
|
|
def struct(*cols):
|
|
|
|
"""Creates a new struct column.
|
|
|
|
|
|
|
|
:param cols: list of column names (string) or list of :class:`Column` expressions
|
|
|
|
|
|
|
|
>>> df.select(struct('age', 'name').alias("struct")).collect()
|
|
|
|
[Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
|
|
|
|
>>> df.select(struct([df.age, df.name]).alias("struct")).collect()
|
|
|
|
[Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if len(cols) == 1 and isinstance(cols[0], (list, set)):
|
|
|
|
cols = cols[0]
|
|
|
|
jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
|
|
|
def greatest(*cols):
|
|
|
|
"""
|
|
|
|
Returns the greatest value of the list of column names, skipping null values.
|
|
|
|
This function takes at least 2 parameters. It will return null iff all parameters are null.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect()
|
|
|
|
[Row(greatest=4)]
|
|
|
|
"""
|
|
|
|
if len(cols) < 2:
|
|
|
|
raise ValueError("greatest should take at least two columns")
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.greatest(_to_seq(sc, cols, _to_java_column)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def least(*cols):
|
|
|
|
"""
|
|
|
|
Returns the least value of the list of column names, skipping null values.
|
|
|
|
This function takes at least 2 parameters. It will return null iff all parameters are null.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(least(df.a, df.b, df.c).alias("least")).collect()
|
|
|
|
[Row(least=1)]
|
|
|
|
"""
|
|
|
|
if len(cols) < 2:
|
|
|
|
raise ValueError("least should take at least two columns")
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column)))
|
|
|
|
|
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.4)
|
2015-05-13 00:43:34 -04:00
|
|
|
def when(condition, value):
|
|
|
|
"""Evaluates a list of conditions and returns one of multiple possible result expressions.
|
|
|
|
If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
|
|
|
|
|
|
|
|
:param condition: a boolean :class:`Column` expression.
|
|
|
|
:param value: a literal value, or a :class:`Column` expression.
|
|
|
|
|
|
|
|
>>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
|
|
|
|
[Row(age=3), Row(age=4)]
|
|
|
|
|
|
|
|
>>> df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
|
|
|
|
[Row(age=3), Row(age=None)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if not isinstance(condition, Column):
|
|
|
|
raise TypeError("condition should be a Column")
|
|
|
|
v = value._jc if isinstance(value, Column) else value
|
|
|
|
jc = sc._jvm.functions.when(condition._jc, v)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-06-18 21:41:15 -04:00
|
|
|
@since(1.5)
|
|
|
|
def log(arg1, arg2=None):
|
2015-06-18 02:31:30 -04:00
|
|
|
"""Returns the first argument-based logarithm of the second argument.
|
|
|
|
|
2015-06-18 21:41:15 -04:00
|
|
|
If there is only one argument, then this takes the natural logarithm of the argument.
|
|
|
|
|
2016-03-02 18:26:34 -05:00
|
|
|
>>> df.select(log(10.0, df.age).alias('ten')).rdd.map(lambda l: str(l.ten)[:7]).collect()
|
2015-06-18 02:31:30 -04:00
|
|
|
['0.30102', '0.69897']
|
|
|
|
|
2016-03-02 18:26:34 -05:00
|
|
|
>>> df.select(log(df.age).alias('e')).rdd.map(lambda l: str(l.e)[:7]).collect()
|
2015-06-18 02:31:30 -04:00
|
|
|
['0.69314', '1.60943']
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-06-18 21:41:15 -04:00
|
|
|
if arg2 is None:
|
|
|
|
jc = sc._jvm.functions.log(_to_java_column(arg1))
|
|
|
|
else:
|
|
|
|
jc = sc._jvm.functions.log(arg1, _to_java_column(arg2))
|
2015-06-18 02:31:30 -04:00
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-06-30 19:59:44 -04:00
|
|
|
@since(1.5)
|
|
|
|
def log2(col):
|
|
|
|
"""Returns the base-2 logarithm of the argument.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect()
|
2015-06-30 19:59:44 -04:00
|
|
|
[Row(log2=2.0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.log2(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def conv(col, fromBase, toBase):
|
|
|
|
"""
|
|
|
|
Convert a number in a string column from one base to another.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([("010101",)], ['n'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(conv(df.n, 2, 16).alias('hex')).collect()
|
|
|
|
[Row(hex=u'15')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def factorial(col):
|
|
|
|
"""
|
|
|
|
Computes the factorial of the given value.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(5,)], ['n'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(factorial(df.n).alias('f')).collect()
|
|
|
|
[Row(f=120)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.factorial(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
# --------------- Window functions ------------------------
|
|
|
|
|
2015-05-23 11:30:05 -04:00
|
|
|
@since(1.4)
|
|
|
|
def lag(col, count=1, default=None):
|
|
|
|
"""
|
|
|
|
Window function: returns the value that is `offset` rows before the current row, and
|
|
|
|
`defaultValue` if there is less than `offset` rows before the current row. For example,
|
|
|
|
an `offset` of one will return the previous row at any given point in the window partition.
|
|
|
|
|
|
|
|
This is equivalent to the LAG function in SQL.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
:param count: number of row to extend
|
|
|
|
:param default: default value
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.lag(_to_java_column(col), count, default))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def lead(col, count=1, default=None):
|
|
|
|
"""
|
|
|
|
Window function: returns the value that is `offset` rows after the current row, and
|
|
|
|
`defaultValue` if there is less than `offset` rows after the current row. For example,
|
|
|
|
an `offset` of one will return the next row at any given point in the window partition.
|
|
|
|
|
|
|
|
This is equivalent to the LEAD function in SQL.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
:param count: number of row to extend
|
|
|
|
:param default: default value
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.lead(_to_java_column(col), count, default))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.4)
|
|
|
|
def ntile(n):
|
|
|
|
"""
|
2015-08-14 16:55:29 -04:00
|
|
|
Window function: returns the ntile group id (from 1 to `n` inclusive)
|
2015-08-19 04:42:41 -04:00
|
|
|
in an ordered window partition. For example, if `n` is 4, the first
|
2015-08-14 16:55:29 -04:00
|
|
|
quarter of the rows will get value 1, the second quarter will get 2,
|
|
|
|
the third quarter will get 3, and the last quarter will get 4.
|
2015-05-23 11:30:05 -04:00
|
|
|
|
|
|
|
This is equivalent to the NTILE function in SQL.
|
|
|
|
|
|
|
|
:param n: an integer
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.ntile(int(n)))
|
|
|
|
|
|
|
|
|
2018-02-11 04:55:38 -05:00
|
|
|
@since(2.4)
|
|
|
|
def unboundedPreceding():
|
|
|
|
"""
|
|
|
|
Window function: returns the special frame boundary that represents the first row
|
|
|
|
in the window partition.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.unboundedPreceding())
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.4)
|
|
|
|
def unboundedFollowing():
|
|
|
|
"""
|
|
|
|
Window function: returns the special frame boundary that represents the last row
|
|
|
|
in the window partition.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.unboundedFollowing())
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.4)
|
|
|
|
def currentRow():
|
|
|
|
"""
|
|
|
|
Window function: returns the special frame boundary that represents the current row
|
|
|
|
in the window partition.
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.currentRow())
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
# ---------------------- Date/Timestamp functions ------------------------------
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def current_date():
|
|
|
|
"""
|
2017-07-08 02:59:34 -04:00
|
|
|
Returns the current date as a :class:`DateType` column.
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.current_date())
|
|
|
|
|
|
|
|
|
|
|
|
def current_timestamp():
|
|
|
|
"""
|
2017-07-08 02:59:34 -04:00
|
|
|
Returns the current timestamp as a :class:`TimestampType` column.
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.current_timestamp())
|
|
|
|
|
|
|
|
|
2015-07-19 01:48:05 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
2015-08-04 22:25:24 -04:00
|
|
|
def date_format(date, format):
|
2015-07-19 01:48:05 -04:00
|
|
|
"""
|
|
|
|
Converts a date/timestamp/string to a value of string in the format specified by the date
|
|
|
|
format given by the second argument.
|
|
|
|
|
|
|
|
A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
|
|
|
|
pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
|
|
|
|
|
2016-11-06 00:47:33 -04:00
|
|
|
.. note:: Use when ever possible specialized functions like `year`. These benefit from a
|
|
|
|
specialized implementation.
|
2015-07-19 01:48:05 -04:00
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(date=u'04/08/2015')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-04 22:25:24 -04:00
|
|
|
return Column(sc._jvm.functions.date_format(_to_java_column(date), format))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def year(col):
|
|
|
|
"""
|
|
|
|
Extract the year of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(year('dt').alias('year')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(year=2015)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.year(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def quarter(col):
|
|
|
|
"""
|
|
|
|
Extract the quarter of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(quarter('dt').alias('quarter')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(quarter=2)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.quarter(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def month(col):
|
|
|
|
"""
|
|
|
|
Extract the month of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(month('dt').alias('month')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(month=4)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.month(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
2017-11-09 00:44:39 -05:00
|
|
|
@since(2.3)
|
|
|
|
def dayofweek(col):
|
|
|
|
"""
|
|
|
|
Extract the day of the week of a given date as integer.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(dayofweek('dt').alias('day')).collect()
|
|
|
|
[Row(day=4)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.dayofweek(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-07-19 01:48:05 -04:00
|
|
|
@since(1.5)
|
2015-07-19 04:17:22 -04:00
|
|
|
def dayofmonth(col):
|
2015-07-19 01:48:05 -04:00
|
|
|
"""
|
|
|
|
Extract the day of the month of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(dayofmonth('dt').alias('day')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(day=8)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.dayofmonth(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
2015-07-19 04:17:22 -04:00
|
|
|
def dayofyear(col):
|
2015-07-19 01:48:05 -04:00
|
|
|
"""
|
|
|
|
Extract the day of the year of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(dayofyear('dt').alias('day')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(day=98)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.dayofyear(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def hour(col):
|
|
|
|
"""
|
|
|
|
Extract the hours of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
|
|
|
|
>>> df.select(hour('ts').alias('hour')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(hour=13)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.hour(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def minute(col):
|
|
|
|
"""
|
|
|
|
Extract the minutes of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
|
|
|
|
>>> df.select(minute('ts').alias('minute')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(minute=8)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.minute(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def second(col):
|
|
|
|
"""
|
|
|
|
Extract the seconds of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts'])
|
|
|
|
>>> df.select(second('ts').alias('second')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(second=15)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.second(_to_java_column(col)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
2015-07-19 04:17:22 -04:00
|
|
|
def weekofyear(col):
|
2015-07-19 01:48:05 -04:00
|
|
|
"""
|
|
|
|
Extract the week number of a given date as integer.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(weekofyear(df.dt).alias('week')).collect()
|
2015-07-19 01:48:05 -04:00
|
|
|
[Row(week=15)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
return Column(sc._jvm.functions.weekofyear(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def date_add(start, days):
|
|
|
|
"""
|
|
|
|
Returns the date that is `days` days after `start`
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
|
|
|
|
[Row(next_date=datetime.date(2015, 4, 9))]
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def date_sub(start, days):
|
|
|
|
"""
|
|
|
|
Returns the date that is `days` days before `start`
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect()
|
|
|
|
[Row(prev_date=datetime.date(2015, 4, 7))]
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.date_sub(_to_java_column(start), days))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
|
|
|
def datediff(end, start):
|
|
|
|
"""
|
|
|
|
Returns the number of days from `start` to `end`.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(datediff(df.d2, df.d1).alias('diff')).collect()
|
|
|
|
[Row(diff=32)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.datediff(_to_java_column(end), _to_java_column(start)))
|
|
|
|
|
|
|
|
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
@since(1.5)
|
|
|
|
def add_months(start, months):
|
|
|
|
"""
|
|
|
|
Returns the date that is `months` months after `start`
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
|
|
|
|
[Row(next_month=datetime.date(2015, 5, 8))]
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.add_months(_to_java_column(start), months))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def months_between(date1, date2):
|
|
|
|
"""
|
|
|
|
Returns the number of months between date1 and date2.
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])
|
|
|
|
>>> df.select(months_between(df.date1, df.date2).alias('months')).collect()
|
[SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198] [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation
This PR is based on #7589 , thanks to adrian-wang
Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.
Closes #7589
cc rxin
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7754 from davies/date_add and squashes the following commits:
e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
2015-07-30 16:21:46 -04:00
|
|
|
[Row(months=3.9495967...)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.months_between(_to_java_column(date1), _to_java_column(date2)))
|
2015-07-19 01:48:05 -04:00
|
|
|
|
|
|
|
|
2017-02-07 09:50:30 -05:00
|
|
|
@since(2.2)
|
|
|
|
def to_date(col, format=None):
|
|
|
|
"""Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
|
|
|
|
:class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
|
[SPARK-20639][SQL] Add single argument support for to_timestamp in SQL with documentation improvement
## What changes were proposed in this pull request?
This PR proposes three things as below:
- Use casting rules to a timestamp in `to_timestamp` by default (it was `yyyy-MM-dd HH:mm:ss`).
- Support single argument for `to_timestamp` similarly with APIs in other languages.
For example, the one below works
```
import org.apache.spark.sql.functions._
Seq("2016-12-31 00:12:00.00").toDF("a").select(to_timestamp(col("a"))).show()
```
prints
```
+----------------------------------------+
|to_timestamp(`a`, 'yyyy-MM-dd HH:mm:ss')|
+----------------------------------------+
| 2016-12-31 00:12:00|
+----------------------------------------+
```
whereas this does not work in SQL.
**Before**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
Error in query: Invalid number of arguments for function to_timestamp; line 1 pos 7
```
**After**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
```
- Related document improvement for SQL function descriptions and other API descriptions accordingly.
**Before**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage: to_date(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage: to_timestamp(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00.0
```
**After**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage:
to_date(date_str[, fmt]) - Parses the `date_str` expression with the `fmt` expression to
a date. Returns null with invalid input. By default, it follows casting rules to a date if
the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_date('2009-07-30 04:17:52');
2009-07-30
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage:
to_timestamp(timestamp[, fmt]) - Parses the `timestamp` expression with the `fmt` expression to
a timestamp. Returns null with invalid input. By default, it follows casting rules to
a timestamp if the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00
```
## How was this patch tested?
Added tests in `datetime.sql`.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #17901 from HyukjinKwon/to_timestamp_arg.
2017-05-12 04:42:58 -04:00
|
|
|
using the optionally specified format. Specify formats according to
|
2017-02-07 09:50:30 -05:00
|
|
|
`SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
|
[SPARK-20639][SQL] Add single argument support for to_timestamp in SQL with documentation improvement
## What changes were proposed in this pull request?
This PR proposes three things as below:
- Use casting rules to a timestamp in `to_timestamp` by default (it was `yyyy-MM-dd HH:mm:ss`).
- Support single argument for `to_timestamp` similarly with APIs in other languages.
For example, the one below works
```
import org.apache.spark.sql.functions._
Seq("2016-12-31 00:12:00.00").toDF("a").select(to_timestamp(col("a"))).show()
```
prints
```
+----------------------------------------+
|to_timestamp(`a`, 'yyyy-MM-dd HH:mm:ss')|
+----------------------------------------+
| 2016-12-31 00:12:00|
+----------------------------------------+
```
whereas this does not work in SQL.
**Before**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
Error in query: Invalid number of arguments for function to_timestamp; line 1 pos 7
```
**After**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
```
- Related document improvement for SQL function descriptions and other API descriptions accordingly.
**Before**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage: to_date(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage: to_timestamp(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00.0
```
**After**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage:
to_date(date_str[, fmt]) - Parses the `date_str` expression with the `fmt` expression to
a date. Returns null with invalid input. By default, it follows casting rules to a date if
the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_date('2009-07-30 04:17:52');
2009-07-30
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage:
to_timestamp(timestamp[, fmt]) - Parses the `timestamp` expression with the `fmt` expression to
a timestamp. Returns null with invalid input. By default, it follows casting rules to
a timestamp if the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00
```
## How was this patch tested?
Added tests in `datetime.sql`.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #17901 from HyukjinKwon/to_timestamp_arg.
2017-05-12 04:42:58 -04:00
|
|
|
By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
|
|
|
|
is omitted (equivalent to ``col.cast("date")``).
|
2015-07-30 22:22:38 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
2015-07-30 22:22:38 -04:00
|
|
|
>>> df.select(to_date(df.t).alias('date')).collect()
|
|
|
|
[Row(date=datetime.date(1997, 2, 28))]
|
2017-02-07 09:50:30 -05:00
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
|
|
|
>>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
|
|
|
|
[Row(date=datetime.date(1997, 2, 28))]
|
2015-07-30 22:22:38 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2017-02-07 09:50:30 -05:00
|
|
|
if format is None:
|
|
|
|
jc = sc._jvm.functions.to_date(_to_java_column(col))
|
|
|
|
else:
|
|
|
|
jc = sc._jvm.functions.to_date(_to_java_column(col), format)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.2)
|
|
|
|
def to_timestamp(col, format=None):
|
|
|
|
"""Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or
|
|
|
|
:class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`
|
[SPARK-20639][SQL] Add single argument support for to_timestamp in SQL with documentation improvement
## What changes were proposed in this pull request?
This PR proposes three things as below:
- Use casting rules to a timestamp in `to_timestamp` by default (it was `yyyy-MM-dd HH:mm:ss`).
- Support single argument for `to_timestamp` similarly with APIs in other languages.
For example, the one below works
```
import org.apache.spark.sql.functions._
Seq("2016-12-31 00:12:00.00").toDF("a").select(to_timestamp(col("a"))).show()
```
prints
```
+----------------------------------------+
|to_timestamp(`a`, 'yyyy-MM-dd HH:mm:ss')|
+----------------------------------------+
| 2016-12-31 00:12:00|
+----------------------------------------+
```
whereas this does not work in SQL.
**Before**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
Error in query: Invalid number of arguments for function to_timestamp; line 1 pos 7
```
**After**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
```
- Related document improvement for SQL function descriptions and other API descriptions accordingly.
**Before**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage: to_date(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage: to_timestamp(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00.0
```
**After**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage:
to_date(date_str[, fmt]) - Parses the `date_str` expression with the `fmt` expression to
a date. Returns null with invalid input. By default, it follows casting rules to a date if
the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_date('2009-07-30 04:17:52');
2009-07-30
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage:
to_timestamp(timestamp[, fmt]) - Parses the `timestamp` expression with the `fmt` expression to
a timestamp. Returns null with invalid input. By default, it follows casting rules to
a timestamp if the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00
```
## How was this patch tested?
Added tests in `datetime.sql`.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #17901 from HyukjinKwon/to_timestamp_arg.
2017-05-12 04:42:58 -04:00
|
|
|
using the optionally specified format. Specify formats according to
|
2017-02-07 09:50:30 -05:00
|
|
|
`SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_.
|
[SPARK-20639][SQL] Add single argument support for to_timestamp in SQL with documentation improvement
## What changes were proposed in this pull request?
This PR proposes three things as below:
- Use casting rules to a timestamp in `to_timestamp` by default (it was `yyyy-MM-dd HH:mm:ss`).
- Support single argument for `to_timestamp` similarly with APIs in other languages.
For example, the one below works
```
import org.apache.spark.sql.functions._
Seq("2016-12-31 00:12:00.00").toDF("a").select(to_timestamp(col("a"))).show()
```
prints
```
+----------------------------------------+
|to_timestamp(`a`, 'yyyy-MM-dd HH:mm:ss')|
+----------------------------------------+
| 2016-12-31 00:12:00|
+----------------------------------------+
```
whereas this does not work in SQL.
**Before**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
Error in query: Invalid number of arguments for function to_timestamp; line 1 pos 7
```
**After**
```
spark-sql> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
```
- Related document improvement for SQL function descriptions and other API descriptions accordingly.
**Before**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage: to_date(date_str, fmt) - Parses the `left` expression with the `fmt` expression. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage: to_timestamp(timestamp, fmt) - Parses the `left` expression with the `format` expression to a timestamp. Returns null with invalid input.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00.0
```
**After**
```
spark-sql> DESCRIBE FUNCTION extended to_date;
...
Usage:
to_date(date_str[, fmt]) - Parses the `date_str` expression with the `fmt` expression to
a date. Returns null with invalid input. By default, it follows casting rules to a date if
the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_date('2009-07-30 04:17:52');
2009-07-30
> SELECT to_date('2016-12-31', 'yyyy-MM-dd');
2016-12-31
```
```
spark-sql> DESCRIBE FUNCTION extended to_timestamp;
...
Usage:
to_timestamp(timestamp[, fmt]) - Parses the `timestamp` expression with the `fmt` expression to
a timestamp. Returns null with invalid input. By default, it follows casting rules to
a timestamp if the `fmt` is omitted.
Extended Usage:
Examples:
> SELECT to_timestamp('2016-12-31 00:12:00');
2016-12-31 00:12:00
> SELECT to_timestamp('2016-12-31', 'yyyy-MM-dd');
2016-12-31 00:00:00
```
## How was this patch tested?
Added tests in `datetime.sql`.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #17901 from HyukjinKwon/to_timestamp_arg.
2017-05-12 04:42:58 -04:00
|
|
|
By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format
|
|
|
|
is omitted (equivalent to ``col.cast("timestamp")``).
|
2017-02-07 09:50:30 -05:00
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
|
|
|
>>> df.select(to_timestamp(df.t).alias('dt')).collect()
|
|
|
|
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
|
|
|
>>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()
|
|
|
|
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if format is None:
|
|
|
|
jc = sc._jvm.functions.to_timestamp(_to_java_column(col))
|
|
|
|
else:
|
|
|
|
jc = sc._jvm.functions.to_timestamp(_to_java_column(col), format)
|
|
|
|
return Column(jc)
|
2015-07-30 22:22:38 -04:00
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def trunc(date, format):
|
|
|
|
"""
|
|
|
|
Returns date truncated to the unit specified by the format.
|
|
|
|
|
2017-12-19 23:22:33 -05:00
|
|
|
:param format: 'year', 'yyyy', 'yy' or 'month', 'mon', 'mm'
|
2015-07-30 22:22:38 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-28',)], ['d'])
|
2015-07-30 22:22:38 -04:00
|
|
|
>>> df.select(trunc(df.d, 'year').alias('year')).collect()
|
|
|
|
[Row(year=datetime.date(1997, 1, 1))]
|
|
|
|
>>> df.select(trunc(df.d, 'mon').alias('month')).collect()
|
|
|
|
[Row(month=datetime.date(1997, 2, 1))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.trunc(_to_java_column(date), format))
|
|
|
|
|
|
|
|
|
2017-12-19 23:22:33 -05:00
|
|
|
@since(2.3)
|
|
|
|
def date_trunc(format, timestamp):
|
|
|
|
"""
|
|
|
|
Returns timestamp truncated to the unit specified by the format.
|
|
|
|
|
|
|
|
:param format: 'year', 'yyyy', 'yy', 'month', 'mon', 'mm',
|
|
|
|
'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter'
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t'])
|
|
|
|
>>> df.select(date_trunc('year', df.t).alias('year')).collect()
|
|
|
|
[Row(year=datetime.datetime(1997, 1, 1, 0, 0))]
|
|
|
|
>>> df.select(date_trunc('mon', df.t).alias('month')).collect()
|
|
|
|
[Row(month=datetime.datetime(1997, 2, 1, 0, 0))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.date_trunc(format, _to_java_column(timestamp)))
|
|
|
|
|
|
|
|
|
2015-08-01 11:48:46 -04:00
|
|
|
@since(1.5)
|
2015-08-04 22:25:24 -04:00
|
|
|
def next_day(date, dayOfWeek):
|
2015-08-01 11:48:46 -04:00
|
|
|
"""
|
2015-08-04 22:25:24 -04:00
|
|
|
Returns the first date which is later than the value of the date column.
|
2015-08-01 11:48:46 -04:00
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
Day of the week parameter is case insensitive, and accepts:
|
|
|
|
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('2015-07-27',)], ['d'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(next_day(df.d, 'Sun').alias('date')).collect()
|
|
|
|
[Row(date=datetime.date(2015, 8, 2))]
|
2015-08-01 11:48:46 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-04 22:25:24 -04:00
|
|
|
return Column(sc._jvm.functions.next_day(_to_java_column(date), dayOfWeek))
|
2015-08-01 11:48:46 -04:00
|
|
|
|
|
|
|
|
2015-08-01 00:18:01 -04:00
|
|
|
@since(1.5)
|
2015-08-04 22:25:24 -04:00
|
|
|
def last_day(date):
|
2015-08-01 00:18:01 -04:00
|
|
|
"""
|
2015-08-04 22:25:24 -04:00
|
|
|
Returns the last day of the month which the given date belongs to.
|
2015-08-01 00:18:01 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-10',)], ['d'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(last_day(df.d).alias('date')).collect()
|
|
|
|
[Row(date=datetime.date(1997, 2, 28))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.last_day(_to_java_column(date)))
|
|
|
|
|
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
@ignore_unicode_prefix
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
|
|
|
def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"):
|
|
|
|
"""
|
|
|
|
Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
|
|
|
|
representing the timestamp of that moment in the current system time zone in the given
|
|
|
|
format.
|
2017-07-08 02:59:34 -04:00
|
|
|
|
2017-07-11 02:23:03 -04:00
|
|
|
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])
|
|
|
|
>>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()
|
|
|
|
[Row(ts=u'2015-04-08 00:00:00')]
|
2017-07-11 02:23:03 -04:00
|
|
|
>>> spark.conf.unset("spark.sql.session.timeZone")
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
|
|
|
|
"""
|
|
|
|
Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default)
|
|
|
|
to Unix time stamp (in seconds), using the default timezone and the default
|
|
|
|
locale, return null if fail.
|
|
|
|
|
|
|
|
if `timestamp` is None, then it returns current timestamp.
|
2017-07-08 02:59:34 -04:00
|
|
|
|
2017-07-11 02:23:03 -04:00
|
|
|
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
|
|
|
|
>>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()
|
|
|
|
[Row(unix_time=1428476400)]
|
2017-07-11 02:23:03 -04:00
|
|
|
>>> spark.conf.unset("spark.sql.session.timeZone")
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if timestamp is None:
|
|
|
|
return Column(sc._jvm.functions.unix_timestamp())
|
|
|
|
return Column(sc._jvm.functions.unix_timestamp(_to_java_column(timestamp), format))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def from_utc_timestamp(timestamp, tz):
|
|
|
|
"""
|
2017-09-20 07:47:17 -04:00
|
|
|
Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
|
|
|
|
that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
|
|
|
|
'2017-07-14 03:40:00.0'.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df.select(from_utc_timestamp(df.t, "PST").alias('local_time')).collect()
|
|
|
|
[Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def to_utc_timestamp(timestamp, tz):
|
|
|
|
"""
|
2017-09-20 07:47:17 -04:00
|
|
|
Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time
|
|
|
|
zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield
|
|
|
|
'2017-07-14 01:40:00.0'.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2017-07-08 02:59:34 -04:00
|
|
|
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts'])
|
|
|
|
>>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect()
|
|
|
|
[Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz))
|
|
|
|
|
|
|
|
|
[SPARK-14353] Dataset Time Window `window` API for Python, and SQL
## What changes were proposed in this pull request?
The `window` function was added to Dataset with [this PR](https://github.com/apache/spark/pull/12008).
This PR adds the Python, and SQL, API for this function.
With this PR, SQL, Java, and Scala will share the same APIs as in users can use:
- `window(timeColumn, windowDuration)`
- `window(timeColumn, windowDuration, slideDuration)`
- `window(timeColumn, windowDuration, slideDuration, startTime)`
In Python, users can access all APIs above, but in addition they can do
- In Python:
`window(timeColumn, windowDuration, startTime=...)`
that is, they can provide the startTime without providing the `slideDuration`. In this case, we will generate tumbling windows.
## How was this patch tested?
Unit tests + manual tests
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #12136 from brkyvz/python-windows.
2016-04-05 16:18:39 -04:00
|
|
|
@since(2.0)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
|
|
|
|
"""Bucketize rows into one or more time windows given a timestamp specifying column. Window
|
|
|
|
starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
|
|
|
|
[12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
|
|
|
|
the order of months are not supported.
|
|
|
|
|
2016-07-28 17:57:15 -04:00
|
|
|
The time column must be of :class:`pyspark.sql.types.TimestampType`.
|
[SPARK-14353] Dataset Time Window `window` API for Python, and SQL
## What changes were proposed in this pull request?
The `window` function was added to Dataset with [this PR](https://github.com/apache/spark/pull/12008).
This PR adds the Python, and SQL, API for this function.
With this PR, SQL, Java, and Scala will share the same APIs as in users can use:
- `window(timeColumn, windowDuration)`
- `window(timeColumn, windowDuration, slideDuration)`
- `window(timeColumn, windowDuration, slideDuration, startTime)`
In Python, users can access all APIs above, but in addition they can do
- In Python:
`window(timeColumn, windowDuration, startTime=...)`
that is, they can provide the startTime without providing the `slideDuration`. In this case, we will generate tumbling windows.
## How was this patch tested?
Unit tests + manual tests
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #12136 from brkyvz/python-windows.
2016-04-05 16:18:39 -04:00
|
|
|
|
|
|
|
Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid
|
|
|
|
interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.
|
2016-07-28 17:57:15 -04:00
|
|
|
If the ``slideDuration`` is not provided, the windows will be tumbling windows.
|
[SPARK-14353] Dataset Time Window `window` API for Python, and SQL
## What changes were proposed in this pull request?
The `window` function was added to Dataset with [this PR](https://github.com/apache/spark/pull/12008).
This PR adds the Python, and SQL, API for this function.
With this PR, SQL, Java, and Scala will share the same APIs as in users can use:
- `window(timeColumn, windowDuration)`
- `window(timeColumn, windowDuration, slideDuration)`
- `window(timeColumn, windowDuration, slideDuration, startTime)`
In Python, users can access all APIs above, but in addition they can do
- In Python:
`window(timeColumn, windowDuration, startTime=...)`
that is, they can provide the startTime without providing the `slideDuration`. In this case, we will generate tumbling windows.
## How was this patch tested?
Unit tests + manual tests
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #12136 from brkyvz/python-windows.
2016-04-05 16:18:39 -04:00
|
|
|
|
|
|
|
The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start
|
|
|
|
window intervals. For example, in order to have hourly tumbling windows that start 15 minutes
|
|
|
|
past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.
|
|
|
|
|
|
|
|
The output column will be a struct called 'window' by default with the nested columns 'start'
|
2016-07-28 17:57:15 -04:00
|
|
|
and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`.
|
[SPARK-14353] Dataset Time Window `window` API for Python, and SQL
## What changes were proposed in this pull request?
The `window` function was added to Dataset with [this PR](https://github.com/apache/spark/pull/12008).
This PR adds the Python, and SQL, API for this function.
With this PR, SQL, Java, and Scala will share the same APIs as in users can use:
- `window(timeColumn, windowDuration)`
- `window(timeColumn, windowDuration, slideDuration)`
- `window(timeColumn, windowDuration, slideDuration, startTime)`
In Python, users can access all APIs above, but in addition they can do
- In Python:
`window(timeColumn, windowDuration, startTime=...)`
that is, they can provide the startTime without providing the `slideDuration`. In this case, we will generate tumbling windows.
## How was this patch tested?
Unit tests + manual tests
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #12136 from brkyvz/python-windows.
2016-04-05 16:18:39 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val")
|
[SPARK-14353] Dataset Time Window `window` API for Python, and SQL
## What changes were proposed in this pull request?
The `window` function was added to Dataset with [this PR](https://github.com/apache/spark/pull/12008).
This PR adds the Python, and SQL, API for this function.
With this PR, SQL, Java, and Scala will share the same APIs as in users can use:
- `window(timeColumn, windowDuration)`
- `window(timeColumn, windowDuration, slideDuration)`
- `window(timeColumn, windowDuration, slideDuration, startTime)`
In Python, users can access all APIs above, but in addition they can do
- In Python:
`window(timeColumn, windowDuration, startTime=...)`
that is, they can provide the startTime without providing the `slideDuration`. In this case, we will generate tumbling windows.
## How was this patch tested?
Unit tests + manual tests
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #12136 from brkyvz/python-windows.
2016-04-05 16:18:39 -04:00
|
|
|
>>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
|
|
|
|
>>> w.select(w.window.start.cast("string").alias("start"),
|
|
|
|
... w.window.end.cast("string").alias("end"), "sum").collect()
|
|
|
|
[Row(start=u'2016-03-11 09:00:05', end=u'2016-03-11 09:00:10', sum=1)]
|
|
|
|
"""
|
|
|
|
def check_string_field(field, fieldName):
|
|
|
|
if not field or type(field) is not str:
|
|
|
|
raise TypeError("%s should be provided as a string" % fieldName)
|
|
|
|
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
time_col = _to_java_column(timeColumn)
|
|
|
|
check_string_field(windowDuration, "windowDuration")
|
|
|
|
if slideDuration and startTime:
|
|
|
|
check_string_field(slideDuration, "slideDuration")
|
|
|
|
check_string_field(startTime, "startTime")
|
|
|
|
res = sc._jvm.functions.window(time_col, windowDuration, slideDuration, startTime)
|
|
|
|
elif slideDuration:
|
|
|
|
check_string_field(slideDuration, "slideDuration")
|
|
|
|
res = sc._jvm.functions.window(time_col, windowDuration, slideDuration)
|
|
|
|
elif startTime:
|
|
|
|
check_string_field(startTime, "startTime")
|
|
|
|
res = sc._jvm.functions.window(time_col, windowDuration, windowDuration, startTime)
|
|
|
|
else:
|
|
|
|
res = sc._jvm.functions.window(time_col, windowDuration)
|
|
|
|
return Column(res)
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
# ---------------------------- misc functions ----------------------------------
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def crc32(col):
|
|
|
|
"""
|
|
|
|
Calculates the cyclic redundancy check value (CRC32) of a binary column and
|
|
|
|
returns the value as a bigint.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect()
|
2015-08-12 18:27:52 -04:00
|
|
|
[Row(crc32=2743272264)]
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-08-12 18:27:52 -04:00
|
|
|
return Column(sc._jvm.functions.crc32(_to_java_column(col)))
|
2015-08-04 22:25:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def md5(col):
|
|
|
|
"""Calculates the MD5 digest and returns the value as a 32 character hex string.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()
|
2015-08-04 22:25:24 -04:00
|
|
|
[Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.md5(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def sha1(col):
|
|
|
|
"""Returns the hex string result of SHA-1.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()
|
2015-08-04 22:25:24 -04:00
|
|
|
[Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.sha1(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def sha2(col, numBits):
|
|
|
|
"""Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
|
|
|
|
and SHA-512). The numBits indicates the desired bit length of the result, which must have a
|
|
|
|
value of 224, 256, 384, 512, or 0 (which is equivalent to 256).
|
|
|
|
|
|
|
|
>>> digests = df.select(sha2(df.name, 256).alias('s')).collect()
|
|
|
|
>>> digests[0]
|
|
|
|
Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
|
|
|
|
>>> digests[1]
|
|
|
|
Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.sha2(_to_java_column(col), numBits)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2016-01-05 13:23:36 -05:00
|
|
|
@since(2.0)
|
|
|
|
def hash(*cols):
|
2016-05-27 01:39:14 -04:00
|
|
|
"""Calculates the hash code of given columns, and returns the result as an int column.
|
2016-01-05 13:23:36 -05:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect()
|
2016-01-13 15:29:02 -05:00
|
|
|
[Row(hash=-757602832)]
|
2016-01-05 13:23:36 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.hash(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
# ---------------------- String/Binary functions ------------------------------
|
|
|
|
|
|
|
|
_string_functions = {
|
|
|
|
'ascii': 'Computes the numeric value of the first character of the string column.',
|
|
|
|
'base64': 'Computes the BASE64 encoding of a binary column and returns it as a string column.',
|
|
|
|
'unbase64': 'Decodes a BASE64 encoded string column and returns it as a binary column.',
|
|
|
|
'initcap': 'Returns a new string column by converting the first letter of each word to ' +
|
|
|
|
'uppercase. Words are delimited by whitespace.',
|
|
|
|
'lower': 'Converts a string column to lower case.',
|
|
|
|
'upper': 'Converts a string column to upper case.',
|
2015-12-21 17:04:59 -05:00
|
|
|
'ltrim': 'Trim the spaces from left end for the specified string value.',
|
2015-08-04 22:25:24 -04:00
|
|
|
'rtrim': 'Trim the spaces from right end for the specified string value.',
|
|
|
|
'trim': 'Trim the spaces from both ends for the specified string column.',
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _name, _doc in _string_functions.items():
|
|
|
|
globals()[_name] = since(1.5)(_create_function(_name, _doc))
|
|
|
|
del _name, _doc
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def concat_ws(sep, *cols):
|
|
|
|
"""
|
|
|
|
Concatenates multiple input string columns together into a single string column,
|
|
|
|
using the given separator.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
|
|
|
|
[Row(s=u'abcd-123')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def decode(col, charset):
|
|
|
|
"""
|
|
|
|
Computes the first argument into a string from a binary using the provided character set
|
|
|
|
(one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.decode(_to_java_column(col), charset))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def encode(col, charset):
|
|
|
|
"""
|
|
|
|
Computes the first argument into a binary from a string using the provided character set
|
|
|
|
(one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.encode(_to_java_column(col), charset))
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def format_number(col, d):
|
|
|
|
"""
|
2017-03-26 21:40:00 -04:00
|
|
|
Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places
|
|
|
|
with HALF_EVEN round mode, and returns the result as a string.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
|
|
|
:param col: the column name of the numeric value to be formatted
|
|
|
|
:param d: the N decimal places
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
|
2015-08-04 22:25:24 -04:00
|
|
|
[Row(v=u'5.0000')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.format_number(_to_java_column(col), d))
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def format_string(format, *cols):
|
|
|
|
"""
|
|
|
|
Formats the arguments in printf-style and returns the result as a string column.
|
|
|
|
|
|
|
|
:param col: the column name of the numeric value to be formatted
|
|
|
|
:param d: the N decimal places
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
|
|
|
|
[Row(v=u'5 hello')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
def instr(str, substr):
|
|
|
|
"""
|
|
|
|
Locate the position of the first occurrence of substr column in the given string.
|
|
|
|
Returns null if either of the arguments are null.
|
|
|
|
|
2016-11-06 00:47:33 -04:00
|
|
|
.. note:: The position is not zero based, but 1 based index. Returns 0 if substr
|
|
|
|
could not be found in str.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(instr(df.s, 'b').alias('s')).collect()
|
|
|
|
[Row(s=2)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.instr(_to_java_column(str), substr))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def substring(str, pos, len):
|
|
|
|
"""
|
|
|
|
Substring starts at `pos` and is of length `len` when str is String type or
|
|
|
|
returns the slice of byte array that starts at `pos` in byte and is of length `len`
|
2017-08-07 12:16:03 -04:00
|
|
|
when str is Binary type.
|
|
|
|
|
|
|
|
.. note:: The position is not zero based, but 1 based index.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(substring(df.s, 1, 2).alias('s')).collect()
|
|
|
|
[Row(s=u'ab')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def substring_index(str, delim, count):
|
|
|
|
"""
|
|
|
|
Returns the substring from string str before count occurrences of the delimiter delim.
|
|
|
|
If count is positive, everything the left of the final delimiter (counting from left) is
|
|
|
|
returned. If count is negative, every to the right of the final delimiter (counting from the
|
|
|
|
right) is returned. substring_index performs a case-sensitive match when searching for delim.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
|
|
|
|
[Row(s=u'a.b')]
|
2015-08-01 00:18:01 -04:00
|
|
|
>>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
|
|
|
|
[Row(s=u'b.c.d')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def levenshtein(left, right):
|
|
|
|
"""Computes the Levenshtein distance of the two given strings.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df0.select(levenshtein('l', 'r').alias('d')).collect()
|
|
|
|
[Row(d=3)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
[SPARK-15397][SQL] fix string udf locate as hive
## What changes were proposed in this pull request?
in hive, `locate("aa", "aaa", 0)` would yield 0, `locate("aa", "aaa", 1)` would yield 1 and `locate("aa", "aaa", 2)` would yield 2, while in Spark, `locate("aa", "aaa", 0)` would yield 1, `locate("aa", "aaa", 1)` would yield 2 and `locate("aa", "aaa", 2)` would yield 0. This results from the different understanding of the third parameter in udf `locate`. It means the starting index and starts from 1, so when we use 0, the return would always be 0.
## How was this patch tested?
tested with modified `StringExpressionsSuite` and `StringFunctionsSuite`
Author: Daoyuan Wang <daoyuan.wang@intel.com>
Closes #13186 from adrian-wang/locate.
2016-05-24 02:29:15 -04:00
|
|
|
def locate(substr, str, pos=1):
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
Locate the position of the first occurrence of substr in a string column, after position pos.
|
|
|
|
|
2016-11-06 00:47:33 -04:00
|
|
|
.. note:: The position is not zero based, but 1 based index. Returns 0 if substr
|
|
|
|
could not be found in str.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
|
|
|
:param substr: a string
|
2016-07-28 17:57:15 -04:00
|
|
|
:param str: a Column of :class:`pyspark.sql.types.StringType`
|
2015-08-04 22:25:24 -04:00
|
|
|
:param pos: start position (zero based)
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(locate('b', df.s, 1).alias('s')).collect()
|
|
|
|
[Row(s=2)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def lpad(col, len, pad):
|
|
|
|
"""
|
|
|
|
Left-pad the string column to width `len` with `pad`.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(lpad(df.s, 6, '#').alias('s')).collect()
|
|
|
|
[Row(s=u'##abcd')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def rpad(col, len, pad):
|
|
|
|
"""
|
|
|
|
Right-pad the string column to width `len` with `pad`.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('abcd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(rpad(df.s, 6, '#').alias('s')).collect()
|
|
|
|
[Row(s=u'abcd##')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def repeat(col, n):
|
|
|
|
"""
|
|
|
|
Repeats a string column n times, and returns it as a new string column.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('ab',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(repeat(df.s, 3).alias('s')).collect()
|
|
|
|
[Row(s=u'ababab')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.repeat(_to_java_column(col), n))
|
|
|
|
|
|
|
|
|
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def split(str, pattern):
|
|
|
|
"""
|
|
|
|
Splits str around pattern (pattern is a regular expression).
|
|
|
|
|
2016-11-06 00:47:33 -04:00
|
|
|
.. note:: pattern is a string represent the regular expression.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('ab12cd',)], ['s',])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
|
|
|
|
[Row(s=[u'ab', u'cd'])]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.split(_to_java_column(str), pattern))
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def regexp_extract(str, pattern, idx):
|
2016-08-10 05:14:43 -04:00
|
|
|
"""Extract a specific group matched by a Java regex, from the specified string column.
|
|
|
|
If the regex did not match, or the specified group did not match, an empty string is returned.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('100-200',)], ['str'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
|
|
|
|
[Row(d=u'100')]
|
2016-08-10 05:14:43 -04:00
|
|
|
>>> df = spark.createDataFrame([('foo',)], ['str'])
|
|
|
|
>>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
|
|
|
|
[Row(d=u'')]
|
2016-08-07 07:20:07 -04:00
|
|
|
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
|
|
|
|
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
|
|
|
|
[Row(d=u'')]
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def regexp_replace(str, pattern, replacement):
|
|
|
|
"""Replace all substrings of the specified string value that match regexp with rep.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([('100-200',)], ['str'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()
|
|
|
|
[Row(d=u'-----')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-08-02 00:44:57 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def initcap(col):
|
|
|
|
"""Translate the first letter of each word to upper case in the sentence.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()
|
2015-08-02 00:44:57 -04:00
|
|
|
[Row(v=u'Ab Cd')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.initcap(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def soundex(col):
|
|
|
|
"""
|
|
|
|
Returns the SoundEx encoding for a string
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> df.select(soundex(df.name).alias("soundex")).collect()
|
|
|
|
[Row(soundex=u'P362'), Row(soundex=u'U612')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.soundex(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def bin(col):
|
|
|
|
"""Returns the string representation of the binary value of the given column.
|
|
|
|
|
|
|
|
>>> df.select(bin(df.age).alias('c')).collect()
|
|
|
|
[Row(c=u'10'), Row(c=u'101')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.bin(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def hex(col):
|
2016-07-28 17:57:15 -04:00
|
|
|
"""Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,
|
|
|
|
:class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or
|
|
|
|
:class:`pyspark.sql.types.LongType`.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
|
2015-08-04 22:25:24 -04:00
|
|
|
[Row(hex(a)=u'414243', hex(b)=u'3')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.hex(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def unhex(col):
|
|
|
|
"""Inverse of hex. Interprets each pair of characters as a hexadecimal number
|
|
|
|
and converts to the byte representation of number.
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect()
|
2015-08-04 22:25:24 -04:00
|
|
|
[Row(unhex(a)=bytearray(b'ABC'))]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.unhex(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def length(col):
|
2018-02-06 19:46:43 -05:00
|
|
|
"""Computes the character length of string data or number of bytes of binary data.
|
|
|
|
The length of character data includes the trailing spaces. The length of binary data
|
|
|
|
includes binary zeros.
|
2015-08-04 22:25:24 -04:00
|
|
|
|
2018-02-06 19:46:43 -05:00
|
|
|
>>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()
|
|
|
|
[Row(length=4)]
|
2015-08-04 22:25:24 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.length(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-08-06 12:02:30 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(1.5)
|
|
|
|
def translate(srcCol, matching, replace):
|
|
|
|
"""A function translate any character in the `srcCol` by a character in `matching`.
|
|
|
|
The characters in `replace` is corresponding to the characters in `matching`.
|
|
|
|
The translate will happen when any character in the string matching with the character
|
|
|
|
in the `matching`.
|
|
|
|
|
2016-07-06 13:45:51 -04:00
|
|
|
>>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\
|
|
|
|
... .alias('r')).collect()
|
2015-08-06 12:02:30 -04:00
|
|
|
[Row(r=u'1a2s3ae')]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
# ---------------------- Collection functions ------------------------------
|
|
|
|
|
2016-03-25 12:50:06 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(2.0)
|
|
|
|
def create_map(*cols):
|
|
|
|
"""Creates a new map column.
|
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
:param cols: list of column names (string) or list of :class:`Column` expressions that are
|
|
|
|
grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).
|
2016-03-25 12:50:06 -04:00
|
|
|
|
|
|
|
>>> df.select(create_map('name', 'age').alias("map")).collect()
|
|
|
|
[Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
|
|
|
|
>>> df.select(create_map([df.name, df.age]).alias("map")).collect()
|
|
|
|
[Row(map={u'Alice': 2}), Row(map={u'Bob': 5})]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if len(cols) == 1 and isinstance(cols[0], (list, set)):
|
|
|
|
cols = cols[0]
|
|
|
|
jc = sc._jvm.functions.map(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.4)
|
|
|
|
def array(*cols):
|
|
|
|
"""Creates a new array column.
|
|
|
|
|
|
|
|
:param cols: list of column names (string) or list of :class:`Column` expressions that have
|
|
|
|
the same data type.
|
|
|
|
|
|
|
|
>>> df.select(array('age', 'age').alias("arr")).collect()
|
|
|
|
[Row(arr=[2, 2]), Row(arr=[5, 5])]
|
|
|
|
>>> df.select(array([df.age, df.age]).alias("arr")).collect()
|
|
|
|
[Row(arr=[2, 2]), Row(arr=[5, 5])]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
if len(cols) == 1 and isinstance(cols[0], (list, set)):
|
|
|
|
cols = cols[0]
|
|
|
|
jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-08-05 01:32:21 -04:00
|
|
|
@since(1.5)
|
|
|
|
def array_contains(col, value):
|
|
|
|
"""
|
2017-03-26 21:40:00 -04:00
|
|
|
Collection function: returns null if the array is null, true if the array contains the
|
|
|
|
given value, and false otherwise.
|
2015-08-05 01:32:21 -04:00
|
|
|
|
|
|
|
:param col: name of column containing array
|
|
|
|
:param value: value to check for in array
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
|
2015-08-05 01:32:21 -04:00
|
|
|
>>> df.select(array_contains(df.data, "a")).collect()
|
2016-02-21 09:53:15 -05:00
|
|
|
[Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]
|
2015-08-05 01:32:21 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.array_contains(_to_java_column(col), value))
|
|
|
|
|
|
|
|
|
[SPARK-23736][SQL] Extending the concat function to support array columns
## What changes were proposed in this pull request?
The PR adds a logic for easy concatenation of multiple array columns and covers:
- Concat expression has been extended to support array columns
- A Python wrapper
## How was this patch tested?
New tests added into:
- CollectionExpressionsSuite
- DataFrameFunctionsSuite
- typeCoercion/native/concat.sql
## Codegen examples
### Primitive-type elements
```
val df = Seq(
(Seq(1 ,2), Seq(3, 4)),
(Seq(1, 2, 3), null)
).toDF("a", "b")
df.filter('a.isNotNull).select(concat('a, 'b)).debugCodegen()
```
Result:
```
/* 033 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */ null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */ if (!(!inputadapter_isNull)) continue;
/* 038 */
/* 039 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 040 */
/* 041 */ ArrayData[] project_args = new ArrayData[2];
/* 042 */
/* 043 */ if (!false) {
/* 044 */ project_args[0] = inputadapter_value;
/* 045 */ }
/* 046 */
/* 047 */ boolean inputadapter_isNull1 = inputadapter_row.isNullAt(1);
/* 048 */ ArrayData inputadapter_value1 = inputadapter_isNull1 ?
/* 049 */ null : (inputadapter_row.getArray(1));
/* 050 */ if (!inputadapter_isNull1) {
/* 051 */ project_args[1] = inputadapter_value1;
/* 052 */ }
/* 053 */
/* 054 */ ArrayData project_value = new Object() {
/* 055 */ public ArrayData concat(ArrayData[] args) {
/* 056 */ for (int z = 0; z < 2; z++) {
/* 057 */ if (args[z] == null) return null;
/* 058 */ }
/* 059 */
/* 060 */ long project_numElements = 0L;
/* 061 */ for (int z = 0; z < 2; z++) {
/* 062 */ project_numElements += args[z].numElements();
/* 063 */ }
/* 064 */ if (project_numElements > 2147483632) {
/* 065 */ throw new RuntimeException("Unsuccessful try to concat arrays with " + project_numElements +
/* 066 */ " elements due to exceeding the array size limit 2147483632.");
/* 067 */ }
/* 068 */
/* 069 */ long project_size = UnsafeArrayData.calculateSizeOfUnderlyingByteArray(
/* 070 */ project_numElements,
/* 071 */ 4);
/* 072 */ if (project_size > 2147483632) {
/* 073 */ throw new RuntimeException("Unsuccessful try to concat arrays with " + project_size +
/* 074 */ " bytes of data due to exceeding the limit 2147483632 bytes" +
/* 075 */ " for UnsafeArrayData.");
/* 076 */ }
/* 077 */
/* 078 */ byte[] project_array = new byte[(int)project_size];
/* 079 */ UnsafeArrayData project_arrayData = new UnsafeArrayData();
/* 080 */ Platform.putLong(project_array, 16, project_numElements);
/* 081 */ project_arrayData.pointTo(project_array, 16, (int)project_size);
/* 082 */ int project_counter = 0;
/* 083 */ for (int y = 0; y < 2; y++) {
/* 084 */ for (int z = 0; z < args[y].numElements(); z++) {
/* 085 */ if (args[y].isNullAt(z)) {
/* 086 */ project_arrayData.setNullAt(project_counter);
/* 087 */ } else {
/* 088 */ project_arrayData.setInt(
/* 089 */ project_counter,
/* 090 */ args[y].getInt(z)
/* 091 */ );
/* 092 */ }
/* 093 */ project_counter++;
/* 094 */ }
/* 095 */ }
/* 096 */ return project_arrayData;
/* 097 */ }
/* 098 */ }.concat(project_args);
/* 099 */ boolean project_isNull = project_value == null;
```
### Non-primitive-type elements
```
val df = Seq(
(Seq("aa" ,"bb"), Seq("ccc", "ddd")),
(Seq("x", "y"), null)
).toDF("a", "b")
df.filter('a.isNotNull).select(concat('a, 'b)).debugCodegen()
```
Result:
```
/* 033 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */ null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */ if (!(!inputadapter_isNull)) continue;
/* 038 */
/* 039 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 040 */
/* 041 */ ArrayData[] project_args = new ArrayData[2];
/* 042 */
/* 043 */ if (!false) {
/* 044 */ project_args[0] = inputadapter_value;
/* 045 */ }
/* 046 */
/* 047 */ boolean inputadapter_isNull1 = inputadapter_row.isNullAt(1);
/* 048 */ ArrayData inputadapter_value1 = inputadapter_isNull1 ?
/* 049 */ null : (inputadapter_row.getArray(1));
/* 050 */ if (!inputadapter_isNull1) {
/* 051 */ project_args[1] = inputadapter_value1;
/* 052 */ }
/* 053 */
/* 054 */ ArrayData project_value = new Object() {
/* 055 */ public ArrayData concat(ArrayData[] args) {
/* 056 */ for (int z = 0; z < 2; z++) {
/* 057 */ if (args[z] == null) return null;
/* 058 */ }
/* 059 */
/* 060 */ long project_numElements = 0L;
/* 061 */ for (int z = 0; z < 2; z++) {
/* 062 */ project_numElements += args[z].numElements();
/* 063 */ }
/* 064 */ if (project_numElements > 2147483632) {
/* 065 */ throw new RuntimeException("Unsuccessful try to concat arrays with " + project_numElements +
/* 066 */ " elements due to exceeding the array size limit 2147483632.");
/* 067 */ }
/* 068 */
/* 069 */ Object[] project_arrayObjects = new Object[(int)project_numElements];
/* 070 */ int project_counter = 0;
/* 071 */ for (int y = 0; y < 2; y++) {
/* 072 */ for (int z = 0; z < args[y].numElements(); z++) {
/* 073 */ project_arrayObjects[project_counter] = args[y].getUTF8String(z);
/* 074 */ project_counter++;
/* 075 */ }
/* 076 */ }
/* 077 */ return new org.apache.spark.sql.catalyst.util.GenericArrayData(project_arrayObjects);
/* 078 */ }
/* 079 */ }.concat(project_args);
/* 080 */ boolean project_isNull = project_value == null;
```
Author: mn-mikke <mrkAha12346github>
Closes #20858 from mn-mikke/feature/array-api-concat_arrays-to-master.
2018-04-20 01:58:11 -04:00
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def concat(*cols):
|
|
|
|
"""
|
|
|
|
Concatenates multiple input columns together into a single column.
|
|
|
|
The function works with strings, binary and compatible array columns.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
|
|
|
|
>>> df.select(concat(df.s, df.d).alias('s')).collect()
|
|
|
|
[Row(s=u'abcd123')]
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
|
|
|
|
>>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()
|
|
|
|
[Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))
|
|
|
|
|
|
|
|
|
2018-04-18 22:59:17 -04:00
|
|
|
@since(2.4)
|
|
|
|
def array_position(col, value):
|
|
|
|
"""
|
|
|
|
Collection function: Locates the position of the first occurrence of the given value
|
|
|
|
in the given array. Returns null if either of the arguments are null.
|
|
|
|
|
|
|
|
.. note:: The position is not zero based, but 1 based index. Returns 0 if the given
|
|
|
|
value could not be found in the array.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])
|
|
|
|
>>> df.select(array_position(df.data, "a")).collect()
|
|
|
|
[Row(array_position(data, a)=3), Row(array_position(data, a)=0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.array_position(_to_java_column(col), value))
|
|
|
|
|
|
|
|
|
2018-04-19 08:00:10 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(2.4)
|
|
|
|
def element_at(col, extraction):
|
|
|
|
"""
|
|
|
|
Collection function: Returns element of array at given index in extraction if col is array.
|
|
|
|
Returns value for the given key in extraction if col is map.
|
|
|
|
|
|
|
|
:param col: name of column containing array or map
|
|
|
|
:param extraction: index to check for in array or key to check for in map
|
|
|
|
|
|
|
|
.. note:: The position is not zero based, but 1 based index.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
|
|
|
|
>>> df.select(element_at(df.data, 1)).collect()
|
|
|
|
[Row(element_at(data, 1)=u'a'), Row(element_at(data, 1)=None)]
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data'])
|
|
|
|
>>> df.select(element_at(df.data, "a")).collect()
|
|
|
|
[Row(element_at(data, a)=1.0), Row(element_at(data, a)=None)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.element_at(_to_java_column(col), extraction))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
@since(1.4)
|
|
|
|
def explode(col):
|
|
|
|
"""Returns a new row for each element in the given array or map.
|
|
|
|
|
|
|
|
>>> from pyspark.sql import Row
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
|
2015-08-04 22:25:24 -04:00
|
|
|
>>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()
|
|
|
|
[Row(anInt=1), Row(anInt=2), Row(anInt=3)]
|
|
|
|
|
|
|
|
>>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()
|
|
|
|
+---+-----+
|
|
|
|
|key|value|
|
|
|
|
+---+-----+
|
|
|
|
| a| b|
|
|
|
|
+---+-----+
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.explode(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
[SPARK-16289][SQL] Implement posexplode table generating function
## What changes were proposed in this pull request?
This PR implements `posexplode` table generating function. Currently, master branch raises the following exception for `map` argument. It's different from Hive.
**Before**
```scala
scala> sql("select posexplode(map('a', 1, 'b', 2))").show
org.apache.spark.sql.AnalysisException: No handler for Hive UDF ... posexplode() takes an array as a parameter; line 1 pos 7
```
**After**
```scala
scala> sql("select posexplode(map('a', 1, 'b', 2))").show
+---+---+-----+
|pos|key|value|
+---+---+-----+
| 0| a| 1|
| 1| b| 2|
+---+---+-----+
```
For `array` argument, `after` is the same with `before`.
```
scala> sql("select posexplode(array(1, 2, 3))").show
+---+---+
|pos|col|
+---+---+
| 0| 1|
| 1| 2|
| 2| 3|
+---+---+
```
## How was this patch tested?
Pass the Jenkins tests with newly added testcases.
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #13971 from dongjoon-hyun/SPARK-16289.
2016-06-30 15:03:54 -04:00
|
|
|
@since(2.1)
|
|
|
|
def posexplode(col):
|
|
|
|
"""Returns a new row for each element with position in the given array or map.
|
|
|
|
|
|
|
|
>>> from pyspark.sql import Row
|
|
|
|
>>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
|
|
|
|
>>> eDF.select(posexplode(eDF.intlist)).collect()
|
|
|
|
[Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]
|
|
|
|
|
|
|
|
>>> eDF.select(posexplode(eDF.mapfield)).show()
|
|
|
|
+---+---+-----+
|
|
|
|
|pos|key|value|
|
|
|
|
+---+---+-----+
|
|
|
|
| 0| a| b|
|
|
|
|
+---+---+-----+
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.posexplode(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2017-06-21 17:59:52 -04:00
|
|
|
@since(2.3)
|
|
|
|
def explode_outer(col):
|
|
|
|
"""Returns a new row for each element in the given array or map.
|
|
|
|
Unlike explode, if the array/map is null or empty then null is produced.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
|
|
|
|
... ("id", "an_array", "a_map")
|
|
|
|
... )
|
|
|
|
>>> df.select("id", "an_array", explode_outer("a_map")).show()
|
|
|
|
+---+----------+----+-----+
|
|
|
|
| id| an_array| key|value|
|
|
|
|
+---+----------+----+-----+
|
|
|
|
| 1|[foo, bar]| x| 1.0|
|
|
|
|
| 2| []|null| null|
|
|
|
|
| 3| null|null| null|
|
|
|
|
+---+----------+----+-----+
|
|
|
|
|
|
|
|
>>> df.select("id", "a_map", explode_outer("an_array")).show()
|
[SPARK-23023][SQL] Cast field data to strings in showString
## What changes were proposed in this pull request?
The current `Datset.showString` prints rows thru `RowEncoder` deserializers like;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------------------------------------------+
|a |
+------------------------------------------------------------+
|[WrappedArray(1, 2), WrappedArray(3), WrappedArray(4, 5, 6)]|
+------------------------------------------------------------+
```
This result is incorrect because the correct one is;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------+
|a |
+------------------------+
|[[1, 2], [3], [4, 5, 6]]|
+------------------------+
```
So, this pr fixed code in `showString` to cast field data to strings before printing.
## How was this patch tested?
Added tests in `DataFrameSuite`.
Author: Takeshi Yamamuro <yamamuro@apache.org>
Closes #20214 from maropu/SPARK-23023.
2018-01-15 03:26:52 -05:00
|
|
|
+---+----------+----+
|
|
|
|
| id| a_map| col|
|
|
|
|
+---+----------+----+
|
|
|
|
| 1|[x -> 1.0]| foo|
|
|
|
|
| 1|[x -> 1.0]| bar|
|
|
|
|
| 2| []|null|
|
|
|
|
| 3| null|null|
|
|
|
|
+---+----------+----+
|
2017-06-21 17:59:52 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.explode_outer(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.3)
|
|
|
|
def posexplode_outer(col):
|
|
|
|
"""Returns a new row for each element with position in the given array or map.
|
|
|
|
Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced.
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
|
|
|
|
... ("id", "an_array", "a_map")
|
|
|
|
... )
|
|
|
|
>>> df.select("id", "an_array", posexplode_outer("a_map")).show()
|
|
|
|
+---+----------+----+----+-----+
|
|
|
|
| id| an_array| pos| key|value|
|
|
|
|
+---+----------+----+----+-----+
|
|
|
|
| 1|[foo, bar]| 0| x| 1.0|
|
|
|
|
| 2| []|null|null| null|
|
|
|
|
| 3| null|null|null| null|
|
|
|
|
+---+----------+----+----+-----+
|
|
|
|
>>> df.select("id", "a_map", posexplode_outer("an_array")).show()
|
[SPARK-23023][SQL] Cast field data to strings in showString
## What changes were proposed in this pull request?
The current `Datset.showString` prints rows thru `RowEncoder` deserializers like;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------------------------------------------+
|a |
+------------------------------------------------------------+
|[WrappedArray(1, 2), WrappedArray(3), WrappedArray(4, 5, 6)]|
+------------------------------------------------------------+
```
This result is incorrect because the correct one is;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------+
|a |
+------------------------+
|[[1, 2], [3], [4, 5, 6]]|
+------------------------+
```
So, this pr fixed code in `showString` to cast field data to strings before printing.
## How was this patch tested?
Added tests in `DataFrameSuite`.
Author: Takeshi Yamamuro <yamamuro@apache.org>
Closes #20214 from maropu/SPARK-23023.
2018-01-15 03:26:52 -05:00
|
|
|
+---+----------+----+----+
|
|
|
|
| id| a_map| pos| col|
|
|
|
|
+---+----------+----+----+
|
|
|
|
| 1|[x -> 1.0]| 0| foo|
|
|
|
|
| 1|[x -> 1.0]| 1| bar|
|
|
|
|
| 2| []|null|null|
|
|
|
|
| 3| null|null|null|
|
|
|
|
+---+----------+----+----+
|
2017-06-21 17:59:52 -04:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-26 02:24:33 -05:00
|
|
|
@ignore_unicode_prefix
|
2015-11-25 00:30:53 -05:00
|
|
|
@since(1.6)
|
|
|
|
def get_json_object(col, path):
|
|
|
|
"""
|
|
|
|
Extracts json object from a json string based on json path specified, and returns json string
|
|
|
|
of the extracted json object. It will return null if the input json string is invalid.
|
|
|
|
|
|
|
|
:param col: string column in json format
|
|
|
|
:param path: path to the json object to extract
|
2015-11-26 02:24:33 -05:00
|
|
|
|
|
|
|
>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame(data, ("key", "jstring"))
|
2016-07-06 13:45:51 -04:00
|
|
|
>>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\
|
|
|
|
... get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
|
2015-11-26 02:24:33 -05:00
|
|
|
[Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
|
2015-11-25 00:30:53 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-11-26 02:24:33 -05:00
|
|
|
@ignore_unicode_prefix
|
2015-11-25 00:30:53 -05:00
|
|
|
@since(1.6)
|
2015-11-26 02:24:33 -05:00
|
|
|
def json_tuple(col, *fields):
|
2015-11-25 00:30:53 -05:00
|
|
|
"""Creates a new row for a json column according to the given field names.
|
|
|
|
|
|
|
|
:param col: string column in json format
|
|
|
|
:param fields: list of fields to extract
|
|
|
|
|
2015-11-26 02:24:33 -05:00
|
|
|
>>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame(data, ("key", "jstring"))
|
2015-11-26 02:24:33 -05:00
|
|
|
>>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()
|
|
|
|
[Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
|
2015-11-25 00:30:53 -05:00
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
2015-11-26 02:24:33 -05:00
|
|
|
jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))
|
2015-11-25 00:30:53 -05:00
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2016-09-29 16:01:10 -04:00
|
|
|
@since(2.1)
|
|
|
|
def from_json(col, schema, options={}):
|
|
|
|
"""
|
[SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json support converting MapType to json for PySpark and SparkR
## What changes were proposed in this pull request?
In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR.
### For PySpark
```
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'{"name":"Alice")']
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
```
### For SparkR
```
# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
```
## How was this patch tested?
Add unit test cases.
cc viirya HyukjinKwon
Author: goldmedal <liugs963@gmail.com>
Closes #19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
2017-09-14 22:53:10 -04:00
|
|
|
Parses a column containing a JSON string into a :class:`StructType` or :class:`ArrayType`
|
|
|
|
of :class:`StructType`\\s with the specified schema. Returns `null`, in the case of an
|
|
|
|
unparseable string.
|
2016-09-29 16:01:10 -04:00
|
|
|
|
|
|
|
:param col: string column in json format
|
[SPARK-21266][R][PYTHON] Support schema a DDL-formatted string in dapply/gapply/from_json
## What changes were proposed in this pull request?
This PR supports schema in a DDL formatted string for `from_json` in R/Python and `dapply` and `gapply` in R, which are commonly used and/or consistent with Scala APIs.
Additionally, this PR exposes `structType` in R to allow working around in other possible corner cases.
**Python**
`from_json`
```python
from pyspark.sql.functions import from_json
data = [(1, '''{"a": 1}''')]
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, "a INT").alias("json")).show()
```
**R**
`from_json`
```R
df <- sql("SELECT named_struct('name', 'Bob') as people")
df <- mutate(df, people_json = to_json(df$people))
head(select(df, from_json(df$people_json, "name STRING")))
```
`structType.character`
```R
structType("a STRING, b INT")
```
`dapply`
```R
dapply(createDataFrame(list(list(1.0)), "a"), function(x) {x}, "a DOUBLE")
```
`gapply`
```R
gapply(createDataFrame(list(list(1.0)), "a"), "a", function(key, x) { x }, "a DOUBLE")
```
## How was this patch tested?
Doc tests for `from_json` in Python and unit tests `test_sparkSQL.R` in R.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #18498 from HyukjinKwon/SPARK-21266.
2017-07-10 13:40:03 -04:00
|
|
|
:param schema: a StructType or ArrayType of StructType to use when parsing the json column.
|
2016-09-29 16:01:10 -04:00
|
|
|
:param options: options to control parsing. accepts the same options as the json datasource
|
|
|
|
|
[SPARK-21266][R][PYTHON] Support schema a DDL-formatted string in dapply/gapply/from_json
## What changes were proposed in this pull request?
This PR supports schema in a DDL formatted string for `from_json` in R/Python and `dapply` and `gapply` in R, which are commonly used and/or consistent with Scala APIs.
Additionally, this PR exposes `structType` in R to allow working around in other possible corner cases.
**Python**
`from_json`
```python
from pyspark.sql.functions import from_json
data = [(1, '''{"a": 1}''')]
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, "a INT").alias("json")).show()
```
**R**
`from_json`
```R
df <- sql("SELECT named_struct('name', 'Bob') as people")
df <- mutate(df, people_json = to_json(df$people))
head(select(df, from_json(df$people_json, "name STRING")))
```
`structType.character`
```R
structType("a STRING, b INT")
```
`dapply`
```R
dapply(createDataFrame(list(list(1.0)), "a"), function(x) {x}, "a DOUBLE")
```
`gapply`
```R
gapply(createDataFrame(list(list(1.0)), "a"), "a", function(key, x) { x }, "a DOUBLE")
```
## How was this patch tested?
Doc tests for `from_json` in Python and unit tests `test_sparkSQL.R` in R.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #18498 from HyukjinKwon/SPARK-21266.
2017-07-10 13:40:03 -04:00
|
|
|
.. note:: Since Spark 2.3, the DDL-formatted string or a JSON format string is also
|
|
|
|
supported for ``schema``.
|
|
|
|
|
2016-09-29 16:01:10 -04:00
|
|
|
>>> from pyspark.sql.types import *
|
|
|
|
>>> data = [(1, '''{"a": 1}''')]
|
|
|
|
>>> schema = StructType([StructField("a", IntegerType())])
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(from_json(df.value, schema).alias("json")).collect()
|
|
|
|
[Row(json=Row(a=1))]
|
[SPARK-21266][R][PYTHON] Support schema a DDL-formatted string in dapply/gapply/from_json
## What changes were proposed in this pull request?
This PR supports schema in a DDL formatted string for `from_json` in R/Python and `dapply` and `gapply` in R, which are commonly used and/or consistent with Scala APIs.
Additionally, this PR exposes `structType` in R to allow working around in other possible corner cases.
**Python**
`from_json`
```python
from pyspark.sql.functions import from_json
data = [(1, '''{"a": 1}''')]
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, "a INT").alias("json")).show()
```
**R**
`from_json`
```R
df <- sql("SELECT named_struct('name', 'Bob') as people")
df <- mutate(df, people_json = to_json(df$people))
head(select(df, from_json(df$people_json, "name STRING")))
```
`structType.character`
```R
structType("a STRING, b INT")
```
`dapply`
```R
dapply(createDataFrame(list(list(1.0)), "a"), function(x) {x}, "a DOUBLE")
```
`gapply`
```R
gapply(createDataFrame(list(list(1.0)), "a"), "a", function(key, x) { x }, "a DOUBLE")
```
## How was this patch tested?
Doc tests for `from_json` in Python and unit tests `test_sparkSQL.R` in R.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #18498 from HyukjinKwon/SPARK-21266.
2017-07-10 13:40:03 -04:00
|
|
|
>>> df.select(from_json(df.value, "a INT").alias("json")).collect()
|
|
|
|
[Row(json=Row(a=1))]
|
[SPARK-19595][SQL] Support json array in from_json
## What changes were proposed in this pull request?
This PR proposes to both,
**Do not allow json arrays with multiple elements and return null in `from_json` with `StructType` as the schema.**
Currently, it only reads the single row when the input is a json array. So, the codes below:
```scala
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val schema = StructType(StructField("a", IntegerType) :: Nil)
Seq(("""[{"a": 1}, {"a": 2}]""")).toDF("struct").select(from_json(col("struct"), schema)).show()
```
prints
```
+--------------------+
|jsontostruct(struct)|
+--------------------+
| [1]|
+--------------------+
```
This PR simply suggests to print this as `null` if the schema is `StructType` and input is json array.with multiple elements
```
+--------------------+
|jsontostruct(struct)|
+--------------------+
| null|
+--------------------+
```
**Support json arrays in `from_json` with `ArrayType` as the schema.**
```scala
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val schema = ArrayType(StructType(StructField("a", IntegerType) :: Nil))
Seq(("""[{"a": 1}, {"a": 2}]""")).toDF("array").select(from_json(col("array"), schema)).show()
```
prints
```
+-------------------+
|jsontostruct(array)|
+-------------------+
| [[1], [2]]|
+-------------------+
```
## How was this patch tested?
Unit test in `JsonExpressionsSuite`, `JsonFunctionsSuite`, Python doctests and manual test.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #16929 from HyukjinKwon/disallow-array.
2017-03-05 17:35:06 -05:00
|
|
|
>>> data = [(1, '''[{"a": 1}]''')]
|
|
|
|
>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(from_json(df.value, schema).alias("json")).collect()
|
|
|
|
[Row(json=[Row(a=1)])]
|
2016-09-29 16:01:10 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
sc = SparkContext._active_spark_context
|
[SPARK-21266][R][PYTHON] Support schema a DDL-formatted string in dapply/gapply/from_json
## What changes were proposed in this pull request?
This PR supports schema in a DDL formatted string for `from_json` in R/Python and `dapply` and `gapply` in R, which are commonly used and/or consistent with Scala APIs.
Additionally, this PR exposes `structType` in R to allow working around in other possible corner cases.
**Python**
`from_json`
```python
from pyspark.sql.functions import from_json
data = [(1, '''{"a": 1}''')]
df = spark.createDataFrame(data, ("key", "value"))
df.select(from_json(df.value, "a INT").alias("json")).show()
```
**R**
`from_json`
```R
df <- sql("SELECT named_struct('name', 'Bob') as people")
df <- mutate(df, people_json = to_json(df$people))
head(select(df, from_json(df$people_json, "name STRING")))
```
`structType.character`
```R
structType("a STRING, b INT")
```
`dapply`
```R
dapply(createDataFrame(list(list(1.0)), "a"), function(x) {x}, "a DOUBLE")
```
`gapply`
```R
gapply(createDataFrame(list(list(1.0)), "a"), "a", function(key, x) { x }, "a DOUBLE")
```
## How was this patch tested?
Doc tests for `from_json` in Python and unit tests `test_sparkSQL.R` in R.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #18498 from HyukjinKwon/SPARK-21266.
2017-07-10 13:40:03 -04:00
|
|
|
if isinstance(schema, DataType):
|
|
|
|
schema = schema.json()
|
|
|
|
jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options)
|
2016-09-29 16:01:10 -04:00
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2016-11-01 15:46:41 -04:00
|
|
|
@ignore_unicode_prefix
|
|
|
|
@since(2.1)
|
|
|
|
def to_json(col, options={}):
|
|
|
|
"""
|
[SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json support converting MapType to json for PySpark and SparkR
## What changes were proposed in this pull request?
In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR.
### For PySpark
```
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'{"name":"Alice")']
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
```
### For SparkR
```
# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
```
## How was this patch tested?
Add unit test cases.
cc viirya HyukjinKwon
Author: goldmedal <liugs963@gmail.com>
Closes #19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
2017-09-14 22:53:10 -04:00
|
|
|
Converts a column containing a :class:`StructType`, :class:`ArrayType` of
|
|
|
|
:class:`StructType`\\s, a :class:`MapType` or :class:`ArrayType` of :class:`MapType`\\s
|
|
|
|
into a JSON string. Throws an exception, in the case of an unsupported type.
|
2016-11-01 15:46:41 -04:00
|
|
|
|
[SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json support converting MapType to json for PySpark and SparkR
## What changes were proposed in this pull request?
In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR.
### For PySpark
```
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'{"name":"Alice")']
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
```
### For SparkR
```
# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
```
## How was this patch tested?
Add unit test cases.
cc viirya HyukjinKwon
Author: goldmedal <liugs963@gmail.com>
Closes #19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
2017-09-14 22:53:10 -04:00
|
|
|
:param col: name of column containing the struct, array of the structs, the map or
|
|
|
|
array of the maps.
|
2016-11-01 15:46:41 -04:00
|
|
|
:param options: options to control converting. accepts the same options as the json datasource
|
|
|
|
|
|
|
|
>>> from pyspark.sql import Row
|
|
|
|
>>> from pyspark.sql.types import *
|
|
|
|
>>> data = [(1, Row(name='Alice', age=2))]
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(to_json(df.value).alias("json")).collect()
|
|
|
|
[Row(json=u'{"age":2,"name":"Alice"}')]
|
2017-03-20 01:33:01 -04:00
|
|
|
>>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])]
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(to_json(df.value).alias("json")).collect()
|
|
|
|
[Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]
|
[SPARK-21513][SQL][FOLLOWUP] Allow UDF to_json support converting MapType to json for PySpark and SparkR
## What changes were proposed in this pull request?
In previous work SPARK-21513, we has allowed `MapType` and `ArrayType` of `MapType`s convert to a json string but only for Scala API. In this follow-up PR, we will make SparkSQL support it for PySpark and SparkR, too. We also fix some little bugs and comments of the previous work in this follow-up PR.
### For PySpark
```
>>> data = [(1, {"name": "Alice"})]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'{"name":"Alice")']
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_json(df.value).alias("json")).collect()
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
```
### For SparkR
```
# Converts a map into a JSON object
df2 <- sql("SELECT map('name', 'Bob')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
# Converts an array of maps into a JSON array
df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
df2 <- mutate(df2, people_json = to_json(df2$people))
```
## How was this patch tested?
Add unit test cases.
cc viirya HyukjinKwon
Author: goldmedal <liugs963@gmail.com>
Closes #19223 from goldmedal/SPARK-21513-fp-PySaprkAndSparkR.
2017-09-14 22:53:10 -04:00
|
|
|
>>> data = [(1, {"name": "Alice"})]
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(to_json(df.value).alias("json")).collect()
|
|
|
|
[Row(json=u'{"name":"Alice"}')]
|
|
|
|
>>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]
|
|
|
|
>>> df = spark.createDataFrame(data, ("key", "value"))
|
|
|
|
>>> df.select(to_json(df.value).alias("json")).collect()
|
|
|
|
[Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')]
|
2016-11-01 15:46:41 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
jc = sc._jvm.functions.to_json(_to_java_column(col), options)
|
|
|
|
return Column(jc)
|
|
|
|
|
|
|
|
|
2015-07-21 03:53:20 -04:00
|
|
|
@since(1.5)
|
|
|
|
def size(col):
|
|
|
|
"""
|
|
|
|
Collection function: returns the length of the array or map stored in the column.
|
2015-07-31 19:05:26 -04:00
|
|
|
|
2015-07-21 03:53:20 -04:00
|
|
|
:param col: name of column or expression
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
|
2015-07-21 03:53:20 -04:00
|
|
|
>>> df.select(size(df.data)).collect()
|
|
|
|
[Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.size(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2018-04-17 04:55:35 -04:00
|
|
|
@since(2.4)
|
|
|
|
def array_min(col):
|
|
|
|
"""
|
|
|
|
Collection function: returns the minimum value of the array.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])
|
|
|
|
>>> df.select(array_min(df.data).alias('min')).collect()
|
|
|
|
[Row(min=1), Row(min=-1)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.array_min(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2018-04-16 00:45:55 -04:00
|
|
|
@since(2.4)
|
|
|
|
def array_max(col):
|
|
|
|
"""
|
|
|
|
Collection function: returns the maximum value of the array.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])
|
|
|
|
>>> df.select(array_max(df.data).alias('max')).collect()
|
|
|
|
[Row(max=3), Row(max=10)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.array_max(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-08-01 11:32:29 -04:00
|
|
|
@since(1.5)
|
|
|
|
def sort_array(col, asc=True):
|
|
|
|
"""
|
2016-11-06 00:47:33 -04:00
|
|
|
Collection function: sorts the input array in ascending or descending order according
|
|
|
|
to the natural ordering of the array elements.
|
2015-08-01 11:32:29 -04:00
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
2016-05-23 21:14:48 -04:00
|
|
|
>>> df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])
|
2015-08-01 11:32:29 -04:00
|
|
|
>>> df.select(sort_array(df.data).alias('r')).collect()
|
|
|
|
[Row(r=[1, 2, 3]), Row(r=[1]), Row(r=[])]
|
|
|
|
>>> df.select(sort_array(df.data, asc=False).alias('r')).collect()
|
|
|
|
[Row(r=[3, 2, 1]), Row(r=[1]), Row(r=[])]
|
2018-04-17 04:55:35 -04:00
|
|
|
"""
|
2015-08-01 11:32:29 -04:00
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))
|
|
|
|
|
|
|
|
|
[SPARK-23926][SQL] Extending reverse function to support ArrayType arguments
## What changes were proposed in this pull request?
This PR extends `reverse` functions to be able to operate over array columns and covers:
- Introduction of `Reverse` expression that represents logic for reversing arrays and also strings
- Removal of `StringReverse` expression
- A wrapper for PySpark
## How was this patch tested?
New tests added into:
- CollectionExpressionsSuite
- DataFrameFunctionsSuite
## Codegen examples
### Primitive type
```
val df = Seq(
Seq(1, 3, 4, 2),
null
).toDF("i")
df.filter($"i".isNotNull || $"i".isNull).select(reverse($"i")).debugCodegen
```
Result:
```
/* 032 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 033 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 034 */ null : (inputadapter_row.getArray(0));
/* 035 */
/* 036 */ boolean filter_value = true;
/* 037 */
/* 038 */ if (!(!inputadapter_isNull)) {
/* 039 */ filter_value = inputadapter_isNull;
/* 040 */ }
/* 041 */ if (!filter_value) continue;
/* 042 */
/* 043 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 044 */
/* 045 */ boolean project_isNull = inputadapter_isNull;
/* 046 */ ArrayData project_value = null;
/* 047 */
/* 048 */ if (!inputadapter_isNull) {
/* 049 */ final int project_length = inputadapter_value.numElements();
/* 050 */ project_value = inputadapter_value.copy();
/* 051 */ for(int k = 0; k < project_length / 2; k++) {
/* 052 */ int l = project_length - k - 1;
/* 053 */ boolean isNullAtK = project_value.isNullAt(k);
/* 054 */ boolean isNullAtL = project_value.isNullAt(l);
/* 055 */ if(!isNullAtK) {
/* 056 */ int el = project_value.getInt(k);
/* 057 */ if(!isNullAtL) {
/* 058 */ project_value.setInt(k, project_value.getInt(l));
/* 059 */ } else {
/* 060 */ project_value.setNullAt(k);
/* 061 */ }
/* 062 */ project_value.setInt(l, el);
/* 063 */ } else if (!isNullAtL) {
/* 064 */ project_value.setInt(k, project_value.getInt(l));
/* 065 */ project_value.setNullAt(l);
/* 066 */ }
/* 067 */ }
/* 068 */
/* 069 */ }
```
### Non-primitive type
```
val df = Seq(
Seq("a", "c", "d", "b"),
null
).toDF("s")
df.filter($"s".isNotNull || $"s".isNull).select(reverse($"s")).debugCodegen
```
Result:
```
/* 032 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 033 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 034 */ null : (inputadapter_row.getArray(0));
/* 035 */
/* 036 */ boolean filter_value = true;
/* 037 */
/* 038 */ if (!(!inputadapter_isNull)) {
/* 039 */ filter_value = inputadapter_isNull;
/* 040 */ }
/* 041 */ if (!filter_value) continue;
/* 042 */
/* 043 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 044 */
/* 045 */ boolean project_isNull = inputadapter_isNull;
/* 046 */ ArrayData project_value = null;
/* 047 */
/* 048 */ if (!inputadapter_isNull) {
/* 049 */ final int project_length = inputadapter_value.numElements();
/* 050 */ project_value = new org.apache.spark.sql.catalyst.util.GenericArrayData(new Object[project_length]);
/* 051 */ for(int k = 0; k < project_length; k++) {
/* 052 */ int l = project_length - k - 1;
/* 053 */ project_value.update(k, inputadapter_value.getUTF8String(l));
/* 054 */ }
/* 055 */
/* 056 */ }
```
Author: mn-mikke <mrkAha12346github>
Closes #21034 from mn-mikke/feature/array-api-reverse-to-master.
2018-04-18 05:41:55 -04:00
|
|
|
@since(1.5)
|
|
|
|
@ignore_unicode_prefix
|
|
|
|
def reverse(col):
|
|
|
|
"""
|
|
|
|
Collection function: returns a reversed string or an array with reverse order of elements.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([('Spark SQL',)], ['data'])
|
|
|
|
>>> df.select(reverse(df.data).alias('s')).collect()
|
|
|
|
[Row(s=u'LQS krapS')]
|
|
|
|
>>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])
|
|
|
|
>>> df.select(reverse(df.data).alias('r')).collect()
|
|
|
|
[Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.reverse(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
[SPARK-23821][SQL] Collection function: flatten
## What changes were proposed in this pull request?
This PR adds a new collection function that transforms an array of arrays into a single array. The PR comprises:
- An expression for flattening array structure
- Flatten function
- A wrapper for PySpark
## How was this patch tested?
New tests added into:
- CollectionExpressionsSuite
- DataFrameFunctionsSuite
## Codegen examples
### Primitive type
```
val df = Seq(
Seq(Seq(1, 2), Seq(4, 5)),
Seq(null, Seq(1))
).toDF("i")
df.filter($"i".isNotNull || $"i".isNull).select(flatten($"i")).debugCodegen
```
Result:
```
/* 033 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */ null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */ boolean filter_value = true;
/* 038 */
/* 039 */ if (!(!inputadapter_isNull)) {
/* 040 */ filter_value = inputadapter_isNull;
/* 041 */ }
/* 042 */ if (!filter_value) continue;
/* 043 */
/* 044 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 045 */
/* 046 */ boolean project_isNull = inputadapter_isNull;
/* 047 */ ArrayData project_value = null;
/* 048 */
/* 049 */ if (!inputadapter_isNull) {
/* 050 */ for (int z = 0; !project_isNull && z < inputadapter_value.numElements(); z++) {
/* 051 */ project_isNull |= inputadapter_value.isNullAt(z);
/* 052 */ }
/* 053 */ if (!project_isNull) {
/* 054 */ long project_numElements = 0;
/* 055 */ for (int z = 0; z < inputadapter_value.numElements(); z++) {
/* 056 */ project_numElements += inputadapter_value.getArray(z).numElements();
/* 057 */ }
/* 058 */ if (project_numElements > 2147483632) {
/* 059 */ throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 060 */ project_numElements + " elements due to exceeding the array size limit 2147483632.");
/* 061 */ }
/* 062 */
/* 063 */ long project_size = UnsafeArrayData.calculateSizeOfUnderlyingByteArray(
/* 064 */ project_numElements,
/* 065 */ 4);
/* 066 */ if (project_size > 2147483632) {
/* 067 */ throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 068 */ project_size + " bytes of data due to exceeding the limit 2147483632" +
/* 069 */ " bytes for UnsafeArrayData.");
/* 070 */ }
/* 071 */
/* 072 */ byte[] project_array = new byte[(int)project_size];
/* 073 */ UnsafeArrayData project_tempArrayData = new UnsafeArrayData();
/* 074 */ Platform.putLong(project_array, 16, project_numElements);
/* 075 */ project_tempArrayData.pointTo(project_array, 16, (int)project_size);
/* 076 */ int project_counter = 0;
/* 077 */ for (int k = 0; k < inputadapter_value.numElements(); k++) {
/* 078 */ ArrayData arr = inputadapter_value.getArray(k);
/* 079 */ for (int l = 0; l < arr.numElements(); l++) {
/* 080 */ if (arr.isNullAt(l)) {
/* 081 */ project_tempArrayData.setNullAt(project_counter);
/* 082 */ } else {
/* 083 */ project_tempArrayData.setInt(
/* 084 */ project_counter,
/* 085 */ arr.getInt(l)
/* 086 */ );
/* 087 */ }
/* 088 */ project_counter++;
/* 089 */ }
/* 090 */ }
/* 091 */ project_value = project_tempArrayData;
/* 092 */
/* 093 */ }
/* 094 */
/* 095 */ }
```
### Non-primitive type
```
val df = Seq(
Seq(Seq("a", "b"), Seq(null, "d")),
Seq(null, Seq("a"))
).toDF("s")
df.filter($"s".isNotNull || $"s".isNull).select(flatten($"s")).debugCodegen
```
Result:
```
/* 033 */ boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */ ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */ null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */ boolean filter_value = true;
/* 038 */
/* 039 */ if (!(!inputadapter_isNull)) {
/* 040 */ filter_value = inputadapter_isNull;
/* 041 */ }
/* 042 */ if (!filter_value) continue;
/* 043 */
/* 044 */ ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 045 */
/* 046 */ boolean project_isNull = inputadapter_isNull;
/* 047 */ ArrayData project_value = null;
/* 048 */
/* 049 */ if (!inputadapter_isNull) {
/* 050 */ for (int z = 0; !project_isNull && z < inputadapter_value.numElements(); z++) {
/* 051 */ project_isNull |= inputadapter_value.isNullAt(z);
/* 052 */ }
/* 053 */ if (!project_isNull) {
/* 054 */ long project_numElements = 0;
/* 055 */ for (int z = 0; z < inputadapter_value.numElements(); z++) {
/* 056 */ project_numElements += inputadapter_value.getArray(z).numElements();
/* 057 */ }
/* 058 */ if (project_numElements > 2147483632) {
/* 059 */ throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " +
/* 060 */ project_numElements + " elements due to exceeding the array size limit 2147483632.");
/* 061 */ }
/* 062 */
/* 063 */ Object[] project_arrayObject = new Object[(int)project_numElements];
/* 064 */ int project_counter = 0;
/* 065 */ for (int k = 0; k < inputadapter_value.numElements(); k++) {
/* 066 */ ArrayData arr = inputadapter_value.getArray(k);
/* 067 */ for (int l = 0; l < arr.numElements(); l++) {
/* 068 */ project_arrayObject[project_counter] = arr.getUTF8String(l);
/* 069 */ project_counter++;
/* 070 */ }
/* 071 */ }
/* 072 */ project_value = new org.apache.spark.sql.catalyst.util.GenericArrayData(project_arrayObject);
/* 073 */
/* 074 */ }
/* 075 */
/* 076 */ }
```
Author: mn-mikke <mrkAha12346github>
Closes #20938 from mn-mikke/feature/array-api-flatten-to-master.
2018-04-24 22:19:08 -04:00
|
|
|
@since(2.4)
|
|
|
|
def flatten(col):
|
|
|
|
"""
|
|
|
|
Collection function: creates a single array from an array of arrays.
|
|
|
|
If a structure of nested arrays is deeper than two levels,
|
|
|
|
only one level of nesting is removed.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data'])
|
|
|
|
>>> df.select(flatten(df.data).alias('r')).collect()
|
|
|
|
[Row(r=[1, 2, 3, 4, 5, 6]), Row(r=None)]
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.flatten(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2017-06-19 14:40:07 -04:00
|
|
|
@since(2.3)
|
|
|
|
def map_keys(col):
|
|
|
|
"""
|
|
|
|
Collection function: Returns an unordered array containing the keys of the map.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> from pyspark.sql.functions import map_keys
|
|
|
|
>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
|
|
|
|
>>> df.select(map_keys("data").alias("keys")).show()
|
|
|
|
+------+
|
|
|
|
| keys|
|
|
|
|
+------+
|
|
|
|
|[1, 2]|
|
|
|
|
+------+
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.map_keys(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
|
|
|
@since(2.3)
|
|
|
|
def map_values(col):
|
|
|
|
"""
|
|
|
|
Collection function: Returns an unordered array containing the values of the map.
|
|
|
|
|
|
|
|
:param col: name of column or expression
|
|
|
|
|
|
|
|
>>> from pyspark.sql.functions import map_values
|
|
|
|
>>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")
|
|
|
|
>>> df.select(map_values("data").alias("values")).show()
|
|
|
|
+------+
|
|
|
|
|values|
|
|
|
|
+------+
|
|
|
|
|[a, b]|
|
|
|
|
+------+
|
|
|
|
"""
|
|
|
|
sc = SparkContext._active_spark_context
|
|
|
|
return Column(sc._jvm.functions.map_values(_to_java_column(col)))
|
|
|
|
|
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
# ---------------------------- User Defined Function ----------------------------------
|
2015-07-31 19:05:26 -04:00
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
class PandasUDFType(object):
|
|
|
|
"""Pandas UDF Types. See :meth:`pyspark.sql.functions.pandas_udf`.
|
|
|
|
"""
|
2018-01-30 07:55:55 -05:00
|
|
|
SCALAR = PythonEvalType.SQL_SCALAR_PANDAS_UDF
|
2017-11-17 10:43:08 -05:00
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
GROUPED_MAP = PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF
|
2017-09-22 04:17:41 -04:00
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
GROUPED_AGG = PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
|
2018-01-23 00:11:30 -05:00
|
|
|
|
2017-09-22 04:17:41 -04:00
|
|
|
|
2015-05-21 02:05:54 -04:00
|
|
|
@since(1.3)
|
2017-02-15 13:16:34 -05:00
|
|
|
def udf(f=None, returnType=StringType()):
|
2017-10-10 18:32:01 -04:00
|
|
|
"""Creates a user defined function (UDF).
|
2016-11-22 06:40:18 -05:00
|
|
|
|
2017-12-26 09:39:40 -05:00
|
|
|
.. note:: The user-defined functions are considered deterministic by default. Due to
|
|
|
|
optimization, duplicate invocations may be eliminated or the function may even be invoked
|
|
|
|
more times than it is present in the query. If your function is not deterministic, call
|
|
|
|
`asNondeterministic` on the user defined function. E.g.:
|
|
|
|
|
|
|
|
>>> from pyspark.sql.types import IntegerType
|
|
|
|
>>> import random
|
|
|
|
>>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2018-01-18 00:51:05 -05:00
|
|
|
.. note:: The user-defined functions do not support conditional expressions or short circuiting
|
2017-11-21 03:36:37 -05:00
|
|
|
in boolean expressions and it ends up with being executed all internally. If the functions
|
|
|
|
can fail on special rows, the workaround is to incorporate the condition into the functions.
|
2017-11-01 08:09:35 -04:00
|
|
|
|
[SPARK-23645][MINOR][DOCS][PYTHON] Add docs RE `pandas_udf` with keyword args
## What changes were proposed in this pull request?
Add documentation about the limitations of `pandas_udf` with keyword arguments and related concepts, like `functools.partial` fn objects.
NOTE: intermediate commits on this PR show some of the steps that can be taken to fix some (but not all) of these pain points.
### Survey of problems we face today:
(Initialize) Note: python 3.6 and spark 2.4snapshot.
```
from pyspark.sql import SparkSession
import inspect, functools
from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit, udf
spark = SparkSession.builder.getOrCreate()
print(spark.version)
df = spark.range(1,6).withColumn('b', col('id') * 2)
def ok(a,b): return a+b
```
Using a keyword argument at the call site `b=...` (and yes, *full* stack trace below, haha):
```
---> 14 df.withColumn('ok', pandas_udf(f=ok, returnType='bigint')('id', b='id')).show() # no kwargs
TypeError: wrapper() got an unexpected keyword argument 'b'
```
Using partial with a keyword argument where the kw-arg is the first argument of the fn:
*(Aside: kind of interesting that lines 15,16 work great and then 17 explodes)*
```
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-9-e9f31b8799c1> in <module>()
15 df.withColumn('ok', pandas_udf(f=functools.partial(ok, 7), returnType='bigint')('id')).show()
16 df.withColumn('ok', pandas_udf(f=functools.partial(ok, b=7), returnType='bigint')('id')).show()
---> 17 df.withColumn('ok', pandas_udf(f=functools.partial(ok, a=7), returnType='bigint')('id')).show()
/Users/stu/ZZ/spark/python/pyspark/sql/functions.py in pandas_udf(f, returnType, functionType)
2378 return functools.partial(_create_udf, returnType=return_type, evalType=eval_type)
2379 else:
-> 2380 return _create_udf(f=f, returnType=return_type, evalType=eval_type)
2381
2382
/Users/stu/ZZ/spark/python/pyspark/sql/udf.py in _create_udf(f, returnType, evalType)
54 argspec.varargs is None:
55 raise ValueError(
---> 56 "Invalid function: 0-arg pandas_udfs are not supported. "
57 "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
58 )
ValueError: Invalid function: 0-arg pandas_udfs are not supported. Instead, create a 1-arg pandas_udf and ignore the arg in your function.
```
Author: Michael (Stu) Stewart <mstewart141@gmail.com>
Closes #20900 from mstewart141/udfkw2.
2018-03-25 23:45:45 -04:00
|
|
|
.. note:: The user-defined functions do not take keyword arguments on the calling side.
|
|
|
|
|
2017-02-15 13:16:34 -05:00
|
|
|
:param f: python function if used as a standalone function
|
2018-01-18 08:33:04 -05:00
|
|
|
:param returnType: the return type of the user-defined function. The value can be either a
|
|
|
|
:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
|
2016-07-28 17:57:15 -04:00
|
|
|
|
2015-02-17 13:22:48 -05:00
|
|
|
>>> from pyspark.sql.types import IntegerType
|
2015-02-14 02:03:22 -05:00
|
|
|
>>> slen = udf(lambda s: len(s), IntegerType())
|
2017-02-15 13:16:34 -05:00
|
|
|
>>> @udf
|
|
|
|
... def to_upper(s):
|
|
|
|
... if s is not None:
|
|
|
|
... return s.upper()
|
|
|
|
...
|
|
|
|
>>> @udf(returnType=IntegerType())
|
|
|
|
... def add_one(x):
|
|
|
|
... if x is not None:
|
|
|
|
... return x + 1
|
|
|
|
...
|
|
|
|
>>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
|
|
|
|
>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()
|
|
|
|
+----------+--------------+------------+
|
|
|
|
|slen(name)|to_upper(name)|add_one(age)|
|
|
|
|
+----------+--------------+------------+
|
|
|
|
| 8| JOHN DOE| 22|
|
|
|
|
+----------+--------------+------------+
|
|
|
|
"""
|
2017-11-17 10:43:08 -05:00
|
|
|
# decorator @udf, @udf(), @udf(dataType())
|
|
|
|
if f is None or isinstance(f, (str, DataType)):
|
|
|
|
# If DataType has been passed as a positional argument
|
|
|
|
# for decorator use it as a returnType
|
|
|
|
return_type = f or returnType
|
|
|
|
return functools.partial(_create_udf, returnType=return_type,
|
|
|
|
evalType=PythonEvalType.SQL_BATCHED_UDF)
|
|
|
|
else:
|
|
|
|
return _create_udf(f=f, returnType=returnType,
|
|
|
|
evalType=PythonEvalType.SQL_BATCHED_UDF)
|
2017-02-15 13:16:34 -05:00
|
|
|
|
2017-09-22 04:17:41 -04:00
|
|
|
|
|
|
|
@since(2.3)
|
2017-11-17 10:43:08 -05:00
|
|
|
def pandas_udf(f=None, returnType=None, functionType=None):
|
2017-09-22 04:17:41 -04:00
|
|
|
"""
|
2017-10-10 18:32:01 -04:00
|
|
|
Creates a vectorized user defined function (UDF).
|
2017-09-22 04:17:41 -04:00
|
|
|
|
2017-10-10 18:32:01 -04:00
|
|
|
:param f: user-defined function. A python function if used as a standalone function
|
2018-01-18 08:33:04 -05:00
|
|
|
:param returnType: the return type of the user-defined function. The value can be either a
|
|
|
|
:class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
|
2017-11-17 10:43:08 -05:00
|
|
|
:param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
|
|
|
|
Default: SCALAR.
|
2017-09-22 04:17:41 -04:00
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
The function type of the UDF can be one of the following:
|
2017-10-10 18:32:01 -04:00
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
1. SCALAR
|
2017-10-10 18:32:01 -04:00
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`.
|
2018-01-23 00:11:30 -05:00
|
|
|
The returnType should be a primitive data type, e.g., :class:`DoubleType`.
|
2017-10-10 18:32:01 -04:00
|
|
|
The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`.
|
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and
|
|
|
|
:meth:`pyspark.sql.DataFrame.select`.
|
|
|
|
|
|
|
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
2017-10-10 18:32:01 -04:00
|
|
|
>>> from pyspark.sql.types import IntegerType, StringType
|
2017-12-21 06:43:56 -05:00
|
|
|
>>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) # doctest: +SKIP
|
|
|
|
>>> @pandas_udf(StringType()) # doctest: +SKIP
|
2017-10-10 18:32:01 -04:00
|
|
|
... def to_upper(s):
|
|
|
|
... return s.str.upper()
|
|
|
|
...
|
2017-12-21 06:43:56 -05:00
|
|
|
>>> @pandas_udf("integer", PandasUDFType.SCALAR) # doctest: +SKIP
|
2017-10-10 18:32:01 -04:00
|
|
|
... def add_one(x):
|
|
|
|
... return x + 1
|
|
|
|
...
|
2017-12-21 06:43:56 -05:00
|
|
|
>>> df = spark.createDataFrame([(1, "John Doe", 21)],
|
|
|
|
... ("id", "name", "age")) # doctest: +SKIP
|
2017-10-10 18:32:01 -04:00
|
|
|
>>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
|
|
|
|
... .show() # doctest: +SKIP
|
|
|
|
+----------+--------------+------------+
|
|
|
|
|slen(name)|to_upper(name)|add_one(age)|
|
|
|
|
+----------+--------------+------------+
|
|
|
|
| 8| JOHN DOE| 22|
|
|
|
|
+----------+--------------+------------+
|
|
|
|
|
[SPARK-22980][PYTHON][SQL] Clarify the length of each series is of each batch within scalar Pandas UDF
## What changes were proposed in this pull request?
This PR proposes to add a note that saying the length of a scalar Pandas UDF's `Series` is not of the whole input column but of the batch.
We are fine for a group map UDF because the usage is different from our typical UDF but scalar UDFs might cause confusion with the normal UDF.
For example, please consider this example:
```python
from pyspark.sql.functions import pandas_udf, col, lit
df = spark.range(1)
f = pandas_udf(lambda x, y: len(x) + y, LongType())
df.select(f(lit('text'), col('id'))).show()
```
```
+------------------+
|<lambda>(text, id)|
+------------------+
| 1|
+------------------+
```
```python
from pyspark.sql.functions import udf, col, lit
df = spark.range(1)
f = udf(lambda x, y: len(x) + y, "long")
df.select(f(lit('text'), col('id'))).show()
```
```
+------------------+
|<lambda>(text, id)|
+------------------+
| 4|
+------------------+
```
## How was this patch tested?
Manually built the doc and checked the output.
Author: hyukjinkwon <gurwls223@gmail.com>
Closes #20237 from HyukjinKwon/SPARK-22980.
2018-01-13 02:13:44 -05:00
|
|
|
.. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
|
|
|
|
column, but is the length of an internal batch used for each call to the function.
|
|
|
|
Therefore, this can be used, for example, to ensure the length of each returned
|
|
|
|
`pandas.Series`, and can not be used as the column length.
|
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
2. GROUPED_MAP
|
2017-10-10 18:32:01 -04:00
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
|
2017-10-10 18:32:01 -04:00
|
|
|
The returnType should be a :class:`StructType` describing the schema of the returned
|
|
|
|
`pandas.DataFrame`.
|
2017-11-17 10:43:08 -05:00
|
|
|
The length of the returned `pandas.DataFrame` can be arbitrary.
|
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.
|
2017-10-10 18:32:01 -04:00
|
|
|
|
2017-11-17 10:43:08 -05:00
|
|
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
2017-10-10 18:32:01 -04:00
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
|
2017-12-21 06:43:56 -05:00
|
|
|
... ("id", "v")) # doctest: +SKIP
|
2018-01-30 07:55:55 -05:00
|
|
|
>>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # doctest: +SKIP
|
2017-10-10 18:32:01 -04:00
|
|
|
... def normalize(pdf):
|
|
|
|
... v = pdf.v
|
|
|
|
... return pdf.assign(v=(v - v.mean()) / v.std())
|
2017-11-17 10:43:08 -05:00
|
|
|
>>> df.groupby("id").apply(normalize).show() # doctest: +SKIP
|
2017-10-10 18:32:01 -04:00
|
|
|
+---+-------------------+
|
|
|
|
| id| v|
|
|
|
|
+---+-------------------+
|
|
|
|
| 1|-0.7071067811865475|
|
|
|
|
| 1| 0.7071067811865475|
|
|
|
|
| 2|-0.8320502943378437|
|
|
|
|
| 2|-0.2773500981126146|
|
|
|
|
| 2| 1.1094003924504583|
|
|
|
|
+---+-------------------+
|
|
|
|
|
2018-03-08 06:29:07 -05:00
|
|
|
Alternatively, the user can define a function that takes two arguments.
|
|
|
|
In this case, the grouping key will be passed as the first argument and the data will
|
|
|
|
be passed as the second argument. The grouping key will be passed as a tuple of numpy
|
|
|
|
data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in
|
|
|
|
as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.
|
|
|
|
This is useful when the user does not want to hardcode grouping key in the function.
|
|
|
|
|
|
|
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
|
|
|
>>> import pandas as pd # doctest: +SKIP
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
|
|
|
|
... ("id", "v")) # doctest: +SKIP
|
|
|
|
>>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # doctest: +SKIP
|
|
|
|
... def mean_udf(key, pdf):
|
|
|
|
... # key is a tuple of one numpy.int64, which is the value
|
|
|
|
... # of 'id' for the current group
|
|
|
|
... return pd.DataFrame([key + (pdf.v.mean(),)])
|
|
|
|
>>> df.groupby('id').apply(mean_udf).show() # doctest: +SKIP
|
|
|
|
+---+---+
|
|
|
|
| id| v|
|
|
|
|
+---+---+
|
|
|
|
| 1|1.5|
|
|
|
|
| 2|6.0|
|
|
|
|
+---+---+
|
|
|
|
|
2017-10-10 18:32:01 -04:00
|
|
|
.. seealso:: :meth:`pyspark.sql.GroupedData.apply`
|
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
3. GROUPED_AGG
|
2018-01-23 00:11:30 -05:00
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar
|
2018-01-23 00:11:30 -05:00
|
|
|
The `returnType` should be a primitive data type, e.g., :class:`DoubleType`.
|
|
|
|
The returned scalar can be either a python primitive type, e.g., `int` or `float`
|
|
|
|
or a numpy data type, e.g., `numpy.int64` or `numpy.float64`.
|
|
|
|
|
|
|
|
:class:`ArrayType`, :class:`MapType` and :class:`StructType` are currently not supported as
|
|
|
|
output types.
|
|
|
|
|
|
|
|
Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg`
|
|
|
|
|
|
|
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
|
|
|
>>> df = spark.createDataFrame(
|
|
|
|
... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
|
|
|
|
... ("id", "v"))
|
2018-01-30 07:55:55 -05:00
|
|
|
>>> @pandas_udf("double", PandasUDFType.GROUPED_AGG) # doctest: +SKIP
|
2018-01-23 00:11:30 -05:00
|
|
|
... def mean_udf(v):
|
|
|
|
... return v.mean()
|
|
|
|
>>> df.groupby("id").agg(mean_udf(df['v'])).show() # doctest: +SKIP
|
|
|
|
+---+-----------+
|
|
|
|
| id|mean_udf(v)|
|
|
|
|
+---+-----------+
|
|
|
|
| 1| 1.5|
|
|
|
|
| 2| 6.0|
|
|
|
|
+---+-----------+
|
|
|
|
|
|
|
|
.. seealso:: :meth:`pyspark.sql.GroupedData.agg`
|
|
|
|
|
2018-01-06 03:11:20 -05:00
|
|
|
.. note:: The user-defined functions are considered deterministic by default. Due to
|
|
|
|
optimization, duplicate invocations may be eliminated or the function may even be invoked
|
|
|
|
more times than it is present in the query. If your function is not deterministic, call
|
|
|
|
`asNondeterministic` on the user defined function. E.g.:
|
|
|
|
|
|
|
|
>>> @pandas_udf('double', PandasUDFType.SCALAR) # doctest: +SKIP
|
|
|
|
... def random(v):
|
|
|
|
... import numpy as np
|
|
|
|
... import pandas as pd
|
|
|
|
... return pd.Series(np.random.randn(len(v))
|
|
|
|
>>> random = random.asNondeterministic() # doctest: +SKIP
|
2017-11-01 08:09:35 -04:00
|
|
|
|
2018-01-18 00:51:05 -05:00
|
|
|
.. note:: The user-defined functions do not support conditional expressions or short circuiting
|
2017-11-21 03:36:37 -05:00
|
|
|
in boolean expressions and it ends up with being executed all internally. If the functions
|
|
|
|
can fail on special rows, the workaround is to incorporate the condition into the functions.
|
[SPARK-23645][MINOR][DOCS][PYTHON] Add docs RE `pandas_udf` with keyword args
## What changes were proposed in this pull request?
Add documentation about the limitations of `pandas_udf` with keyword arguments and related concepts, like `functools.partial` fn objects.
NOTE: intermediate commits on this PR show some of the steps that can be taken to fix some (but not all) of these pain points.
### Survey of problems we face today:
(Initialize) Note: python 3.6 and spark 2.4snapshot.
```
from pyspark.sql import SparkSession
import inspect, functools
from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit, udf
spark = SparkSession.builder.getOrCreate()
print(spark.version)
df = spark.range(1,6).withColumn('b', col('id') * 2)
def ok(a,b): return a+b
```
Using a keyword argument at the call site `b=...` (and yes, *full* stack trace below, haha):
```
---> 14 df.withColumn('ok', pandas_udf(f=ok, returnType='bigint')('id', b='id')).show() # no kwargs
TypeError: wrapper() got an unexpected keyword argument 'b'
```
Using partial with a keyword argument where the kw-arg is the first argument of the fn:
*(Aside: kind of interesting that lines 15,16 work great and then 17 explodes)*
```
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-9-e9f31b8799c1> in <module>()
15 df.withColumn('ok', pandas_udf(f=functools.partial(ok, 7), returnType='bigint')('id')).show()
16 df.withColumn('ok', pandas_udf(f=functools.partial(ok, b=7), returnType='bigint')('id')).show()
---> 17 df.withColumn('ok', pandas_udf(f=functools.partial(ok, a=7), returnType='bigint')('id')).show()
/Users/stu/ZZ/spark/python/pyspark/sql/functions.py in pandas_udf(f, returnType, functionType)
2378 return functools.partial(_create_udf, returnType=return_type, evalType=eval_type)
2379 else:
-> 2380 return _create_udf(f=f, returnType=return_type, evalType=eval_type)
2381
2382
/Users/stu/ZZ/spark/python/pyspark/sql/udf.py in _create_udf(f, returnType, evalType)
54 argspec.varargs is None:
55 raise ValueError(
---> 56 "Invalid function: 0-arg pandas_udfs are not supported. "
57 "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
58 )
ValueError: Invalid function: 0-arg pandas_udfs are not supported. Instead, create a 1-arg pandas_udf and ignore the arg in your function.
```
Author: Michael (Stu) Stewart <mstewart141@gmail.com>
Closes #20900 from mstewart141/udfkw2.
2018-03-25 23:45:45 -04:00
|
|
|
|
|
|
|
.. note:: The user-defined functions do not take keyword arguments on the calling side.
|
2017-09-22 04:17:41 -04:00
|
|
|
"""
|
2017-11-17 10:43:08 -05:00
|
|
|
# decorator @pandas_udf(returnType, functionType)
|
|
|
|
is_decorator = f is None or isinstance(f, (str, DataType))
|
|
|
|
|
|
|
|
if is_decorator:
|
|
|
|
# If DataType has been passed as a positional argument
|
|
|
|
# for decorator use it as a returnType
|
|
|
|
return_type = f or returnType
|
|
|
|
|
|
|
|
if functionType is not None:
|
|
|
|
# @pandas_udf(dataType, functionType=functionType)
|
|
|
|
# @pandas_udf(returnType=dataType, functionType=functionType)
|
|
|
|
eval_type = functionType
|
|
|
|
elif returnType is not None and isinstance(returnType, int):
|
|
|
|
# @pandas_udf(dataType, functionType)
|
|
|
|
eval_type = returnType
|
|
|
|
else:
|
|
|
|
# @pandas_udf(dataType) or @pandas_udf(returnType=dataType)
|
2018-01-30 07:55:55 -05:00
|
|
|
eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
|
2017-11-17 10:43:08 -05:00
|
|
|
else:
|
|
|
|
return_type = returnType
|
|
|
|
|
|
|
|
if functionType is not None:
|
|
|
|
eval_type = functionType
|
|
|
|
else:
|
2018-01-30 07:55:55 -05:00
|
|
|
eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF
|
2017-11-17 10:43:08 -05:00
|
|
|
|
|
|
|
if return_type is None:
|
|
|
|
raise ValueError("Invalid returnType: returnType can not be None")
|
|
|
|
|
2018-01-30 07:55:55 -05:00
|
|
|
if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,
|
|
|
|
PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
|
|
|
|
PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:
|
2017-11-17 10:43:08 -05:00
|
|
|
raise ValueError("Invalid functionType: "
|
|
|
|
"functionType must be one the values from PandasUDFType")
|
|
|
|
|
|
|
|
if is_decorator:
|
|
|
|
return functools.partial(_create_udf, returnType=return_type, evalType=eval_type)
|
|
|
|
else:
|
|
|
|
return _create_udf(f=f, returnType=return_type, evalType=eval_type)
|
2017-02-15 13:16:34 -05:00
|
|
|
|
2015-02-14 02:03:22 -05:00
|
|
|
|
2015-08-04 22:25:24 -04:00
|
|
|
blacklist = ['map', 'since', 'ignore_unicode_prefix']
|
|
|
|
__all__ = [k for k, v in globals().items()
|
|
|
|
if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist]
|
|
|
|
__all__.sort()
|
|
|
|
|
2015-02-14 02:03:22 -05:00
|
|
|
|
|
|
|
def _test():
|
|
|
|
import doctest
|
2016-05-23 21:14:48 -04:00
|
|
|
from pyspark.sql import Row, SparkSession
|
2015-02-17 13:22:48 -05:00
|
|
|
import pyspark.sql.functions
|
|
|
|
globs = pyspark.sql.functions.__dict__.copy()
|
2016-05-23 21:14:48 -04:00
|
|
|
spark = SparkSession.builder\
|
|
|
|
.master("local[4]")\
|
|
|
|
.appName("sql.functions tests")\
|
|
|
|
.getOrCreate()
|
|
|
|
sc = spark.sparkContext
|
2015-02-14 02:03:22 -05:00
|
|
|
globs['sc'] = sc
|
2016-05-23 21:14:48 -04:00
|
|
|
globs['spark'] = spark
|
2017-07-08 02:59:34 -04:00
|
|
|
globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)])
|
2015-02-14 02:03:22 -05:00
|
|
|
(failure_count, test_count) = doctest.testmod(
|
2015-02-17 13:22:48 -05:00
|
|
|
pyspark.sql.functions, globs=globs,
|
2015-02-14 02:03:22 -05:00
|
|
|
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
|
2016-05-23 21:14:48 -04:00
|
|
|
spark.stop()
|
2015-02-14 02:03:22 -05:00
|
|
|
if failure_count:
|
2018-03-08 06:38:34 -05:00
|
|
|
sys.exit(-1)
|
2015-02-14 02:03:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
_test()
|