spark-instrumented-optimizer/python/pyspark/sql/pandas/map_ops.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys

from pyspark import since
from pyspark.rdd import PythonEvalType


class PandasMapOpsMixin(object):
    """
    Min-in for pandas map operations. Currently, only :class:`DataFrame`
    can use this class.
    """

    @since(3.0)
    def mapInPandas(self, udf):
        """
        Maps an iterator of batches in the current :class:`DataFrame` using a Pandas user-defined
        function and returns the result as a :class:`DataFrame`.

        The user-defined function should take an iterator of `pandas.DataFrame`\\s and return
        another iterator of `pandas.DataFrame`\\s. All columns are passed
        together as an iterator of `pandas.DataFrame`\\s to the user-defined function and the
        returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
        Each `pandas.DataFrame` size can be controlled by
        `spark.sql.execution.arrow.maxRecordsPerBatch`.
        Its schema must match the returnType of the Pandas user-defined function.

        :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`

        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> df = spark.createDataFrame([(1, 21), (2, 30)],
        ...                            ("id", "age"))  # doctest: +SKIP
        >>> @pandas_udf(df.schema, PandasUDFType.MAP_ITER)  # doctest: +SKIP
        ... def filter_func(batch_iter):
        ...     for pdf in batch_iter:
        ...         yield pdf[pdf.id == 1]
        >>> df.mapInPandas(filter_func).show()  # doctest: +SKIP
        +---+---+
        | id|age|
        +---+---+
        |  1| 21|
        +---+---+

        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`

        """
        from pyspark.sql import Column, DataFrame

        assert isinstance(self, DataFrame)

        # Columns are special because hasattr always return True
        if isinstance(udf, Column) or not hasattr(udf, 'func') \
                or udf.evalType != PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
            raise ValueError("Invalid udf: the udf argument must be a pandas_udf of type "
                             "MAP_ITER.")

        udf_column = udf(*[self[col] for col in self.columns])
        jdf = self._jdf.mapInPandas(udf_column._jc.expr())
        return DataFrame(jdf, self.sql_ctx)


def _test():
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.pandas.map_ops
    globs = pyspark.sql.pandas.map_ops.__dict__.copy()
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("sql.pandas.map_ops tests")\
        .getOrCreate()
    globs['spark'] = spark
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.pandas.map_ops, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()
[SPARK-30434][PYTHON][SQL] Move pandas related functionalities into 'pandas' sub-package ### What changes were proposed in this pull request? This PR proposes to move pandas related functionalities into pandas package. Namely: ```bash pyspark/sql/pandas ├── __init__.py ├── conversion.py # Conversion between pandas <> PySpark DataFrames ├── functions.py # pandas_udf ├── group_ops.py # Grouped UDF / Cogrouped UDF + groupby.apply, groupby.cogroup.apply ├── map_ops.py # Map Iter UDF + mapInPandas ├── serializers.py # pandas <> PyArrow serializers ├── types.py # Type utils between pandas <> PyArrow └── utils.py # Version requirement checks ``` In order to separately locate `groupby.apply`, `groupby.cogroup.apply`, `mapInPandas`, `toPandas`, and `createDataFrame(pdf)` under `pandas` sub-package, I had to use a mix-in approach which Scala side uses often by `trait`, and also pandas itself uses this approach (see `IndexOpsMixin` as an example) to group related functionalities. Currently, you can think it's like Scala's self typed trait. See the structure below: ```python class PandasMapOpsMixin(object): def mapInPandas(self, ...): ... return ... # other Pandas <> PySpark APIs ``` ```python class DataFrame(PandasMapOpsMixin): # other DataFrame APIs equivalent to Scala side. ``` Yes, This is a big PR but they are mostly just moving around except one case `createDataFrame` which I had to split the methods. ### Why are the changes needed? There are pandas functionalities here and there and I myself gets lost where it was. Also, when you have to make a change commonly for all of pandas related features, it's almost impossible now. Also, after this change, `DataFrame` and `SparkSession` become more consistent with Scala side since pandas is specific to Python, and this change separates pandas-specific APIs away from `DataFrame` or `SparkSession`. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing tests should cover. Also, I manually built the PySpark API documentation and checked. Closes #27109 from HyukjinKwon/pandas-refactoring. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2020-01-08 20:22:50 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`import sys`

			`from pyspark import since`
			`from pyspark.rdd import PythonEvalType`


			`class PandasMapOpsMixin(object):`
			`"""`
			Min-in for pandas map operations. Currently, only :class:`DataFrame`
			`can use this class.`
			`"""`

			`@since(3.0)`
			`def mapInPandas(self, udf):`
			`"""`
			Maps an iterator of batches in the current :class:`DataFrame` using a Pandas user-defined
			function and returns the result as a :class:`DataFrame`.

			The user-defined function should take an iterator of `pandas.DataFrame`\\s and return
			another iterator of `pandas.DataFrame`\\s. All columns are passed
			together as an iterator of `pandas.DataFrame`\\s to the user-defined function and the
			returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
			Each `pandas.DataFrame` size can be controlled by
			`spark.sql.execution.arrow.maxRecordsPerBatch`.
			`Its schema must match the returnType of the Pandas user-defined function.`

			:param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`

			`>>> from pyspark.sql.functions import pandas_udf, PandasUDFType`
			`>>> df = spark.createDataFrame([(1, 21), (2, 30)],`
			`... ("id", "age")) # doctest: +SKIP`
			`>>> @pandas_udf(df.schema, PandasUDFType.MAP_ITER) # doctest: +SKIP`
			`... def filter_func(batch_iter):`
			`... for pdf in batch_iter:`
			`... yield pdf[pdf.id == 1]`
			`>>> df.mapInPandas(filter_func).show() # doctest: +SKIP`
			`+---+---+`
			`\| id\|age\|`
			`+---+---+`
			`\| 1\| 21\|`
			`+---+---+`

			.. seealso:: :meth:`pyspark.sql.functions.pandas_udf`

			`"""`
			`from pyspark.sql import Column, DataFrame`

			`assert isinstance(self, DataFrame)`

			`# Columns are special because hasattr always return True`
			`if isinstance(udf, Column) or not hasattr(udf, 'func') \`
			`or udf.evalType != PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:`
			`raise ValueError("Invalid udf: the udf argument must be a pandas_udf of type "`
			`"MAP_ITER.")`

			`udf_column = udf(*[self[col] for col in self.columns])`
			`jdf = self._jdf.mapInPandas(udf_column._jc.expr())`
			`return DataFrame(jdf, self.sql_ctx)`


			`def _test():`
			`import doctest`
			`from pyspark.sql import SparkSession`
			`import pyspark.sql.pandas.map_ops`
			`globs = pyspark.sql.pandas.map_ops.__dict__.copy()`
			`spark = SparkSession.builder\`
			`.master("local[4]")\`
			`.appName("sql.pandas.map_ops tests")\`
			`.getOrCreate()`
			`globs['spark'] = spark`
			`(failure_count, test_count) = doctest.testmod(`
			`pyspark.sql.pandas.map_ops, globs=globs,`
			`optionflags=doctest.ELLIPSIS \| doctest.NORMALIZE_WHITESPACE \| doctest.REPORT_NDIFF)`
			`spark.stop()`
			`if failure_count:`
			`sys.exit(-1)`


			`if __name__ == "__main__":`
			`_test()`