spark-instrumented-optimizer/python/pyspark/sql/tests/test_pandas_udf_iter.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
import time
import unittest

if sys.version >= '3':
    unicode = str

from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
    pandas_requirement_message, pyarrow_requirement_message

if have_pandas:
    import pandas as pd


@unittest.skipIf(
    not have_pandas or not have_pyarrow,
    pandas_requirement_message or pyarrow_requirement_message)
class ScalarPandasIterUDFTests(ReusedSQLTestCase):

    @classmethod
    def setUpClass(cls):
        ReusedSQLTestCase.setUpClass()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.sc.environment["TZ"] = tz
        cls.spark.conf.set("spark.sql.session.timeZone", tz)

    @classmethod
    def tearDownClass(cls):
        del os.environ["TZ"]
        if cls.tz_prev is not None:
            os.environ["TZ"] = cls.tz_prev
        time.tzset()
        ReusedSQLTestCase.tearDownClass()

    def test_map_partitions_in_pandas(self):
        @pandas_udf('id long', PandasUDFType.SCALAR_ITER)
        def func(iterator):
            for pdf in iterator:
                assert isinstance(pdf, pd.DataFrame)
                assert pdf.columns == ['id']
                yield pdf

        df = self.spark.range(10)
        actual = df.mapPartitionsInPandas(func).collect()
        expected = df.collect()
        self.assertEquals(actual, expected)

    def test_multiple_columns(self):
        data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")]
        df = self.spark.createDataFrame(data, "a int, b string")

        @pandas_udf(df.schema, PandasUDFType.SCALAR_ITER)
        def func(iterator):
            for pdf in iterator:
                assert isinstance(pdf, pd.DataFrame)
                assert [d.name for d in list(pdf.dtypes)] == ['int32', 'object']
                yield pdf

        actual = df.mapPartitionsInPandas(func).collect()
        expected = df.collect()
        self.assertEquals(actual, expected)

    def test_different_output_length(self):
        @pandas_udf('a long', PandasUDFType.SCALAR_ITER)
        def func(iterator):
            for _ in iterator:
                yield pd.DataFrame({'a': list(range(100))})

        df = self.spark.range(10)
        actual = df.repartition(1).mapPartitionsInPandas(func).collect()
        self.assertEquals(set((r.a for r in actual)), set(range(100)))

    def test_empty_iterator(self):
        @pandas_udf('a int, b string', PandasUDFType.SCALAR_ITER)
        def empty_iter(_):
            return iter([])

        self.assertEqual(
            self.spark.range(10).mapPartitionsInPandas(empty_iter).count(), 0)

    def test_empty_rows(self):
        @pandas_udf('a int', PandasUDFType.SCALAR_ITER)
        def empty_rows(_):
            return iter([pd.DataFrame({'a': []})])

        self.assertEqual(
            self.spark.range(10).mapPartitionsInPandas(empty_rows).count(), 0)

    def test_chain_map_partitions_in_pandas(self):
        @pandas_udf('id long', PandasUDFType.SCALAR_ITER)
        def func(iterator):
            for pdf in iterator:
                assert isinstance(pdf, pd.DataFrame)
                assert pdf.columns == ['id']
                yield pdf

        df = self.spark.range(10)
        actual = df.mapPartitionsInPandas(func).mapPartitionsInPandas(func).collect()
        expected = df.collect()
        self.assertEquals(actual, expected)


if __name__ == "__main__":
    from pyspark.sql.tests.test_pandas_udf_iter import *

    try:
        import xmlrunner
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
[SPARK-28198][PYTHON] Add mapPartitionsInPandas to allow an iterator of DataFrames ## What changes were proposed in this pull request? This PR proposes to add `mapPartitionsInPandas` API to DataFrame by using existing `SCALAR_ITER` as below: 1. Filtering via setting the column ```python from pyspark.sql.functions import pandas_udf, PandasUDFType df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) pandas_udf(df.schema, PandasUDFType.SCALAR_ITER) def filter_func(iterator): for pdf in iterator: yield pdf[pdf.id == 1] df.mapPartitionsInPandas(filter_func).show() ``` ``` +---+---+ \| id\|age\| +---+---+ \| 1\| 21\| +---+---+ ``` 2. `DataFrame.loc` ```python from pyspark.sql.functions import pandas_udf, PandasUDFType import pandas as pd df = spark.createDataFrame([['aa'], ['bb'], ['cc'], ['aa'], ['aa'], ['aa']], ["value"]) pandas_udf(df.schema, PandasUDFType.SCALAR_ITER) def filter_func(iterator): for pdf in iterator: yield pdf.loc[pdf.value.str.contains('^a'), :] df.mapPartitionsInPandas(filter_func).show() ``` ``` +-----+ \|value\| +-----+ \| aa\| \| aa\| \| aa\| \| aa\| +-----+ ``` 3. `pandas.melt` ```python from pyspark.sql.functions import pandas_udf, PandasUDFType import pandas as pd df = spark.createDataFrame( pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, 'B': {0: 1, 1: 3, 2: 5}, 'C': {0: 2, 1: 4, 2: 6}})) pandas_udf("A string, variable string, value long", PandasUDFType.SCALAR_ITER) def filter_func(iterator): for pdf in iterator: import pandas as pd yield pd.melt(pdf, id_vars=['A'], value_vars=['B', 'C']) df.mapPartitionsInPandas(filter_func).show() ``` ``` +---+--------+-----+ \| A\|variable\|value\| +---+--------+-----+ \| a\| B\| 1\| \| a\| C\| 2\| \| b\| B\| 3\| \| b\| C\| 4\| \| c\| B\| 5\| \| c\| C\| 6\| +---+--------+-----+ ``` The current limitation of `SCALAR_ITER` is that it doesn't allow different length of result, which is pretty critical in practice - for instance, we cannot simply filter by using Pandas APIs but we merely just map N to N. This PR allows map N to M like flatMap. This API mimics the way of `mapPartitions` but keeps API shape of `SCALAR_ITER` by allowing different results. ### How does this PR implement? This PR adds mimics both `dapply` with Arrow optimization and Grouped Map Pandas UDF. At Python execution side, it reuses existing `SCALAR_ITER` code path. Therefore, externally, we don't introduce any new type of Pandas UDF but internally we use another evaluation type code `205` (`SQL_MAP_PANDAS_ITER_UDF`). This approach is similar with Pandas' Windows function implementation with Grouped Aggregation Pandas UDF functions - internally we have `203` (`SQL_WINDOW_AGG_PANDAS_UDF`) but externally we just share the same `GROUPED_AGG`. ## How was this patch tested? Manually tested and unittests were added. Closes #24997 from HyukjinKwon/scalar-udf-iter. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2019-07-01 21:54:16 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`import os`
			`import sys`
			`import time`
			`import unittest`

			`if sys.version >= '3':`
			`unicode = str`

			`from pyspark.sql.functions import pandas_udf, PandasUDFType`
			`from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \`
			`pandas_requirement_message, pyarrow_requirement_message`

			`if have_pandas:`
			`import pandas as pd`


			`@unittest.skipIf(`
			`not have_pandas or not have_pyarrow,`
			`pandas_requirement_message or pyarrow_requirement_message)`
			`class ScalarPandasIterUDFTests(ReusedSQLTestCase):`

			`@classmethod`
			`def setUpClass(cls):`
			`ReusedSQLTestCase.setUpClass()`

			`# Synchronize default timezone between Python and Java`
			`cls.tz_prev = os.environ.get("TZ", None) # save current tz if set`
			`tz = "America/Los_Angeles"`
			`os.environ["TZ"] = tz`
			`time.tzset()`

			`cls.sc.environment["TZ"] = tz`
			`cls.spark.conf.set("spark.sql.session.timeZone", tz)`

			`@classmethod`
			`def tearDownClass(cls):`
			`del os.environ["TZ"]`
			`if cls.tz_prev is not None:`
			`os.environ["TZ"] = cls.tz_prev`
			`time.tzset()`
			`ReusedSQLTestCase.tearDownClass()`

			`def test_map_partitions_in_pandas(self):`
			`@pandas_udf('id long', PandasUDFType.SCALAR_ITER)`
			`def func(iterator):`
			`for pdf in iterator:`
			`assert isinstance(pdf, pd.DataFrame)`
			`assert pdf.columns == ['id']`
			`yield pdf`

			`df = self.spark.range(10)`
			`actual = df.mapPartitionsInPandas(func).collect()`
			`expected = df.collect()`
			`self.assertEquals(actual, expected)`

			`def test_multiple_columns(self):`
			`data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")]`
			`df = self.spark.createDataFrame(data, "a int, b string")`

			`@pandas_udf(df.schema, PandasUDFType.SCALAR_ITER)`
			`def func(iterator):`
			`for pdf in iterator:`
			`assert isinstance(pdf, pd.DataFrame)`
			`assert [d.name for d in list(pdf.dtypes)] == ['int32', 'object']`
			`yield pdf`

			`actual = df.mapPartitionsInPandas(func).collect()`
			`expected = df.collect()`
			`self.assertEquals(actual, expected)`

			`def test_different_output_length(self):`
			`@pandas_udf('a long', PandasUDFType.SCALAR_ITER)`
			`def func(iterator):`
			`for _ in iterator:`
			`yield pd.DataFrame({'a': list(range(100))})`

			`df = self.spark.range(10)`
			`actual = df.repartition(1).mapPartitionsInPandas(func).collect()`
			`self.assertEquals(set((r.a for r in actual)), set(range(100)))`

			`def test_empty_iterator(self):`
			`@pandas_udf('a int, b string', PandasUDFType.SCALAR_ITER)`
			`def empty_iter(_):`
			`return iter([])`

			`self.assertEqual(`
			`self.spark.range(10).mapPartitionsInPandas(empty_iter).count(), 0)`

			`def test_empty_rows(self):`
			`@pandas_udf('a int', PandasUDFType.SCALAR_ITER)`
			`def empty_rows(_):`
			`return iter([pd.DataFrame({'a': []})])`

			`self.assertEqual(`
			`self.spark.range(10).mapPartitionsInPandas(empty_rows).count(), 0)`

			`def test_chain_map_partitions_in_pandas(self):`
			`@pandas_udf('id long', PandasUDFType.SCALAR_ITER)`
			`def func(iterator):`
			`for pdf in iterator:`
			`assert isinstance(pdf, pd.DataFrame)`
			`assert pdf.columns == ['id']`
			`yield pdf`

			`df = self.spark.range(10)`
			`actual = df.mapPartitionsInPandas(func).mapPartitionsInPandas(func).collect()`
			`expected = df.collect()`
			`self.assertEquals(actual, expected)`


			`if __name__ == "__main__":`
			`from pyspark.sql.tests.test_pandas_udf_iter import *`

			`try:`
			`import xmlrunner`
			`testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)`
			`except ImportError:`
			`testRunner = None`
			`unittest.main(testRunner=testRunner, verbosity=2)`