spark-instrumented-optimizer/python/pyspark/pandas/tests/test_internal.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from pyspark.pandas.internal import (
    InternalFrame,
    SPARK_DEFAULT_INDEX_NAME,
    SPARK_INDEX_NAME_FORMAT,
)
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils


class InternalFrameTest(PandasOnSparkTestCase, SQLTestUtils):
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [("a",), ("b",)])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(internal.spark_column_for(("a",))._jc.equals(sdf["a"]._jc))
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # non-string column name
        pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf1)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [(0,), (1,)])
        self.assert_eq(internal.data_spark_column_names, ["0", "1"])
        self.assertTrue(internal.spark_column_for((0,))._jc.equals(sdf["0"]._jc))
        self.assertTrue(internal.spark_column_for((1,))._jc.equals(sdf["1"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf1)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("b",)])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(internal.spark_column_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)


if __name__ == "__main__":
    import unittest
    from pyspark.pandas.tests.test_internal import *  # noqa: F401

    try:
        import xmlrunner  # type: ignore[import]

        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
[SPARK-35035][PYTHON] Port Koalas internal implementation unit tests into PySpark ### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas internal implementation unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the internal implementation unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable internal implementation unit tests. Closes #32137 from xinrong-databricks/port.test_internal_impl. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2021-04-14 00:59:33 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import pandas as pd`

			`from pyspark.pandas.internal import (`
			`InternalFrame,`
			`SPARK_DEFAULT_INDEX_NAME,`
			`SPARK_INDEX_NAME_FORMAT,`
			`)`
[SPARK-34999][PYTHON] Consolidate PySpark testing utils ### What changes were proposed in this pull request? Consolidate PySpark testing utils by removing `python/pyspark/pandas/testing`, and then creating a file `pandasutils` under `python/pyspark/testing` for test utilities used in `pyspark/pandas`. ### Why are the changes needed? `python/pyspark/pandas/testing` hold test utilites for pandas-on-spark, and `python/pyspark/testing` contain test utilities for pyspark. Consolidating them makes code cleaner and easier to maintain. Updated import statements are as shown below: - from pyspark.testing.sqlutils import SQLTestUtils - from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils (PandasOnSparkTestCase is the original ReusedSQLTestCase in `python/pyspark/pandas/testing/utils.py`) Minor improvements include: - Usage of missing library's requirement_message - `except ImportError` rather than `except` - import pyspark.pandas alias as `ps` rather than `pp` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests under python/pyspark/pandas/tests. Closes #32177 from xinrong-databricks/port.merge_utils. Authored-by: Xinrong Meng <xinrong.meng@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com> 2021-04-22 16:07:35 -04:00			`from pyspark.testing.pandasutils import PandasOnSparkTestCase`
			`from pyspark.testing.sqlutils import SQLTestUtils`
[SPARK-35035][PYTHON] Port Koalas internal implementation unit tests into PySpark ### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas internal implementation unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the internal implementation unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable internal implementation unit tests. Closes #32137 from xinrong-databricks/port.test_internal_impl. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2021-04-14 00:59:33 -04:00

[SPARK-34999][PYTHON] Consolidate PySpark testing utils ### What changes were proposed in this pull request? Consolidate PySpark testing utils by removing `python/pyspark/pandas/testing`, and then creating a file `pandasutils` under `python/pyspark/testing` for test utilities used in `pyspark/pandas`. ### Why are the changes needed? `python/pyspark/pandas/testing` hold test utilites for pandas-on-spark, and `python/pyspark/testing` contain test utilities for pyspark. Consolidating them makes code cleaner and easier to maintain. Updated import statements are as shown below: - from pyspark.testing.sqlutils import SQLTestUtils - from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils (PandasOnSparkTestCase is the original ReusedSQLTestCase in `python/pyspark/pandas/testing/utils.py`) Minor improvements include: - Usage of missing library's requirement_message - `except ImportError` rather than `except` - import pyspark.pandas alias as `ps` rather than `pp` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests under python/pyspark/pandas/tests. Closes #32177 from xinrong-databricks/port.merge_utils. Authored-by: Xinrong Meng <xinrong.meng@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com> 2021-04-22 16:07:35 -04:00			`class InternalFrameTest(PandasOnSparkTestCase, SQLTestUtils):`
[SPARK-35035][PYTHON] Port Koalas internal implementation unit tests into PySpark ### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas internal implementation unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the internal implementation unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable internal implementation unit tests. Closes #32137 from xinrong-databricks/port.test_internal_impl. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2021-04-14 00:59:33 -04:00			`def test_from_pandas(self):`
			`pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})`

			`internal = InternalFrame.from_pandas(pdf)`
			`sdf = internal.spark_frame`

			`self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])`
			`self.assert_eq(internal.index_names, [None])`
			`self.assert_eq(internal.column_labels, [("a",), ("b",)])`
			`self.assert_eq(internal.data_spark_column_names, ["a", "b"])`
			`self.assertTrue(internal.spark_column_for(("a",))._jc.equals(sdf["a"]._jc))`
			`self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))`

			`self.assert_eq(internal.to_pandas_frame, pdf)`

			`# non-string column name`
			`pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})`

			`internal = InternalFrame.from_pandas(pdf1)`
			`sdf = internal.spark_frame`

			`self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])`
			`self.assert_eq(internal.index_names, [None])`
			`self.assert_eq(internal.column_labels, [(0,), (1,)])`
			`self.assert_eq(internal.data_spark_column_names, ["0", "1"])`
			`self.assertTrue(internal.spark_column_for((0,))._jc.equals(sdf["0"]._jc))`
			`self.assertTrue(internal.spark_column_for((1,))._jc.equals(sdf["1"]._jc))`

			`self.assert_eq(internal.to_pandas_frame, pdf1)`

			`# multi-index`
			`pdf.set_index("a", append=True, inplace=True)`

			`internal = InternalFrame.from_pandas(pdf)`
			`sdf = internal.spark_frame`

			`self.assert_eq(`
			`internal.index_spark_column_names,`
			`[SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],`
			`)`
			`self.assert_eq(internal.index_names, [None, ("a",)])`
			`self.assert_eq(internal.column_labels, [("b",)])`
			`self.assert_eq(internal.data_spark_column_names, ["b"])`
			`self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))`

			`self.assert_eq(internal.to_pandas_frame, pdf)`

			`# multi-index columns`
			`pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])`

			`internal = InternalFrame.from_pandas(pdf)`
			`sdf = internal.spark_frame`

			`self.assert_eq(`
			`internal.index_spark_column_names,`
			`[SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],`
			`)`
			`self.assert_eq(internal.index_names, [None, ("a",)])`
			`self.assert_eq(internal.column_labels, [("x", "b")])`
			`self.assert_eq(internal.data_spark_column_names, ["(x, b)"])`
			`self.assertTrue(internal.spark_column_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc))`

			`self.assert_eq(internal.to_pandas_frame, pdf)`


			`if __name__ == "__main__":`
			`import unittest`
			`from pyspark.pandas.tests.test_internal import * # noqa: F401`

			`try:`
			`import xmlrunner # type: ignore[import]`
[SPARK-35364][PYTHON] Renaming the existing Koalas related codes ### What changes were proposed in this pull request? There are still naming related to Koalas in test and function name. This PR addressed them to fit pandas-on-spark. - kdf -> psdf - kser -> psser - kidx -> psidx - kmidx -> psmidx - to_koalas() -> to_pandas_on_spark() ### Why are the changes needed? This is because the name Koalas is no longer used in PySpark. ### Does this PR introduce _any_ user-facing change? `to_koalas()` function is renamed to `to_pandas_on_spark()` ### How was this patch tested? Tested in local manually. After changing the related naming, I checked them one by one. Closes #32516 from itholic/SPARK-35364. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com> 2021-05-20 18:08:30 -04:00
			`testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)`
[SPARK-35035][PYTHON] Port Koalas internal implementation unit tests into PySpark ### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas internal implementation unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not tested fully. We should enable the internal implementation unit tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable internal implementation unit tests. Closes #32137 from xinrong-databricks/port.test_internal_impl. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2021-04-14 00:59:33 -04:00			`except ImportError:`
			`testRunner = None`
			`unittest.main(testRunner=testRunner, verbosity=2)`