[SPARK-35032][PYTHON] Port Koalas Index unit tests into PySpark

### What changes were proposed in this pull request?
Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas Index unit tests to PySpark.

### Why are the changes needed?
Currently, the pandas-on-Spark modules are not tested fully. We should enable the Index unit tests.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Enable Index unit tests.

Closes #32139 from xinrong-databricks/port.indexes_tests.

Authored-by: Xinrong Meng <xinrong.meng@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Xinrong Meng 2021-04-16 08:53:30 +09:00 committed by HyukjinKwon
parent ba92de0ae5
commit 4aee19efb4
6 changed files with 2789 additions and 31 deletions

View file

@ -611,43 +611,47 @@ pyspark_pandas = Module(
"pyspark.pandas.spark.utils", "pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints", "pyspark.pandas.typedef.typehints",
# unittests # unittests
"pyspark.pandas.tests.test_dataframe", "pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.test_config", "pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.test_default_index", "pyspark.pandas.tests.indexes.test_datetime",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.test_dataframe_conversion",
"pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_ops_on_diff_frames",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_series",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_csv",
"pyspark.pandas.tests.test_groupby",
"pyspark.pandas.tests.test_expanding",
"pyspark.pandas.tests.test_indexing",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_stats",
"pyspark.pandas.tests.test_window",
"pyspark.pandas.tests.plot.test_frame_plot", "pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib", "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly", "pyspark.pandas.tests.plot.test_frame_plot_plotly",
"pyspark.pandas.tests.plot.test_series_plot", "pyspark.pandas.tests.plot.test_series_plot",
"pyspark.pandas.tests.plot.test_series_plot_matplotlib", "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
"pyspark.pandas.tests.plot.test_series_plot_plotly", "pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
"pyspark.pandas.tests.test_csv",
"pyspark.pandas.tests.test_dataframe",
"pyspark.pandas.tests.test_dataframe_conversion",
"pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_default_index",
"pyspark.pandas.tests.test_expanding",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_groupby",
"pyspark.pandas.tests.test_indexing",
"pyspark.pandas.tests.test_indexops_spark",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_ops_on_diff_frames",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
"pyspark.pandas.tests.test_series",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_stats",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.test_window",
], ],
excluded_python_implementations=[ excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and

View file

@ -0,0 +1,16 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,124 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from distutils.version import LooseVersion
import pandas as pd
from pandas.api.types import CategoricalDtype
import pyspark.pandas as ps
from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
class CategoricalIndexTest(ReusedSQLTestCase, TestUtils):
def test_categorical_index(self):
pidx = pd.CategoricalIndex([1, 2, 3])
kidx = ps.CategoricalIndex([1, 2, 3])
self.assert_eq(kidx, pidx)
self.assert_eq(kidx.categories, pidx.categories)
self.assert_eq(kidx.codes, pd.Index(pidx.codes))
self.assert_eq(kidx.ordered, pidx.ordered)
pidx = pd.Index([1, 2, 3], dtype="category")
kidx = ps.Index([1, 2, 3], dtype="category")
self.assert_eq(kidx, pidx)
self.assert_eq(kidx.categories, pidx.categories)
self.assert_eq(kidx.codes, pd.Index(pidx.codes))
self.assert_eq(kidx.ordered, pidx.ordered)
pdf = pd.DataFrame(
{
"a": pd.Categorical([1, 2, 3, 1, 2, 3]),
"b": pd.Categorical(["a", "b", "c", "a", "b", "c"], categories=["c", "b", "a"]),
},
index=pd.Categorical([10, 20, 30, 20, 30, 10], categories=[30, 10, 20], ordered=True),
)
kdf = ps.from_pandas(pdf)
pidx = pdf.set_index("b").index
kidx = kdf.set_index("b").index
self.assert_eq(kidx, pidx)
self.assert_eq(kidx.categories, pidx.categories)
self.assert_eq(kidx.codes, pd.Index(pidx.codes))
self.assert_eq(kidx.ordered, pidx.ordered)
pidx = pdf.set_index(["a", "b"]).index.get_level_values(0)
kidx = kdf.set_index(["a", "b"]).index.get_level_values(0)
self.assert_eq(kidx, pidx)
self.assert_eq(kidx.categories, pidx.categories)
self.assert_eq(kidx.codes, pd.Index(pidx.codes))
self.assert_eq(kidx.ordered, pidx.ordered)
def test_astype(self):
pidx = pd.Index(["a", "b", "c"])
kidx = ps.from_pandas(pidx)
self.assert_eq(kidx.astype("category"), pidx.astype("category"))
self.assert_eq(
kidx.astype(CategoricalDtype(["c", "a", "b"])),
pidx.astype(CategoricalDtype(["c", "a", "b"])),
)
pcidx = pidx.astype(CategoricalDtype(["c", "a", "b"]))
kcidx = kidx.astype(CategoricalDtype(["c", "a", "b"]))
self.assert_eq(kcidx.astype("category"), pcidx.astype("category"))
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
self.assert_eq(
kcidx.astype(CategoricalDtype(["b", "c", "a"])),
pcidx.astype(CategoricalDtype(["b", "c", "a"])),
)
else:
self.assert_eq(
kcidx.astype(CategoricalDtype(["b", "c", "a"])),
pidx.astype(CategoricalDtype(["b", "c", "a"])),
)
self.assert_eq(kcidx.astype(str), pcidx.astype(str))
def test_factorize(self):
pidx = pd.CategoricalIndex([1, 2, 3, None])
kidx = ps.from_pandas(pidx)
pcodes, puniques = pidx.factorize()
kcodes, kuniques = kidx.factorize()
self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)
pcodes, puniques = pidx.factorize(na_sentinel=-2)
kcodes, kuniques = kidx.factorize(na_sentinel=-2)
self.assert_eq(kcodes.tolist(), pcodes.tolist())
self.assert_eq(kuniques, puniques)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.indexes.test_category import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -0,0 +1,232 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
from distutils.version import LooseVersion
import pandas as pd
import pyspark.pandas as ps
from pyspark.pandas.testing.utils import ReusedSQLTestCase, TestUtils
class DatetimeIndexTest(ReusedSQLTestCase, TestUtils):
@property
def fixed_freqs(self):
return [
"D",
"H",
"T", # min
"S",
"L", # ms
"U", # us
# 'N' not supported
]
@property
def non_fixed_freqs(self):
return ["W", "Q"]
@property
def pidxs(self):
return [
pd.DatetimeIndex([0]),
pd.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]),
] + [
pd.date_range("2000-01-01", periods=3, freq=freq)
for freq in (self.fixed_freqs + self.non_fixed_freqs)
]
@property
def kidxs(self):
return [ps.from_pandas(pidx) for pidx in self.pidxs]
@property
def idx_pairs(self):
return list(zip(self.kidxs, self.pidxs))
def _disallow_nanoseconds(self, f):
self.assertRaises(ValueError, lambda: f(freq="ns"))
self.assertRaises(ValueError, lambda: f(freq="N"))
def test_properties(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(kidx.year, pidx.year)
self.assert_eq(kidx.month, pidx.month)
self.assert_eq(kidx.day, pidx.day)
self.assert_eq(kidx.hour, pidx.hour)
self.assert_eq(kidx.minute, pidx.minute)
self.assert_eq(kidx.second, pidx.second)
self.assert_eq(kidx.microsecond, pidx.microsecond)
self.assert_eq(kidx.week, pidx.week)
self.assert_eq(kidx.weekofyear, pidx.weekofyear)
self.assert_eq(kidx.dayofweek, pidx.dayofweek)
self.assert_eq(kidx.weekday, pidx.weekday)
self.assert_eq(kidx.dayofyear, pidx.dayofyear)
self.assert_eq(kidx.quarter, pidx.quarter)
self.assert_eq(kidx.daysinmonth, pidx.daysinmonth)
self.assert_eq(kidx.days_in_month, pidx.days_in_month)
self.assert_eq(kidx.is_month_start, pd.Index(pidx.is_month_start))
self.assert_eq(kidx.is_month_end, pd.Index(pidx.is_month_end))
self.assert_eq(kidx.is_quarter_start, pd.Index(pidx.is_quarter_start))
self.assert_eq(kidx.is_quarter_end, pd.Index(pidx.is_quarter_end))
self.assert_eq(kidx.is_year_start, pd.Index(pidx.is_year_start))
self.assert_eq(kidx.is_year_end, pd.Index(pidx.is_year_end))
self.assert_eq(kidx.is_leap_year, pd.Index(pidx.is_leap_year))
if LooseVersion(pd.__version__) >= LooseVersion("1.2.0"):
self.assert_eq(kidx.day_of_year, pidx.day_of_year)
self.assert_eq(kidx.day_of_week, pidx.day_of_week)
def test_ceil(self):
for kidx, pidx in self.idx_pairs:
for freq in self.fixed_freqs:
self.assert_eq(kidx.ceil(freq), pidx.ceil(freq))
self._disallow_nanoseconds(self.kidxs[0].ceil)
def test_floor(self):
for kidx, pidx in self.idx_pairs:
for freq in self.fixed_freqs:
self.assert_eq(kidx.floor(freq), pidx.floor(freq))
self._disallow_nanoseconds(self.kidxs[0].floor)
def test_round(self):
for kidx, pidx in self.idx_pairs:
for freq in self.fixed_freqs:
self.assert_eq(kidx.round(freq), pidx.round(freq))
self._disallow_nanoseconds(self.kidxs[0].round)
def test_day_name(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(kidx.day_name(), pidx.day_name())
def test_month_name(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(kidx.day_name(), pidx.day_name())
def test_normalize(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(kidx.normalize(), pidx.normalize())
def test_strftime(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(
kidx.strftime(date_format="%B %d, %Y"), pidx.strftime(date_format="%B %d, %Y")
)
def test_indexer_between_time(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(
kidx.indexer_between_time("00:00:00", "00:01:00").sort_values(),
pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00")),
)
self.assert_eq(
kidx.indexer_between_time(
datetime.time(0, 0, 0), datetime.time(0, 1, 0)
).sort_values(),
pd.Index(pidx.indexer_between_time(datetime.time(0, 0, 0), datetime.time(0, 1, 0))),
)
self.assert_eq(
kidx.indexer_between_time("00:00:00", "00:01:00", True, False).sort_values(),
pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, False)),
)
self.assert_eq(
kidx.indexer_between_time("00:00:00", "00:01:00", False, True).sort_values(),
pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, True)),
)
self.assert_eq(
kidx.indexer_between_time("00:00:00", "00:01:00", False, False).sort_values(),
pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", False, False)),
)
self.assert_eq(
kidx.indexer_between_time("00:00:00", "00:01:00", True, True).sort_values(),
pd.Index(pidx.indexer_between_time("00:00:00", "00:01:00", True, True)),
)
def test_indexer_at_time(self):
for kidx, pidx in self.idx_pairs:
self.assert_eq(
kidx.indexer_at_time("00:00:00").sort_values(),
pd.Index(pidx.indexer_at_time("00:00:00")),
)
self.assert_eq(
kidx.indexer_at_time(datetime.time(0, 1, 0)).sort_values(),
pd.Index(pidx.indexer_at_time(datetime.time(0, 1, 0))),
)
self.assert_eq(
kidx.indexer_at_time("00:00:01").sort_values(),
pd.Index(pidx.indexer_at_time("00:00:01")),
)
self.assertRaises(
NotImplementedError,
lambda: ps.DatetimeIndex([0]).indexer_at_time("00:00:00", asof=True),
)
def test_arithmetic_op_exceptions(self):
for kidx, pidx in self.idx_pairs:
py_datetime = pidx.to_pydatetime()
for other in [1, 0.1, kidx, kidx.to_series().reset_index(drop=True), py_datetime]:
expected_err_msg = "addition can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx + other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kidx)
expected_err_msg = "multiplication can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx * other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kidx)
expected_err_msg = "division can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx / other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kidx)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx // other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kidx)
expected_err_msg = "modulo can not be applied to date times."
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx % other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kidx)
expected_err_msg = "datetime subtraction can only be applied to datetime series."
for other in [1, 0.1]:
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kidx)
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kidx - other)
self.assertRaises(NotImplementedError, lambda: py_datetime - kidx)
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.indexes.test_datetime import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)

View file

@ -0,0 +1,74 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pandas as pd
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F
from pyspark import pandas as ps
from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils
class SparkIndexOpsMethodsTest(ReusedSQLTestCase, SQLTestUtils):
@property
def pser(self):
return pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
@property
def kser(self):
return ps.from_pandas(self.pser)
def test_series_transform_negative(self):
with self.assertRaisesRegex(
ValueError, "The output of the function.* pyspark.sql.Column.*int"
):
self.kser.spark.transform(lambda scol: 1)
with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
self.kser.spark.transform(lambda scol: F.col("non-existent"))
def test_multiindex_transform_negative(self):
with self.assertRaisesRegex(
NotImplementedError, "MultiIndex does not support spark.transform yet"
):
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 1, 1, 1, 2, 1, 2, 2]],
)
s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
s.index.spark.transform(lambda scol: scol)
def test_series_apply_negative(self):
with self.assertRaisesRegex(
ValueError, "The output of the function.* pyspark.sql.Column.*int"
):
self.kser.spark.apply(lambda scol: 1)
with self.assertRaisesRegex(AnalysisException, "cannot resolve.*non-existent.*"):
self.kser.spark.transform(lambda scol: F.col("non-existent"))
if __name__ == "__main__":
import unittest
from pyspark.pandas.tests.test_indexops_spark import * # noqa: F401
try:
import xmlrunner # type: ignore[import]
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)