a392633566
### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas Series related unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not fully tested. We should enable the Series related unit tests first. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable Series-related unit tests. Closes #32117 from xinrong-databricks/port.test_series_related. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
290 lines
10 KiB
Python
290 lines
10 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import datetime
|
|
import unittest
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from pyspark import pandas as ps
|
|
from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils
|
|
|
|
|
|
class SeriesDateTimeTest(ReusedSQLTestCase, SQLTestUtils):
|
|
@property
|
|
def pdf1(self):
|
|
date1 = pd.Series(pd.date_range("2012-1-1 12:45:31", periods=3, freq="M"))
|
|
date2 = pd.Series(pd.date_range("2013-3-11 21:45:00", periods=3, freq="W"))
|
|
return pd.DataFrame(dict(start_date=date1, end_date=date2))
|
|
|
|
@property
|
|
def pd_start_date(self):
|
|
return self.pdf1["start_date"]
|
|
|
|
@property
|
|
def ks_start_date(self):
|
|
return ps.from_pandas(self.pd_start_date)
|
|
|
|
def check_func(self, func):
|
|
self.assert_eq(func(self.ks_start_date), func(self.pd_start_date))
|
|
|
|
def test_timestamp_subtraction(self):
|
|
pdf = self.pdf1
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
# Those fail in certain OSs presumably due to different
|
|
# timezone behaviours inherited from C library.
|
|
|
|
actual = (kdf["end_date"] - kdf["start_date"] - 1).to_pandas()
|
|
expected = (pdf["end_date"] - pdf["start_date"]) // np.timedelta64(1, "s") - 1
|
|
# self.assert_eq(actual, expected)
|
|
|
|
actual = (kdf["end_date"] - pd.Timestamp("2012-1-1 12:45:31") - 1).to_pandas()
|
|
expected = (pdf["end_date"] - pd.Timestamp("2012-1-1 12:45:31")) // np.timedelta64(
|
|
1, "s"
|
|
) - 1
|
|
# self.assert_eq(actual, expected)
|
|
|
|
actual = (pd.Timestamp("2013-3-11 21:45:00") - kdf["start_date"] - 1).to_pandas()
|
|
expected = (pd.Timestamp("2013-3-11 21:45:00") - pdf["start_date"]) // np.timedelta64(
|
|
1, "s"
|
|
) - 1
|
|
# self.assert_eq(actual, expected)
|
|
|
|
kdf = ps.DataFrame(
|
|
{"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))}
|
|
)
|
|
expected_error_message = "datetime subtraction can only be applied to datetime series."
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
kdf["a"] - kdf["b"]
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
kdf["a"] - 1
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
1 - kdf["a"]
|
|
|
|
def test_arithmetic_op_exceptions(self):
|
|
kser = self.ks_start_date
|
|
py_datetime = self.pd_start_date.dt.to_pydatetime()
|
|
datetime_index = ps.Index(self.pd_start_date)
|
|
|
|
for other in [1, 0.1, kser, datetime_index, py_datetime]:
|
|
expected_err_msg = "addition can not be applied to date times."
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser + other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kser)
|
|
|
|
expected_err_msg = "multiplication can not be applied to date times."
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser * other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kser)
|
|
|
|
expected_err_msg = "division can not be applied to date times."
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser / other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kser)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser // other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kser)
|
|
|
|
expected_err_msg = "modulo can not be applied to date times."
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser % other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kser)
|
|
|
|
expected_err_msg = "datetime subtraction can only be applied to datetime series."
|
|
|
|
for other in [1, 0.1]:
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other)
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kser)
|
|
|
|
self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other)
|
|
self.assertRaises(NotImplementedError, lambda: py_datetime - kser)
|
|
|
|
def test_date_subtraction(self):
|
|
pdf = self.pdf1
|
|
kdf = ps.from_pandas(pdf)
|
|
|
|
self.assert_eq(
|
|
kdf["end_date"].dt.date - kdf["start_date"].dt.date,
|
|
(pdf["end_date"].dt.date - pdf["start_date"].dt.date).dt.days,
|
|
)
|
|
|
|
self.assert_eq(
|
|
kdf["end_date"].dt.date - datetime.date(2012, 1, 1),
|
|
(pdf["end_date"].dt.date - datetime.date(2012, 1, 1)).dt.days,
|
|
)
|
|
|
|
self.assert_eq(
|
|
datetime.date(2013, 3, 11) - kdf["start_date"].dt.date,
|
|
(datetime.date(2013, 3, 11) - pdf["start_date"].dt.date).dt.days,
|
|
)
|
|
|
|
kdf = ps.DataFrame(
|
|
{"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))}
|
|
)
|
|
expected_error_message = "date subtraction can only be applied to date series."
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
kdf["a"].dt.date - kdf["b"]
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
kdf["a"].dt.date - 1
|
|
with self.assertRaisesRegex(TypeError, expected_error_message):
|
|
1 - kdf["a"].dt.date
|
|
|
|
@unittest.skip(
|
|
"It fails in certain OSs presumably due to different "
|
|
"timezone behaviours inherited from C library."
|
|
)
|
|
def test_div(self):
|
|
pdf = self.pdf1
|
|
kdf = ps.from_pandas(pdf)
|
|
for u in "D", "s", "ms":
|
|
duration = np.timedelta64(1, u)
|
|
self.assert_eq(
|
|
(kdf["end_date"] - kdf["start_date"]) / duration,
|
|
(pdf["end_date"] - pdf["start_date"]) / duration,
|
|
)
|
|
|
|
@unittest.skip("It is currently failed probably for the same reason in 'test_subtraction'")
|
|
def test_date(self):
|
|
self.check_func(lambda x: x.dt.date)
|
|
|
|
def test_time(self):
|
|
with self.assertRaises(NotImplementedError):
|
|
self.check_func(lambda x: x.dt.time)
|
|
|
|
def test_timetz(self):
|
|
with self.assertRaises(NotImplementedError):
|
|
self.check_func(lambda x: x.dt.timetz)
|
|
|
|
def test_year(self):
|
|
self.check_func(lambda x: x.dt.year)
|
|
|
|
def test_month(self):
|
|
self.check_func(lambda x: x.dt.month)
|
|
|
|
def test_day(self):
|
|
self.check_func(lambda x: x.dt.day)
|
|
|
|
def test_hour(self):
|
|
self.check_func(lambda x: x.dt.hour)
|
|
|
|
def test_minute(self):
|
|
self.check_func(lambda x: x.dt.minute)
|
|
|
|
def test_second(self):
|
|
self.check_func(lambda x: x.dt.second)
|
|
|
|
def test_microsecond(self):
|
|
self.check_func(lambda x: x.dt.microsecond)
|
|
|
|
def test_nanosecond(self):
|
|
with self.assertRaises(NotImplementedError):
|
|
self.check_func(lambda x: x.dt.nanosecond)
|
|
|
|
def test_week(self):
|
|
self.check_func(lambda x: x.dt.week)
|
|
|
|
def test_weekofyear(self):
|
|
self.check_func(lambda x: x.dt.weekofyear)
|
|
|
|
def test_dayofweek(self):
|
|
self.check_func(lambda x: x.dt.dayofweek)
|
|
|
|
def test_weekday(self):
|
|
self.check_func(lambda x: x.dt.weekday)
|
|
|
|
def test_dayofyear(self):
|
|
self.check_func(lambda x: x.dt.dayofyear)
|
|
|
|
def test_quarter(self):
|
|
self.check_func(lambda x: x.dt.dayofyear)
|
|
|
|
def test_is_month_start(self):
|
|
self.check_func(lambda x: x.dt.is_month_start)
|
|
|
|
def test_is_month_end(self):
|
|
self.check_func(lambda x: x.dt.is_month_end)
|
|
|
|
def test_is_quarter_start(self):
|
|
self.check_func(lambda x: x.dt.is_quarter_start)
|
|
|
|
def test_is_quarter_end(self):
|
|
self.check_func(lambda x: x.dt.is_quarter_end)
|
|
|
|
def test_is_year_start(self):
|
|
self.check_func(lambda x: x.dt.is_year_start)
|
|
|
|
def test_is_year_end(self):
|
|
self.check_func(lambda x: x.dt.is_year_end)
|
|
|
|
def test_is_leap_year(self):
|
|
self.check_func(lambda x: x.dt.is_leap_year)
|
|
|
|
def test_daysinmonth(self):
|
|
self.check_func(lambda x: x.dt.daysinmonth)
|
|
|
|
def test_days_in_month(self):
|
|
self.check_func(lambda x: x.dt.days_in_month)
|
|
|
|
@unittest.expectedFailure
|
|
def test_tz_localize(self):
|
|
self.check_func(lambda x: x.dt.tz_localize("America/New_York"))
|
|
|
|
@unittest.expectedFailure
|
|
def test_tz_convert(self):
|
|
self.check_func(lambda x: x.dt.tz_convert("America/New_York"))
|
|
|
|
def test_normalize(self):
|
|
self.check_func(lambda x: x.dt.normalize())
|
|
|
|
def test_strftime(self):
|
|
self.check_func(lambda x: x.dt.strftime("%Y-%m-%d"))
|
|
|
|
def test_round(self):
|
|
self.check_func(lambda x: x.dt.round(freq="min"))
|
|
self.check_func(lambda x: x.dt.round(freq="H"))
|
|
|
|
def test_floor(self):
|
|
self.check_func(lambda x: x.dt.floor(freq="min"))
|
|
self.check_func(lambda x: x.dt.floor(freq="H"))
|
|
|
|
def test_ceil(self):
|
|
self.check_func(lambda x: x.dt.floor(freq="min"))
|
|
self.check_func(lambda x: x.dt.floor(freq="H"))
|
|
|
|
@unittest.skip("Unsupported locale setting")
|
|
def test_month_name(self):
|
|
self.check_func(lambda x: x.dt.month_name())
|
|
self.check_func(lambda x: x.dt.month_name(locale="en_US.UTF-8"))
|
|
|
|
@unittest.skip("Unsupported locale setting")
|
|
def test_day_name(self):
|
|
self.check_func(lambda x: x.dt.day_name())
|
|
self.check_func(lambda x: x.dt.day_name(locale="en_US.UTF-8"))
|
|
|
|
def test_unsupported_type(self):
|
|
self.assertRaisesRegex(
|
|
ValueError, "Cannot call DatetimeMethods on type LongType", lambda: ps.Series([0]).dt
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from pyspark.pandas.tests.test_series_datetime import * # noqa: F401
|
|
|
|
try:
|
|
import xmlrunner # type: ignore[import]
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|