diff --git a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py index 139ad117f8..a68459a750 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py @@ -19,7 +19,6 @@ import pandas as pd from pandas.api.types import CategoricalDtype from pyspark import pandas as ps -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -34,74 +33,75 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): return ps.from_pandas(self.pser) @property - def other_pser(self): - return pd.Series([b"2", b"3", b"4"]) + def byte_pdf(self): + psers = { + "this": self.pser, + "that": pd.Series([b"2", b"3", b"4"]), + } + return pd.concat(psers, axis=1) @property - def other_psser(self): - return ps.from_pandas(self.other_pser) + def byte_psdf(self): + return ps.from_pandas(self.byte_pdf) def test_add(self): - psser = self.psser - pser = self.pser + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + pser, psser = byte_pdf["this"], byte_psdf["this"] + other_pser, other_psser = byte_pdf["that"], byte_psdf["that"] + self.assert_eq(psser + b"1", pser + b"1") self.assert_eq(psser + psser, pser + pser) self.assert_eq(psser + psser.astype("bytes"), pser + pser.astype("bytes")) self.assertRaises(TypeError, lambda: psser + "x") self.assertRaises(TypeError, lambda: psser + 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser + psser) - self.assert_eq(self.pser + self.pser, (self.psser + self.psser).sort_index()) + self.assert_eq(pser + pser, psser + psser) + self.assert_eq(pser + other_pser, psser + other_psser) + + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser + psser) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser - psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser - psser) def test_mul(self): self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser * psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser * psser) def test_truediv(self): self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser / psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser / psser) def test_floordiv(self): self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser // psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser // psser) def test_mod(self): self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser % psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser % psser) def test_pow(self): self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser ** psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser ** psser) def test_radd(self): self.assert_eq(b"1" + self.psser, b"1" + self.pser) @@ -177,46 +177,34 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: ~self.psser) def test_eq(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() - ) - self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] == byte_pdf["that"], byte_psdf["this"] == byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] == byte_pdf["this"], byte_psdf["this"] == byte_psdf["this"]) def test_ne(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() - ) - self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] != byte_pdf["that"], byte_psdf["this"] != byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] != byte_pdf["this"], byte_psdf["this"] != byte_psdf["this"]) def test_lt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() - ) - self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] < byte_pdf["that"], byte_psdf["this"] < byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] < byte_pdf["this"], byte_psdf["this"] < byte_psdf["this"]) def test_le(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() - ) - self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] <= byte_pdf["that"], byte_psdf["this"] <= byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] <= byte_pdf["this"], byte_psdf["this"] <= byte_psdf["this"]) def test_gt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() - ) - self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] > byte_pdf["that"], byte_psdf["this"] > byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] > byte_pdf["this"], byte_psdf["this"] > byte_psdf["this"]) def test_ge(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() - ) - self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) + byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf + self.assert_eq(byte_pdf["this"] >= byte_pdf["that"], byte_psdf["this"] >= byte_psdf["that"]) + self.assert_eq(byte_pdf["this"] >= byte_pdf["this"], byte_psdf["this"] >= byte_psdf["this"]) if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py index 0480285566..91a92badf8 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py @@ -21,50 +21,11 @@ import datetime import pandas as pd from pyspark import pandas as ps -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): - @property - def numeric_array_psers(self): - return [ - pd.Series([[1, 2, 3]]), - pd.Series([[0.1, 0.2, 0.3]]), - pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]), - ] - - @property - def non_numeric_array_psers(self): - return { - "string": pd.Series([["x", "y", "z"]]), - "date": pd.Series( - [[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]] - ), - "bool": pd.Series([[True, True, False]]), - } - - @property - def numeric_array_pssers(self): - return [ps.from_pandas(pser) for pser in self.numeric_array_psers] - - @property - def non_numeric_array_pssers(self): - pssers = {} - - for k, v in self.non_numeric_array_psers.items(): - pssers[k] = ps.from_pandas(v) - return pssers - - @property - def psers(self): - return self.numeric_array_psers + list(self.non_numeric_array_psers.values()) - - @property - def pssers(self): - return self.numeric_array_pssers + list(self.non_numeric_array_pssers.values()) - @property def pser(self): return pd.Series([[1, 2, 3]]) @@ -74,116 +35,161 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): return ps.from_pandas(self.pser) @property - def other_pser(self): - return pd.Series([[2, 3, 4]]) + def numeric_array_pdf(self): + psers = { + "int": pd.Series([[1, 2, 3]]), + "float": pd.Series([[0.1, 0.2, 0.3]]), + "decimal": pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]), + } + return pd.concat(psers, axis=1) @property - def other_psser(self): - return ps.from_pandas(self.other_pser) + def numeric_array_psdf(self): + return ps.from_pandas(self.numeric_array_pdf) @property - def struct_pser(self): - return pd.Series([("x", 1)]) + def numeric_array_df_cols(self): + return self.numeric_array_pdf.columns @property - def struct_psser(self): - return ps.Index([("x", 1)]).to_series().reset_index(drop=True) + def non_numeric_array_pdf(self): + psers = { + "string": pd.Series([["x", "y", "z"]]), + "date": pd.Series( + [[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]] + ), + "bool": pd.Series([[True, True, False]]), + } + return pd.concat(psers, axis=1) + + @property + def non_numeric_array_psdf(self): + return ps.from_pandas(self.non_numeric_array_pdf) + + @property + def non_numeric_array_df_cols(self): + return self.non_numeric_array_pdf.columns + + @property + def array_pdf(self): + return pd.concat([self.numeric_array_pdf, self.non_numeric_array_pdf], axis=1) + + @property + def array_psdf(self): + return ps.from_pandas(self.array_pdf) + + @property + def array_df_cols(self): + return self.array_pdf.columns + + @property + def complex_pdf(self): + psers = { + "this_array": self.pser, + "that_array": pd.Series([[2, 3, 4]]), + "this_struct": pd.Series([("x", 1)]), + "that_struct": pd.Series([("a", 2)]), + } + return pd.concat(psers, axis=1) + + @property + def complex_psdf(self): + pssers = { + "this_array": self.psser, + "that_array": ps.Series([[2, 3, 4]]), + "this_struct": ps.Index([("x", 1)]).to_series().reset_index(drop=True), + "that_struct": ps.Index([("a", 2)]).to_series().reset_index(drop=True), + } + return ps.concat(pssers, axis=1) def test_add(self): - for pser, psser in zip(self.psers, self.pssers): + pdf, psdf = self.array_pdf, self.array_psdf + for col in self.array_df_cols: + self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col]) + + # Numeric array + Numeric array + for col in self.numeric_array_df_cols: + pser1, psser1 = pdf[col], psdf[col] + for other_col in self.numeric_array_df_cols: + pser2, psser2 = pdf[other_col], psdf[other_col] + self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values()) + + # Non-numeric array + Non-numeric array + self.assertRaises( + TypeError, + lambda: psdf["string"] + psdf["bool"], + ) + self.assertRaises( + TypeError, + lambda: psdf["string"] + psdf["date"], + ) + self.assertRaises( + TypeError, + lambda: psdf["bool"] + psdf["date"], + ) + + for col in self.non_numeric_array_df_cols: + pser, psser = pdf[col], psdf[col] self.assert_eq(pser + pser, psser + psser) - with option_context("compute.ops_on_diff_frames", True): - # Numeric array + Numeric array - for pser1, psser1 in zip(self.numeric_array_psers, self.numeric_array_pssers): - for pser2, psser2 in zip(self.numeric_array_psers, self.numeric_array_pssers): - self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values()) - - # Non-numeric array + Non-numeric array - self.assertRaises( - TypeError, - lambda: self.non_numeric_array_pssers["string"] - + self.non_numeric_array_pssers["bool"], - ) - self.assertRaises( - TypeError, - lambda: self.non_numeric_array_pssers["string"] - + self.non_numeric_array_pssers["date"], - ) - self.assertRaises( - TypeError, - lambda: self.non_numeric_array_pssers["bool"] - + self.non_numeric_array_pssers["date"], - ) - - for data_type in self.non_numeric_array_psers.keys(): - self.assert_eq( - self.non_numeric_array_psers.get(data_type) - + self.non_numeric_array_psers.get(data_type), - ( - self.non_numeric_array_pssers.get(data_type) - + self.non_numeric_array_pssers.get(data_type) - ).sort_index(), - ) - - # Numeric array + Non-numeric array - for numeric_ppser in self.numeric_array_pssers: - for non_numeric_ppser in self.non_numeric_array_pssers.values(): - self.assertRaises(TypeError, lambda: numeric_ppser + non_numeric_ppser) + # Numeric array + Non-numeric array + for numeric_col in self.numeric_array_df_cols: + for non_numeric_col in self.non_numeric_array_df_cols: + self.assertRaises(TypeError, lambda: psdf[numeric_col] + psdf[non_numeric_col]) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 - psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] - psdf[other_col]) def test_mul(self): self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 * psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] * psdf[other_col]) def test_truediv(self): self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 / psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] / psdf[other_col]) def test_floordiv(self): self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 // psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] // psdf[other_col]) def test_mod(self): self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 % psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] % psdf[other_col]) def test_pow(self): self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** 1) - with option_context("compute.ops_on_diff_frames", True): - for psser1 in self.pssers: - for psser2 in self.pssers: - self.assertRaises(TypeError, lambda: psser1 ** psser2) + psdf = self.array_psdf + for col in self.array_df_cols: + for other_col in self.array_df_cols: + self.assertRaises(TypeError, lambda: psdf[col] ** psdf[other_col]) def test_radd(self): self.assertRaises(TypeError, lambda: "x" + self.psser) @@ -231,12 +237,16 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: False | self.psser) def test_from_to_pandas(self): - for pser, psser in zip(self.psers, self.pssers): + pdf, psdf = self.array_pdf, self.array_psdf + for col in self.array_df_cols: + pser, psser = pdf[col], psdf[col] self.assert_eq(pser, psser.to_pandas()) self.assert_eq(ps.from_pandas(pser), psser) def test_isnull(self): - for pser, psser in zip(self.psers, self.pssers): + pdf, psdf = self.array_pdf, self.array_psdf + for col in self.array_df_cols: + pser, psser = pdf[col], psdf[col] self.assert_eq(pser.isnull(), psser.isnull()) def test_astype(self): @@ -252,70 +262,94 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: ~self.psser) def test_eq(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() - ) - self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) - self.assert_eq( - self.struct_pser == self.struct_pser, - (self.struct_psser == self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] == pdf["that_array"], psdf["this_array"] == psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] == pdf["that_struct"], psdf["this_struct"] == psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] == pdf["this_array"], psdf["this_array"] == psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] == pdf["this_struct"], psdf["this_struct"] == psdf["this_struct"] + ) def test_ne(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() - ) - self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) - self.assert_eq( - self.struct_pser != self.struct_pser, - (self.struct_psser != self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] != pdf["that_array"], psdf["this_array"] != psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] != pdf["that_struct"], psdf["this_struct"] != psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] != pdf["this_array"], psdf["this_array"] != psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] != pdf["this_struct"], psdf["this_struct"] != psdf["this_struct"] + ) def test_lt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() - ) - self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) - self.assert_eq( - self.struct_pser < self.struct_pser, - (self.struct_psser < self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] < pdf["that_array"], psdf["this_array"] < psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] < pdf["that_struct"], psdf["this_struct"] < psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] < pdf["this_array"], psdf["this_array"] < psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] < pdf["this_struct"], psdf["this_struct"] < psdf["this_struct"] + ) def test_le(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() - ) - self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) - self.assert_eq( - self.struct_pser <= self.struct_pser, - (self.struct_psser <= self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] <= pdf["that_array"], psdf["this_array"] <= psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] <= pdf["that_struct"], psdf["this_struct"] <= psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] <= pdf["this_array"], psdf["this_array"] <= psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] <= pdf["this_struct"], psdf["this_struct"] <= psdf["this_struct"] + ) def test_gt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() - ) - self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) - self.assert_eq( - self.struct_pser > self.struct_pser, - (self.struct_psser > self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] > pdf["that_array"], psdf["this_array"] > psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] > pdf["that_struct"], psdf["this_struct"] > psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] > pdf["this_array"], psdf["this_array"] > psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] > pdf["this_struct"], psdf["this_struct"] > psdf["this_struct"] + ) def test_ge(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() - ) - self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) - self.assert_eq( - self.struct_pser >= self.struct_pser, - (self.struct_psser >= self.struct_psser).sort_index(), - ) + pdf, psdf = self.complex_pdf, self.complex_pdf + self.assert_eq( + pdf["this_array"] >= pdf["that_array"], psdf["this_array"] >= psdf["that_array"] + ) + self.assert_eq( + pdf["this_struct"] >= pdf["that_struct"], psdf["this_struct"] >= psdf["that_struct"] + ) + self.assert_eq( + pdf["this_array"] >= pdf["this_array"], psdf["this_array"] >= psdf["this_array"] + ) + self.assert_eq( + pdf["this_struct"] >= pdf["this_struct"], psdf["this_struct"] >= psdf["this_struct"] + ) if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py index 1574ebff95..0f1d76855e 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_date_ops.py @@ -20,10 +20,7 @@ import datetime import pandas as pd from pandas.api.types import CategoricalDtype -from pyspark.sql.types import DateType - from pyspark import pandas as ps -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -40,14 +37,18 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): return ps.from_pandas(self.pser) @property - def other_pser(self): - return pd.Series( - [datetime.date(2000, 1, 31), datetime.date(1994, 3, 1), datetime.date(1990, 2, 2)] - ) + def date_pdf(self): + psers = { + "this": self.pser, + "that": pd.Series( + [datetime.date(2000, 1, 31), datetime.date(1994, 3, 1), datetime.date(1990, 2, 2)] + ), + } + return pd.concat(psers, axis=1) @property - def other_psser(self): - return ps.from_pandas(self.other_pser) + def date_psdf(self): + return ps.from_pandas(self.date_pdf) @property def some_date(self): @@ -58,9 +59,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser + psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser + psser) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") @@ -69,57 +69,54 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): (self.pser - self.some_date).dt.days, self.psser - self.some_date, ) - with option_context("compute.ops_on_diff_frames", True): - for pser, psser in self.pser_psser_pairs: - if isinstance(psser.spark.data_type, DateType): - self.assert_eq((self.pser - pser).dt.days, (self.psser - psser).sort_index()) - else: - self.assertRaises(TypeError, lambda: self.psser - psser) + pdf, psdf = self.pdf, self.psdf + for col in self.df_cols: + if col == "date": + self.assert_eq((pdf["date"] - pdf[col]).dt.days, psdf["date"] - psdf[col]) + else: + self.assertRaises(TypeError, lambda: psdf["date"] - psdf[col]) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq((pdf["this"] - pdf["that"]).dt.days, psdf["this"] - psdf["that"]) def test_mul(self): self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser * psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser * psser) def test_truediv(self): self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser / psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser / psser) def test_floordiv(self): self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser // psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser // psser) def test_mod(self): self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser % psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser % psser) def test_pow(self): self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** self.some_date) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser ** psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser ** psser) def test_radd(self): self.assertRaises(TypeError, lambda: "x" + self.psser) @@ -204,46 +201,34 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: ~self.psser) def test_eq(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() - ) - self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"]) + self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"]) def test_ne(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() - ) - self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] != pdf["that"], psdf["this"] != psdf["that"]) + self.assert_eq(pdf["this"] != pdf["this"], psdf["this"] != psdf["this"]) def test_lt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() - ) - self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"]) + self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"]) def test_le(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() - ) - self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] <= pdf["that"], psdf["this"] <= psdf["that"]) + self.assert_eq(pdf["this"] <= pdf["this"], psdf["this"] <= psdf["this"]) def test_gt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() - ) - self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] > pdf["that"], psdf["this"] > psdf["that"]) + self.assert_eq(pdf["this"] > pdf["this"], psdf["this"] > psdf["this"]) def test_ge(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() - ) - self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) + pdf, psdf = self.date_pdf, self.date_psdf + self.assert_eq(pdf["this"] >= pdf["that"], psdf["this"] >= psdf["that"]) + self.assert_eq(pdf["this"] >= pdf["this"], psdf["this"] >= psdf["this"]) if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py index 8b2a0f9d5a..d3e59b3ae0 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_datetime_ops.py @@ -17,12 +17,10 @@ import datetime -import numpy as np import pandas as pd from pandas.api.types import CategoricalDtype from pyspark import pandas as ps -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -30,19 +28,23 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): @property def pser(self): - return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="M")) + return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="D")) @property def psser(self): return ps.from_pandas(self.pser) @property - def other_pser(self): - return pd.Series(pd.date_range("1994-4-30 10:30:15", periods=3, freq="M")) + def datetime_pdf(self): + psers = { + "this": self.pser, + "that": pd.Series(pd.date_range("1994-2-1 10:30:15", periods=3, freq="D")), + } + return pd.concat(psers, axis=1) @property - def other_psser(self): - return ps.from_pandas(self.other_pser) + def datetime_psdf(self): + return ps.from_pandas(self.datetime_pdf) @property def some_datetime(self): @@ -53,9 +55,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + self.some_datetime) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser + psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser + psser) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") @@ -64,60 +65,62 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): (self.pser - self.some_datetime).dt.total_seconds().astype("int"), self.psser - self.some_datetime, ) - with option_context("compute.ops_on_diff_frames", True): - for pser, psser in self.pser_psser_pairs: - if pser.dtype == np.dtype(" self.other_pser, (self.psser > self.other_psser).sort_index() - ) - self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) + pdf, psdf = self.datetime_pdf, self.datetime_psdf + self.assert_eq(pdf["this"] > pdf["that"], psdf["this"] > psdf["that"]) + self.assert_eq(pdf["this"] > pdf["this"], psdf["this"] > psdf["this"]) def test_ge(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() - ) - self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) + pdf, psdf = self.datetime_pdf, self.datetime_psdf + self.assert_eq(pdf["this"] >= pdf["that"], psdf["this"] >= psdf["that"]) + self.assert_eq(pdf["this"] >= pdf["this"], psdf["this"] >= psdf["this"]) if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py index a7f0b6ccfe..c2b6be2903 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_null_ops.py @@ -19,7 +19,6 @@ import pandas as pd from pandas.api.types import CategoricalDtype import pyspark.pandas as ps -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -37,57 +36,50 @@ class NullOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: self.psser + "x") self.assertRaises(TypeError, lambda: self.psser + 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser + psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser + psser) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser - psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser - psser) def test_mul(self): self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser * psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser * psser) def test_truediv(self): self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser / psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser / psser) def test_floordiv(self): self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser // psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser // psser) def test_mod(self): self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser % psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser % psser) def test_pow(self): self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser ** psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser ** psser) def test_radd(self): self.assertRaises(TypeError, lambda: "x" + self.psser) @@ -145,28 +137,28 @@ class NullOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: ~self.psser) def test_eq(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser == pser, psser == psser) def test_ne(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser != pser, psser != psser) def test_lt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser < pser, psser < psser) def test_le(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser <= pser, psser <= psser) def test_gt(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser > pser, psser > psser) def test_ge(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) + pser, psser = self.pser, self.psser + self.assert_eq(pser >= pser, psser >= psser) if __name__ == "__main__": diff --git a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py index 232fec1fd5..70175c4a97 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py @@ -19,7 +19,6 @@ import pandas as pd import pyspark.pandas as ps from pyspark.ml.linalg import SparseVector -from pyspark.pandas.config import option_context from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase @@ -34,61 +33,67 @@ class UDTOpsTest(PandasOnSparkTestCase, TestCasesUtils): def psser(self): return ps.from_pandas(self.pser) + @property + def udt_pdf(self): + sparse_values = {0: 0.2, 1: 1.0} + psers = { + "this": self.pser, + "that": pd.Series([SparseVector(len(sparse_values), sparse_values)]), + } + return pd.concat(psers, axis=1) + + @property + def udt_psdf(self): + return ps.from_pandas(self.udt_pdf) + def test_add(self): self.assertRaises(TypeError, lambda: self.psser + "x") self.assertRaises(TypeError, lambda: self.psser + 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser + psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser + psser) def test_sub(self): self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser - psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser - psser) def test_mul(self): self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser * psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser * psser) def test_truediv(self): self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser / psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser / psser) def test_floordiv(self): self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser // psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser // psser) def test_mod(self): self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser % psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser % psser) def test_pow(self): self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** 1) - with option_context("compute.ops_on_diff_frames", True): - for psser in self.pssers: - self.assertRaises(TypeError, lambda: self.psser ** psser) + for psser in self.pssers: + self.assertRaises(TypeError, lambda: self.psser ** psser) def test_radd(self): self.assertRaises(TypeError, lambda: "x" + self.psser) @@ -141,12 +146,14 @@ class UDTOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assertRaises(TypeError, lambda: ~self.psser) def test_eq(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) + pdf, psdf = self.udt_pdf, self.udt_psdf + self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"]) + self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"]) def test_ne(self): - with option_context("compute.ops_on_diff_frames", True): - self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) + pdf, psdf = self.udt_pdf, self.udt_psdf + self.assert_eq(pdf["this"] != pdf["this"], psdf["this"] != psdf["this"]) + self.assert_eq(pdf["this"] != pdf["that"], psdf["this"] != psdf["that"]) def test_lt(self): self.assertRaisesRegex(