[SPARK-36190][PYTHON] Improve the rest of DataTypeOps tests by avoiding joins

### What changes were proposed in this pull request?
Improve the rest of DataTypeOps tests by avoiding joins.

### Why are the changes needed?
bool, string, numeric DataTypeOps tests have been improved by avoiding joins.
We should improve the rest of the DataTypeOps tests in the same way.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Unit tests.

Closes #33546 from xinrong-databricks/test_no_join.

Authored-by: Xinrong Meng <xinrong.meng@databricks.com>
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
(cherry picked from commit 9c5cb99d6e)
Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
Xinrong Meng 2021-07-28 15:53:38 -07:00 committed by Takuya UESHIN
parent c236101d4c
commit 999cf81653
6 changed files with 408 additions and 411 deletions

View file

@ -19,7 +19,6 @@ import pandas as pd
from pandas.api.types import CategoricalDtype from pandas.api.types import CategoricalDtype
from pyspark import pandas as ps from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -34,74 +33,75 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
return ps.from_pandas(self.pser) return ps.from_pandas(self.pser)
@property @property
def other_pser(self): def byte_pdf(self):
return pd.Series([b"2", b"3", b"4"]) psers = {
"this": self.pser,
"that": pd.Series([b"2", b"3", b"4"]),
}
return pd.concat(psers, axis=1)
@property @property
def other_psser(self): def byte_psdf(self):
return ps.from_pandas(self.other_pser) return ps.from_pandas(self.byte_pdf)
def test_add(self): def test_add(self):
psser = self.psser byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
pser = self.pser pser, psser = byte_pdf["this"], byte_psdf["this"]
other_pser, other_psser = byte_pdf["that"], byte_psdf["that"]
self.assert_eq(psser + b"1", pser + b"1") self.assert_eq(psser + b"1", pser + b"1")
self.assert_eq(psser + psser, pser + pser) self.assert_eq(psser + psser, pser + pser)
self.assert_eq(psser + psser.astype("bytes"), pser + pser.astype("bytes")) self.assert_eq(psser + psser.astype("bytes"), pser + pser.astype("bytes"))
self.assertRaises(TypeError, lambda: psser + "x") self.assertRaises(TypeError, lambda: psser + "x")
self.assertRaises(TypeError, lambda: psser + 1) self.assertRaises(TypeError, lambda: psser + 1)
with option_context("compute.ops_on_diff_frames", True): self.assert_eq(pser + pser, psser + psser)
for psser in self.pssers: self.assert_eq(pser + other_pser, psser + other_psser)
self.assertRaises(TypeError, lambda: self.psser + psser)
self.assert_eq(self.pser + self.pser, (self.psser + self.psser).sort_index()) for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser + psser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
self.assertRaises(TypeError, lambda: self.psser - 1) self.assertRaises(TypeError, lambda: self.psser - 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser - psser)
self.assertRaises(TypeError, lambda: self.psser - psser)
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser * psser)
self.assertRaises(TypeError, lambda: self.psser * psser)
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser / psser)
self.assertRaises(TypeError, lambda: self.psser / psser)
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser // psser)
self.assertRaises(TypeError, lambda: self.psser // psser)
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser % psser)
self.assertRaises(TypeError, lambda: self.psser % psser)
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser ** psser)
def test_radd(self): def test_radd(self):
self.assert_eq(b"1" + self.psser, b"1" + self.pser) self.assert_eq(b"1" + self.psser, b"1" + self.pser)
@ -177,46 +177,34 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] == byte_pdf["that"], byte_psdf["this"] == byte_psdf["that"])
self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] == byte_pdf["this"], byte_psdf["this"] == byte_psdf["this"])
)
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index())
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] != byte_pdf["that"], byte_psdf["this"] != byte_psdf["that"])
self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] != byte_pdf["this"], byte_psdf["this"] != byte_psdf["this"])
)
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index())
def test_lt(self): def test_lt(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] < byte_pdf["that"], byte_psdf["this"] < byte_psdf["that"])
self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] < byte_pdf["this"], byte_psdf["this"] < byte_psdf["this"])
)
self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index())
def test_le(self): def test_le(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] <= byte_pdf["that"], byte_psdf["this"] <= byte_psdf["that"])
self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] <= byte_pdf["this"], byte_psdf["this"] <= byte_psdf["this"])
)
self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index())
def test_gt(self): def test_gt(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] > byte_pdf["that"], byte_psdf["this"] > byte_psdf["that"])
self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] > byte_pdf["this"], byte_psdf["this"] > byte_psdf["this"])
)
self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index())
def test_ge(self): def test_ge(self):
with option_context("compute.ops_on_diff_frames", True): byte_pdf, byte_psdf = self.byte_pdf, self.byte_psdf
self.assert_eq( self.assert_eq(byte_pdf["this"] >= byte_pdf["that"], byte_psdf["this"] >= byte_psdf["that"])
self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() self.assert_eq(byte_pdf["this"] >= byte_pdf["this"], byte_psdf["this"] >= byte_psdf["this"])
)
self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index())
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -21,50 +21,11 @@ import datetime
import pandas as pd import pandas as pd
from pyspark import pandas as ps from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils): class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property
def numeric_array_psers(self):
return [
pd.Series([[1, 2, 3]]),
pd.Series([[0.1, 0.2, 0.3]]),
pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]),
]
@property
def non_numeric_array_psers(self):
return {
"string": pd.Series([["x", "y", "z"]]),
"date": pd.Series(
[[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]]
),
"bool": pd.Series([[True, True, False]]),
}
@property
def numeric_array_pssers(self):
return [ps.from_pandas(pser) for pser in self.numeric_array_psers]
@property
def non_numeric_array_pssers(self):
pssers = {}
for k, v in self.non_numeric_array_psers.items():
pssers[k] = ps.from_pandas(v)
return pssers
@property
def psers(self):
return self.numeric_array_psers + list(self.non_numeric_array_psers.values())
@property
def pssers(self):
return self.numeric_array_pssers + list(self.non_numeric_array_pssers.values())
@property @property
def pser(self): def pser(self):
return pd.Series([[1, 2, 3]]) return pd.Series([[1, 2, 3]])
@ -74,116 +35,161 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils):
return ps.from_pandas(self.pser) return ps.from_pandas(self.pser)
@property @property
def other_pser(self): def numeric_array_pdf(self):
return pd.Series([[2, 3, 4]]) psers = {
"int": pd.Series([[1, 2, 3]]),
"float": pd.Series([[0.1, 0.2, 0.3]]),
"decimal": pd.Series([[decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)]]),
}
return pd.concat(psers, axis=1)
@property @property
def other_psser(self): def numeric_array_psdf(self):
return ps.from_pandas(self.other_pser) return ps.from_pandas(self.numeric_array_pdf)
@property @property
def struct_pser(self): def numeric_array_df_cols(self):
return pd.Series([("x", 1)]) return self.numeric_array_pdf.columns
@property @property
def struct_psser(self): def non_numeric_array_pdf(self):
return ps.Index([("x", 1)]).to_series().reset_index(drop=True) psers = {
"string": pd.Series([["x", "y", "z"]]),
"date": pd.Series(
[[datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3)]]
),
"bool": pd.Series([[True, True, False]]),
}
return pd.concat(psers, axis=1)
@property
def non_numeric_array_psdf(self):
return ps.from_pandas(self.non_numeric_array_pdf)
@property
def non_numeric_array_df_cols(self):
return self.non_numeric_array_pdf.columns
@property
def array_pdf(self):
return pd.concat([self.numeric_array_pdf, self.non_numeric_array_pdf], axis=1)
@property
def array_psdf(self):
return ps.from_pandas(self.array_pdf)
@property
def array_df_cols(self):
return self.array_pdf.columns
@property
def complex_pdf(self):
psers = {
"this_array": self.pser,
"that_array": pd.Series([[2, 3, 4]]),
"this_struct": pd.Series([("x", 1)]),
"that_struct": pd.Series([("a", 2)]),
}
return pd.concat(psers, axis=1)
@property
def complex_psdf(self):
pssers = {
"this_array": self.psser,
"that_array": ps.Series([[2, 3, 4]]),
"this_struct": ps.Index([("x", 1)]).to_series().reset_index(drop=True),
"that_struct": ps.Index([("a", 2)]).to_series().reset_index(drop=True),
}
return ps.concat(pssers, axis=1)
def test_add(self): def test_add(self):
for pser, psser in zip(self.psers, self.pssers): pdf, psdf = self.array_pdf, self.array_psdf
for col in self.array_df_cols:
self.assert_eq(pdf[col] + pdf[col], psdf[col] + psdf[col])
# Numeric array + Numeric array
for col in self.numeric_array_df_cols:
pser1, psser1 = pdf[col], psdf[col]
for other_col in self.numeric_array_df_cols:
pser2, psser2 = pdf[other_col], psdf[other_col]
self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values())
# Non-numeric array + Non-numeric array
self.assertRaises(
TypeError,
lambda: psdf["string"] + psdf["bool"],
)
self.assertRaises(
TypeError,
lambda: psdf["string"] + psdf["date"],
)
self.assertRaises(
TypeError,
lambda: psdf["bool"] + psdf["date"],
)
for col in self.non_numeric_array_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser + pser, psser + psser) self.assert_eq(pser + pser, psser + psser)
with option_context("compute.ops_on_diff_frames", True): # Numeric array + Non-numeric array
# Numeric array + Numeric array for numeric_col in self.numeric_array_df_cols:
for pser1, psser1 in zip(self.numeric_array_psers, self.numeric_array_pssers): for non_numeric_col in self.non_numeric_array_df_cols:
for pser2, psser2 in zip(self.numeric_array_psers, self.numeric_array_pssers): self.assertRaises(TypeError, lambda: psdf[numeric_col] + psdf[non_numeric_col])
self.assert_eq((pser1 + pser2).sort_values(), (psser1 + psser2).sort_values())
# Non-numeric array + Non-numeric array
self.assertRaises(
TypeError,
lambda: self.non_numeric_array_pssers["string"]
+ self.non_numeric_array_pssers["bool"],
)
self.assertRaises(
TypeError,
lambda: self.non_numeric_array_pssers["string"]
+ self.non_numeric_array_pssers["date"],
)
self.assertRaises(
TypeError,
lambda: self.non_numeric_array_pssers["bool"]
+ self.non_numeric_array_pssers["date"],
)
for data_type in self.non_numeric_array_psers.keys():
self.assert_eq(
self.non_numeric_array_psers.get(data_type)
+ self.non_numeric_array_psers.get(data_type),
(
self.non_numeric_array_pssers.get(data_type)
+ self.non_numeric_array_pssers.get(data_type)
).sort_index(),
)
# Numeric array + Non-numeric array
for numeric_ppser in self.numeric_array_pssers:
for non_numeric_ppser in self.non_numeric_array_pssers.values():
self.assertRaises(TypeError, lambda: numeric_ppser + non_numeric_ppser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
self.assertRaises(TypeError, lambda: self.psser - 1) self.assertRaises(TypeError, lambda: self.psser - 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 - psser2) self.assertRaises(TypeError, lambda: psdf[col] - psdf[other_col])
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 * psser2) self.assertRaises(TypeError, lambda: psdf[col] * psdf[other_col])
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 / psser2) self.assertRaises(TypeError, lambda: psdf[col] / psdf[other_col])
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 // psser2) self.assertRaises(TypeError, lambda: psdf[col] // psdf[other_col])
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 % psser2) self.assertRaises(TypeError, lambda: psdf[col] % psdf[other_col])
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
with option_context("compute.ops_on_diff_frames", True): psdf = self.array_psdf
for psser1 in self.pssers: for col in self.array_df_cols:
for psser2 in self.pssers: for other_col in self.array_df_cols:
self.assertRaises(TypeError, lambda: psser1 ** psser2) self.assertRaises(TypeError, lambda: psdf[col] ** psdf[other_col])
def test_radd(self): def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: "x" + self.psser)
@ -231,12 +237,16 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: False | self.psser) self.assertRaises(TypeError, lambda: False | self.psser)
def test_from_to_pandas(self): def test_from_to_pandas(self):
for pser, psser in zip(self.psers, self.pssers): pdf, psdf = self.array_pdf, self.array_psdf
for col in self.array_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser, psser.to_pandas()) self.assert_eq(pser, psser.to_pandas())
self.assert_eq(ps.from_pandas(pser), psser) self.assert_eq(ps.from_pandas(pser), psser)
def test_isnull(self): def test_isnull(self):
for pser, psser in zip(self.psers, self.pssers): pdf, psdf = self.array_pdf, self.array_psdf
for col in self.array_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser.isnull(), psser.isnull()) self.assert_eq(pser.isnull(), psser.isnull())
def test_astype(self): def test_astype(self):
@ -252,70 +262,94 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() pdf["this_array"] == pdf["that_array"], psdf["this_array"] == psdf["that_array"]
) )
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] == pdf["that_struct"], psdf["this_struct"] == psdf["that_struct"]
self.struct_pser == self.struct_pser, )
(self.struct_psser == self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] == pdf["this_array"], psdf["this_array"] == psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] == pdf["this_struct"], psdf["this_struct"] == psdf["this_struct"]
)
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() pdf["this_array"] != pdf["that_array"], psdf["this_array"] != psdf["that_array"]
) )
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] != pdf["that_struct"], psdf["this_struct"] != psdf["that_struct"]
self.struct_pser != self.struct_pser, )
(self.struct_psser != self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] != pdf["this_array"], psdf["this_array"] != psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] != pdf["this_struct"], psdf["this_struct"] != psdf["this_struct"]
)
def test_lt(self): def test_lt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() pdf["this_array"] < pdf["that_array"], psdf["this_array"] < psdf["that_array"]
) )
self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] < pdf["that_struct"], psdf["this_struct"] < psdf["that_struct"]
self.struct_pser < self.struct_pser, )
(self.struct_psser < self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] < pdf["this_array"], psdf["this_array"] < psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] < pdf["this_struct"], psdf["this_struct"] < psdf["this_struct"]
)
def test_le(self): def test_le(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() pdf["this_array"] <= pdf["that_array"], psdf["this_array"] <= psdf["that_array"]
) )
self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] <= pdf["that_struct"], psdf["this_struct"] <= psdf["that_struct"]
self.struct_pser <= self.struct_pser, )
(self.struct_psser <= self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] <= pdf["this_array"], psdf["this_array"] <= psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] <= pdf["this_struct"], psdf["this_struct"] <= psdf["this_struct"]
)
def test_gt(self): def test_gt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() pdf["this_array"] > pdf["that_array"], psdf["this_array"] > psdf["that_array"]
) )
self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] > pdf["that_struct"], psdf["this_struct"] > psdf["that_struct"]
self.struct_pser > self.struct_pser, )
(self.struct_psser > self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] > pdf["this_array"], psdf["this_array"] > psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] > pdf["this_struct"], psdf["this_struct"] > psdf["this_struct"]
)
def test_ge(self): def test_ge(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.complex_pdf, self.complex_pdf
self.assert_eq( self.assert_eq(
self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() pdf["this_array"] >= pdf["that_array"], psdf["this_array"] >= psdf["that_array"]
) )
self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) self.assert_eq(
self.assert_eq( pdf["this_struct"] >= pdf["that_struct"], psdf["this_struct"] >= psdf["that_struct"]
self.struct_pser >= self.struct_pser, )
(self.struct_psser >= self.struct_psser).sort_index(), self.assert_eq(
) pdf["this_array"] >= pdf["this_array"], psdf["this_array"] >= psdf["this_array"]
)
self.assert_eq(
pdf["this_struct"] >= pdf["this_struct"], psdf["this_struct"] >= psdf["this_struct"]
)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -20,10 +20,7 @@ import datetime
import pandas as pd import pandas as pd
from pandas.api.types import CategoricalDtype from pandas.api.types import CategoricalDtype
from pyspark.sql.types import DateType
from pyspark import pandas as ps from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -40,14 +37,18 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
return ps.from_pandas(self.pser) return ps.from_pandas(self.pser)
@property @property
def other_pser(self): def date_pdf(self):
return pd.Series( psers = {
[datetime.date(2000, 1, 31), datetime.date(1994, 3, 1), datetime.date(1990, 2, 2)] "this": self.pser,
) "that": pd.Series(
[datetime.date(2000, 1, 31), datetime.date(1994, 3, 1), datetime.date(1990, 2, 2)]
),
}
return pd.concat(psers, axis=1)
@property @property
def other_psser(self): def date_psdf(self):
return ps.from_pandas(self.other_pser) return ps.from_pandas(self.date_pdf)
@property @property
def some_date(self): def some_date(self):
@ -58,9 +59,8 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + 1)
self.assertRaises(TypeError, lambda: self.psser + self.some_date) self.assertRaises(TypeError, lambda: self.psser + self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser + psser)
self.assertRaises(TypeError, lambda: self.psser + psser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
@ -69,57 +69,54 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
(self.pser - self.some_date).dt.days, (self.pser - self.some_date).dt.days,
self.psser - self.some_date, self.psser - self.some_date,
) )
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.pdf, self.psdf
for pser, psser in self.pser_psser_pairs: for col in self.df_cols:
if isinstance(psser.spark.data_type, DateType): if col == "date":
self.assert_eq((self.pser - pser).dt.days, (self.psser - psser).sort_index()) self.assert_eq((pdf["date"] - pdf[col]).dt.days, psdf["date"] - psdf[col])
else: else:
self.assertRaises(TypeError, lambda: self.psser - psser) self.assertRaises(TypeError, lambda: psdf["date"] - psdf[col])
pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq((pdf["this"] - pdf["that"]).dt.days, psdf["this"] - psdf["that"])
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
self.assertRaises(TypeError, lambda: self.psser * self.some_date) self.assertRaises(TypeError, lambda: self.psser * self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser * psser)
self.assertRaises(TypeError, lambda: self.psser * psser)
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
self.assertRaises(TypeError, lambda: self.psser / self.some_date) self.assertRaises(TypeError, lambda: self.psser / self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser / psser)
self.assertRaises(TypeError, lambda: self.psser / psser)
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
self.assertRaises(TypeError, lambda: self.psser // self.some_date) self.assertRaises(TypeError, lambda: self.psser // self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser // psser)
self.assertRaises(TypeError, lambda: self.psser // psser)
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
self.assertRaises(TypeError, lambda: self.psser % self.some_date) self.assertRaises(TypeError, lambda: self.psser % self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser % psser)
self.assertRaises(TypeError, lambda: self.psser % psser)
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser ** self.some_date) self.assertRaises(TypeError, lambda: self.psser ** self.some_date)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser ** psser)
def test_radd(self): def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: "x" + self.psser)
@ -204,46 +201,34 @@ class DateOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
)
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index())
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] != pdf["that"], psdf["this"] != psdf["that"])
self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() self.assert_eq(pdf["this"] != pdf["this"], psdf["this"] != psdf["this"])
)
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index())
def test_lt(self): def test_lt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
)
self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index())
def test_le(self): def test_le(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] <= pdf["that"], psdf["this"] <= psdf["that"])
self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() self.assert_eq(pdf["this"] <= pdf["this"], psdf["this"] <= psdf["this"])
)
self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index())
def test_gt(self): def test_gt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] > pdf["that"], psdf["this"] > psdf["that"])
self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() self.assert_eq(pdf["this"] > pdf["this"], psdf["this"] > psdf["this"])
)
self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index())
def test_ge(self): def test_ge(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.date_pdf, self.date_psdf
self.assert_eq( self.assert_eq(pdf["this"] >= pdf["that"], psdf["this"] >= psdf["that"])
self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() self.assert_eq(pdf["this"] >= pdf["this"], psdf["this"] >= psdf["this"])
)
self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index())
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -17,12 +17,10 @@
import datetime import datetime
import numpy as np
import pandas as pd import pandas as pd
from pandas.api.types import CategoricalDtype from pandas.api.types import CategoricalDtype
from pyspark import pandas as ps from pyspark import pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -30,19 +28,23 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase
class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils): class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
@property @property
def pser(self): def pser(self):
return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="M")) return pd.Series(pd.date_range("1994-1-31 10:30:15", periods=3, freq="D"))
@property @property
def psser(self): def psser(self):
return ps.from_pandas(self.pser) return ps.from_pandas(self.pser)
@property @property
def other_pser(self): def datetime_pdf(self):
return pd.Series(pd.date_range("1994-4-30 10:30:15", periods=3, freq="M")) psers = {
"this": self.pser,
"that": pd.Series(pd.date_range("1994-2-1 10:30:15", periods=3, freq="D")),
}
return pd.concat(psers, axis=1)
@property @property
def other_psser(self): def datetime_psdf(self):
return ps.from_pandas(self.other_pser) return ps.from_pandas(self.datetime_pdf)
@property @property
def some_datetime(self): def some_datetime(self):
@ -53,9 +55,8 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + 1)
self.assertRaises(TypeError, lambda: self.psser + self.some_datetime) self.assertRaises(TypeError, lambda: self.psser + self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser + psser)
self.assertRaises(TypeError, lambda: self.psser + psser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
@ -64,60 +65,62 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
(self.pser - self.some_datetime).dt.total_seconds().astype("int"), (self.pser - self.some_datetime).dt.total_seconds().astype("int"),
self.psser - self.some_datetime, self.psser - self.some_datetime,
) )
with option_context("compute.ops_on_diff_frames", True):
for pser, psser in self.pser_psser_pairs: pdf, psdf = self.pdf, self.psdf
if pser.dtype == np.dtype("<M8[ns]"): for col in self.df_cols:
self.assert_eq( if col == "datetime":
(self.pser - pser).dt.total_seconds().astype("int"), self.assert_eq(
(self.psser - psser).sort_index(), (pdf["datetime"] - pdf[col]).dt.total_seconds().astype("int"),
) psdf["datetime"] - psdf[col],
else: )
self.assertRaises(TypeError, lambda: self.psser - psser) else:
self.assertRaises(TypeError, lambda: psdf["datetime"] - psdf[col])
pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq(
(pdf["that"] - pdf["this"]).dt.total_seconds().astype("int"),
psdf["that"] - psdf["this"],
)
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
self.assertRaises(TypeError, lambda: self.psser * self.some_datetime) self.assertRaises(TypeError, lambda: self.psser * self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser * psser)
self.assertRaises(TypeError, lambda: self.psser * psser)
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
self.assertRaises(TypeError, lambda: self.psser / self.some_datetime) self.assertRaises(TypeError, lambda: self.psser / self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser / psser)
self.assertRaises(TypeError, lambda: self.psser / psser)
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
self.assertRaises(TypeError, lambda: self.psser // self.some_datetime) self.assertRaises(TypeError, lambda: self.psser // self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser // psser)
self.assertRaises(TypeError, lambda: self.psser // psser)
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
self.assertRaises(TypeError, lambda: self.psser % self.some_datetime) self.assertRaises(TypeError, lambda: self.psser % self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser % psser)
self.assertRaises(TypeError, lambda: self.psser % psser)
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
self.assertRaises(TypeError, lambda: self.psser ** self.some_datetime) self.assertRaises(TypeError, lambda: self.psser ** self.some_datetime)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser ** psser)
def test_radd(self): def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: "x" + self.psser)
@ -202,46 +205,34 @@ class DatetimeOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
self.pser == self.other_pser, (self.psser == self.other_psser).sort_index() self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
)
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index())
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] != pdf["that"], psdf["this"] != psdf["that"])
self.pser != self.other_pser, (self.psser != self.other_psser).sort_index() self.assert_eq(pdf["this"] != pdf["this"], psdf["this"] != psdf["this"])
)
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index())
def test_lt(self): def test_lt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] < pdf["that"], psdf["this"] < psdf["that"])
self.pser < self.other_pser, (self.psser < self.other_psser).sort_index() self.assert_eq(pdf["this"] < pdf["this"], psdf["this"] < psdf["this"])
)
self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index())
def test_le(self): def test_le(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] <= pdf["that"], psdf["this"] <= psdf["that"])
self.pser <= self.other_pser, (self.psser <= self.other_psser).sort_index() self.assert_eq(pdf["this"] <= pdf["this"], psdf["this"] <= psdf["this"])
)
self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index())
def test_gt(self): def test_gt(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] > pdf["that"], psdf["this"] > psdf["that"])
self.pser > self.other_pser, (self.psser > self.other_psser).sort_index() self.assert_eq(pdf["this"] > pdf["this"], psdf["this"] > psdf["this"])
)
self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index())
def test_ge(self): def test_ge(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.datetime_pdf, self.datetime_psdf
self.assert_eq( self.assert_eq(pdf["this"] >= pdf["that"], psdf["this"] >= psdf["that"])
self.pser >= self.other_pser, (self.psser >= self.other_psser).sort_index() self.assert_eq(pdf["this"] >= pdf["this"], psdf["this"] >= psdf["this"])
)
self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index())
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -19,7 +19,6 @@ import pandas as pd
from pandas.api.types import CategoricalDtype from pandas.api.types import CategoricalDtype
import pyspark.pandas as ps import pyspark.pandas as ps
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -37,57 +36,50 @@ class NullOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: self.psser + "x") self.assertRaises(TypeError, lambda: self.psser + "x")
self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser + psser)
self.assertRaises(TypeError, lambda: self.psser + psser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
self.assertRaises(TypeError, lambda: self.psser - 1) self.assertRaises(TypeError, lambda: self.psser - 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser - psser)
self.assertRaises(TypeError, lambda: self.psser - psser)
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser * psser)
self.assertRaises(TypeError, lambda: self.psser * psser)
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser / psser)
self.assertRaises(TypeError, lambda: self.psser / psser)
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser // psser)
self.assertRaises(TypeError, lambda: self.psser // psser)
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser % psser)
self.assertRaises(TypeError, lambda: self.psser % psser)
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser ** psser)
def test_radd(self): def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: "x" + self.psser)
@ -145,28 +137,28 @@ class NullOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) self.assert_eq(pser == pser, psser == psser)
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) self.assert_eq(pser != pser, psser != psser)
def test_lt(self): def test_lt(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser < self.pser, (self.psser < self.psser).sort_index()) self.assert_eq(pser < pser, psser < psser)
def test_le(self): def test_le(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser <= self.pser, (self.psser <= self.psser).sort_index()) self.assert_eq(pser <= pser, psser <= psser)
def test_gt(self): def test_gt(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser > self.pser, (self.psser > self.psser).sort_index()) self.assert_eq(pser > pser, psser > psser)
def test_ge(self): def test_ge(self):
with option_context("compute.ops_on_diff_frames", True): pser, psser = self.pser, self.psser
self.assert_eq(self.pser >= self.pser, (self.psser >= self.psser).sort_index()) self.assert_eq(pser >= pser, psser >= psser)
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -19,7 +19,6 @@ import pandas as pd
import pyspark.pandas as ps import pyspark.pandas as ps
from pyspark.ml.linalg import SparseVector from pyspark.ml.linalg import SparseVector
from pyspark.pandas.config import option_context
from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils from pyspark.pandas.tests.data_type_ops.testing_utils import TestCasesUtils
from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.pandasutils import PandasOnSparkTestCase
@ -34,61 +33,67 @@ class UDTOpsTest(PandasOnSparkTestCase, TestCasesUtils):
def psser(self): def psser(self):
return ps.from_pandas(self.pser) return ps.from_pandas(self.pser)
@property
def udt_pdf(self):
sparse_values = {0: 0.2, 1: 1.0}
psers = {
"this": self.pser,
"that": pd.Series([SparseVector(len(sparse_values), sparse_values)]),
}
return pd.concat(psers, axis=1)
@property
def udt_psdf(self):
return ps.from_pandas(self.udt_pdf)
def test_add(self): def test_add(self):
self.assertRaises(TypeError, lambda: self.psser + "x") self.assertRaises(TypeError, lambda: self.psser + "x")
self.assertRaises(TypeError, lambda: self.psser + 1) self.assertRaises(TypeError, lambda: self.psser + 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser + psser)
self.assertRaises(TypeError, lambda: self.psser + psser)
def test_sub(self): def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x") self.assertRaises(TypeError, lambda: self.psser - "x")
self.assertRaises(TypeError, lambda: self.psser - 1) self.assertRaises(TypeError, lambda: self.psser - 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser - psser)
self.assertRaises(TypeError, lambda: self.psser - psser)
def test_mul(self): def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * "x") self.assertRaises(TypeError, lambda: self.psser * "x")
self.assertRaises(TypeError, lambda: self.psser * 1) self.assertRaises(TypeError, lambda: self.psser * 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser * psser)
self.assertRaises(TypeError, lambda: self.psser * psser)
def test_truediv(self): def test_truediv(self):
self.assertRaises(TypeError, lambda: self.psser / "x") self.assertRaises(TypeError, lambda: self.psser / "x")
self.assertRaises(TypeError, lambda: self.psser / 1) self.assertRaises(TypeError, lambda: self.psser / 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser / psser)
self.assertRaises(TypeError, lambda: self.psser / psser)
def test_floordiv(self): def test_floordiv(self):
self.assertRaises(TypeError, lambda: self.psser // "x") self.assertRaises(TypeError, lambda: self.psser // "x")
self.assertRaises(TypeError, lambda: self.psser // 1) self.assertRaises(TypeError, lambda: self.psser // 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser // psser)
self.assertRaises(TypeError, lambda: self.psser // psser)
def test_mod(self): def test_mod(self):
self.assertRaises(TypeError, lambda: self.psser % "x") self.assertRaises(TypeError, lambda: self.psser % "x")
self.assertRaises(TypeError, lambda: self.psser % 1) self.assertRaises(TypeError, lambda: self.psser % 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser % psser)
self.assertRaises(TypeError, lambda: self.psser % psser)
def test_pow(self): def test_pow(self):
self.assertRaises(TypeError, lambda: self.psser ** "x") self.assertRaises(TypeError, lambda: self.psser ** "x")
self.assertRaises(TypeError, lambda: self.psser ** 1) self.assertRaises(TypeError, lambda: self.psser ** 1)
with option_context("compute.ops_on_diff_frames", True): for psser in self.pssers:
for psser in self.pssers: self.assertRaises(TypeError, lambda: self.psser ** psser)
self.assertRaises(TypeError, lambda: self.psser ** psser)
def test_radd(self): def test_radd(self):
self.assertRaises(TypeError, lambda: "x" + self.psser) self.assertRaises(TypeError, lambda: "x" + self.psser)
@ -141,12 +146,14 @@ class UDTOpsTest(PandasOnSparkTestCase, TestCasesUtils):
self.assertRaises(TypeError, lambda: ~self.psser) self.assertRaises(TypeError, lambda: ~self.psser)
def test_eq(self): def test_eq(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.udt_pdf, self.udt_psdf
self.assert_eq(self.pser == self.pser, (self.psser == self.psser).sort_index()) self.assert_eq(pdf["this"] == pdf["this"], psdf["this"] == psdf["this"])
self.assert_eq(pdf["this"] == pdf["that"], psdf["this"] == psdf["that"])
def test_ne(self): def test_ne(self):
with option_context("compute.ops_on_diff_frames", True): pdf, psdf = self.udt_pdf, self.udt_psdf
self.assert_eq(self.pser != self.pser, (self.psser != self.psser).sort_index()) self.assert_eq(pdf["this"] != pdf["this"], psdf["this"] != psdf["this"])
self.assert_eq(pdf["this"] != pdf["that"], psdf["this"] != psdf["that"])
def test_lt(self): def test_lt(self):
self.assertRaisesRegex( self.assertRaisesRegex(