[SPARK-36818][PYTHON] Fix filtering a Series by a boolean Series

### What changes were proposed in this pull request? Fix filtering a Series (without a name) by a boolean Series. ### Why are the changes needed? A bugfix. The issue is raised as https://github.com/databricks/koalas/issues/2199. ### Does this PR introduce _any_ user-facing change? Yes. #### From ```py >>> psser = ps.Series([0, 1, 2, 3, 4]) >>> ps.set_option('compute.ops_on_diff_frames', True) >>> psser.loc[ps.Series([True, True, True, False, False])] Traceback (most recent call last): ... KeyError: 'none key' ``` #### To ```py >>> psser = ps.Series([0, 1, 2, 3, 4]) >>> ps.set_option('compute.ops_on_diff_frames', True) >>> psser.loc[ps.Series([True, True, True, False, False])] 0 0 1 1 2 2 dtype: int64 ``` ### How was this patch tested? Unit test. Closes #34061 from xinrong-databricks/filter_series. Authored-by: Xinrong Meng <xinrong.meng@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
2021-09-22 12:52:56 -07:00 · 2021-09-22 12:52:56 -07:00 · 6a5ee0283c
parent a7cbe69986
commit 6a5ee0283c
3 changed files with 19 additions and 2 deletions
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@ -33,6 +33,7 @@ import numpy as np
 from pyspark import pandas as ps  # noqa: F401
 from pyspark.pandas._typing import Label, Name, Scalar
 from pyspark.pandas.internal import (
+    DEFAULT_SERIES_NAME,
    InternalField,
    InternalFrame,
    NATURAL_ORDER_COLUMN_NAME,
@ -435,11 +436,12 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):

        if self._is_series:
            if isinstance(key, Series) and not same_anchor(key, self._psdf_or_psser):
-                psdf = self._psdf_or_psser.to_frame()
+                name = self._psdf_or_psser.name or DEFAULT_SERIES_NAME
+                psdf = self._psdf_or_psser.to_frame(name)
                temp_col = verify_temp_column_name(psdf, "__temp_col__")

                psdf[temp_col] = key
-                return type(self)(psdf[self._psdf_or_psser.name])[psdf[temp_col]]
+                return type(self)(psdf[name].rename(self._psdf_or_psser.name))[psdf[temp_col]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@ -417,6 +417,15 @@ class IndexingTest(PandasOnSparkTestCase):
        self.assertRaises(KeyError, lambda: psdf.loc[0:30])
        self.assertRaises(KeyError, lambda: psdf.loc[10:100])

+    def test_loc_getitem_boolean_series(self):
+        pdf = pd.DataFrame(
+            {"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}, index=[20, 10, 30, 0, 50]
+        )
+        psdf = ps.from_pandas(pdf)
+        self.assert_eq(pdf.A.loc[pdf.B > 200], psdf.A.loc[psdf.B > 200])
+        self.assert_eq(pdf.B.loc[pdf.B > 200], psdf.B.loc[psdf.B > 200])
+        self.assert_eq(pdf.loc[pdf.B > 200], psdf.loc[psdf.B > 200])
+
    def test_loc_non_informative_index(self):
        pdf = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40])
        psdf = ps.from_pandas(pdf)
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@ -503,6 +503,12 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
            (pdf1.A + 1).loc[pdf2.A > -3].sort_index(), (psdf1.A + 1).loc[psdf2.A > -3].sort_index()
        )

+        pser = pd.Series([0, 1, 2, 3, 4], index=[20, 10, 30, 0, 50])
+        psser = ps.from_pandas(pser)
+        self.assert_eq(pser.loc[pdf2.A > -3].sort_index(), psser.loc[psdf2.A > -3].sort_index())
+        pser.name = psser.name = "B"
+        self.assert_eq(pser.loc[pdf2.A > -3].sort_index(), psser.loc[psdf2.A > -3].sort_index())
+
    def test_bitwise(self):
        pser1 = pd.Series([True, False, True, False, np.nan, np.nan, True, False, np.nan])
        pser2 = pd.Series([True, False, False, True, True, False, np.nan, np.nan, np.nan])