[SPARK-36261][PYTHON] Add remove_unused_categories to CategoricalAccessor and CategoricalIndex

### What changes were proposed in this pull request? Add `remove_unused_categories` to `CategoricalAccessor` and `CategoricalIndex`. ### Why are the changes needed? We should implement `remove_unused_categories` in `CategoricalAccessor` and `CategoricalIndex`. ### Does this PR introduce _any_ user-facing change? Yes, users will be able to use `remove_unused_categories`. ### How was this patch tested? Added some tests. Closes #33485 from ueshin/issues/SPARK-36261/remove_unused_categories. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 2fe12a7520) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-07-23 14:04:59 +09:00 · 2021-07-23 14:04:59 +09:00 · 4abc1d389e
parent f169f056b4
commit 4abc1d389e
7 changed files with 102 additions and 5 deletions
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@ -176,6 +176,7 @@ Categorical components
   CategoricalIndex.ordered
   CategoricalIndex.add_categories
   CategoricalIndex.remove_categories
+   CategoricalIndex.remove_unused_categories
   CategoricalIndex.as_ordered
   CategoricalIndex.as_unordered
   CategoricalIndex.rename_categories
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@ -401,6 +401,7 @@ the ``Series.cat`` accessor.
   Series.cat.codes
   Series.cat.add_categories
   Series.cat.remove_categories
+   Series.cat.remove_unused_categories
   Series.cat.as_ordered
   Series.cat.as_unordered
   Series.cat.rename_categories
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@ -439,8 +439,47 @@ class CategoricalAccessor(object):
            else:
                return psser

-    def remove_unused_categories(self) -> "ps.Series":
-        raise NotImplementedError()
+    def remove_unused_categories(self, inplace: bool = False) -> Optional["ps.Series"]:
+        """
+        Remove categories which are not used.
+
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to drop unused categories inplace or return a copy of
+           this categorical with unused categories dropped.
+
+        Returns
+        -------
+        cat : Series or None
+            Categorical with unused categories dropped or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
+        >>> s  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (4, object): ['a', 'b', 'c', 'd']
+
+        >>> s.cat.remove_unused_categories()  # doctest: +SKIP
+        0    a
+        1    b
+        2    b
+        3    c
+        4    c
+        5    c
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+        """
+        categories = set(self._data.drop_duplicates().to_pandas())
+        removals = [cat for cat in self.categories if cat not in categories]
+        return self.remove_categories(removals=removals, inplace=inplace)

    def rename_categories(
        self, new_categories: Union[list, dict, Callable], inplace: bool = False
--- a/python/pyspark/pandas/indexes/category.py
+++ b/python/pyspark/pandas/indexes/category.py
@ -355,6 +355,37 @@ class CategoricalIndex(Index):

        return CategoricalIndex(self.to_series().cat.remove_categories(removals)).rename(self.name)

+    def remove_unused_categories(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
+        """
+        Remove categories which are not used.
+
+        Parameters
+        ----------
+        inplace : bool, default False
+           Whether or not to drop unused categories inplace or return a copy of
+           this categorical with unused categories dropped.
+
+        Returns
+        -------
+        cat : CategoricalIndex or None
+            Categorical with unused categories dropped or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
+        >>> idx  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c', 'd'], ordered=False, dtype='category')
+
+        >>> idx.remove_unused_categories()  # doctest: +NORMALIZE_WHITESPACE
+        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
+                         categories=['a', 'b', 'c'], ordered=False, dtype='category')
+        """
+        if inplace:
+            raise ValueError("cannot use inplace with CategoricalIndex")
+
+        return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
+
    def __getattr__(self, item: str) -> Any:
        if hasattr(MissingPandasLikeCategoricalIndex, item):
            property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@ -124,9 +124,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):

    # Functions
    reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex")
-    remove_unused_categories = _unsupported_function(
-        "remove_unused_categories", cls="CategoricalIndex"
-    )
    set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
    map = _unsupported_function("map", cls="CategoricalIndex")

--- a/python/pyspark/pandas/tests/indexes/test_category.py
+++ b/python/pyspark/pandas/tests/indexes/test_category.py
@ -122,6 +122,14 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
        self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
        self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))

+    def test_remove_unused_categories(self):
+        pidx = pd.CategoricalIndex([1, 4, 5, 3], categories=[4, 3, 2, 1])
+        psidx = ps.from_pandas(pidx)
+
+        self.assert_eq(pidx.remove_unused_categories(), psidx.remove_unused_categories())
+
+        self.assertRaises(ValueError, lambda: psidx.remove_unused_categories(inplace=True))
+
    def test_as_ordered_unordered(self):
        pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
        psidx = ps.from_pandas(pidx)
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@ -121,6 +121,26 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
        self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
        self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4, None]))

+    def test_remove_unused_categories(self):
+        pdf, psdf = self.df_pair
+
+        pser = pdf.a
+        psser = psdf.a
+
+        self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
+
+        pser.cat.add_categories(4, inplace=True)
+        pser.cat.remove_categories(2, inplace=True)
+        psser.cat.add_categories(4, inplace=True)
+        psser.cat.remove_categories(2, inplace=True)
+
+        self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
+
+        pser.cat.remove_unused_categories(inplace=True)
+        psser.cat.remove_unused_categories(inplace=True)
+        self.assert_eq(pser, psser)
+        self.assert_eq(pdf, psdf)
+
    def test_as_ordered_unordered(self):
        pdf, psdf = self.df_pair