[SPARK-36261][PYTHON] Add remove_unused_categories to CategoricalAccessor and CategoricalIndex

### What changes were proposed in this pull request?

Add `remove_unused_categories` to `CategoricalAccessor` and `CategoricalIndex`.

### Why are the changes needed?

We should implement `remove_unused_categories` in `CategoricalAccessor` and `CategoricalIndex`.

### Does this PR introduce _any_ user-facing change?

Yes, users will be able to use `remove_unused_categories`.

### How was this patch tested?

Added some tests.

Closes #33485 from ueshin/issues/SPARK-36261/remove_unused_categories.

Authored-by: Takuya UESHIN <ueshin@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 2fe12a7520)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
Takuya UESHIN 2021-07-23 14:04:59 +09:00 committed by Hyukjin Kwon
parent f169f056b4
commit 4abc1d389e
7 changed files with 102 additions and 5 deletions

View file

@ -176,6 +176,7 @@ Categorical components
CategoricalIndex.ordered
CategoricalIndex.add_categories
CategoricalIndex.remove_categories
CategoricalIndex.remove_unused_categories
CategoricalIndex.as_ordered
CategoricalIndex.as_unordered
CategoricalIndex.rename_categories

View file

@ -401,6 +401,7 @@ the ``Series.cat`` accessor.
Series.cat.codes
Series.cat.add_categories
Series.cat.remove_categories
Series.cat.remove_unused_categories
Series.cat.as_ordered
Series.cat.as_unordered
Series.cat.rename_categories

View file

@ -439,8 +439,47 @@ class CategoricalAccessor(object):
else:
return psser
def remove_unused_categories(self) -> "ps.Series":
raise NotImplementedError()
def remove_unused_categories(self, inplace: bool = False) -> Optional["ps.Series"]:
"""
Remove categories which are not used.
Parameters
----------
inplace : bool, default False
Whether or not to drop unused categories inplace or return a copy of
this categorical with unused categories dropped.
Returns
-------
cat : Series or None
Categorical with unused categories dropped or None if ``inplace=True``.
Examples
--------
>>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
>>> s # doctest: +SKIP
0 a
1 b
2 b
3 c
4 c
5 c
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
>>> s.cat.remove_unused_categories() # doctest: +SKIP
0 a
1 b
2 b
3 c
4 c
5 c
dtype: category
Categories (3, object): ['a', 'b', 'c']
"""
categories = set(self._data.drop_duplicates().to_pandas())
removals = [cat for cat in self.categories if cat not in categories]
return self.remove_categories(removals=removals, inplace=inplace)
def rename_categories(
self, new_categories: Union[list, dict, Callable], inplace: bool = False

View file

@ -355,6 +355,37 @@ class CategoricalIndex(Index):
return CategoricalIndex(self.to_series().cat.remove_categories(removals)).rename(self.name)
def remove_unused_categories(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
"""
Remove categories which are not used.
Parameters
----------
inplace : bool, default False
Whether or not to drop unused categories inplace or return a copy of
this categorical with unused categories dropped.
Returns
-------
cat : CategoricalIndex or None
Categorical with unused categories dropped or None if ``inplace=True``.
Examples
--------
>>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
>>> idx # doctest: +NORMALIZE_WHITESPACE
CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
categories=['a', 'b', 'c', 'd'], ordered=False, dtype='category')
>>> idx.remove_unused_categories() # doctest: +NORMALIZE_WHITESPACE
CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
categories=['a', 'b', 'c'], ordered=False, dtype='category')
"""
if inplace:
raise ValueError("cannot use inplace with CategoricalIndex")
return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
def __getattr__(self, item: str) -> Any:
if hasattr(MissingPandasLikeCategoricalIndex, item):
property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)

View file

@ -124,9 +124,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
# Functions
reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex")
remove_unused_categories = _unsupported_function(
"remove_unused_categories", cls="CategoricalIndex"
)
set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
map = _unsupported_function("map", cls="CategoricalIndex")

View file

@ -122,6 +122,14 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))
def test_remove_unused_categories(self):
pidx = pd.CategoricalIndex([1, 4, 5, 3], categories=[4, 3, 2, 1])
psidx = ps.from_pandas(pidx)
self.assert_eq(pidx.remove_unused_categories(), psidx.remove_unused_categories())
self.assertRaises(ValueError, lambda: psidx.remove_unused_categories(inplace=True))
def test_as_ordered_unordered(self):
pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
psidx = ps.from_pandas(pidx)

View file

@ -121,6 +121,26 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4, None]))
def test_remove_unused_categories(self):
pdf, psdf = self.df_pair
pser = pdf.a
psser = psdf.a
self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
pser.cat.add_categories(4, inplace=True)
pser.cat.remove_categories(2, inplace=True)
psser.cat.add_categories(4, inplace=True)
psser.cat.remove_categories(2, inplace=True)
self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
pser.cat.remove_unused_categories(inplace=True)
psser.cat.remove_unused_categories(inplace=True)
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
def test_as_ordered_unordered(self):
pdf, psdf = self.df_pair