[SPARK-36261][PYTHON] Add remove_unused_categories to CategoricalAccessor and CategoricalIndex
### What changes were proposed in this pull request?
Add `remove_unused_categories` to `CategoricalAccessor` and `CategoricalIndex`.
### Why are the changes needed?
We should implement `remove_unused_categories` in `CategoricalAccessor` and `CategoricalIndex`.
### Does this PR introduce _any_ user-facing change?
Yes, users will be able to use `remove_unused_categories`.
### How was this patch tested?
Added some tests.
Closes #33485 from ueshin/issues/SPARK-36261/remove_unused_categories.
Authored-by: Takuya UESHIN <ueshin@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 2fe12a7520
)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
parent
f169f056b4
commit
4abc1d389e
|
@ -176,6 +176,7 @@ Categorical components
|
|||
CategoricalIndex.ordered
|
||||
CategoricalIndex.add_categories
|
||||
CategoricalIndex.remove_categories
|
||||
CategoricalIndex.remove_unused_categories
|
||||
CategoricalIndex.as_ordered
|
||||
CategoricalIndex.as_unordered
|
||||
CategoricalIndex.rename_categories
|
||||
|
|
|
@ -401,6 +401,7 @@ the ``Series.cat`` accessor.
|
|||
Series.cat.codes
|
||||
Series.cat.add_categories
|
||||
Series.cat.remove_categories
|
||||
Series.cat.remove_unused_categories
|
||||
Series.cat.as_ordered
|
||||
Series.cat.as_unordered
|
||||
Series.cat.rename_categories
|
||||
|
|
|
@ -439,8 +439,47 @@ class CategoricalAccessor(object):
|
|||
else:
|
||||
return psser
|
||||
|
||||
def remove_unused_categories(self) -> "ps.Series":
|
||||
raise NotImplementedError()
|
||||
def remove_unused_categories(self, inplace: bool = False) -> Optional["ps.Series"]:
|
||||
"""
|
||||
Remove categories which are not used.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inplace : bool, default False
|
||||
Whether or not to drop unused categories inplace or return a copy of
|
||||
this categorical with unused categories dropped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cat : Series or None
|
||||
Categorical with unused categories dropped or None if ``inplace=True``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ps.Series(pd.Categorical(list("abbccc"), categories=['a', 'b', 'c', 'd']))
|
||||
>>> s # doctest: +SKIP
|
||||
0 a
|
||||
1 b
|
||||
2 b
|
||||
3 c
|
||||
4 c
|
||||
5 c
|
||||
dtype: category
|
||||
Categories (4, object): ['a', 'b', 'c', 'd']
|
||||
|
||||
>>> s.cat.remove_unused_categories() # doctest: +SKIP
|
||||
0 a
|
||||
1 b
|
||||
2 b
|
||||
3 c
|
||||
4 c
|
||||
5 c
|
||||
dtype: category
|
||||
Categories (3, object): ['a', 'b', 'c']
|
||||
"""
|
||||
categories = set(self._data.drop_duplicates().to_pandas())
|
||||
removals = [cat for cat in self.categories if cat not in categories]
|
||||
return self.remove_categories(removals=removals, inplace=inplace)
|
||||
|
||||
def rename_categories(
|
||||
self, new_categories: Union[list, dict, Callable], inplace: bool = False
|
||||
|
|
|
@ -355,6 +355,37 @@ class CategoricalIndex(Index):
|
|||
|
||||
return CategoricalIndex(self.to_series().cat.remove_categories(removals)).rename(self.name)
|
||||
|
||||
def remove_unused_categories(self, inplace: bool = False) -> Optional["CategoricalIndex"]:
|
||||
"""
|
||||
Remove categories which are not used.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inplace : bool, default False
|
||||
Whether or not to drop unused categories inplace or return a copy of
|
||||
this categorical with unused categories dropped.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cat : CategoricalIndex or None
|
||||
Categorical with unused categories dropped or None if ``inplace=True``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> idx = ps.CategoricalIndex(list("abbccc"), categories=['a', 'b', 'c', 'd'])
|
||||
>>> idx # doctest: +NORMALIZE_WHITESPACE
|
||||
CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
|
||||
categories=['a', 'b', 'c', 'd'], ordered=False, dtype='category')
|
||||
|
||||
>>> idx.remove_unused_categories() # doctest: +NORMALIZE_WHITESPACE
|
||||
CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
|
||||
categories=['a', 'b', 'c'], ordered=False, dtype='category')
|
||||
"""
|
||||
if inplace:
|
||||
raise ValueError("cannot use inplace with CategoricalIndex")
|
||||
|
||||
return CategoricalIndex(self.to_series().cat.remove_unused_categories()).rename(self.name)
|
||||
|
||||
def __getattr__(self, item: str) -> Any:
|
||||
if hasattr(MissingPandasLikeCategoricalIndex, item):
|
||||
property_or_func = getattr(MissingPandasLikeCategoricalIndex, item)
|
||||
|
|
|
@ -124,9 +124,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
|
|||
|
||||
# Functions
|
||||
reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex")
|
||||
remove_unused_categories = _unsupported_function(
|
||||
"remove_unused_categories", cls="CategoricalIndex"
|
||||
)
|
||||
set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
|
||||
map = _unsupported_function("map", cls="CategoricalIndex")
|
||||
|
||||
|
|
|
@ -122,6 +122,14 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils):
|
|||
self.assertRaises(ValueError, lambda: psidx.remove_categories(4))
|
||||
self.assertRaises(ValueError, lambda: psidx.remove_categories([4, None]))
|
||||
|
||||
def test_remove_unused_categories(self):
|
||||
pidx = pd.CategoricalIndex([1, 4, 5, 3], categories=[4, 3, 2, 1])
|
||||
psidx = ps.from_pandas(pidx)
|
||||
|
||||
self.assert_eq(pidx.remove_unused_categories(), psidx.remove_unused_categories())
|
||||
|
||||
self.assertRaises(ValueError, lambda: psidx.remove_unused_categories(inplace=True))
|
||||
|
||||
def test_as_ordered_unordered(self):
|
||||
pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"])
|
||||
psidx = ps.from_pandas(pidx)
|
||||
|
|
|
@ -121,6 +121,26 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils):
|
|||
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
|
||||
self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4, None]))
|
||||
|
||||
def test_remove_unused_categories(self):
|
||||
pdf, psdf = self.df_pair
|
||||
|
||||
pser = pdf.a
|
||||
psser = psdf.a
|
||||
|
||||
self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
|
||||
|
||||
pser.cat.add_categories(4, inplace=True)
|
||||
pser.cat.remove_categories(2, inplace=True)
|
||||
psser.cat.add_categories(4, inplace=True)
|
||||
psser.cat.remove_categories(2, inplace=True)
|
||||
|
||||
self.assert_eq(pser.cat.remove_unused_categories(), psser.cat.remove_unused_categories())
|
||||
|
||||
pser.cat.remove_unused_categories(inplace=True)
|
||||
psser.cat.remove_unused_categories(inplace=True)
|
||||
self.assert_eq(pser, psser)
|
||||
self.assert_eq(pdf, psdf)
|
||||
|
||||
def test_as_ordered_unordered(self):
|
||||
pdf, psdf = self.df_pair
|
||||
|
||||
|
|
Loading…
Reference in a new issue