From dcc0aaa3efb2d441b2dfadb0c64dbc28ee197de5 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Wed, 21 Jul 2021 22:34:04 -0700 Subject: [PATCH] [SPARK-36214][PYTHON] Add add_categories to CategoricalAccessor and CategoricalIndex ### What changes were proposed in this pull request? Add `add_categories` to `CategoricalAccessor` and `CategoricalIndex`. ### Why are the changes needed? We should implement `add_categories` in `CategoricalAccessor` and `CategoricalIndex`. ### Does this PR introduce _any_ user-facing change? Yes, users will be able to use `add_categories`. ### How was this patch tested? Added some tests. Closes #33470 from ueshin/issues/SPARK-36214/add_categories. Authored-by: Takuya UESHIN Signed-off-by: Takuya UESHIN --- .../reference/pyspark.pandas/indexing.rst | 1 + .../reference/pyspark.pandas/series.rst | 1 + python/pyspark/pandas/categorical.py | 84 ++++++++++++++++++- python/pyspark/pandas/indexes/category.py | 46 ++++++++++ python/pyspark/pandas/missing/indexes.py | 1 - .../pandas/tests/indexes/test_category.py | 12 +++ .../pyspark/pandas/tests/test_categorical.py | 18 ++++ 7 files changed, 158 insertions(+), 5 deletions(-) diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index 4f84d91061..b0b4cdd221 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -175,6 +175,7 @@ Categorical components CategoricalIndex.codes CategoricalIndex.categories CategoricalIndex.ordered + CategoricalIndex.add_categories CategoricalIndex.as_ordered CategoricalIndex.as_unordered diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index b718d79e81..6243a22454 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -401,6 +401,7 @@ the ``Series.cat`` accessor. Series.cat.categories Series.cat.ordered Series.cat.codes + Series.cat.add_categories Series.cat.as_ordered Series.cat.as_unordered diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index aeba20d510..a83c3c741c 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import List, Optional, Union, TYPE_CHECKING, cast +from typing import Any, List, Optional, Union, TYPE_CHECKING, cast import pandas as pd -from pandas.api.types import CategoricalDtype +from pandas.api.types import CategoricalDtype, is_list_like from pyspark.pandas.internal import InternalField from pyspark.sql.types import StructField @@ -165,8 +165,84 @@ class CategoricalAccessor(object): ), ).rename() - def add_categories(self, new_categories: pd.Index, inplace: bool = False) -> "ps.Series": - raise NotImplementedError() + def add_categories( + self, new_categories: Union[pd.Index, Any, List], inplace: bool = False + ) -> Optional["ps.Series"]: + """ + Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : bool, default False + Whether or not to add the categories inplace or return a copy of + this categorical with added categories. + + Returns + ------- + Series or None + Categorical with new categories added or None if ``inplace=True``. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as + categories + + Examples + -------- + >>> s = ps.Series(list("abbccc"), dtype="category") + >>> s # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + + >>> s.cat.add_categories('x') # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (4, object): ['a', 'b', 'c', 'x'] + """ + from pyspark.pandas.frame import DataFrame + + if is_list_like(new_categories): + categories = list(new_categories) # type: List + else: + categories = [new_categories] + + if any(cat in self.categories for cat in categories): + raise ValueError( + "new categories must not include old categories: {{{cats}}}".format( + cats=", ".join(set(str(cat) for cat in categories if cat in self.categories)) + ) + ) + + internal = self._data._psdf._internal.with_new_spark_column( + self._data._column_label, + self._data.spark.column, + field=self._data._internal.data_fields[0].copy( + dtype=CategoricalDtype(list(self.categories) + categories, ordered=self.ordered) + ), + ) + if inplace: + self._data._psdf._update_internal_frame(internal) + return None + else: + psser = DataFrame(internal)._psser_for(self._data._column_label) + return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0]) def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]: from pyspark.pandas.frame import DataFrame diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py index 1b6588646f..308043e353 100644 --- a/python/pyspark/pandas/indexes/category.py +++ b/python/pyspark/pandas/indexes/category.py @@ -204,6 +204,52 @@ class CategoricalIndex(Index): """ return self.dtype.ordered + def add_categories( + self, new_categories: Union[pd.Index, Any, List], inplace: bool = False + ) -> Optional["CategoricalIndex"]: + """ + Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : bool, default False + Whether or not to add the categories inplace or return a copy of + this categorical with added categories. + + Returns + ------- + CategoricalIndex or None + Categorical with new categories added or None if ``inplace=True``. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as + categories + + Examples + -------- + >>> idx = ps.CategoricalIndex(list("abbccc")) + >>> idx # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + + >>> idx.add_categories('x') # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c', 'x'], ordered=False, dtype='category') + """ + if inplace: + raise ValueError("cannot use inplace with CategoricalIndex") + + return CategoricalIndex( + self.to_series().cat.add_categories(new_categories=new_categories) + ).rename(self.name) + def as_ordered(self, inplace: bool = False) -> Optional["CategoricalIndex"]: """ Set the Categorical to be ordered. diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py index 6ca564f925..2a5a4c9087 100644 --- a/python/pyspark/pandas/missing/indexes.py +++ b/python/pyspark/pandas/missing/indexes.py @@ -125,7 +125,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex): # Functions rename_categories = _unsupported_function("rename_categories", cls="CategoricalIndex") reorder_categories = _unsupported_function("reorder_categories", cls="CategoricalIndex") - add_categories = _unsupported_function("add_categories", cls="CategoricalIndex") remove_categories = _unsupported_function("remove_categories", cls="CategoricalIndex") remove_unused_categories = _unsupported_function( "remove_unused_categories", cls="CategoricalIndex" diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index d04f89684e..44e270348e 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -94,6 +94,18 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): with self.assertRaises(ValueError): psidx.categories = [1, 2, 3, 4] + def test_add_categories(self): + pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1]) + psidx = ps.from_pandas(pidx) + + self.assert_eq(pidx.add_categories(4), psidx.add_categories(4)) + self.assert_eq(pidx.add_categories([4, 5]), psidx.add_categories([4, 5])) + self.assert_eq(pidx.add_categories([]), psidx.add_categories([])) + + self.assertRaises(ValueError, lambda: psidx.add_categories(4, inplace=True)) + self.assertRaises(ValueError, lambda: psidx.add_categories(3)) + self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4])) + def test_as_ordered_unordered(self): pidx = pd.CategoricalIndex(["x", "y", "z"], categories=["z", "y", "x"]) psidx = ps.from_pandas(pidx) diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index fb0561d560..1af03d69df 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -79,6 +79,24 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): with self.assertRaises(ValueError): psser.cat.categories = [1, 2, 3, 4] + def test_add_categories(self): + pdf, psdf = self.df_pair + + pser = pdf.a + psser = psdf.a + + self.assert_eq(pser.cat.add_categories(4), psser.cat.add_categories(4)) + self.assert_eq(pser.cat.add_categories([4, 5]), psser.cat.add_categories([4, 5])) + self.assert_eq(pser.cat.add_categories([]), psser.cat.add_categories([])) + + pser.cat.add_categories(4, inplace=True) + psser.cat.add_categories(4, inplace=True) + self.assert_eq(pser, psser) + self.assert_eq(pdf, psdf) + + self.assertRaises(ValueError, lambda: psser.cat.add_categories(4)) + self.assertRaises(ValueError, lambda: psser.cat.add_categories([5, 5])) + def test_as_ordered_unordered(self): pdf, psdf = self.df_pair