# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from typing import TYPE_CHECKING import pandas as pd from pandas.api.types import CategoricalDtype if TYPE_CHECKING: import pyspark.pandas as ps # noqa: F401 (SPARK-34943) class CategoricalAccessor(object): """ Accessor object for categorical properties of the Series values. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') >>> s.cat.codes 0 0 1 1 2 1 3 2 4 2 5 2 dtype: int8 """ def __init__(self, series: "ps.Series"): if not isinstance(series.dtype, CategoricalDtype): raise ValueError("Cannot call CategoricalAccessor on type {}".format(series.dtype)) self._data = series @property def categories(self) -> pd.Index: """ The categories of this categorical. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') """ return self._data.dtype.categories @categories.setter def categories(self, categories: pd.Index) -> None: raise NotImplementedError() @property def ordered(self) -> bool: """ Whether the categories have an ordered relationship. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.ordered False """ return self._data.dtype.ordered @property def codes(self) -> "ps.Series": """ Return Series of codes as well as the index. Examples -------- >>> s = ps.Series(list("abbccc"), dtype="category") >>> s # doctest: +SKIP 0 a 1 b 2 b 3 c 4 c 5 c dtype: category Categories (3, object): ['a', 'b', 'c'] >>> s.cat.codes 0 0 1 1 2 1 3 2 4 2 5 2 dtype: int8 """ return self._data._with_new_scol(self._data.spark.column).rename() def add_categories(self, new_categories: pd.Index, inplace: bool = False) -> "ps.Series": raise NotImplementedError() def as_ordered(self, inplace: bool = False) -> "ps.Series": raise NotImplementedError() def as_unordered(self, inplace: bool = False) -> "ps.Series": raise NotImplementedError() def remove_categories(self, removals: pd.Index, inplace: bool = False) -> "ps.Series": raise NotImplementedError() def remove_unused_categories(self) -> "ps.Series": raise NotImplementedError() def rename_categories(self, new_categories: pd.Index, inplace: bool = False) -> "ps.Series": raise NotImplementedError() def reorder_categories( self, new_categories: pd.Index, ordered: bool = None, inplace: bool = False ) -> "ps.Series": raise NotImplementedError() def set_categories( self, new_categories: pd.Index, ordered: bool = None, rename: bool = False, inplace: bool = False, ) -> "ps.Series": raise NotImplementedError() def _test() -> None: import os import doctest import sys from pyspark.sql import SparkSession import pyspark.pandas.categorical os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.pandas.categorical.__dict__.copy() globs["ps"] = pyspark.pandas spark = ( SparkSession.builder.master("local[4]") .appName("pyspark.pandas.categorical tests") .getOrCreate() ) (failure_count, test_count) = doctest.testmod( pyspark.pandas.categorical, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, ) spark.stop() if failure_count: sys.exit(-1) if __name__ == "__main__": _test()