[SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes

### What changes were proposed in this pull request?
Add dtypes for MultiIndex

### Why are the changes needed?
Add dtypes for MultiIndex

Before this PR:

```python
>>> idx = pd.MultiIndex.from_arrays([[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]], names=("zero", "one"))
>>> pdf = pd.DataFrame(
...     {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
...     index=idx,
... )
>>> psdf = ps.from_pandas(pdf)
>>>
>>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/u02/spark/python/pyspark/pandas/indexes/multi.py", line 917, in __getattr__
    raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item))
AttributeError: 'MultiIndex' object has no attribute 'dtypes'
>>>
```

### Does this PR introduce _any_ user-facing change?
After this PR user can use ```MultiIndex.dtypes``` for:

``` python
>>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes]
typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType]
```

### How was this patch tested?
unit tests.

Closes #34179 from dchvn/add_multiindex_dtypes.

Lead-authored-by: dchvn nguyen <dgd_contributor@viettel.com.vn>
Co-authored-by: dch nguyen <dgd_contributor@viettel.com.vn>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
dchvn nguyen 2021-10-06 15:35:32 +09:00 committed by Hyukjin Kwon
parent 6a2452fb5c
commit 5c6f0b9263
3 changed files with 44 additions and 0 deletions

View file

@ -240,6 +240,7 @@ MultiIndex Properties
MultiIndex.nlevels
MultiIndex.levshape
MultiIndex.values
MultiIndex.dtypes
MultiIndex components
~~~~~~~~~~~~~~~~~~~~~

View file

@ -375,6 +375,35 @@ class MultiIndex(Index):
def name(self, name: Name) -> None:
raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
@property
def dtypes(self) -> pd.Series:
"""Return the dtypes as a Series for the underlying MultiIndex.
.. versionadded:: 3.3.0
Returns
-------
pd.Series
The data type of each level.
Examples
--------
>>> psmidx = ps.MultiIndex.from_arrays(
... [[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]],
... names=("zero", "one"),
... )
>>> psmidx.dtypes
zero int64
one int64
dtype: object
"""
return pd.Series(
[field.dtype for field in self._internal.index_fields],
index=pd.Index(
[name if len(name) > 1 else name[0] for name in self._internal.index_names]
),
)
def _verify_for_rename(self, name: List[Name]) -> List[Label]: # type: ignore[override]
if is_list_like(name):
if self._internal.index_level != len(name):

View file

@ -6000,6 +6000,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
expected_pdf = pd.DataFrame({"A": [None, 0], "B": [4.0, 1.0], "C": [3, 3]})
self.assert_eq(expected_pdf, psdf1.combine_first(psdf2))
def test_multi_index_dtypes(self):
# SPARK-36930: Support ps.MultiIndex.dtypes
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
psmidx = ps.from_pandas(pmidx)
self.assert_eq(psmidx.dtypes, pmidx.dtypes)
# multiple labels
pmidx = pd.MultiIndex.from_arrays(arrays, names=[("zero", "first"), ("one", "second")])
psmidx = ps.from_pandas(pmidx)
self.assert_eq(psmidx.dtypes, pmidx.dtypes)
if __name__ == "__main__":
from pyspark.pandas.tests.test_dataframe import * # noqa: F401