[SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes

### What changes were proposed in this pull request? Add dtypes for MultiIndex ### Why are the changes needed? Add dtypes for MultiIndex Before this PR: ```python >>> idx = pd.MultiIndex.from_arrays([[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]], names=("zero", "one")) >>> pdf = pd.DataFrame( ... {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, ... index=idx, ... ) >>> psdf = ps.from_pandas(pdf) >>> >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes] Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/u02/spark/python/pyspark/pandas/indexes/multi.py", line 917, in __getattr__ raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item)) AttributeError: 'MultiIndex' object has no attribute 'dtypes' >>> ``` ### Does this PR introduce _any_ user-facing change? After this PR user can use ```MultiIndex.dtypes``` for: ``` python >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes] typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType] ``` ### How was this patch tested? unit tests. Closes #34179 from dchvn/add_multiindex_dtypes. Lead-authored-by: dchvn nguyen <dgd_contributor@viettel.com.vn> Co-authored-by: dch nguyen <dgd_contributor@viettel.com.vn> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
2021-10-06 15:35:32 +09:00 · 2021-10-06 15:35:32 +09:00 · 5c6f0b9263
parent 6a2452fb5c
commit 5c6f0b9263
3 changed files with 44 additions and 0 deletions
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@ -240,6 +240,7 @@ MultiIndex Properties
   MultiIndex.nlevels
   MultiIndex.levshape
   MultiIndex.values
+   MultiIndex.dtypes

 MultiIndex components
 ~~~~~~~~~~~~~~~~~~~~~
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@ -375,6 +375,35 @@ class MultiIndex(Index):
    def name(self, name: Name) -> None:
        raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")

+    @property
+    def dtypes(self) -> pd.Series:
+        """Return the dtypes as a Series for the underlying MultiIndex.
+
+        .. versionadded:: 3.3.0
+
+        Returns
+        -------
+        pd.Series
+            The data type of each level.
+
+        Examples
+        --------
+        >>> psmidx = ps.MultiIndex.from_arrays(
+        ...     [[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]],
+        ...     names=("zero", "one"),
+        ... )
+        >>> psmidx.dtypes
+        zero    int64
+        one     int64
+        dtype: object
+        """
+        return pd.Series(
+            [field.dtype for field in self._internal.index_fields],
+            index=pd.Index(
+                [name if len(name) > 1 else name[0] for name in self._internal.index_names]
+            ),
+        )
+
    def _verify_for_rename(self, name: List[Name]) -> List[Label]:  # type: ignore[override]
        if is_list_like(name):
            if self._internal.index_level != len(name):
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@ -6000,6 +6000,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
            expected_pdf = pd.DataFrame({"A": [None, 0], "B": [4.0, 1.0], "C": [3, 3]})
            self.assert_eq(expected_pdf, psdf1.combine_first(psdf2))

+    def test_multi_index_dtypes(self):
+        # SPARK-36930: Support ps.MultiIndex.dtypes
+        arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
+        pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+
+        # multiple labels
+        pmidx = pd.MultiIndex.from_arrays(arrays, names=[("zero", "first"), ("one", "second")])
+        psmidx = ps.from_pandas(pmidx)
+
+        self.assert_eq(psmidx.dtypes, pmidx.dtypes)
+

 if __name__ == "__main__":
    from pyspark.pandas.tests.test_dataframe import *  # noqa: F401