[SPARK-36930][PYTHON] Support ps.MultiIndex.dtypes
### What changes were proposed in this pull request? Add dtypes for MultiIndex ### Why are the changes needed? Add dtypes for MultiIndex Before this PR: ```python >>> idx = pd.MultiIndex.from_arrays([[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]], names=("zero", "one")) >>> pdf = pd.DataFrame( ... {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, ... index=idx, ... ) >>> psdf = ps.from_pandas(pdf) >>> >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes] Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/u02/spark/python/pyspark/pandas/indexes/multi.py", line 917, in __getattr__ raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item)) AttributeError: 'MultiIndex' object has no attribute 'dtypes' >>> ``` ### Does this PR introduce _any_ user-facing change? After this PR user can use ```MultiIndex.dtypes``` for: ``` python >>> ps.DataFrame[psdf.index.dtypes, psdf.dtypes] typing.Tuple[pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.IndexNameType, pyspark.pandas.typedef.typehints.NameType, pyspark.pandas.typedef.typehints.NameType] ``` ### How was this patch tested? unit tests. Closes #34179 from dchvn/add_multiindex_dtypes. Lead-authored-by: dchvn nguyen <dgd_contributor@viettel.com.vn> Co-authored-by: dch nguyen <dgd_contributor@viettel.com.vn> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
parent
6a2452fb5c
commit
5c6f0b9263
|
@ -240,6 +240,7 @@ MultiIndex Properties
|
|||
MultiIndex.nlevels
|
||||
MultiIndex.levshape
|
||||
MultiIndex.values
|
||||
MultiIndex.dtypes
|
||||
|
||||
MultiIndex components
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -375,6 +375,35 @@ class MultiIndex(Index):
|
|||
def name(self, name: Name) -> None:
|
||||
raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
|
||||
|
||||
@property
|
||||
def dtypes(self) -> pd.Series:
|
||||
"""Return the dtypes as a Series for the underlying MultiIndex.
|
||||
|
||||
.. versionadded:: 3.3.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Series
|
||||
The data type of each level.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> psmidx = ps.MultiIndex.from_arrays(
|
||||
... [[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8, 9]],
|
||||
... names=("zero", "one"),
|
||||
... )
|
||||
>>> psmidx.dtypes
|
||||
zero int64
|
||||
one int64
|
||||
dtype: object
|
||||
"""
|
||||
return pd.Series(
|
||||
[field.dtype for field in self._internal.index_fields],
|
||||
index=pd.Index(
|
||||
[name if len(name) > 1 else name[0] for name in self._internal.index_names]
|
||||
),
|
||||
)
|
||||
|
||||
def _verify_for_rename(self, name: List[Name]) -> List[Label]: # type: ignore[override]
|
||||
if is_list_like(name):
|
||||
if self._internal.index_level != len(name):
|
||||
|
|
|
@ -6000,6 +6000,20 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
|
|||
expected_pdf = pd.DataFrame({"A": [None, 0], "B": [4.0, 1.0], "C": [3, 3]})
|
||||
self.assert_eq(expected_pdf, psdf1.combine_first(psdf2))
|
||||
|
||||
def test_multi_index_dtypes(self):
|
||||
# SPARK-36930: Support ps.MultiIndex.dtypes
|
||||
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
|
||||
pmidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
|
||||
psmidx = ps.from_pandas(pmidx)
|
||||
|
||||
self.assert_eq(psmidx.dtypes, pmidx.dtypes)
|
||||
|
||||
# multiple labels
|
||||
pmidx = pd.MultiIndex.from_arrays(arrays, names=[("zero", "first"), ("one", "second")])
|
||||
psmidx = ps.from_pandas(pmidx)
|
||||
|
||||
self.assert_eq(psmidx.dtypes, pmidx.dtypes)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pyspark.pandas.tests.test_dataframe import * # noqa: F401
|
||||
|
|
Loading…
Reference in a new issue