2616d5cc1d
### What changes were proposed in this pull request? Sets up the `mypy` configuration to enable `disallow_untyped_defs` check for pandas APIs on Spark module. ### Why are the changes needed? Currently many functions in the main codes in pandas APIs on Spark module are still missing type annotations and disabled `mypy` check `disallow_untyped_defs`. We should add more type annotations and enable the mypy check. ### Does this PR introduce _any_ user-facing change? Yes. This PR adds more type annotations in pandas APIs on Spark module, which can impact interaction with development tools for users. ### How was this patch tested? The mypy check with a new configuration and existing tests should pass. Closes #32614 from ueshin/issues/SPARK-35465/disallow_untyped_defs. Authored-by: Takuya UESHIN <ueshin@databricks.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
198 lines
5.4 KiB
Python
198 lines
5.4 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from typing import Any, Optional, Tuple, Union, cast
|
|
|
|
import pandas as pd
|
|
from pandas.api.types import is_hashable
|
|
|
|
from pyspark import pandas as ps
|
|
from pyspark.pandas.indexes.base import Index
|
|
from pyspark.pandas.series import Series
|
|
from pyspark.pandas.typedef.typehints import Dtype
|
|
|
|
|
|
class NumericIndex(Index):
|
|
"""
|
|
Provide numeric type operations.
|
|
This is an abstract class.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class IntegerIndex(NumericIndex):
|
|
"""
|
|
This is an abstract class for Int64Index.
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
class Int64Index(IntegerIndex):
|
|
"""
|
|
Immutable sequence used for indexing and alignment. The basic object
|
|
storing axis labels for all pandas objects. Int64Index is a special case
|
|
of `Index` with purely integer labels.
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like (1-dimensional)
|
|
dtype : NumPy dtype (default: int64)
|
|
copy : bool
|
|
Make a copy of input ndarray.
|
|
name : object
|
|
Name to be stored in the index.
|
|
|
|
See Also
|
|
--------
|
|
Index : The base pandas-on-Spark Index type.
|
|
Float64Index : A special case of :class:`Index` with purely float labels.
|
|
|
|
Notes
|
|
-----
|
|
An Index instance can **only** contain hashable objects.
|
|
|
|
Examples
|
|
--------
|
|
>>> ps.Int64Index([1, 2, 3])
|
|
Int64Index([1, 2, 3], dtype='int64')
|
|
|
|
From a Series:
|
|
|
|
>>> s = ps.Series([1, 2, 3], index=[10, 20, 30])
|
|
>>> ps.Int64Index(s)
|
|
Int64Index([1, 2, 3], dtype='int64')
|
|
|
|
From an Index:
|
|
|
|
>>> idx = ps.Index([1, 2, 3])
|
|
>>> ps.Int64Index(idx)
|
|
Int64Index([1, 2, 3], dtype='int64')
|
|
"""
|
|
|
|
def __new__(
|
|
cls,
|
|
data: Optional[Any] = None,
|
|
dtype: Optional[Union[str, Dtype]] = None,
|
|
copy: bool = False,
|
|
name: Optional[Union[Any, Tuple]] = None,
|
|
) -> "Int64Index":
|
|
if not is_hashable(name):
|
|
raise TypeError("Index.name must be a hashable type")
|
|
|
|
if isinstance(data, (Series, Index)):
|
|
if dtype is None:
|
|
dtype = "int64"
|
|
return cast(Int64Index, Index(data, dtype=dtype, copy=copy, name=name))
|
|
|
|
return cast(
|
|
Int64Index, ps.from_pandas(pd.Int64Index(data=data, dtype=dtype, copy=copy, name=name))
|
|
)
|
|
|
|
|
|
class Float64Index(NumericIndex):
|
|
"""
|
|
Immutable sequence used for indexing and alignment. The basic object
|
|
storing axis labels for all pandas objects. Float64Index is a special case
|
|
of `Index` with purely float labels.
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like (1-dimensional)
|
|
dtype : NumPy dtype (default: float64)
|
|
copy : bool
|
|
Make a copy of input ndarray.
|
|
name : object
|
|
Name to be stored in the index.
|
|
|
|
See Also
|
|
--------
|
|
Index : The base pandas-on-Spark Index type.
|
|
Int64Index : A special case of :class:`Index` with purely integer labels.
|
|
|
|
Notes
|
|
-----
|
|
An Index instance can **only** contain hashable objects.
|
|
|
|
Examples
|
|
--------
|
|
>>> ps.Float64Index([1.0, 2.0, 3.0])
|
|
Float64Index([1.0, 2.0, 3.0], dtype='float64')
|
|
|
|
From a Series:
|
|
|
|
>>> s = ps.Series([1, 2, 3], index=[10, 20, 30])
|
|
>>> ps.Float64Index(s)
|
|
Float64Index([1.0, 2.0, 3.0], dtype='float64')
|
|
|
|
From an Index:
|
|
|
|
>>> idx = ps.Index([1, 2, 3])
|
|
>>> ps.Float64Index(idx)
|
|
Float64Index([1.0, 2.0, 3.0], dtype='float64')
|
|
"""
|
|
|
|
def __new__(
|
|
cls,
|
|
data: Optional[Any] = None,
|
|
dtype: Optional[Union[str, Dtype]] = None,
|
|
copy: bool = False,
|
|
name: Optional[Union[Any, Tuple]] = None,
|
|
) -> "Float64Index":
|
|
if not is_hashable(name):
|
|
raise TypeError("Index.name must be a hashable type")
|
|
|
|
if isinstance(data, (Series, Index)):
|
|
if dtype is None:
|
|
dtype = "float64"
|
|
return cast(Float64Index, Index(data, dtype=dtype, copy=copy, name=name))
|
|
|
|
return cast(
|
|
Float64Index,
|
|
ps.from_pandas(pd.Float64Index(data=data, dtype=dtype, copy=copy, name=name)),
|
|
)
|
|
|
|
|
|
def _test() -> None:
|
|
import os
|
|
import doctest
|
|
import sys
|
|
from pyspark.sql import SparkSession
|
|
import pyspark.pandas.indexes.numeric
|
|
|
|
os.chdir(os.environ["SPARK_HOME"])
|
|
|
|
globs = pyspark.pandas.indexes.numeric.__dict__.copy()
|
|
globs["ps"] = pyspark.pandas
|
|
spark = (
|
|
SparkSession.builder.master("local[4]")
|
|
.appName("pyspark.pandas.indexes.numeric tests")
|
|
.getOrCreate()
|
|
)
|
|
(failure_count, test_count) = doctest.testmod(
|
|
pyspark.pandas.indexes.numeric,
|
|
globs=globs,
|
|
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
|
|
)
|
|
spark.stop()
|
|
if failure_count:
|
|
sys.exit(-1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_test()
|