spark-instrumented-optimizer/python/pyspark/pandas/indexes/numeric.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Any, Optional, Tuple, Union, cast

import pandas as pd
from pandas.api.types import is_hashable

from pyspark import pandas as ps
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.series import Series
from pyspark.pandas.typedef.typehints import Dtype


class NumericIndex(Index):
    """
    Provide numeric type operations.
    This is an abstract class.
    """

    pass


class IntegerIndex(NumericIndex):
    """
    This is an abstract class for Int64Index.
    """

    pass


class Int64Index(IntegerIndex):
    """
    Immutable sequence used for indexing and alignment. The basic object
    storing axis labels for all pandas objects. Int64Index is a special case
    of `Index` with purely integer labels.

    Parameters
    ----------
    data : array-like (1-dimensional)
    dtype : NumPy dtype (default: int64)
    copy : bool
        Make a copy of input ndarray.
    name : object
        Name to be stored in the index.

    See Also
    --------
    Index : The base pandas-on-Spark Index type.
    Float64Index : A special case of :class:`Index` with purely float labels.

    Notes
    -----
    An Index instance can **only** contain hashable objects.

    Examples
    --------
    >>> ps.Int64Index([1, 2, 3])
    Int64Index([1, 2, 3], dtype='int64')

    From a Series:

    >>> s = ps.Series([1, 2, 3], index=[10, 20, 30])
    >>> ps.Int64Index(s)
    Int64Index([1, 2, 3], dtype='int64')

    From an Index:

    >>> idx = ps.Index([1, 2, 3])
    >>> ps.Int64Index(idx)
    Int64Index([1, 2, 3], dtype='int64')
    """

    def __new__(
        cls,
        data: Optional[Any] = None,
        dtype: Optional[Union[str, Dtype]] = None,
        copy: bool = False,
        name: Optional[Union[Any, Tuple]] = None,
    ) -> "Int64Index":
        if not is_hashable(name):
            raise TypeError("Index.name must be a hashable type")

        if isinstance(data, (Series, Index)):
            if dtype is None:
                dtype = "int64"
            return cast(Int64Index, Index(data, dtype=dtype, copy=copy, name=name))

        return cast(
            Int64Index, ps.from_pandas(pd.Int64Index(data=data, dtype=dtype, copy=copy, name=name))
        )


class Float64Index(NumericIndex):
    """
    Immutable sequence used for indexing and alignment. The basic object
    storing axis labels for all pandas objects. Float64Index is a special case
    of `Index` with purely float labels.

    Parameters
    ----------
    data : array-like (1-dimensional)
    dtype : NumPy dtype (default: float64)
    copy : bool
        Make a copy of input ndarray.
    name : object
        Name to be stored in the index.

    See Also
    --------
    Index : The base pandas-on-Spark Index type.
    Int64Index : A special case of :class:`Index` with purely integer labels.

    Notes
    -----
    An Index instance can **only** contain hashable objects.

    Examples
    --------
    >>> ps.Float64Index([1.0, 2.0, 3.0])
    Float64Index([1.0, 2.0, 3.0], dtype='float64')

    From a Series:

    >>> s = ps.Series([1, 2, 3], index=[10, 20, 30])
    >>> ps.Float64Index(s)
    Float64Index([1.0, 2.0, 3.0], dtype='float64')

    From an Index:

    >>> idx = ps.Index([1, 2, 3])
    >>> ps.Float64Index(idx)
    Float64Index([1.0, 2.0, 3.0], dtype='float64')
    """

    def __new__(
        cls,
        data: Optional[Any] = None,
        dtype: Optional[Union[str, Dtype]] = None,
        copy: bool = False,
        name: Optional[Union[Any, Tuple]] = None,
    ) -> "Float64Index":
        if not is_hashable(name):
            raise TypeError("Index.name must be a hashable type")

        if isinstance(data, (Series, Index)):
            if dtype is None:
                dtype = "float64"
            return cast(Float64Index, Index(data, dtype=dtype, copy=copy, name=name))

        return cast(
            Float64Index,
            ps.from_pandas(pd.Float64Index(data=data, dtype=dtype, copy=copy, name=name)),
        )


def _test() -> None:
    import os
    import doctest
    import sys
    from pyspark.sql import SparkSession
    import pyspark.pandas.indexes.numeric

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.pandas.indexes.numeric.__dict__.copy()
    globs["ps"] = pyspark.pandas
    spark = (
        SparkSession.builder.master("local[4]")
        .appName("pyspark.pandas.indexes.numeric tests")
        .getOrCreate()
    )
    (failure_count, test_count) = doctest.testmod(
        pyspark.pandas.indexes.numeric,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
    )
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()