spark-instrumented-optimizer/python/pyspark/pandas/indexes/multi.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from distutils.version import LooseVersion
from functools import partial
from typing import Any, Optional, Tuple, Union, cast
import warnings

import pandas as pd
from pandas.api.types import is_list_like
from pandas.api.types import is_hashable

from pyspark import sql as spark
from pyspark.sql import functions as F, Window

# For running doctests and reference resolution in PyCharm.
from pyspark import pandas as ps  # noqa: F401
from pyspark.pandas.exceptions import PandasNotImplementedError
from pyspark.pandas.frame import DataFrame
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.missing.indexes import MissingPandasLikeMultiIndex
from pyspark.pandas.series import Series, first_series
from pyspark.pandas.utils import (
    compare_disallow_null,
    is_name_like_tuple,
    name_like_string,
    scol_for,
    verify_temp_column_name,
)
from pyspark.pandas.internal import (
    InternalFrame,
    NATURAL_ORDER_COLUMN_NAME,
    SPARK_INDEX_NAME_FORMAT,
)
from pyspark.pandas.typedef import Scalar


class MultiIndex(Index):
    """
    Koalas MultiIndex that corresponds to pandas MultiIndex logically. This might hold Spark Column
    internally.

    Parameters
    ----------
    levels : sequence of arrays
        The unique labels for each level.
    codes : sequence of arrays
        Integers for each level designating which label at each location.
    sortorder : optional int
        Level of sortedness (must be lexicographically sorted by that
        level).
    names : optional sequence of objects
        Names for each of the index levels. (name is accepted for compat).
    copy : bool, default False
        Copy the meta-data.
    verify_integrity : bool, default True
        Check that the levels/codes are consistent and valid.

    See Also
    --------
    MultiIndex.from_arrays  : Convert list of arrays to MultiIndex.
    MultiIndex.from_product : Create a MultiIndex from the cartesian product
                              of iterables.
    MultiIndex.from_tuples  : Convert list of tuples to a MultiIndex.
    MultiIndex.from_frame   : Make a MultiIndex from a DataFrame.
    Index : A single-level Index.

    Examples
    --------
    >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[[1, 2, 3], [4, 5, 6]]).index  # doctest: +SKIP
    MultiIndex([(1, 4),
                (2, 5),
                (3, 6)],
               )

    >>> ps.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]).index  # doctest: +SKIP
    MultiIndex([('a', 'd'),
                ('b', 'e'),
                ('c', 'f')],
               )
    """

    def __new__(
        cls,
        levels=None,
        codes=None,
        sortorder=None,
        names=None,
        dtype=None,
        copy=False,
        name=None,
        verify_integrity: bool = True,
    ):
        if LooseVersion(pd.__version__) < LooseVersion("0.24"):
            if levels is None or codes is None:
                raise TypeError("Must pass both levels and codes")

            pidx = pd.MultiIndex(
                levels=levels,
                labels=codes,
                sortorder=sortorder,
                names=names,
                dtype=dtype,
                copy=copy,
                name=name,
                verify_integrity=verify_integrity,
            )
        else:
            pidx = pd.MultiIndex(
                levels=levels,
                codes=codes,
                sortorder=sortorder,
                names=names,
                dtype=dtype,
                copy=copy,
                name=name,
                verify_integrity=verify_integrity,
            )
        return ps.from_pandas(pidx)

    @property
    def _internal(self):
        internal = self._kdf._internal
        scol = F.struct(internal.index_spark_columns)
        return internal.copy(
            column_labels=[None],
            data_spark_columns=[scol],
            data_dtypes=[None],
            column_label_names=None,
        )

    @property
    def _column_label(self):
        return None

    def __abs__(self):
        raise TypeError("TypeError: cannot perform __abs__ with this index type: MultiIndex")

    def _with_new_scol(self, scol: spark.Column, *, dtype=None):
        raise NotImplementedError("Not supported for type MultiIndex")

    def _align_and_column_op(self, f, *args) -> Index:
        raise NotImplementedError("Not supported for type MultiIndex")

    def any(self, *args, **kwargs) -> None:
        raise TypeError("cannot perform any with this index type: MultiIndex")

    def all(self, *args, **kwargs) -> None:
        raise TypeError("cannot perform all with this index type: MultiIndex")

    @staticmethod
    def from_tuples(tuples, sortorder=None, names=None) -> "MultiIndex":
        """
        Convert list of tuples to MultiIndex.

        Parameters
        ----------
        tuples : list / sequence of tuple-likes
            Each tuple is the index of one row/column.
        sortorder : int or None
            Level of sortedness (must be lexicographically sorted by that level).
        names : list / sequence of str, optional
            Names for the levels in the index.

        Returns
        -------
        index : MultiIndex

        Examples
        --------

        >>> tuples = [(1, 'red'), (1, 'blue'),
        ...           (2, 'red'), (2, 'blue')]
        >>> ps.MultiIndex.from_tuples(tuples, names=('number', 'color'))  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        """
        return cast(
            MultiIndex,
            ps.from_pandas(
                pd.MultiIndex.from_tuples(tuples=tuples, sortorder=sortorder, names=names)
            ),
        )

    @staticmethod
    def from_arrays(arrays, sortorder=None, names=None) -> "MultiIndex":
        """
        Convert arrays to MultiIndex.

        Parameters
        ----------
        arrays: list / sequence of array-likes
            Each array-like gives one level’s value for each data point. len(arrays)
            is the number of levels.
        sortorder: int or None
            Level of sortedness (must be lexicographically sorted by that level).
        names: list / sequence of str, optional
            Names for the levels in the index.

        Returns
        -------
        index: MultiIndex

        Examples
        --------

        >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
        >>> ps.MultiIndex.from_arrays(arrays, names=('number', 'color'))  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        """
        return cast(
            MultiIndex,
            ps.from_pandas(
                pd.MultiIndex.from_arrays(arrays=arrays, sortorder=sortorder, names=names)
            ),
        )

    @staticmethod
    def from_product(iterables, sortorder=None, names=None) -> "MultiIndex":
        """
        Make a MultiIndex from the cartesian product of multiple iterables.

        Parameters
        ----------
        iterables : list / sequence of iterables
            Each iterable has unique labels for each level of the index.
        sortorder : int or None
            Level of sortedness (must be lexicographically sorted by that
            level).
        names : list / sequence of str, optional
            Names for the levels in the index.

        Returns
        -------
        index : MultiIndex

        See Also
        --------
        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.

        Examples
        --------
        >>> numbers = [0, 1, 2]
        >>> colors = ['green', 'purple']
        >>> ps.MultiIndex.from_product([numbers, colors],
        ...                            names=['number', 'color'])  # doctest: +SKIP
        MultiIndex([(0,  'green'),
                    (0, 'purple'),
                    (1,  'green'),
                    (1, 'purple'),
                    (2,  'green'),
                    (2, 'purple')],
                   names=['number', 'color'])
        """
        return cast(
            MultiIndex,
            ps.from_pandas(
                pd.MultiIndex.from_product(iterables=iterables, sortorder=sortorder, names=names)
            ),
        )

    @staticmethod
    def from_frame(df, names=None) -> "MultiIndex":
        """
        Make a MultiIndex from a DataFrame.

        Parameters
        ----------
        df : DataFrame
            DataFrame to be converted to MultiIndex.
        names : list-like, optional
            If no names are provided, use the column names, or tuple of column
            names if the columns is a MultiIndex. If a sequence, overwrite
            names with the given sequence.

        Returns
        -------
        MultiIndex
            The MultiIndex representation of the given DataFrame.

        See Also
        --------
        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
        MultiIndex.from_product : Make a MultiIndex from cartesian product
                                  of iterables.

        Examples
        --------
        >>> df = ps.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
        ...                    ['NJ', 'Temp'], ['NJ', 'Precip']],
        ...                   columns=['a', 'b'])
        >>> df  # doctest: +SKIP
              a       b
        0    HI    Temp
        1    HI  Precip
        2    NJ    Temp
        3    NJ  Precip

        >>> ps.MultiIndex.from_frame(df)  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['a', 'b'])

        Using explicit names, instead of the column names

        >>> ps.MultiIndex.from_frame(df, names=['state', 'observation'])  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['state', 'observation'])
        """
        if not isinstance(df, DataFrame):
            raise TypeError("Input must be a DataFrame")
        sdf = df.to_spark()

        if names is None:
            names = df._internal.column_labels
        elif not is_list_like(names):
            raise ValueError("Names should be list-like for a MultiIndex")
        else:
            names = [name if is_name_like_tuple(name) else (name,) for name in names]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in sdf.columns],
            index_names=names,
        )
        return cast(MultiIndex, DataFrame(internal).index)

    @property
    def name(self) -> str:
        raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")

    @name.setter
    def name(self, name: str) -> None:
        raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")

    def _verify_for_rename(self, name):
        if is_list_like(name):
            if self._internal.index_level != len(name):
                raise ValueError(
                    "Length of new names must be {}, got {}".format(
                        self._internal.index_level, len(name)
                    )
                )
            if any(not is_hashable(n) for n in name):
                raise TypeError("MultiIndex.name must be a hashable type")
            return [n if is_name_like_tuple(n) else (n,) for n in name]
        else:
            raise TypeError("Must pass list-like as `names`.")

    def swaplevel(self, i=-2, j=-1) -> "MultiIndex":
        """
        Swap level i with level j.
        Calling this method does not change the ordering of the values.

        Parameters
        ----------
        i : int, str, default -2
            First level of index to be swapped. Can pass level name as string.
            Type of parameters can be mixed.
        j : int, str, default -1
            Second level of index to be swapped. Can pass level name as string.
            Type of parameters can be mixed.

        Returns
        -------
        MultiIndex
            A new MultiIndex.

        Examples
        --------
        >>> midx = ps.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names = ['word', 'number'])
        >>> midx  # doctest: +SKIP
        MultiIndex([('a', 1),
                    ('b', 2)],
                   names=['word', 'number'])

        >>> midx.swaplevel(0, 1)  # doctest: +SKIP
        MultiIndex([(1, 'a'),
                    (2, 'b')],
                   names=['number', 'word'])

        >>> midx.swaplevel('number', 'word')  # doctest: +SKIP
        MultiIndex([(1, 'a'),
                    (2, 'b')],
                   names=['number', 'word'])
        """
        for index in (i, j):
            if not isinstance(index, int) and index not in self.names:
                raise KeyError("Level %s not found" % index)

        i = i if isinstance(i, int) else self.names.index(i)
        j = j if isinstance(j, int) else self.names.index(j)

        for index in (i, j):
            if index >= len(self.names) or index < -len(self.names):
                raise IndexError(
                    "Too many levels: Index has only %s levels, "
                    "%s is not a valid level number" % (len(self.names), index)
                )

        index_map = list(
            zip(
                self._internal.index_spark_columns,
                self._internal.index_names,
                self._internal.index_dtypes,
            )
        )
        index_map[i], index_map[j], = index_map[j], index_map[i]
        index_spark_columns, index_names, index_dtypes = zip(*index_map)
        internal = self._internal.copy(
            index_spark_columns=list(index_spark_columns),
            index_names=list(index_names),
            index_dtypes=list(index_dtypes),
            column_labels=[],
            data_spark_columns=[],
            data_dtypes=[],
        )
        return cast(MultiIndex, DataFrame(internal).index)

    @property
    def levshape(self) -> Tuple[int, ...]:
        """
        A tuple with the length of each level.

        Examples
        --------
        >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> midx  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> midx.levshape
        (3, 3)
        """
        result = self._internal.spark_frame.agg(
            *(F.countDistinct(c) for c in self._internal.index_spark_columns)
        ).collect()[0]
        return tuple(result)

    @staticmethod
    def _comparator_for_monotonic_increasing(data_type):
        return compare_disallow_null

    def _is_monotonic(self, order):
        if order == "increasing":
            return self._is_monotonic_increasing().all()
        else:
            return self._is_monotonic_decreasing().all()

    def _is_monotonic_increasing(self):
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = F.lit(True)
        has_not_null = F.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
                compare(scol, prev, spark.Column.__gt__)
            )

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(self._internal.index_spark_columns),
            "__is_monotonic_increasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)]
        )

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_dtypes=self._internal.index_dtypes,
        )

        return first_series(DataFrame(internal))

    @staticmethod
    def _comparator_for_monotonic_decreasing(data_type):
        return compare_disallow_null

    def _is_monotonic_decreasing(self):
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = F.lit(True)
        has_not_null = F.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
                compare(scol, prev, spark.Column.__lt__)
            )

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)]
        )

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_dtypes=self._internal.index_dtypes,
        )

        return first_series(DataFrame(internal))

    def to_frame(self, index=True, name=None) -> DataFrame:
        """
        Create a DataFrame with the levels of the MultiIndex as columns.
        Column ordering is determined by the DataFrame constructor with data as
        a dict.

        Parameters
        ----------
        index : boolean, default True
            Set the index of the returned DataFrame as the original MultiIndex.
        name : list / sequence of strings, optional
            The passed names should substitute index level names.

        Returns
        -------
        DataFrame : a DataFrame containing the original MultiIndex data.

        See Also
        --------
        DataFrame

        Examples
        --------
        >>> tuples = [(1, 'red'), (1, 'blue'),
        ...           (2, 'red'), (2, 'blue')]
        >>> idx = ps.MultiIndex.from_tuples(tuples, names=('number', 'color'))
        >>> idx  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        >>> idx.to_frame()  # doctest: +NORMALIZE_WHITESPACE
                      number color
        number color
        1      red         1   red
               blue        1  blue
        2      red         2   red
               blue        2  blue

        By default, the original Index is reused. To enforce a new Index:

        >>> idx.to_frame(index=False)
           number color
        0       1   red
        1       1  blue
        2       2   red
        3       2  blue

        To override the name of the resulting column, specify `name`:

        >>> idx.to_frame(name=['n', 'c'])  # doctest: +NORMALIZE_WHITESPACE
                      n     c
        number color
        1      red    1   red
               blue   1  blue
        2      red    2   red
               blue   2  blue
        """
        if name is None:
            name = [
                name if name is not None else (i,)
                for i, name in enumerate(self._internal.index_names)
            ]
        elif is_list_like(name):
            if len(name) != self._internal.index_level:
                raise ValueError("'name' should have same length as number of levels on index.")
            name = [n if is_name_like_tuple(n) else (n,) for n in name]
        else:
            raise TypeError("'name' must be a list / sequence of column names.")

        return self._to_frame(index=index, names=name)

    def to_pandas(self) -> pd.MultiIndex:
        """
        Return a pandas MultiIndex.

        .. note:: This method should only be used if the resulting pandas object is expected
                  to be small, as all the data is loaded into the driver's memory.

        Examples
        --------
        >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'],
        ...                   index=[list('abcd'), list('efgh')])
        >>> df['dogs'].index.to_pandas()  # doctest: +SKIP
        MultiIndex([('a', 'e'),
                    ('b', 'f'),
                    ('c', 'g'),
                    ('d', 'h')],
                   )
        """
        # TODO: We might need to handle internal state change.
        # So far, we don't have any functions to change the internal state of MultiIndex except for
        # series-like operations. In that case, it creates new Index object instead of MultiIndex.
        return super().to_pandas()

    def toPandas(self) -> pd.MultiIndex:
        warnings.warn(
            "MultiIndex.toPandas is deprecated as of MultiIndex.to_pandas. "
            "Please use the API instead.",
            FutureWarning,
        )
        return self.to_pandas()

    toPandas.__doc__ = to_pandas.__doc__

    def nunique(self, dropna=True) -> None:  # type: ignore
        raise NotImplementedError("nunique is not defined for MultiIndex")

    # TODO: add 'name' parameter after pd.MultiIndex.name is implemented
    def copy(self, deep=None) -> "MultiIndex":  # type: ignore
        """
        Make a copy of this object.

        Parameters
        ----------
        deep : None
            this parameter is not supported but just dummy parameter to match pandas.

        Examples
        --------
        >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'],
        ...                   index=[list('abcd'), list('efgh')])
        >>> df['dogs'].index  # doctest: +SKIP
        MultiIndex([('a', 'e'),
                    ('b', 'f'),
                    ('c', 'g'),
                    ('d', 'h')],
                   )

        Copy index

        >>> df.index.copy()  # doctest: +SKIP
        MultiIndex([('a', 'e'),
                    ('b', 'f'),
                    ('c', 'g'),
                    ('d', 'h')],
                   )
        """
        return super().copy(deep=deep)  # type: ignore

    def symmetric_difference(self, other, result_name=None, sort=None) -> "MultiIndex":
        """
        Compute the symmetric difference of two MultiIndex objects.

        Parameters
        ----------
        other : Index or array-like
        result_name : list
        sort : True or None, default None
            Whether to sort the resulting index.
            * True : Attempt to sort the result.
            * None : Do not sort the result.

        Returns
        -------
        symmetric_difference : MiltiIndex

        Notes
        -----
        ``symmetric_difference`` contains elements that appear in either
        ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
        ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
        dropped.

        Examples
        --------
        >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> midx2 = pd.MultiIndex([['koalas', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> s1 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...                index=midx1)
        >>> s2 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...              index=midx2)

        >>> s1.index.symmetric_difference(s2.index)  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can set names of result Index.

        >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b'])  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   names=['a', 'b'])

        You can set sort to `True`, if you want to sort the resulting index.

        >>> s1.index.symmetric_difference(s2.index, sort=True)  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can also use the ``^`` operator:

        >>> s1.index ^ s2.index  # doctest: +SKIP
        MultiIndex([('koalas', 'speed'),
                    (  'lama', 'speed')],
                   )
        """
        if type(self) != type(other):
            raise NotImplementedError(
                "Doesn't support symmetric_difference between Index & MultiIndex for now"
            )

        sdf_self = self._kdf._internal.spark_frame.select(self._internal.index_spark_columns)
        sdf_other = other._kdf._internal.spark_frame.select(other._internal.index_spark_columns)

        sdf_symdiff = sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other))

        if sort:
            sdf_symdiff = sdf_symdiff.sort(self._internal.index_spark_columns)

        internal = InternalFrame(  # TODO: dtypes?
            spark_frame=sdf_symdiff,
            index_spark_columns=[
                scol_for(sdf_symdiff, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
        )
        result = cast(MultiIndex, DataFrame(internal).index)

        if result_name:
            result.names = result_name

        return result

    # TODO: ADD error parameter
    def drop(self, codes, level=None) -> "MultiIndex":
        """
        Make new MultiIndex with passed list of labels deleted

        Parameters
        ----------
        codes : array-like
            Must be a list of tuples
        level : int or level name, default None

        Returns
        -------
        dropped : MultiIndex

        Examples
        --------
        >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> index # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['a']) # doctest: +SKIP
        MultiIndex([('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        internal = self._internal.resolved_copy
        sdf = internal.spark_frame
        index_scols = internal.index_spark_columns
        if level is None:
            scol = index_scols[0]
        elif isinstance(level, int):
            scol = index_scols[level]
        else:
            scol = None
            for index_spark_column, index_name in zip(
                internal.index_spark_columns, internal.index_names
            ):
                if not isinstance(level, tuple):
                    level = (level,)
                if level == index_name:
                    if scol is not None:
                        raise ValueError(
                            "The name {} occurs multiple times, use a level number".format(
                                name_like_string(level)
                            )
                        )
                    scol = index_spark_column
            if scol is None:
                raise KeyError("Level {} not found".format(name_like_string(level)))
        sdf = sdf[~scol.isin(codes)]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in internal.index_spark_column_names],
            index_names=internal.index_names,
            index_dtypes=internal.index_dtypes,
            column_labels=[],
            data_spark_columns=[],
            data_dtypes=[],
        )
        return cast(MultiIndex, DataFrame(internal).index)

    def argmax(self) -> None:
        raise TypeError("reduction operation 'argmax' not allowed for this dtype")

    def argmin(self) -> None:
        raise TypeError("reduction operation 'argmin' not allowed for this dtype")

    def asof(self, label) -> None:
        raise NotImplementedError(
            "only the default get_loc method is currently supported for MultiIndex"
        )

    @property
    def is_all_dates(self) -> bool:
        """
        is_all_dates always returns False for MultiIndex

        Examples
        --------
        >>> from datetime import datetime

        >>> idx = ps.MultiIndex.from_tuples(
        ...     [(datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0, 0)),
        ...      (datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0, 0))])
        >>> idx  # doctest: +SKIP
        MultiIndex([('2019-01-01', '2019-01-01'),
                    ('2019-01-01', '2019-01-01')],
                   )

        >>> idx.is_all_dates
        False
        """
        return False

    def __getattr__(self, item: str) -> Any:
        if hasattr(MissingPandasLikeMultiIndex, item):
            property_or_func = getattr(MissingPandasLikeMultiIndex, item)
            if isinstance(property_or_func, property):
                return property_or_func.fget(self)  # type: ignore
            else:
                return partial(property_or_func, self)
        raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item))

    def _get_level_number(self, level) -> Optional[int]:
        """
        Return the level number if a valid level is given.
        """
        count = self.names.count(level)
        if (count > 1) and not isinstance(level, int):
            raise ValueError("The name %s occurs multiple times, use a level number" % level)
        if level in self.names:
            level = self.names.index(level)
        elif isinstance(level, int):
            nlevels = self.nlevels
            if level >= nlevels:
                raise IndexError(
                    "Too many levels: Index has only %d "
                    "levels, %d is not a valid level number" % (nlevels, level)
                )
            if level < 0:
                if (level + nlevels) < 0:
                    raise IndexError(
                        "Too many levels: Index has only %d levels, "
                        "not %d" % (nlevels, level + 1)
                    )
                level = level + nlevels
        else:
            raise KeyError("Level %s not found" % str(level))
            return None

        return level

    def get_level_values(self, level) -> Index:
        """
        Return vector of label values for requested level,
        equal to the length of the index.

        Parameters
        ----------
        level : int or str
            ``level`` is either the integer position of the level in the
            MultiIndex, or the name of the level.

        Returns
        -------
        values : Index
            Values is a level of this MultiIndex converted to
            a single :class:`Index` (or subclass thereof).

        Examples
        --------

        Create a MultiIndex:

        >>> mi = ps.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')])
        >>> mi.names = ['level_1', 'level_2']

        Get level values by supplying level as either integer or name:

        >>> mi.get_level_values(0)
        Index(['x', 'x', 'y'], dtype='object', name='level_1')

        >>> mi.get_level_values('level_2')
        Index(['a', 'b', 'a'], dtype='object', name='level_2')
        """
        level = self._get_level_number(level)
        index_scol = self._internal.index_spark_columns[level]
        index_name = self._internal.index_names[level]
        index_dtype = self._internal.index_dtypes[level]
        internal = self._internal.copy(
            index_spark_columns=[index_scol],
            index_names=[index_name],
            index_dtypes=[index_dtype],
            column_labels=[],
            data_spark_columns=[],
            data_dtypes=[],
        )
        return DataFrame(internal).index

    def insert(self, loc: int, item) -> Index:
        """
        Make new MultiIndex inserting new item at location.

        Follows Python list.append semantics for negative values.

        Parameters
        ----------
        loc : int
        item : object

        Returns
        -------
        new_index : MultiIndex

        Examples
        --------
        >>> kmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> kmidx.insert(3, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z'),
                    ('h', 'j')],
                   )

        For negative values

        >>> kmidx.insert(-2, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('h', 'j'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )
        """
        length = len(self)
        if loc < 0:
            loc = loc + length
            if loc < 0:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(
                        (loc - length), length
                    )
                )
        else:
            if loc > length:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(loc, length)
                )

        index_name = self._internal.index_spark_column_names
        sdf_before = self.to_frame(name=index_name)[:loc].to_spark()
        sdf_middle = Index([item]).to_frame(name=index_name).to_spark()
        sdf_after = self.to_frame(name=index_name)[loc:].to_spark()
        sdf = sdf_before.union(sdf_middle).union(sdf_after)

        internal = InternalFrame(  # TODO: dtypes?
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
        )
        return DataFrame(internal).index

    def item(self) -> Tuple[Scalar, ...]:
        """
        Return the first element of the underlying data as a python tuple.

        Returns
        -------
        tuple
            The first element of MultiIndex.

        Raises
        ------
        ValueError
            If the data is not length-1.

        Examples
        --------
        >>> kmidx = ps.MultiIndex.from_tuples([('a', 'x')])
        >>> kmidx.item()
        ('a', 'x')
        """
        return self._kdf.head(2)._to_internal_pandas().index.item()

    def intersection(self, other) -> "MultiIndex":
        """
        Form the intersection of two Index objects.

        This returns a new Index with elements common to the index and `other`.

        Parameters
        ----------
        other : Index or array-like

        Returns
        -------
        intersection : MultiIndex

        Examples
        --------
        >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        if isinstance(other, Series) or not is_list_like(other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        elif isinstance(other, DataFrame):
            raise ValueError("Index data must be 1-dimensional")
        elif isinstance(other, MultiIndex):
            spark_frame_other = other.to_frame().to_spark()
            keep_name = self.names == other.names
        elif isinstance(other, Index):
            # Always returns an empty MultiIndex if `other` is Index.
            return self.to_frame().head(0).index  # type: ignore
        elif not all(isinstance(item, tuple) for item in other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        else:
            other = MultiIndex.from_tuples(list(other))
            spark_frame_other = other.to_frame().to_spark()
            keep_name = True

        default_name = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
        spark_frame_self = self.to_frame(name=default_name).to_spark()
        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
        if keep_name:
            index_names = self._internal.index_names
        else:
            index_names = None
        internal = InternalFrame(  # TODO: dtypes?
            spark_frame=spark_frame_intersected,
            index_spark_columns=[scol_for(spark_frame_intersected, col) for col in default_name],
            index_names=index_names,
        )
        return cast(MultiIndex, DataFrame(internal).index)

    @property
    def hasnans(self):
        raise NotImplementedError("hasnans is not defined for MultiIndex")

    @property
    def inferred_type(self) -> str:
        """
        Return a string of the type inferred from the values.
        """
        # Always returns "mixed" for MultiIndex
        return "mixed"

    @property
    def asi8(self) -> None:
        """
        Integer representation of the values.
        """
        # Always returns None for MultiIndex
        return None

    def factorize(
        self, sort: bool = True, na_sentinel: Optional[int] = -1
    ) -> Tuple[Union["Series", "Index"], pd.Index]:
        return MissingPandasLikeMultiIndex.factorize(self, sort=sort, na_sentinel=na_sentinel)

    def __iter__(self):
        return MissingPandasLikeMultiIndex.__iter__(self)


def _test():
    import os
    import doctest
    import sys
    import numpy
    from pyspark.sql import SparkSession
    import pyspark.pandas.indexes.multi

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.pandas.indexes.multi.__dict__.copy()
    globs["np"] = numpy
    globs["ps"] = pyspark.pandas
    spark = (
        SparkSession.builder.master("local[4]")
        .appName("pyspark.pandas.indexes.multi tests")
        .getOrCreate()
    )
    (failure_count, test_count) = doctest.testmod(
        pyspark.pandas.indexes.multi,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
    )
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								#
 								# Licensed to the Apache Software Foundation (ASF) under one or more
 								# contributor license agreements.  See the NOTICE file distributed with
 								# this work for additional information regarding copyright ownership.
 								# The ASF licenses this file to You under the Apache License, Version 2.0
 								# (the "License"); you may not use this file except in compliance with
 								# the License.  You may obtain a copy of the License at
 								#
 								#    http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								#
 								from distutils.version import LooseVersion
 								from functools import partial
 								from typing import Any, Optional, Tuple, Union, cast
 								import warnings
 								import pandas as pd
 								from pandas.api.types import is_list_like
 								from pandas.api.types import is_hashable
 								from pyspark import sql as spark
 								from pyspark.sql import functions as F, Window
 								# For running doctests and reference resolution in PyCharm.
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								from pyspark import pandas as ps  # noqa: F401
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								from pyspark.pandas.exceptions import PandasNotImplementedError
 								from pyspark.pandas.frame import DataFrame
 								from pyspark.pandas.indexes.base import Index
 								from pyspark.pandas.missing.indexes import MissingPandasLikeMultiIndex
 								from pyspark.pandas.series import Series, first_series
 								from pyspark.pandas.utils import (
 								    compare_disallow_null,
 								    is_name_like_tuple,
 								    name_like_string,
 								    scol_for,
 								    verify_temp_column_name,
 								)
 								from pyspark.pandas.internal import (
 								    InternalFrame,
 								    NATURAL_ORDER_COLUMN_NAME,
 								    SPARK_INDEX_NAME_FORMAT,
 								)
 								from pyspark.pandas.typedef import Scalar
 								class MultiIndex(Index):
 								    """
 								    Koalas MultiIndex that corresponds to pandas MultiIndex logically. This might hold Spark Column
 								    internally.
 								    Parameters
 								    ----------
 								    levels : sequence of arrays
 								        The unique labels for each level.
 								    codes : sequence of arrays
 								        Integers for each level designating which label at each location.
 								    sortorder : optional int
 								        Level of sortedness (must be lexicographically sorted by that
 								        level).
 								    names : optional sequence of objects
 								        Names for each of the index levels. (name is accepted for compat).
 								    copy : bool, default False
 								        Copy the meta-data.
 								    verify_integrity : bool, default True
 								        Check that the levels/codes are consistent and valid.
 								    See Also
 								    --------
 								    MultiIndex.from_arrays  : Convert list of arrays to MultiIndex.
 								    MultiIndex.from_product : Create a MultiIndex from the cartesian product
 								                              of iterables.
 								    MultiIndex.from_tuples  : Convert list of tuples to a MultiIndex.
 								    MultiIndex.from_frame   : Make a MultiIndex from a DataFrame.
 								    Index : A single-level Index.
 								    Examples
 								    --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								    >>> ps.DataFrame({'a': ['a', 'b', 'c']}, index=[[1, 2, 3], [4, 5, 6]]).index  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								    MultiIndex([(1, 4),
 								                (2, 5),
 								                (3, 6)],
 								               )
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								    >>> ps.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')]).index  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								    MultiIndex([('a', 'd'),
 								                ('b', 'e'),
 								                ('c', 'f')],
 								               )
 								    """
 								    def __new__(
 								        cls,
 								        levels=None,
 								        codes=None,
 								        sortorder=None,
 								        names=None,
 								        dtype=None,
 								        copy=False,
 								        name=None,
 								        verify_integrity: bool = True,
 								    ):
 								        if LooseVersion(pd.__version__) < LooseVersion("0.24"):
 								            if levels is None or codes is None:
 								                raise TypeError("Must pass both levels and codes")
 								            pidx = pd.MultiIndex(
 								                levels=levels,
 								                labels=codes,
 								                sortorder=sortorder,
 								                names=names,
 								                dtype=dtype,
 								                copy=copy,
 								                name=name,
 								                verify_integrity=verify_integrity,
 								            )
 								        else:
 								            pidx = pd.MultiIndex(
 								                levels=levels,
 								                codes=codes,
 								                sortorder=sortorder,
 								                names=names,
 								                dtype=dtype,
 								                copy=copy,
 								                name=name,
 								                verify_integrity=verify_integrity,
 								            )
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        return ps.from_pandas(pidx)
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
 								    @property
 								    def _internal(self):
 								        internal = self._kdf._internal
 								        scol = F.struct(internal.index_spark_columns)
 								        return internal.copy(
 								            column_labels=[None],
 								            data_spark_columns=[scol],
 								            data_dtypes=[None],
 								            column_label_names=None,
 								        )
 								    @property
 								    def _column_label(self):
 								        return None
 								    def __abs__(self):
 								        raise TypeError("TypeError: cannot perform __abs__ with this index type: MultiIndex")
 								    def _with_new_scol(self, scol: spark.Column, *, dtype=None):
 								        raise NotImplementedError("Not supported for type MultiIndex")
 								    def _align_and_column_op(self, f, *args) -> Index:
 								        raise NotImplementedError("Not supported for type MultiIndex")
 								    def any(self, *args, **kwargs) -> None:
 								        raise TypeError("cannot perform any with this index type: MultiIndex")
 								    def all(self, *args, **kwargs) -> None:
 								        raise TypeError("cannot perform all with this index type: MultiIndex")
 								    @staticmethod
 								    def from_tuples(tuples, sortorder=None, names=None) -> "MultiIndex":
 								        """
 								        Convert list of tuples to MultiIndex.
 								        Parameters
 								        ----------
 								        tuples : list / sequence of tuple-likes
 								            Each tuple is the index of one row/column.
 								        sortorder : int or None
 								            Level of sortedness (must be lexicographically sorted by that level).
 								        names : list / sequence of str, optional
 								            Names for the levels in the index.
 								        Returns
 								        -------
 								        index : MultiIndex
 								        Examples
 								        --------
 								        >>> tuples = [(1, 'red'), (1, 'blue'),
 								        ...           (2, 'red'), (2, 'blue')]
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> ps.MultiIndex.from_tuples(tuples, names=('number', 'color'))  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        MultiIndex([(1,  'red'),
 								                    (1, 'blue'),
 								                    (2,  'red'),
 								                    (2, 'blue')],
 								                   names=['number', 'color'])
 								        """
 								        return cast(
 								            MultiIndex,
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            ps.from_pandas(
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								                pd.MultiIndex.from_tuples(tuples=tuples, sortorder=sortorder, names=names)
 								            ),
 								        )
 								    @staticmethod
 								    def from_arrays(arrays, sortorder=None, names=None) -> "MultiIndex":
 								        """
 								        Convert arrays to MultiIndex.
 								        Parameters
 								        ----------
 								        arrays: list / sequence of array-likes
 								            Each array-like gives one level’s value for each data point. len(arrays)
 								            is the number of levels.
 								        sortorder: int or None
 								            Level of sortedness (must be lexicographically sorted by that level).
 								        names: list / sequence of str, optional
 								            Names for the levels in the index.
 								        Returns
 								        -------
 								        index: MultiIndex
 								        Examples
 								        --------
 								        >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> ps.MultiIndex.from_arrays(arrays, names=('number', 'color'))  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        MultiIndex([(1,  'red'),
 								                    (1, 'blue'),
 								                    (2,  'red'),
 								                    (2, 'blue')],
 								                   names=['number', 'color'])
 								        """
 								        return cast(
 								            MultiIndex,
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            ps.from_pandas(
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								                pd.MultiIndex.from_arrays(arrays=arrays, sortorder=sortorder, names=names)
 								            ),
 								        )
 								    @staticmethod
 								    def from_product(iterables, sortorder=None, names=None) -> "MultiIndex":
 								        """
 								        Make a MultiIndex from the cartesian product of multiple iterables.
 								        Parameters
 								        ----------
 								        iterables : list / sequence of iterables
 								            Each iterable has unique labels for each level of the index.
 								        sortorder : int or None
 								            Level of sortedness (must be lexicographically sorted by that
 								            level).
 								        names : list / sequence of str, optional
 								            Names for the levels in the index.
 								        Returns
 								        -------
 								        index : MultiIndex
 								        See Also
 								        --------
 								        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
 								        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
 								        Examples
 								        --------
 								        >>> numbers = [0, 1, 2]
 								        >>> colors = ['green', 'purple']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> ps.MultiIndex.from_product([numbers, colors],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...                            names=['number', 'color'])  # doctest: +SKIP
 								        MultiIndex([(0,  'green'),
 								                    (0, 'purple'),
 								                    (1,  'green'),
 								                    (1, 'purple'),
 								                    (2,  'green'),
 								                    (2, 'purple')],
 								                   names=['number', 'color'])
 								        """
 								        return cast(
 								            MultiIndex,
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            ps.from_pandas(
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								                pd.MultiIndex.from_product(iterables=iterables, sortorder=sortorder, names=names)
 								            ),
 								        )
 								    @staticmethod
 								    def from_frame(df, names=None) -> "MultiIndex":
 								        """
 								        Make a MultiIndex from a DataFrame.
 								        Parameters
 								        ----------
 								        df : DataFrame
 								            DataFrame to be converted to MultiIndex.
 								        names : list-like, optional
 								            If no names are provided, use the column names, or tuple of column
 								            names if the columns is a MultiIndex. If a sequence, overwrite
 								            names with the given sequence.
 								        Returns
 								        -------
 								        MultiIndex
 								            The MultiIndex representation of the given DataFrame.
 								        See Also
 								        --------
 								        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
 								        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
 								        MultiIndex.from_product : Make a MultiIndex from cartesian product
 								                                  of iterables.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> df = ps.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...                    ['NJ', 'Temp'], ['NJ', 'Precip']],
 								        ...                   columns=['a', 'b'])
 								        >>> df  # doctest: +SKIP
 								              a       b
 HI    Temp
 HI  Precip
 NJ    Temp
 NJ  Precip
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> ps.MultiIndex.from_frame(df)  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        MultiIndex([('HI',   'Temp'),
 								                    ('HI', 'Precip'),
 								                    ('NJ',   'Temp'),
 								                    ('NJ', 'Precip')],
 								                   names=['a', 'b'])
 								        Using explicit names, instead of the column names
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> ps.MultiIndex.from_frame(df, names=['state', 'observation'])  # doctest: +SKIP
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        MultiIndex([('HI',   'Temp'),
 								                    ('HI', 'Precip'),
 								                    ('NJ',   'Temp'),
 								                    ('NJ', 'Precip')],
 								                   names=['state', 'observation'])
 								        """
 								        if not isinstance(df, DataFrame):
 								            raise TypeError("Input must be a DataFrame")
 								        sdf = df.to_spark()
 								        if names is None:
 								            names = df._internal.column_labels
 								        elif not is_list_like(names):
 								            raise ValueError("Names should be list-like for a MultiIndex")
 								        else:
 								            names = [name if is_name_like_tuple(name) else (name,) for name in names]
 								        internal = InternalFrame(
 								            spark_frame=sdf,
 								            index_spark_columns=[scol_for(sdf, col) for col in sdf.columns],
 								            index_names=names,
 								        )
 								        return cast(MultiIndex, DataFrame(internal).index)
 								    @property
 								    def name(self) -> str:
 								        raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
 								    @name.setter
 								    def name(self, name: str) -> None:
 								        raise PandasNotImplementedError(class_name="pd.MultiIndex", property_name="name")
 								    def _verify_for_rename(self, name):
 								        if is_list_like(name):
 								            if self._internal.index_level != len(name):
 								                raise ValueError(
 								                    "Length of new names must be {}, got {}".format(
 								                        self._internal.index_level, len(name)
 								                    )
 								                )
 								            if any(not is_hashable(n) for n in name):
 								                raise TypeError("MultiIndex.name must be a hashable type")
 								            return [n if is_name_like_tuple(n) else (n,) for n in name]
 								        else:
 								            raise TypeError("Must pass list-like as `names`.")
 								    def swaplevel(self, i=-2, j=-1) -> "MultiIndex":
 								        """
 								        Swap level i with level j.
 								        Calling this method does not change the ordering of the values.
 								        Parameters
 								        ----------
 								        i : int, str, default -2
 								            First level of index to be swapped. Can pass level name as string.
 								            Type of parameters can be mixed.
 								        j : int, str, default -1
 								            Second level of index to be swapped. Can pass level name as string.
 								            Type of parameters can be mixed.
 								        Returns
 								        -------
 								        MultiIndex
 								            A new MultiIndex.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> midx = ps.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names = ['word', 'number'])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> midx  # doctest: +SKIP
 								        MultiIndex([('a', 1),
 								                    ('b', 2)],
 								                   names=['word', 'number'])
 								        >>> midx.swaplevel(0, 1)  # doctest: +SKIP
 								        MultiIndex([(1, 'a'),
 								                    (2, 'b')],
 								                   names=['number', 'word'])
 								        >>> midx.swaplevel('number', 'word')  # doctest: +SKIP
 								        MultiIndex([(1, 'a'),
 								                    (2, 'b')],
 								                   names=['number', 'word'])
 								        """
 								        for index in (i, j):
 								            if not isinstance(index, int) and index not in self.names:
 								                raise KeyError("Level %s not found" % index)
 								        i = i if isinstance(i, int) else self.names.index(i)
 								        j = j if isinstance(j, int) else self.names.index(j)
 								        for index in (i, j):
 								            if index >= len(self.names) or index < -len(self.names):
 								                raise IndexError(
 								                    "Too many levels: Index has only %s levels, "
 								                    "%s is not a valid level number" % (len(self.names), index)
 								                )
 								        index_map = list(
 								            zip(
 								                self._internal.index_spark_columns,
 								                self._internal.index_names,
 								                self._internal.index_dtypes,
 								            )
 								        )
 								        index_map[i], index_map[j], = index_map[j], index_map[i]
 								        index_spark_columns, index_names, index_dtypes = zip(*index_map)
 								        internal = self._internal.copy(
 								            index_spark_columns=list(index_spark_columns),
 								            index_names=list(index_names),
 								            index_dtypes=list(index_dtypes),
 								            column_labels=[],
 								            data_spark_columns=[],
 								            data_dtypes=[],
 								        )
 								        return cast(MultiIndex, DataFrame(internal).index)
 								    @property
 								    def levshape(self) -> Tuple[int, ...]:
 								        """
 								        A tuple with the length of each level.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> midx = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> midx  # doctest: +SKIP
 								        MultiIndex([('a', 'x'),
 								                    ('b', 'y'),
 								                    ('c', 'z')],
 								                   )
 								        >>> midx.levshape
 								        (3, 3)
 								        """
 								        result = self._internal.spark_frame.agg(
 								            *(F.countDistinct(c) for c in self._internal.index_spark_columns)
 								        ).collect()[0]
 								        return tuple(result)
 								    @staticmethod
 								    def _comparator_for_monotonic_increasing(data_type):
 								        return compare_disallow_null
 								    def _is_monotonic(self, order):
 								        if order == "increasing":
 								            return self._is_monotonic_increasing().all()
 								        else:
 								            return self._is_monotonic_decreasing().all()
 								    def _is_monotonic_increasing(self):
 								        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
 								        cond = F.lit(True)
 								        has_not_null = F.lit(True)
 								        for scol in self._internal.index_spark_columns[::-1]:
 								            data_type = self._internal.spark_type_for(scol)
 								            prev = F.lag(scol, 1).over(window)
 								            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
 								            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
 								            # Therefore, we should check `has_not_null` over the all levels.
 								            has_not_null = has_not_null & scol.isNotNull()
 								            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
 								                compare(scol, prev, spark.Column.__gt__)
 								            )
 								        cond = has_not_null & (prev.isNull() | cond)
 								        cond_name = verify_temp_column_name(
 								            self._internal.spark_frame.select(self._internal.index_spark_columns),
 								            "__is_monotonic_increasing_cond__",
 								        )
 								        sdf = self._internal.spark_frame.select(
 								            self._internal.index_spark_columns + [cond.alias(cond_name)]
 								        )
 								        internal = InternalFrame(
 								            spark_frame=sdf,
 								            index_spark_columns=[
 								                scol_for(sdf, col) for col in self._internal.index_spark_column_names
 								            ],
 								            index_names=self._internal.index_names,
 								            index_dtypes=self._internal.index_dtypes,
 								        )
 								        return first_series(DataFrame(internal))
 								    @staticmethod
 								    def _comparator_for_monotonic_decreasing(data_type):
 								        return compare_disallow_null
 								    def _is_monotonic_decreasing(self):
 								        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
 								        cond = F.lit(True)
 								        has_not_null = F.lit(True)
 								        for scol in self._internal.index_spark_columns[::-1]:
 								            data_type = self._internal.spark_type_for(scol)
 								            prev = F.lag(scol, 1).over(window)
 								            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
 								            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
 								            # Therefore, we should check `has_not_null` over the all levels.
 								            has_not_null = has_not_null & scol.isNotNull()
 								            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
 								                compare(scol, prev, spark.Column.__lt__)
 								            )
 								        cond = has_not_null & (prev.isNull() | cond)
 								        cond_name = verify_temp_column_name(
 								            self._internal.spark_frame.select(self._internal.index_spark_columns),
 								            "__is_monotonic_decreasing_cond__",
 								        )
 								        sdf = self._internal.spark_frame.select(
 								            self._internal.index_spark_columns + [cond.alias(cond_name)]
 								        )
 								        internal = InternalFrame(
 								            spark_frame=sdf,
 								            index_spark_columns=[
 								                scol_for(sdf, col) for col in self._internal.index_spark_column_names
 								            ],
 								            index_names=self._internal.index_names,
 								            index_dtypes=self._internal.index_dtypes,
 								        )
 								        return first_series(DataFrame(internal))
 								    def to_frame(self, index=True, name=None) -> DataFrame:
 								        """
 								        Create a DataFrame with the levels of the MultiIndex as columns.
 								        Column ordering is determined by the DataFrame constructor with data as
 								        a dict.
 								        Parameters
 								        ----------
 								        index : boolean, default True
 								            Set the index of the returned DataFrame as the original MultiIndex.
 								        name : list / sequence of strings, optional
 								            The passed names should substitute index level names.
 								        Returns
 								        -------
 								        DataFrame : a DataFrame containing the original MultiIndex data.
 								        See Also
 								        --------
 								        DataFrame
 								        Examples
 								        --------
 								        >>> tuples = [(1, 'red'), (1, 'blue'),
 								        ...           (2, 'red'), (2, 'blue')]
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> idx = ps.MultiIndex.from_tuples(tuples, names=('number', 'color'))
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> idx  # doctest: +SKIP
 								        MultiIndex([(1,  'red'),
 								                    (1, 'blue'),
 								                    (2,  'red'),
 								                    (2, 'blue')],
 								                   names=['number', 'color'])
 								        >>> idx.to_frame()  # doctest: +NORMALIZE_WHITESPACE
 								                      number color
 								        number color
 red         1   red
 								               blue        1  blue
 red         2   red
 								               blue        2  blue
 								        By default, the original Index is reused. To enforce a new Index:
 								        >>> idx.to_frame(index=False)
 								           number color
 1   red
 1  blue
 2   red
 2  blue
 								        To override the name of the resulting column, specify `name`:
 								        >>> idx.to_frame(name=['n', 'c'])  # doctest: +NORMALIZE_WHITESPACE
 								                      n     c
 								        number color
 red    1   red
 								               blue   1  blue
 red    2   red
 								               blue   2  blue
 								        """
 								        if name is None:
 								            name = [
 								                name if name is not None else (i,)
 								                for i, name in enumerate(self._internal.index_names)
 								            ]
 								        elif is_list_like(name):
 								            if len(name) != self._internal.index_level:
 								                raise ValueError("'name' should have same length as number of levels on index.")
 								            name = [n if is_name_like_tuple(n) else (n,) for n in name]
 								        else:
 								            raise TypeError("'name' must be a list / sequence of column names.")
 								        return self._to_frame(index=index, names=name)
 								    def to_pandas(self) -> pd.MultiIndex:
 								        """
 								        Return a pandas MultiIndex.
 								        .. note:: This method should only be used if the resulting pandas object is expected
 								                  to be small, as all the data is loaded into the driver's memory.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...                   columns=['dogs', 'cats'],
 								        ...                   index=[list('abcd'), list('efgh')])
 								        >>> df['dogs'].index.to_pandas()  # doctest: +SKIP
 								        MultiIndex([('a', 'e'),
 								                    ('b', 'f'),
 								                    ('c', 'g'),
 								                    ('d', 'h')],
 								                   )
 								        """
 								        # TODO: We might need to handle internal state change.
 								        # So far, we don't have any functions to change the internal state of MultiIndex except for
 								        # series-like operations. In that case, it creates new Index object instead of MultiIndex.
 								        return super().to_pandas()
 								    def toPandas(self) -> pd.MultiIndex:
 								        warnings.warn(
 								            "MultiIndex.toPandas is deprecated as of MultiIndex.to_pandas. "
 								            "Please use the API instead.",
 								            FutureWarning,
 								        )
 								        return self.to_pandas()
 								    toPandas.__doc__ = to_pandas.__doc__
 								    def nunique(self, dropna=True) -> None:  # type: ignore
 								        raise NotImplementedError("nunique is not defined for MultiIndex")
 								    # TODO: add 'name' parameter after pd.MultiIndex.name is implemented
 								    def copy(self, deep=None) -> "MultiIndex":  # type: ignore
 								        """
 								        Make a copy of this object.
 								        Parameters
 								        ----------
 								        deep : None
 								            this parameter is not supported but just dummy parameter to match pandas.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...                   columns=['dogs', 'cats'],
 								        ...                   index=[list('abcd'), list('efgh')])
 								        >>> df['dogs'].index  # doctest: +SKIP
 								        MultiIndex([('a', 'e'),
 								                    ('b', 'f'),
 								                    ('c', 'g'),
 								                    ('d', 'h')],
 								                   )
 								        Copy index
 								        >>> df.index.copy()  # doctest: +SKIP
 								        MultiIndex([('a', 'e'),
 								                    ('b', 'f'),
 								                    ('c', 'g'),
 								                    ('d', 'h')],
 								                   )
 								        """
 								        return super().copy(deep=deep)  # type: ignore
 								    def symmetric_difference(self, other, result_name=None, sort=None) -> "MultiIndex":
 								        """
 								        Compute the symmetric difference of two MultiIndex objects.
 								        Parameters
 								        ----------
 								        other : Index or array-like
 								        result_name : list
 								        sort : True or None, default None
 								            Whether to sort the resulting index.
 								            * True : Attempt to sort the result.
 								            * None : Do not sort the result.
 								        Returns
 								        -------
 								        symmetric_difference : MiltiIndex
 								        Notes
 								        -----
 								        ``symmetric_difference`` contains elements that appear in either
 								        ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
 								        ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
 								        dropped.
 								        Examples
 								        --------
 								        >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'],
 								        ...                        ['speed', 'weight', 'length']],
 								        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
 								        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
 								        >>> midx2 = pd.MultiIndex([['koalas', 'cow', 'falcon'],
 								        ...                        ['speed', 'weight', 'length']],
 								        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
 								        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> s1 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...                index=midx1)
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> s2 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...              index=midx2)
 								        >>> s1.index.symmetric_difference(s2.index)  # doctest: +SKIP
 								        MultiIndex([('koalas', 'speed'),
 								                    (  'lama', 'speed')],
 								                   )
 								        You can set names of result Index.
 								        >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b'])  # doctest: +SKIP
 								        MultiIndex([('koalas', 'speed'),
 								                    (  'lama', 'speed')],
 								                   names=['a', 'b'])
 								        You can set sort to `True`, if you want to sort the resulting index.
 								        >>> s1.index.symmetric_difference(s2.index, sort=True)  # doctest: +SKIP
 								        MultiIndex([('koalas', 'speed'),
 								                    (  'lama', 'speed')],
 								                   )
 								        You can also use the ``^`` operator:
 								        >>> s1.index ^ s2.index  # doctest: +SKIP
 								        MultiIndex([('koalas', 'speed'),
 								                    (  'lama', 'speed')],
 								                   )
 								        """
 								        if type(self) != type(other):
 								            raise NotImplementedError(
 								                "Doesn't support symmetric_difference between Index & MultiIndex for now"
 								            )
 								        sdf_self = self._kdf._internal.spark_frame.select(self._internal.index_spark_columns)
 								        sdf_other = other._kdf._internal.spark_frame.select(other._internal.index_spark_columns)
 								        sdf_symdiff = sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other))
 								        if sort:
 								            sdf_symdiff = sdf_symdiff.sort(self._internal.index_spark_columns)
 								        internal = InternalFrame(  # TODO: dtypes?
 								            spark_frame=sdf_symdiff,
 								            index_spark_columns=[
 								                scol_for(sdf_symdiff, col) for col in self._internal.index_spark_column_names
 								            ],
 								            index_names=self._internal.index_names,
 								        )
 								        result = cast(MultiIndex, DataFrame(internal).index)
 								        if result_name:
 								            result.names = result_name
 								        return result
 								    # TODO: ADD error parameter
 								    def drop(self, codes, level=None) -> "MultiIndex":
 								        """
 								        Make new MultiIndex with passed list of labels deleted
 								        Parameters
 								        ----------
 								        codes : array-like
 								            Must be a list of tuples
 								        level : int or level name, default None
 								        Returns
 								        -------
 								        dropped : MultiIndex
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> index # doctest: +SKIP
 								        MultiIndex([('a', 'x'),
 								                    ('b', 'y'),
 								                    ('c', 'z')],
 								                   )
 								        >>> index.drop(['a']) # doctest: +SKIP
 								        MultiIndex([('b', 'y'),
 								                    ('c', 'z')],
 								                   )
 								        >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP
 								        MultiIndex([('c', 'z')],
 								                   )
 								        """
 								        internal = self._internal.resolved_copy
 								        sdf = internal.spark_frame
 								        index_scols = internal.index_spark_columns
 								        if level is None:
 								            scol = index_scols[0]
 								        elif isinstance(level, int):
 								            scol = index_scols[level]
 								        else:
 								            scol = None
 								            for index_spark_column, index_name in zip(
 								                internal.index_spark_columns, internal.index_names
 								            ):
 								                if not isinstance(level, tuple):
 								                    level = (level,)
 								                if level == index_name:
 								                    if scol is not None:
 								                        raise ValueError(
 								                            "The name {} occurs multiple times, use a level number".format(
 								                                name_like_string(level)
 								                            )
 								                        )
 								                    scol = index_spark_column
 								            if scol is None:
 								                raise KeyError("Level {} not found".format(name_like_string(level)))
 								        sdf = sdf[~scol.isin(codes)]
 								        internal = InternalFrame(
 								            spark_frame=sdf,
 								            index_spark_columns=[scol_for(sdf, col) for col in internal.index_spark_column_names],
 								            index_names=internal.index_names,
 								            index_dtypes=internal.index_dtypes,
 								            column_labels=[],
 								            data_spark_columns=[],
 								            data_dtypes=[],
 								        )
 								        return cast(MultiIndex, DataFrame(internal).index)
 								    def argmax(self) -> None:
 								        raise TypeError("reduction operation 'argmax' not allowed for this dtype")
 								    def argmin(self) -> None:
 								        raise TypeError("reduction operation 'argmin' not allowed for this dtype")
 								    def asof(self, label) -> None:
 								        raise NotImplementedError(
 								            "only the default get_loc method is currently supported for MultiIndex"
 								        )
 								    @property
 								    def is_all_dates(self) -> bool:
 								        """
 								        is_all_dates always returns False for MultiIndex
 								        Examples
 								        --------
 								        >>> from datetime import datetime
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> idx = ps.MultiIndex.from_tuples(
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        ...     [(datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0, 0)),
 								        ...      (datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0, 0))])
 								        >>> idx  # doctest: +SKIP
 								        MultiIndex([('2019-01-01', '2019-01-01'),
 								                    ('2019-01-01', '2019-01-01')],
 								                   )
 								        >>> idx.is_all_dates
 								        False
 								        """
 								        return False
 								    def __getattr__(self, item: str) -> Any:
 								        if hasattr(MissingPandasLikeMultiIndex, item):
 								            property_or_func = getattr(MissingPandasLikeMultiIndex, item)
 								            if isinstance(property_or_func, property):
 								                return property_or_func.fget(self)  # type: ignore
 								            else:
 								                return partial(property_or_func, self)
 								        raise AttributeError("'MultiIndex' object has no attribute '{}'".format(item))
 								    def _get_level_number(self, level) -> Optional[int]:
 								        """
 								        Return the level number if a valid level is given.
 								        """
 								        count = self.names.count(level)
 								        if (count > 1) and not isinstance(level, int):
 								            raise ValueError("The name %s occurs multiple times, use a level number" % level)
 								        if level in self.names:
 								            level = self.names.index(level)
 								        elif isinstance(level, int):
 								            nlevels = self.nlevels
 								            if level >= nlevels:
 								                raise IndexError(
 								                    "Too many levels: Index has only %d "
 								                    "levels, %d is not a valid level number" % (nlevels, level)
 								                )
 								            if level < 0:
 								                if (level + nlevels) < 0:
 								                    raise IndexError(
 								                        "Too many levels: Index has only %d levels, "
 								                        "not %d" % (nlevels, level + 1)
 								                    )
 								                level = level + nlevels
 								        else:
 								            raise KeyError("Level %s not found" % str(level))
 								            return None
 								        return level
 								    def get_level_values(self, level) -> Index:
 								        """
 								        Return vector of label values for requested level,
 								        equal to the length of the index.
 								        Parameters
 								        ----------
 								        level : int or str
 								            ``level`` is either the integer position of the level in the
 								            MultiIndex, or the name of the level.
 								        Returns
 								        -------
 								        values : Index
 								            Values is a level of this MultiIndex converted to
 								            a single :class:`Index` (or subclass thereof).
 								        Examples
 								        --------
 								        Create a MultiIndex:
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> mi = ps.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> mi.names = ['level_1', 'level_2']
 								        Get level values by supplying level as either integer or name:
 								        >>> mi.get_level_values(0)
 								        Index(['x', 'x', 'y'], dtype='object', name='level_1')
 								        >>> mi.get_level_values('level_2')
 								        Index(['a', 'b', 'a'], dtype='object', name='level_2')
 								        """
 								        level = self._get_level_number(level)
 								        index_scol = self._internal.index_spark_columns[level]
 								        index_name = self._internal.index_names[level]
 								        index_dtype = self._internal.index_dtypes[level]
 								        internal = self._internal.copy(
 								            index_spark_columns=[index_scol],
 								            index_names=[index_name],
 								            index_dtypes=[index_dtype],
 								            column_labels=[],
 								            data_spark_columns=[],
 								            data_dtypes=[],
 								        )
 								        return DataFrame(internal).index
 								    def insert(self, loc: int, item) -> Index:
 								        """
 								        Make new MultiIndex inserting new item at location.
 								        Follows Python list.append semantics for negative values.
 								        Parameters
 								        ----------
 								        loc : int
 								        item : object
 								        Returns
 								        -------
 								        new_index : MultiIndex
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> kmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> kmidx.insert(3, ("h", "j"))  # doctest: +SKIP
 								        MultiIndex([('a', 'x'),
 								                    ('b', 'y'),
 								                    ('c', 'z'),
 								                    ('h', 'j')],
 								                   )
 								        For negative values
 								        >>> kmidx.insert(-2, ("h", "j"))  # doctest: +SKIP
 								        MultiIndex([('a', 'x'),
 								                    ('h', 'j'),
 								                    ('b', 'y'),
 								                    ('c', 'z')],
 								                   )
 								        """
 								        length = len(self)
 								        if loc < 0:
 								            loc = loc + length
 								            if loc < 0:
 								                raise IndexError(
 								                    "index {} is out of bounds for axis 0 with size {}".format(
 								                        (loc - length), length
 								                    )
 								                )
 								        else:
 								            if loc > length:
 								                raise IndexError(
 								                    "index {} is out of bounds for axis 0 with size {}".format(loc, length)
 								                )
 								        index_name = self._internal.index_spark_column_names
 								        sdf_before = self.to_frame(name=index_name)[:loc].to_spark()
 								        sdf_middle = Index([item]).to_frame(name=index_name).to_spark()
 								        sdf_after = self.to_frame(name=index_name)[loc:].to_spark()
 								        sdf = sdf_before.union(sdf_middle).union(sdf_after)
 								        internal = InternalFrame(  # TODO: dtypes?
 								            spark_frame=sdf,
 								            index_spark_columns=[
 								                scol_for(sdf, col) for col in self._internal.index_spark_column_names
 								            ],
 								            index_names=self._internal.index_names,
 								        )
 								        return DataFrame(internal).index
 								    def item(self) -> Tuple[Scalar, ...]:
 								        """
 								        Return the first element of the underlying data as a python tuple.
 								        Returns
 								        -------
 								        tuple
 								            The first element of MultiIndex.
 								        Raises
 								        ------
 								        ValueError
 								            If the data is not length-1.
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> kmidx = ps.MultiIndex.from_tuples([('a', 'x')])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> kmidx.item()
 								        ('a', 'x')
 								        """
 								        return self._kdf.head(2)._to_internal_pandas().index.item()
 								    def intersection(self, other) -> "MultiIndex":
 								        """
 								        Form the intersection of two Index objects.
 								        This returns a new Index with elements common to the index and `other`.
 								        Parameters
 								        ----------
 								        other : Index or array-like
 								        Returns
 								        -------
 								        intersection : MultiIndex
 								        Examples
 								        --------
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								        >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
 								        >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
 								        MultiIndex([('c', 'z')],
 								                   )
 								        """
 								        if isinstance(other, Series) or not is_list_like(other):
 								            raise TypeError("other must be a MultiIndex or a list of tuples")
 								        elif isinstance(other, DataFrame):
 								            raise ValueError("Index data must be 1-dimensional")
 								        elif isinstance(other, MultiIndex):
 								            spark_frame_other = other.to_frame().to_spark()
 								            keep_name = self.names == other.names
 								        elif isinstance(other, Index):
 								            # Always returns an empty MultiIndex if `other` is Index.
 								            return self.to_frame().head(0).index  # type: ignore
 								        elif not all(isinstance(item, tuple) for item in other):
 								            raise TypeError("other must be a MultiIndex or a list of tuples")
 								        else:
 								            other = MultiIndex.from_tuples(list(other))
 								            spark_frame_other = other.to_frame().to_spark()
 								            keep_name = True
 								        default_name = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
 								        spark_frame_self = self.to_frame(name=default_name).to_spark()
 								        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
 								        if keep_name:
 								            index_names = self._internal.index_names
 								        else:
 								            index_names = None
 								        internal = InternalFrame(  # TODO: dtypes?
 								            spark_frame=spark_frame_intersected,
 								            index_spark_columns=[scol_for(spark_frame_intersected, col) for col in default_name],
 								            index_names=index_names,
 								        )
 								        return cast(MultiIndex, DataFrame(internal).index)
 								    @property
 								    def hasnans(self):
 								        raise NotImplementedError("hasnans is not defined for MultiIndex")
 								    @property
 								    def inferred_type(self) -> str:
 								        """
 								        Return a string of the type inferred from the values.
 								        """
 								        # Always returns "mixed" for MultiIndex
 								        return "mixed"
 								    @property
 								    def asi8(self) -> None:
 								        """
 								        Integer representation of the values.
 								        """
 								        # Always returns None for MultiIndex
 								        return None
 								    def factorize(
 								        self, sort: bool = True, na_sentinel: Optional[int] = -1
 								    ) -> Tuple[Union["Series", "Index"], pd.Index]:
 								        return MissingPandasLikeMultiIndex.factorize(self, sort=sort, na_sentinel=na_sentinel)
 								    def __iter__(self):
 								        return MissingPandasLikeMultiIndex.__iter__(self)
-												[SPARK-34972][PYTHON] Make pandas-on-Spark doctests work

### What changes were proposed in this pull request?

Now that we merged the Koalas main code into PySpark code base (#32036), we should enable doctests on the Spark's infrastructure.

### Why are the changes needed?

Currently the pandas-on-Spark modules are not tested at all.
We should enable doctests first, and we will port other unit tests separately later.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Enabled the whole doctests.

Closes #32069 from ueshin/issues/SPARK-34972/pyspark-pandas_doctests.

Authored-by: Takuya UESHIN <ueshin@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-07 07:50:41 -04:00
 								def _test():
 								    import os
 								    import doctest
 								    import sys
 								    import numpy
 								    from pyspark.sql import SparkSession
 								    import pyspark.pandas.indexes.multi
 								    os.chdir(os.environ["SPARK_HOME"])
 								    globs = pyspark.pandas.indexes.multi.__dict__.copy()
 								    globs["np"] = numpy
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								    globs["ps"] = pyspark.pandas
-												[SPARK-34972][PYTHON] Make pandas-on-Spark doctests work

### What changes were proposed in this pull request?

Now that we merged the Koalas main code into PySpark code base (#32036), we should enable doctests on the Spark's infrastructure.

### Why are the changes needed?

Currently the pandas-on-Spark modules are not tested at all.
We should enable doctests first, and we will port other unit tests separately later.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Enabled the whole doctests.

Closes #32069 from ueshin/issues/SPARK-34972/pyspark-pandas_doctests.

Authored-by: Takuya UESHIN <ueshin@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-07 07:50:41 -04:00
+								    spark = (
 								        SparkSession.builder.master("local[4]")
 								        .appName("pyspark.pandas.indexes.multi tests")
 								        .getOrCreate()
 								    )
 								    (failure_count, test_count) = doctest.testmod(
 								        pyspark.pandas.indexes.multi,
 								        globs=globs,
 								        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
 								    )
 								    spark.stop()
 								    if failure_count:
 								        sys.exit(-1)
 								if __name__ == "__main__":
 								    _test()