spark-instrumented-optimizer/python/pyspark/pandas/plot/core.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import importlib

import pandas as pd
import numpy as np
from pyspark.ml.feature import Bucketizer
from pyspark.mllib.stat import KernelDensity
from pyspark.sql import functions as F
from pandas.core.base import PandasObject
from pandas.core.dtypes.inference import is_integer

from pyspark.pandas.missing import unsupported_function
from pyspark.pandas.config import get_option
from pyspark.pandas.utils import name_like_string


class TopNPlotBase:
    def get_top_n(self, data):
        from pyspark.pandas import DataFrame, Series

        max_rows = get_option("plotting.max_rows")
        # Simply use the first 1k elements and make it into a pandas dataframe
        # For categorical variables, it is likely called from df.x.value_counts().plot.xxx().
        if isinstance(data, (Series, DataFrame)):
            data = data.head(max_rows + 1).to_pandas()
        else:
            raise ValueError("Only DataFrame and Series are supported for plotting.")

        self.partial = False
        if len(data) > max_rows:
            self.partial = True
            data = data.iloc[:max_rows]
        return data

    def set_result_text(self, ax):
        max_rows = get_option("plotting.max_rows")
        assert hasattr(self, "partial")

        if self.partial:
            ax.text(
                1,
                1,
                "showing top {} elements only".format(max_rows),
                size=6,
                ha="right",
                va="bottom",
                transform=ax.transAxes,
            )


class SampledPlotBase:
    def get_sampled(self, data):
        from pyspark.pandas import DataFrame, Series

        fraction = get_option("plotting.sample_ratio")
        if fraction is None:
            fraction = 1 / (len(data) / get_option("plotting.max_rows"))
            fraction = min(1.0, fraction)
        self.fraction = fraction

        if isinstance(data, (DataFrame, Series)):
            if isinstance(data, Series):
                data = data.to_frame()
            sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
            return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
        else:
            raise ValueError("Only DataFrame and Series are supported for plotting.")

    def set_result_text(self, ax):
        assert hasattr(self, "fraction")

        if self.fraction < 1:
            ax.text(
                1,
                1,
                "showing the sampled result by fraction %s" % self.fraction,
                size=6,
                ha="right",
                va="bottom",
                transform=ax.transAxes,
            )


class HistogramPlotBase:
    @staticmethod
    def prepare_hist_data(data, bins):
        # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it.
        from pyspark.pandas.series import Series

        if isinstance(data, Series):
            data = data.to_frame()

        numeric_data = data.select_dtypes(
            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
        )

        # no empty frames or series allowed
        if len(numeric_data.columns) == 0:
            raise TypeError(
                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
            )

        if is_integer(bins):
            # computes boundaries for the column
            bins = HistogramPlotBase.get_bins(data.to_spark(), bins)

        return numeric_data, bins

    @staticmethod
    def get_bins(sdf, bins):
        # 'data' is a Spark DataFrame that selects all columns.
        if len(sdf.columns) > 1:
            min_col = F.least(*map(F.min, sdf))
            max_col = F.greatest(*map(F.max, sdf))
        else:
            min_col = F.min(sdf.columns[-1])
            max_col = F.max(sdf.columns[-1])
        boundaries = sdf.select(min_col, max_col).first()

        # divides the boundaries into bins
        if boundaries[0] == boundaries[1]:
            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)

        return np.linspace(boundaries[0], boundaries[1], bins + 1)

    @staticmethod
    def compute_hist(kdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = kdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in kdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(kdf._internal.spark_column_for(label).alias(input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(
                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
            )

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                )
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                    )
                )

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (
            output_df.groupby("__group_id", "__bucket")
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=["__group_id", "__bucket"])
        )

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
                ["count"]
            ]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series


class BoxPlotBase:
    @staticmethod
    def compute_stats(data, colname, whis, precision):
        # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        pdf = data._kdf._internal.resolved_copy.spark_frame.agg(
            *[
                F.expr(
                    "approx_percentile(`{}`, {}, {})".format(colname, q, int(1.0 / precision))
                ).alias("{}_{}%".format(colname, int(q * 100)))
                for q in [0.25, 0.50, 0.75]
            ],
            F.mean("`%s`" % colname).alias("{}_mean".format(colname)),
        ).toPandas()

        # Computes IQR and Tukey's fences
        iqr = "{}_iqr".format(colname)
        p75 = "{}_75%".format(colname)
        p25 = "{}_25%".format(colname)
        pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25]
        pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr]
        pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr]

        qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"]
        col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]]
        col_summ.columns = qnames
        lfence, ufence = col_summ["lfence"], col_summ["ufence"]

        stats = {
            "mean": col_summ["mean"].values[0],
            "med": col_summ["50%"].values[0],
            "q1": col_summ["25%"].values[0],
            "q3": col_summ["75%"].values[0],
        }

        return stats, (lfence.values[0], ufence.values[0])

    @staticmethod
    def outliers(data, colname, lfence, ufence):
        # Builds expression to identify outliers
        expression = F.col("`%s`" % colname).between(lfence, ufence)
        # Creates a column to flag rows as outliers or not
        return data._kdf._internal.resolved_copy.spark_frame.withColumn(
            "__{}_outlier".format(colname), ~expression
        )

    @staticmethod
    def calc_whiskers(colname, outliers):
        # Computes min and max values of non-outliers - the whiskers
        minmax = (
            outliers.filter("not `__{}_outlier`".format(colname))
            .agg(F.min("`%s`" % colname).alias("min"), F.max(colname).alias("max"))
            .toPandas()
        )
        return minmax.iloc[0][["min", "max"]].values

    @staticmethod
    def get_fliers(colname, outliers, min_val):
        # Filters only the outliers, should "showfliers" be True
        fliers_df = outliers.filter("`__{}_outlier`".format(colname))

        # If shows fliers, takes the top 1k with highest absolute values
        # Here we normalize the values by subtracting the minimum value from
        # each, and use absolute values.
        order_col = F.abs(F.col("`{}`".format(colname)) - min_val.item())
        fliers = (
            fliers_df.select(F.col("`{}`".format(colname)))
            .orderBy(order_col)
            .limit(1001)
            .toPandas()[colname]
            .values
        )

        return fliers


class KdePlotBase:
    @staticmethod
    def prepare_kde_data(data):
        # TODO: this logic is similar with HistogramPlotBase. Might have to deduplicate it.
        from pyspark.pandas.series import Series

        if isinstance(data, Series):
            data = data.to_frame()

        numeric_data = data.select_dtypes(
            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
        )

        # no empty frames or series allowed
        if len(numeric_data.columns) == 0:
            raise TypeError(
                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
            )

        return numeric_data

    @staticmethod
    def get_ind(sdf, ind):
        def calc_min_max():
            if len(sdf.columns) > 1:
                min_col = F.least(*map(F.min, sdf))
                max_col = F.greatest(*map(F.max, sdf))
            else:
                min_col = F.min(sdf.columns[-1])
                max_col = F.max(sdf.columns[-1])
            return sdf.select(min_col, max_col).first()

        if ind is None:
            min_val, max_val = calc_min_max()
            sample_range = max_val - min_val
            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000,)
        elif is_integer(ind):
            min_val, max_val = calc_min_max()
            sample_range = max_val - min_val
            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, ind,)
        return ind

    @staticmethod
    def compute_kde(sdf, bw_method=None, ind=None):
        # 'sdf' is a Spark DataFrame that selects one column.

        # Using RDD is slow so we might have to change it to Dataset based implementation
        # once Spark has that implementation.
        sample = sdf.rdd.map(lambda x: float(x[0]))
        kd = KernelDensity()
        kd.setSample(sample)

        assert isinstance(bw_method, (int, float)), "'bw_method' must be set as a scalar number."

        if bw_method is not None:
            # Match the bandwidth with Spark.
            kd.setBandwidth(float(bw_method))
        return kd.estimate(list(map(float, ind)))


class KoalasPlotAccessor(PandasObject):
    """
    Series/Frames plotting accessor and method.

    Uses the backend specified by the
    option ``plotting.backend``. By default, plotly is used.

    Plotting methods can also be accessed by calling the accessor as a method
    with the ``kind`` argument:
    ``s.plot(kind='hist')`` is equivalent to ``s.plot.hist()``
    """

    pandas_plot_data_map = {
        "pie": TopNPlotBase().get_top_n,
        "bar": TopNPlotBase().get_top_n,
        "barh": TopNPlotBase().get_top_n,
        "scatter": TopNPlotBase().get_top_n,
        "area": SampledPlotBase().get_sampled,
        "line": SampledPlotBase().get_sampled,
    }
    _backends = {}  # type: ignore

    def __init__(self, data):
        self.data = data

    @staticmethod
    def _find_backend(backend):
        """
        Find a Koalas plotting backend
        """
        try:
            return KoalasPlotAccessor._backends[backend]
        except KeyError:
            try:
                module = importlib.import_module(backend)
            except ImportError:
                # We re-raise later on.
                pass
            else:
                if hasattr(module, "plot") or hasattr(module, "plot_koalas"):
                    # Validate that the interface is implemented when the option
                    # is set, rather than at plot time.
                    KoalasPlotAccessor._backends[backend] = module
                    return module

        raise ValueError(
            "Could not find plotting backend '{backend}'. Ensure that you've installed "
            "the package providing the '{backend}' entrypoint, or that the package has a "
            "top-level `.plot` method.".format(backend=backend)
        )

    @staticmethod
    def _get_plot_backend(backend=None):
        backend = backend or get_option("plotting.backend")
        # Shortcut
        if backend in KoalasPlotAccessor._backends:
            return KoalasPlotAccessor._backends[backend]

        if backend == "matplotlib":
            # Because matplotlib is an optional dependency,
            # we need to attempt an import here to raise an ImportError if needed.
            try:
                # test if matplotlib can be imported
                import matplotlib  # noqa: F401
                from pyspark.pandas.plot import matplotlib as module
            except ImportError:
                raise ImportError(
                    "matplotlib is required for plotting when the "
                    "default backend 'matplotlib' is selected."
                ) from None

            KoalasPlotAccessor._backends["matplotlib"] = module
        elif backend == "plotly":
            try:
                # test if plotly can be imported
                import plotly  # noqa: F401
                from pyspark.pandas.plot import plotly as module
            except ImportError:
                raise ImportError(
                    "plotly is required for plotting when the "
                    "default backend 'plotly' is selected."
                ) from None

            KoalasPlotAccessor._backends["plotly"] = module
        else:
            module = KoalasPlotAccessor._find_backend(backend)
            KoalasPlotAccessor._backends[backend] = module
        return module

    def __call__(self, kind="line", backend=None, **kwargs):
        plot_backend = KoalasPlotAccessor._get_plot_backend(backend)
        plot_data = self.data

        kind = {"density": "kde"}.get(kind, kind)
        if hasattr(plot_backend, "plot_koalas"):
            # use if there's koalas specific method.
            return plot_backend.plot_koalas(plot_data, kind=kind, **kwargs)
        else:
            # fallback to use pandas'
            if not KoalasPlotAccessor.pandas_plot_data_map[kind]:
                raise NotImplementedError(
                    "'%s' plot is not supported with '%s' plot "
                    "backend yet." % (kind, plot_backend.__name__)
                )
            plot_data = KoalasPlotAccessor.pandas_plot_data_map[kind](plot_data)
            return plot_backend.plot(plot_data, kind=kind, **kwargs)

    def line(self, x=None, y=None, **kwargs):
        """
        Plot DataFrame/Series as lines.

        This function is useful to plot lines using Series's values
        as coordinates.

        Parameters
        ----------
        x : int or str, optional
            Columns to use for the horizontal axis.
            Either the location or the label of the columns to be used.
            By default, it will use the DataFrame indices.
        y : int, str, or list of them, optional
            The values to be plotted.
            Either the location or the label of the columns to be used.
            By default, it will use the remaining DataFrame numeric columns.
        **kwds
            Keyword arguments to pass on to :meth:`Series.plot` or :meth:`DataFrame.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        See Also
        --------
        plotly.express.line : Plot y versus x as lines and/or markers (plotly).
        matplotlib.pyplot.plot : Plot y versus x as lines and/or markers (matplotlib).

        Examples
        --------
        Basic plot.

        For Series:

        .. plotly::

            >>> s = ps.Series([1, 3, 2])
            >>> s.plot.line()  # doctest: +SKIP

        For DataFrame:

        .. plotly::

            The following example shows the populations for some animals
            over the years.

            >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
            ...                    'horse': [4, 25, 281, 600, 1900]},
            ...                   index=[1990, 1997, 2003, 2009, 2014])
            >>> df.plot.line()  # doctest: +SKIP

        .. plotly::

            The following example shows the relationship between both
            populations.

            >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
            ...                    'horse': [4, 25, 281, 600, 1900]},
            ...                   index=[1990, 1997, 2003, 2009, 2014])
            >>> df.plot.line(x='pig', y='horse')  # doctest: +SKIP
        """
        return self(kind="line", x=x, y=y, **kwargs)

    def bar(self, x=None, y=None, **kwds):
        """
        Vertical bar plot.

        Parameters
        ----------
        x : label or position, optional
            Allows plotting of one column versus another.
            If not specified, the index of the DataFrame is used.
        y : label or position, optional
            Allows plotting of one column versus another.
            If not specified, all numerical columns are used.
        **kwds : optional
            Additional keyword arguments are documented in
            :meth:`Koalas.Series.plot` or :meth:`Koalas.DataFrame.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Examples
        --------
        Basic plot.

        For Series:

        .. plotly::

            >>> s = ps.Series([1, 3, 2])
            >>> s.plot.bar()  # doctest: +SKIP

        For DataFrame:

        .. plotly::

            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
            >>> df.plot.bar(x='lab', y='val')  # doctest: +SKIP

        Plot a whole dataframe to a bar plot. Each column is stacked with a
        distinct color along the horizontal axis.

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.bar()  # doctest: +SKIP

        Instead of stacking, the figure can be split by column with plotly
        APIs.

        .. plotly::

            >>> from plotly.subplots import make_subplots
            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> fig = (make_subplots(rows=2, cols=1)
            ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
            ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
            ...        .add_trace(df.plot.bar(y='lifespan').data[0], row=2, col=1))
            >>> fig  # doctest: +SKIP

        Plot a single column.

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.bar(y='speed')  # doctest: +SKIP

        Plot only selected categories for the DataFrame.

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.bar(x='lifespan')  # doctest: +SKIP
        """
        from pyspark.pandas import DataFrame, Series

        if isinstance(self.data, Series):
            return self(kind="bar", **kwds)
        elif isinstance(self.data, DataFrame):
            return self(kind="bar", x=x, y=y, **kwds)

    def barh(self, x=None, y=None, **kwargs):
        """
        Make a horizontal bar plot.

        A horizontal bar plot is a plot that presents quantitative data with
        rectangular bars with lengths proportional to the values that they
        represent. A bar plot shows comparisons among discrete categories. One
        axis of the plot shows the specific categories being compared, and the
        other axis represents a measured value.

        Parameters
        ----------
        x : label or position, default DataFrame.index
            Column to be used for categories.
        y : label or position, default All numeric columns in dataframe
            Columns to be plotted from the DataFrame.
        **kwds
            Keyword arguments to pass on to
            :meth:`pyspark.pandas.DataFrame.plot` or :meth:`pyspark.pandas.Series.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        See Also
        --------
        plotly.express.bar : Plot a vertical bar plot using plotly.
        matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.

        Examples
        --------
        For Series:

        .. plotly::

            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
            >>> df.val.plot.barh()  # doctest: +SKIP

        For DataFrame:

        .. plotly::

            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
            >>> df.plot.barh(x='lab', y='val')  # doctest: +SKIP

        Plot a whole DataFrame to a horizontal bar plot

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.barh()  # doctest: +SKIP

        Plot a column of the DataFrame to a horizontal bar plot

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.barh(y='speed')  # doctest: +SKIP

        Plot DataFrame versus the desired column

        .. plotly::

            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
            >>> index = ['snail', 'pig', 'elephant',
            ...          'rabbit', 'giraffe', 'coyote', 'horse']
            >>> df = ps.DataFrame({'speed': speed,
            ...                    'lifespan': lifespan}, index=index)
            >>> df.plot.barh(x='lifespan')  # doctest: +SKIP
        """
        from pyspark.pandas import DataFrame, Series

        if isinstance(self.data, Series):
            return self(kind="barh", **kwargs)
        elif isinstance(self.data, DataFrame):
            return self(kind="barh", x=x, y=y, **kwargs)

    def box(self, **kwds):
        """
        Make a box plot of the Series columns.

        Parameters
        ----------
        **kwds : optional
            Additional keyword arguments are documented in
            :meth:`Koalas.Series.plot`.

        precision: scalar, default = 0.01
            This argument is used by Koalas to compute approximate statistics
            for building a boxplot. Use *smaller* values to get more precise
            statistics (matplotlib-only).

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Notes
        -----
        There are behavior differences between Koalas and pandas.

        * Koalas computes approximate statistics - expect differences between
          pandas and Koalas boxplots, especially regarding 1st and 3rd quartiles.
        * The `whis` argument is only supported as a single number.
        * Koalas doesn't support the following argument(s) (matplotlib-only).

          * `bootstrap` argument is not supported
          * `autorange` argument is not supported

        Examples
        --------
        Draw a box plot from a DataFrame with four columns of randomly
        generated data.

        For Series:

        .. plotly::

            >>> data = np.random.randn(25, 4)
            >>> df = ps.DataFrame(data, columns=list('ABCD'))
            >>> df['A'].plot.box()  # doctest: +SKIP

        This is an unsupported function for DataFrame type
        """
        from pyspark.pandas import DataFrame, Series

        if isinstance(self.data, Series):
            return self(kind="box", **kwds)
        elif isinstance(self.data, DataFrame):
            return unsupported_function(class_name="pd.DataFrame", method_name="box")()

    def hist(self, bins=10, **kwds):
        """
        Draw one histogram of the DataFrame’s columns.
        A `histogram`_ is a representation of the distribution of data.
        This function calls :meth:`plotting.backend.plot`,
        on each series in the DataFrame, resulting in one histogram per column.

        .. _histogram: https://en.wikipedia.org/wiki/Histogram

        Parameters
        ----------
        bins : integer or sequence, default 10
            Number of histogram bins to be used. If an integer is given, bins + 1
            bin edges are calculated and returned. If bins is a sequence, gives
            bin edges, including left edge of first bin and right edge of last
            bin. In this case, bins is returned unmodified.
        **kwds
            All other plotting keyword arguments to be passed to
            plotting backend.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Examples
        --------
        Basic plot.

        For Series:

        .. plotly::

            >>> s = ps.Series([1, 3, 2])
            >>> s.plot.hist()  # doctest: +SKIP

        For DataFrame:

        .. plotly::

            >>> df = pd.DataFrame(
            ...     np.random.randint(1, 7, 6000),
            ...     columns=['one'])
            >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
            >>> df = ps.from_pandas(df)
            >>> df.plot.hist(bins=12, alpha=0.5)  # doctest: +SKIP
        """
        return self(kind="hist", bins=bins, **kwds)

    def kde(self, bw_method=None, ind=None, **kwargs):
        """
        Generate Kernel Density Estimate plot using Gaussian kernels.

        Parameters
        ----------
        bw_method : scalar
            The method used to calculate the estimator bandwidth.
            See KernelDensity in PySpark for more information.
        ind : NumPy array or integer, optional
            Evaluation points for the estimated PDF. If None (default),
            1000 equally spaced points are used. If `ind` is a NumPy array, the
            KDE is evaluated at the points passed. If `ind` is an integer,
            `ind` number of equally spaced points are used.
        **kwargs : optional
            Keyword arguments to pass on to :meth:`Koalas.Series.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Examples
        --------
        A scalar bandwidth should be specified. Using a small bandwidth value can
        lead to over-fitting, while using a large bandwidth value may result
        in under-fitting:

        .. plotly::

            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
            >>> s.plot.kde(bw_method=0.3)  # doctest: +SKIP

        .. plotly::

            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
            >>> s.plot.kde(bw_method=3)  # doctest: +SKIP

        The `ind` parameter determines the evaluation points for the
        plot of the estimated KDF:

        .. plotly::

            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
            >>> s.plot.kde(ind=[1, 2, 3, 4, 5], bw_method=0.3)  # doctest: +SKIP

        For DataFrame, it works in the same way as Series:

        .. plotly::

            >>> df = ps.DataFrame({
            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
            ... })
            >>> df.plot.kde(bw_method=0.3)  # doctest: +SKIP

        .. plotly::

            >>> df = ps.DataFrame({
            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
            ... })
            >>> df.plot.kde(bw_method=3)  # doctest: +SKIP

        .. plotly::

            >>> df = ps.DataFrame({
            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
            ... })
            >>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3)  # doctest: +SKIP
        """
        return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)

    density = kde

    def area(self, x=None, y=None, **kwds):
        """
        Draw a stacked area plot.

        An area plot displays quantitative data visually.
        This function wraps the plotly area function.

        Parameters
        ----------
        x : label or position, optional
            Coordinates for the X axis. By default uses the index.
        y : label or position, optional
            Column to plot. By default uses all columns.
        stacked : bool, default True
            Area plots are stacked by default. Set to False to create a
            unstacked plot (matplotlib-only).
        **kwds : optional
            Additional keyword arguments are documented in
            :meth:`DataFrame.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Examples
        --------

        For Series

        .. plotly::

            >>> df = ps.DataFrame({
            ...     'sales': [3, 2, 3, 9, 10, 6],
            ...     'signups': [5, 5, 6, 12, 14, 13],
            ...     'visits': [20, 42, 28, 62, 81, 50],
            ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
            ...                        freq='M'))
            >>> df.sales.plot.area()  # doctest: +SKIP

        For DataFrame

        .. plotly::

            >>> df = ps.DataFrame({
            ...     'sales': [3, 2, 3, 9, 10, 6],
            ...     'signups': [5, 5, 6, 12, 14, 13],
            ...     'visits': [20, 42, 28, 62, 81, 50],
            ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
            ...                        freq='M'))
            >>> df.plot.area()  # doctest: +SKIP
        """
        from pyspark.pandas import DataFrame, Series

        if isinstance(self.data, Series):
            return self(kind="area", **kwds)
        elif isinstance(self.data, DataFrame):
            return self(kind="area", x=x, y=y, **kwds)

    def pie(self, **kwds):
        """
        Generate a pie plot.

        A pie plot is a proportional representation of the numerical data in a
        column. This function wraps :meth:`plotly.express.pie` for the
        specified column.

        Parameters
        ----------
        y : int or label, optional
            Label or position of the column to plot.
            If not provided, ``subplots=True`` argument must be passed (matplotlib-only).
        **kwds
            Keyword arguments to pass on to :meth:`Koalas.Series.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        Examples
        --------

        For Series:

        .. plotly::

            >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
            ...                    'radius': [2439.7, 6051.8, 6378.1]},
            ...                   index=['Mercury', 'Venus', 'Earth'])
            >>> df.mass.plot.pie()  # doctest: +SKIP


        For DataFrame:

        .. plotly::

            >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
            ...                    'radius': [2439.7, 6051.8, 6378.1]},
            ...                   index=['Mercury', 'Venus', 'Earth'])
            >>> df.plot.pie(y='mass')  # doctest: +SKIP
        """
        from pyspark.pandas import DataFrame, Series

        if isinstance(self.data, Series):
            return self(kind="pie", **kwds)
        else:
            # pandas will raise an error if y is None and subplots if not True
            if (
                isinstance(self.data, DataFrame)
                and kwds.get("y", None) is None
                and not kwds.get("subplots", False)
            ):
                raise ValueError(
                    "pie requires either y column or 'subplots=True' (matplotlib-only)"
                )
            return self(kind="pie", **kwds)

    def scatter(self, x, y, **kwds):
        """
        Create a scatter plot with varying marker point size and color.

        The coordinates of each point are defined by two dataframe columns and
        filled circles are used to represent each point. This kind of plot is
        useful to see complex correlations between two variables. Points could
        be for instance natural 2D coordinates like longitude and latitude in
        a map or, in general, any pair of metrics that can be plotted against
        each other.

        Parameters
        ----------
        x : int or str
            The column name or column position to be used as horizontal
            coordinates for each point.
        y : int or str
            The column name or column position to be used as vertical
            coordinates for each point.
        s : scalar or array_like, optional
            (matplotlib-only).
        c : str, int or array_like, optional
            (matplotlib-only).

        **kwds: Optional
            Keyword arguments to pass on to :meth:`pyspark.pandas.DataFrame.plot`.

        Returns
        -------
        :class:`plotly.graph_objs.Figure`
            Return an custom object when ``backend!=plotly``.
            Return an ndarray when ``subplots=True`` (matplotlib-only).

        See Also
        --------
        plotly.express.scatter : Scatter plot using multiple input data
            formats (plotly).
        matplotlib.pyplot.scatter : Scatter plot using multiple input data
            formats (matplotlib).

        Examples
        --------
        Let's see how to draw a scatter plot using coordinates from the values
        in a DataFrame's columns.

        .. plotly::

            >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
            ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
            ...                   columns=['length', 'width', 'species'])
            >>> df.plot.scatter(x='length', y='width')  # doctest: +SKIP

        And now with dark scheme:

        .. plotly::

            >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
            ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
            ...                   columns=['length', 'width', 'species'])
            >>> fig = df.plot.scatter(x='length', y='width')
            >>> fig.update_layout(template="plotly_dark")  # doctest: +SKIP
        """
        return self(kind="scatter", x=x, y=y, **kwds)

    def hexbin(self, **kwds):
        return unsupported_function(class_name="pd.DataFrame", method_name="hexbin")()
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								#
 								# Licensed to the Apache Software Foundation (ASF) under one or more
 								# contributor license agreements.  See the NOTICE file distributed with
 								# this work for additional information regarding copyright ownership.
 								# The ASF licenses this file to You under the Apache License, Version 2.0
 								# (the "License"); you may not use this file except in compliance with
 								# the License.  You may obtain a copy of the License at
 								#
 								#    http://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								#
 								import importlib
 								import pandas as pd
 								import numpy as np
 								from pyspark.ml.feature import Bucketizer
 								from pyspark.mllib.stat import KernelDensity
 								from pyspark.sql import functions as F
 								from pandas.core.base import PandasObject
 								from pandas.core.dtypes.inference import is_integer
 								from pyspark.pandas.missing import unsupported_function
 								from pyspark.pandas.config import get_option
 								from pyspark.pandas.utils import name_like_string
 								class TopNPlotBase:
 								    def get_top_n(self, data):
 								        from pyspark.pandas import DataFrame, Series
 								        max_rows = get_option("plotting.max_rows")
 								        # Simply use the first 1k elements and make it into a pandas dataframe
 								        # For categorical variables, it is likely called from df.x.value_counts().plot.xxx().
 								        if isinstance(data, (Series, DataFrame)):
 								            data = data.head(max_rows + 1).to_pandas()
 								        else:
 								            raise ValueError("Only DataFrame and Series are supported for plotting.")
 								        self.partial = False
 								        if len(data) > max_rows:
 								            self.partial = True
 								            data = data.iloc[:max_rows]
 								        return data
 								    def set_result_text(self, ax):
 								        max_rows = get_option("plotting.max_rows")
 								        assert hasattr(self, "partial")
 								        if self.partial:
 								            ax.text(
 ,
 ,
 								                "showing top {} elements only".format(max_rows),
 								                size=6,
 								                ha="right",
 								                va="bottom",
 								                transform=ax.transAxes,
 								            )
 								class SampledPlotBase:
 								    def get_sampled(self, data):
 								        from pyspark.pandas import DataFrame, Series
 								        fraction = get_option("plotting.sample_ratio")
 								        if fraction is None:
 								            fraction = 1 / (len(data) / get_option("plotting.max_rows"))
 								            fraction = min(1.0, fraction)
 								        self.fraction = fraction
 								        if isinstance(data, (DataFrame, Series)):
 								            if isinstance(data, Series):
 								                data = data.to_frame()
 								            sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
 								            return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
 								        else:
 								            raise ValueError("Only DataFrame and Series are supported for plotting.")
 								    def set_result_text(self, ax):
 								        assert hasattr(self, "fraction")
 								        if self.fraction < 1:
 								            ax.text(
 ,
 ,
 								                "showing the sampled result by fraction %s" % self.fraction,
 								                size=6,
 								                ha="right",
 								                va="bottom",
 								                transform=ax.transAxes,
 								            )
 								class HistogramPlotBase:
 								    @staticmethod
 								    def prepare_hist_data(data, bins):
 								        # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it.
 								        from pyspark.pandas.series import Series
 								        if isinstance(data, Series):
 								            data = data.to_frame()
 								        numeric_data = data.select_dtypes(
 								            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
 								        )
 								        # no empty frames or series allowed
 								        if len(numeric_data.columns) == 0:
 								            raise TypeError(
 								                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
 								            )
 								        if is_integer(bins):
 								            # computes boundaries for the column
 								            bins = HistogramPlotBase.get_bins(data.to_spark(), bins)
 								        return numeric_data, bins
 								    @staticmethod
 								    def get_bins(sdf, bins):
 								        # 'data' is a Spark DataFrame that selects all columns.
 								        if len(sdf.columns) > 1:
 								            min_col = F.least(*map(F.min, sdf))
 								            max_col = F.greatest(*map(F.max, sdf))
 								        else:
 								            min_col = F.min(sdf.columns[-1])
 								            max_col = F.max(sdf.columns[-1])
 								        boundaries = sdf.select(min_col, max_col).first()
 								        # divides the boundaries into bins
 								        if boundaries[0] == boundaries[1]:
 								            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)
 								        return np.linspace(boundaries[0], boundaries[1], bins + 1)
 								    @staticmethod
 								    def compute_hist(kdf, bins):
 								        # 'data' is a Spark DataFrame that selects one column.
 								        assert isinstance(bins, (np.ndarray, np.generic))
 								        sdf = kdf._internal.spark_frame
 								        scols = []
 								        input_column_names = []
 								        for label in kdf._internal.column_labels:
 								            input_column_name = name_like_string(label)
 								            input_column_names.append(input_column_name)
 								            scols.append(kdf._internal.spark_column_for(label).alias(input_column_name))
 								        sdf = sdf.select(*scols)
 								        # 1. Make the bucket output flat to:
 								        #     +----------+-------+
 								        #     |__group_id|buckets|
 								        #     +----------+-------+
 								        #     |0         |0.0    |
 								        #     |0         |0.0    |
 								        #     |0         |1.0    |
 								        #     |0         |2.0    |
 								        #     |0         |3.0    |
 								        #     |0         |3.0    |
 								        #     |1         |0.0    |
 								        #     |1         |1.0    |
 								        #     |1         |1.0    |
 								        #     |1         |2.0    |
 								        #     |1         |1.0    |
 								        #     |1         |0.0    |
 								        #     +----------+-------+
 								        colnames = sdf.columns
 								        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]
 								        output_df = None
 								        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
 								            # creates a Bucketizer to get corresponding bin of each value
 								            bucketizer = Bucketizer(
 								                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
 								            )
 								            bucket_df = bucketizer.transform(sdf)
 								            if output_df is None:
 								                output_df = bucket_df.select(
 								                    F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
 								                )
 								            else:
 								                output_df = output_df.union(
 								                    bucket_df.select(
 								                        F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
 								                    )
 								                )
 								        # 2. Calculate the count based on each group and bucket.
 								        #     +----------+-------+------+
 								        #     |__group_id|buckets| count|
 								        #     +----------+-------+------+
 								        #     |0         |0.0    |2     |
 								        #     |0         |1.0    |1     |
 								        #     |0         |2.0    |1     |
 								        #     |0         |3.0    |2     |
 								        #     |1         |0.0    |2     |
 								        #     |1         |1.0    |3     |
 								        #     |1         |2.0    |1     |
 								        #     +----------+-------+------+
 								        result = (
 								            output_df.groupby("__group_id", "__bucket")
 								            .agg(F.count("*").alias("count"))
 								            .toPandas()
 								            .sort_values(by=["__group_id", "__bucket"])
 								        )
 								        # 3. Fill empty bins and calculate based on each group id. From:
 								        #     +----------+--------+------+
 								        #     |__group_id|__bucket| count|
 								        #     +----------+--------+------+
 								        #     |0         |0.0     |2     |
 								        #     |0         |1.0     |1     |
 								        #     |0         |2.0     |1     |
 								        #     |0         |3.0     |2     |
 								        #     +----------+--------+------+
 								        #     +----------+--------+------+
 								        #     |__group_id|__bucket| count|
 								        #     +----------+--------+------+
 								        #     |1         |0.0     |2     |
 								        #     |1         |1.0     |3     |
 								        #     |1         |2.0     |1     |
 								        #     +----------+--------+------+
 								        #
 								        # to:
 								        #     +-----------------+
 								        #     |__values1__bucket|
 								        #     +-----------------+
 								        #     |2                |
 								        #     |1                |
 								        #     |1                |
 								        #     |2                |
 								        #     |0                |
 								        #     +-----------------+
 								        #     +-----------------+
 								        #     |__values2__bucket|
 								        #     +-----------------+
 								        #     |2                |
 								        #     |3                |
 								        #     |1                |
 								        #     |0                |
 								        #     |0                |
 								        #     +-----------------+
 								        output_series = []
 								        for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
 								            current_bucket_result = result[result["__group_id"] == i]
 								            # generates a pandas DF with one row for each bin
 								            # we need this as some of the bins may be empty
 								            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
 								            # merges the bins with counts on it and fills remaining ones with zeros
 								            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
 								                ["count"]
 								            ]
 								            pdf.columns = [input_column_name]
 								            output_series.append(pdf[input_column_name])
 								        return output_series
 								class BoxPlotBase:
 								    @staticmethod
 								    def compute_stats(data, colname, whis, precision):
 								        # Computes mean, median, Q1 and Q3 with approx_percentile and precision
 								        pdf = data._kdf._internal.resolved_copy.spark_frame.agg(
 								            *[
 								                F.expr(
 								                    "approx_percentile(`{}`, {}, {})".format(colname, q, int(1.0 / precision))
 								                ).alias("{}_{}%".format(colname, int(q * 100)))
 								                for q in [0.25, 0.50, 0.75]
 								            ],
 								            F.mean("`%s`" % colname).alias("{}_mean".format(colname)),
 								        ).toPandas()
 								        # Computes IQR and Tukey's fences
 								        iqr = "{}_iqr".format(colname)
 								        p75 = "{}_75%".format(colname)
 								        p25 = "{}_25%".format(colname)
 								        pdf.loc[:, iqr] = pdf.loc[:, p75] - pdf.loc[:, p25]
 								        pdf.loc[:, "{}_lfence".format(colname)] = pdf.loc[:, p25] - whis * pdf.loc[:, iqr]
 								        pdf.loc[:, "{}_ufence".format(colname)] = pdf.loc[:, p75] + whis * pdf.loc[:, iqr]
 								        qnames = ["25%", "50%", "75%", "mean", "lfence", "ufence"]
 								        col_summ = pdf[["{}_{}".format(colname, q) for q in qnames]]
 								        col_summ.columns = qnames
 								        lfence, ufence = col_summ["lfence"], col_summ["ufence"]
 								        stats = {
 								            "mean": col_summ["mean"].values[0],
 								            "med": col_summ["50%"].values[0],
 								            "q1": col_summ["25%"].values[0],
 								            "q3": col_summ["75%"].values[0],
 								        }
 								        return stats, (lfence.values[0], ufence.values[0])
 								    @staticmethod
 								    def outliers(data, colname, lfence, ufence):
 								        # Builds expression to identify outliers
 								        expression = F.col("`%s`" % colname).between(lfence, ufence)
 								        # Creates a column to flag rows as outliers or not
 								        return data._kdf._internal.resolved_copy.spark_frame.withColumn(
 								            "__{}_outlier".format(colname), ~expression
 								        )
 								    @staticmethod
 								    def calc_whiskers(colname, outliers):
 								        # Computes min and max values of non-outliers - the whiskers
 								        minmax = (
 								            outliers.filter("not `__{}_outlier`".format(colname))
 								            .agg(F.min("`%s`" % colname).alias("min"), F.max(colname).alias("max"))
 								            .toPandas()
 								        )
 								        return minmax.iloc[0][["min", "max"]].values
 								    @staticmethod
 								    def get_fliers(colname, outliers, min_val):
 								        # Filters only the outliers, should "showfliers" be True
 								        fliers_df = outliers.filter("`__{}_outlier`".format(colname))
 								        # If shows fliers, takes the top 1k with highest absolute values
 								        # Here we normalize the values by subtracting the minimum value from
 								        # each, and use absolute values.
 								        order_col = F.abs(F.col("`{}`".format(colname)) - min_val.item())
 								        fliers = (
 								            fliers_df.select(F.col("`{}`".format(colname)))
 								            .orderBy(order_col)
 								            .limit(1001)
 								            .toPandas()[colname]
 								            .values
 								        )
 								        return fliers
 								class KdePlotBase:
 								    @staticmethod
 								    def prepare_kde_data(data):
 								        # TODO: this logic is similar with HistogramPlotBase. Might have to deduplicate it.
 								        from pyspark.pandas.series import Series
 								        if isinstance(data, Series):
 								            data = data.to_frame()
 								        numeric_data = data.select_dtypes(
 								            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
 								        )
 								        # no empty frames or series allowed
 								        if len(numeric_data.columns) == 0:
 								            raise TypeError(
 								                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
 								            )
 								        return numeric_data
 								    @staticmethod
 								    def get_ind(sdf, ind):
 								        def calc_min_max():
 								            if len(sdf.columns) > 1:
 								                min_col = F.least(*map(F.min, sdf))
 								                max_col = F.greatest(*map(F.max, sdf))
 								            else:
 								                min_col = F.min(sdf.columns[-1])
 								                max_col = F.max(sdf.columns[-1])
 								            return sdf.select(min_col, max_col).first()
 								        if ind is None:
 								            min_val, max_val = calc_min_max()
 								            sample_range = max_val - min_val
 								            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000,)
 								        elif is_integer(ind):
 								            min_val, max_val = calc_min_max()
 								            sample_range = max_val - min_val
 								            ind = np.linspace(min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, ind,)
 								        return ind
 								    @staticmethod
 								    def compute_kde(sdf, bw_method=None, ind=None):
 								        # 'sdf' is a Spark DataFrame that selects one column.
 								        # Using RDD is slow so we might have to change it to Dataset based implementation
 								        # once Spark has that implementation.
 								        sample = sdf.rdd.map(lambda x: float(x[0]))
 								        kd = KernelDensity()
 								        kd.setSample(sample)
 								        assert isinstance(bw_method, (int, float)), "'bw_method' must be set as a scalar number."
 								        if bw_method is not None:
 								            # Match the bandwidth with Spark.
 								            kd.setBandwidth(float(bw_method))
 								        return kd.estimate(list(map(float, ind)))
 								class KoalasPlotAccessor(PandasObject):
 								    """
 								    Series/Frames plotting accessor and method.
 								    Uses the backend specified by the
 								    option ``plotting.backend``. By default, plotly is used.
 								    Plotting methods can also be accessed by calling the accessor as a method
 								    with the ``kind`` argument:
 								    ``s.plot(kind='hist')`` is equivalent to ``s.plot.hist()``
 								    """
 								    pandas_plot_data_map = {
 								        "pie": TopNPlotBase().get_top_n,
 								        "bar": TopNPlotBase().get_top_n,
 								        "barh": TopNPlotBase().get_top_n,
 								        "scatter": TopNPlotBase().get_top_n,
 								        "area": SampledPlotBase().get_sampled,
 								        "line": SampledPlotBase().get_sampled,
 								    }
 								    _backends = {}  # type: ignore
 								    def __init__(self, data):
 								        self.data = data
 								    @staticmethod
 								    def _find_backend(backend):
 								        """
 								        Find a Koalas plotting backend
 								        """
 								        try:
 								            return KoalasPlotAccessor._backends[backend]
 								        except KeyError:
 								            try:
 								                module = importlib.import_module(backend)
 								            except ImportError:
 								                # We re-raise later on.
 								                pass
 								            else:
 								                if hasattr(module, "plot") or hasattr(module, "plot_koalas"):
 								                    # Validate that the interface is implemented when the option
 								                    # is set, rather than at plot time.
 								                    KoalasPlotAccessor._backends[backend] = module
 								                    return module
 								        raise ValueError(
 								            "Could not find plotting backend '{backend}'. Ensure that you've installed "
 								            "the package providing the '{backend}' entrypoint, or that the package has a "
 								            "top-level `.plot` method.".format(backend=backend)
 								        )
 								    @staticmethod
 								    def _get_plot_backend(backend=None):
 								        backend = backend or get_option("plotting.backend")
 								        # Shortcut
 								        if backend in KoalasPlotAccessor._backends:
 								            return KoalasPlotAccessor._backends[backend]
 								        if backend == "matplotlib":
 								            # Because matplotlib is an optional dependency,
 								            # we need to attempt an import here to raise an ImportError if needed.
 								            try:
 								                # test if matplotlib can be imported
 								                import matplotlib  # noqa: F401
 								                from pyspark.pandas.plot import matplotlib as module
 								            except ImportError:
 								                raise ImportError(
 								                    "matplotlib is required for plotting when the "
 								                    "default backend 'matplotlib' is selected."
 								                ) from None
 								            KoalasPlotAccessor._backends["matplotlib"] = module
 								        elif backend == "plotly":
 								            try:
 								                # test if plotly can be imported
 								                import plotly  # noqa: F401
 								                from pyspark.pandas.plot import plotly as module
 								            except ImportError:
 								                raise ImportError(
 								                    "plotly is required for plotting when the "
 								                    "default backend 'plotly' is selected."
 								                ) from None
 								            KoalasPlotAccessor._backends["plotly"] = module
 								        else:
 								            module = KoalasPlotAccessor._find_backend(backend)
 								            KoalasPlotAccessor._backends[backend] = module
 								        return module
 								    def __call__(self, kind="line", backend=None, **kwargs):
 								        plot_backend = KoalasPlotAccessor._get_plot_backend(backend)
 								        plot_data = self.data
 								        kind = {"density": "kde"}.get(kind, kind)
 								        if hasattr(plot_backend, "plot_koalas"):
 								            # use if there's koalas specific method.
 								            return plot_backend.plot_koalas(plot_data, kind=kind, **kwargs)
 								        else:
 								            # fallback to use pandas'
 								            if not KoalasPlotAccessor.pandas_plot_data_map[kind]:
 								                raise NotImplementedError(
 								                    "'%s' plot is not supported with '%s' plot "
 								                    "backend yet." % (kind, plot_backend.__name__)
 								                )
 								            plot_data = KoalasPlotAccessor.pandas_plot_data_map[kind](plot_data)
 								            return plot_backend.plot(plot_data, kind=kind, **kwargs)
 								    def line(self, x=None, y=None, **kwargs):
 								        """
 								        Plot DataFrame/Series as lines.
 								        This function is useful to plot lines using Series's values
 								        as coordinates.
 								        Parameters
 								        ----------
 								        x : int or str, optional
 								            Columns to use for the horizontal axis.
 								            Either the location or the label of the columns to be used.
 								            By default, it will use the DataFrame indices.
 								        y : int, str, or list of them, optional
 								            The values to be plotted.
 								            Either the location or the label of the columns to be used.
 								            By default, it will use the remaining DataFrame numeric columns.
 								        **kwds
 								            Keyword arguments to pass on to :meth:`Series.plot` or :meth:`DataFrame.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        See Also
 								        --------
 								        plotly.express.line : Plot y versus x as lines and/or markers (plotly).
 								        matplotlib.pyplot.plot : Plot y versus x as lines and/or markers (matplotlib).
 								        Examples
 								        --------
 								        Basic plot.
 								        For Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 3, 2])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.line()  # doctest: +SKIP
 								        For DataFrame:
 								        .. plotly::
 								            The following example shows the populations for some animals
 								            over the years.
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'horse': [4, 25, 281, 600, 1900]},
 								            ...                   index=[1990, 1997, 2003, 2009, 2014])
 								            >>> df.plot.line()  # doctest: +SKIP
 								        .. plotly::
 								            The following example shows the relationship between both
 								            populations.
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'horse': [4, 25, 281, 600, 1900]},
 								            ...                   index=[1990, 1997, 2003, 2009, 2014])
 								            >>> df.plot.line(x='pig', y='horse')  # doctest: +SKIP
 								        """
 								        return self(kind="line", x=x, y=y, **kwargs)
 								    def bar(self, x=None, y=None, **kwds):
 								        """
 								        Vertical bar plot.
 								        Parameters
 								        ----------
 								        x : label or position, optional
 								            Allows plotting of one column versus another.
 								            If not specified, the index of the DataFrame is used.
 								        y : label or position, optional
 								            Allows plotting of one column versus another.
 								            If not specified, all numerical columns are used.
 								        **kwds : optional
 								            Additional keyword arguments are documented in
 								            :meth:`Koalas.Series.plot` or :meth:`Koalas.DataFrame.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Examples
 								        --------
 								        Basic plot.
 								        For Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 3, 2])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.bar()  # doctest: +SKIP
 								        For DataFrame:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> df.plot.bar(x='lab', y='val')  # doctest: +SKIP
 								        Plot a whole dataframe to a bar plot. Each column is stacked with a
 								        distinct color along the horizontal axis.
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.bar()  # doctest: +SKIP
 								        Instead of stacking, the figure can be split by column with plotly
 								        APIs.
 								        .. plotly::
 								            >>> from plotly.subplots import make_subplots
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> fig = (make_subplots(rows=2, cols=1)
 								            ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
 								            ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
 								            ...        .add_trace(df.plot.bar(y='lifespan').data[0], row=2, col=1))
 								            >>> fig  # doctest: +SKIP
 								        Plot a single column.
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.bar(y='speed')  # doctest: +SKIP
 								        Plot only selected categories for the DataFrame.
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.bar(x='lifespan')  # doctest: +SKIP
 								        """
 								        from pyspark.pandas import DataFrame, Series
 								        if isinstance(self.data, Series):
 								            return self(kind="bar", **kwds)
 								        elif isinstance(self.data, DataFrame):
 								            return self(kind="bar", x=x, y=y, **kwds)
 								    def barh(self, x=None, y=None, **kwargs):
 								        """
 								        Make a horizontal bar plot.
 								        A horizontal bar plot is a plot that presents quantitative data with
 								        rectangular bars with lengths proportional to the values that they
 								        represent. A bar plot shows comparisons among discrete categories. One
 								        axis of the plot shows the specific categories being compared, and the
 								        other axis represents a measured value.
 								        Parameters
 								        ----------
 								        x : label or position, default DataFrame.index
 								            Column to be used for categories.
 								        y : label or position, default All numeric columns in dataframe
 								            Columns to be plotted from the DataFrame.
 								        **kwds
 								            Keyword arguments to pass on to
 								            :meth:`pyspark.pandas.DataFrame.plot` or :meth:`pyspark.pandas.Series.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        See Also
 								        --------
 								        plotly.express.bar : Plot a vertical bar plot using plotly.
 								        matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
 								        Examples
 								        --------
 								        For Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> df.val.plot.barh()  # doctest: +SKIP
 								        For DataFrame:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> df.plot.barh(x='lab', y='val')  # doctest: +SKIP
 								        Plot a whole DataFrame to a horizontal bar plot
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.barh()  # doctest: +SKIP
 								        Plot a column of the DataFrame to a horizontal bar plot
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.barh(y='speed')  # doctest: +SKIP
 								        Plot DataFrame versus the desired column
 								        .. plotly::
 								            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 								            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 								            >>> index = ['snail', 'pig', 'elephant',
 								            ...          'rabbit', 'giraffe', 'coyote', 'horse']
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'speed': speed,
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'lifespan': lifespan}, index=index)
 								            >>> df.plot.barh(x='lifespan')  # doctest: +SKIP
 								        """
 								        from pyspark.pandas import DataFrame, Series
 								        if isinstance(self.data, Series):
 								            return self(kind="barh", **kwargs)
 								        elif isinstance(self.data, DataFrame):
 								            return self(kind="barh", x=x, y=y, **kwargs)
 								    def box(self, **kwds):
 								        """
 								        Make a box plot of the Series columns.
 								        Parameters
 								        ----------
 								        **kwds : optional
 								            Additional keyword arguments are documented in
 								            :meth:`Koalas.Series.plot`.
 								        precision: scalar, default = 0.01
 								            This argument is used by Koalas to compute approximate statistics
 								            for building a boxplot. Use *smaller* values to get more precise
 								            statistics (matplotlib-only).
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Notes
 								        -----
 								        There are behavior differences between Koalas and pandas.
 								        * Koalas computes approximate statistics - expect differences between
 								          pandas and Koalas boxplots, especially regarding 1st and 3rd quartiles.
 								        * The `whis` argument is only supported as a single number.
 								        * Koalas doesn't support the following argument(s) (matplotlib-only).
 								          * `bootstrap` argument is not supported
 								          * `autorange` argument is not supported
 								        Examples
 								        --------
 								        Draw a box plot from a DataFrame with four columns of randomly
 								        generated data.
 								        For Series:
 								        .. plotly::
 								            >>> data = np.random.randn(25, 4)
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame(data, columns=list('ABCD'))
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> df['A'].plot.box()  # doctest: +SKIP
 								        This is an unsupported function for DataFrame type
 								        """
 								        from pyspark.pandas import DataFrame, Series
 								        if isinstance(self.data, Series):
 								            return self(kind="box", **kwds)
 								        elif isinstance(self.data, DataFrame):
 								            return unsupported_function(class_name="pd.DataFrame", method_name="box")()
 								    def hist(self, bins=10, **kwds):
 								        """
 								        Draw one histogram of the DataFrame’s columns.
 								        A `histogram`_ is a representation of the distribution of data.
 								        This function calls :meth:`plotting.backend.plot`,
 								        on each series in the DataFrame, resulting in one histogram per column.
 								        .. _histogram: https://en.wikipedia.org/wiki/Histogram
 								        Parameters
 								        ----------
 								        bins : integer or sequence, default 10
 								            Number of histogram bins to be used. If an integer is given, bins + 1
 								            bin edges are calculated and returned. If bins is a sequence, gives
 								            bin edges, including left edge of first bin and right edge of last
 								            bin. In this case, bins is returned unmodified.
 								        **kwds
 								            All other plotting keyword arguments to be passed to
 								            plotting backend.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Examples
 								        --------
 								        Basic plot.
 								        For Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 3, 2])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.hist()  # doctest: +SKIP
 								        For DataFrame:
 								        .. plotly::
 								            >>> df = pd.DataFrame(
 								            ...     np.random.randint(1, 7, 6000),
 								            ...     columns=['one'])
 								            >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.from_pandas(df)
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> df.plot.hist(bins=12, alpha=0.5)  # doctest: +SKIP
 								        """
 								        return self(kind="hist", bins=bins, **kwds)
 								    def kde(self, bw_method=None, ind=None, **kwargs):
 								        """
 								        Generate Kernel Density Estimate plot using Gaussian kernels.
 								        Parameters
 								        ----------
 								        bw_method : scalar
 								            The method used to calculate the estimator bandwidth.
 								            See KernelDensity in PySpark for more information.
 								        ind : NumPy array or integer, optional
 								            Evaluation points for the estimated PDF. If None (default),
 equally spaced points are used. If `ind` is a NumPy array, the
 								            KDE is evaluated at the points passed. If `ind` is an integer,
 								            `ind` number of equally spaced points are used.
 								        **kwargs : optional
 								            Keyword arguments to pass on to :meth:`Koalas.Series.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Examples
 								        --------
 								        A scalar bandwidth should be specified. Using a small bandwidth value can
 								        lead to over-fitting, while using a large bandwidth value may result
 								        in under-fitting:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.kde(bw_method=0.3)  # doctest: +SKIP
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.kde(bw_method=3)  # doctest: +SKIP
 								        The `ind` parameter determines the evaluation points for the
 								        plot of the estimated KDF:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            >>> s.plot.kde(ind=[1, 2, 3, 4, 5], bw_method=0.3)  # doctest: +SKIP
 								        For DataFrame, it works in the same way as Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
 								            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
 								            ... })
 								            >>> df.plot.kde(bw_method=0.3)  # doctest: +SKIP
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
 								            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
 								            ... })
 								            >>> df.plot.kde(bw_method=3)  # doctest: +SKIP
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
 								            ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
 								            ... })
 								            >>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3)  # doctest: +SKIP
 								        """
 								        return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)
 								    density = kde
 								    def area(self, x=None, y=None, **kwds):
 								        """
 								        Draw a stacked area plot.
 								        An area plot displays quantitative data visually.
 								        This function wraps the plotly area function.
 								        Parameters
 								        ----------
 								        x : label or position, optional
 								            Coordinates for the X axis. By default uses the index.
 								        y : label or position, optional
 								            Column to plot. By default uses all columns.
 								        stacked : bool, default True
 								            Area plots are stacked by default. Set to False to create a
 								            unstacked plot (matplotlib-only).
 								        **kwds : optional
 								            Additional keyword arguments are documented in
 								            :meth:`DataFrame.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Examples
 								        --------
 								        For Series
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...     'sales': [3, 2, 3, 9, 10, 6],
 								            ...     'signups': [5, 5, 6, 12, 14, 13],
 								            ...     'visits': [20, 42, 28, 62, 81, 50],
 								            ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
 								            ...                        freq='M'))
 								            >>> df.sales.plot.area()  # doctest: +SKIP
 								        For DataFrame
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...     'sales': [3, 2, 3, 9, 10, 6],
 								            ...     'signups': [5, 5, 6, 12, 14, 13],
 								            ...     'visits': [20, 42, 28, 62, 81, 50],
 								            ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
 								            ...                        freq='M'))
 								            >>> df.plot.area()  # doctest: +SKIP
 								        """
 								        from pyspark.pandas import DataFrame, Series
 								        if isinstance(self.data, Series):
 								            return self(kind="area", **kwds)
 								        elif isinstance(self.data, DataFrame):
 								            return self(kind="area", x=x, y=y, **kwds)
 								    def pie(self, **kwds):
 								        """
 								        Generate a pie plot.
 								        A pie plot is a proportional representation of the numerical data in a
 								        column. This function wraps :meth:`plotly.express.pie` for the
 								        specified column.
 								        Parameters
 								        ----------
 								        y : int or label, optional
 								            Label or position of the column to plot.
 								            If not provided, ``subplots=True`` argument must be passed (matplotlib-only).
 								        **kwds
 								            Keyword arguments to pass on to :meth:`Koalas.Series.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        Examples
 								        --------
 								        For Series:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'radius': [2439.7, 6051.8, 6378.1]},
 								            ...                   index=['Mercury', 'Venus', 'Earth'])
 								            >>> df.mass.plot.pie()  # doctest: +SKIP
 								        For DataFrame:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    'radius': [2439.7, 6051.8, 6378.1]},
 								            ...                   index=['Mercury', 'Venus', 'Earth'])
 								            >>> df.plot.pie(y='mass')  # doctest: +SKIP
 								        """
 								        from pyspark.pandas import DataFrame, Series
 								        if isinstance(self.data, Series):
 								            return self(kind="pie", **kwds)
 								        else:
 								            # pandas will raise an error if y is None and subplots if not True
 								            if (
 								                isinstance(self.data, DataFrame)
 								                and kwds.get("y", None) is None
 								                and not kwds.get("subplots", False)
 								            ):
 								                raise ValueError(
 								                    "pie requires either y column or 'subplots=True' (matplotlib-only)"
 								                )
 								            return self(kind="pie", **kwds)
 								    def scatter(self, x, y, **kwds):
 								        """
 								        Create a scatter plot with varying marker point size and color.
 								        The coordinates of each point are defined by two dataframe columns and
 								        filled circles are used to represent each point. This kind of plot is
 								        useful to see complex correlations between two variables. Points could
 								        be for instance natural 2D coordinates like longitude and latitude in
 								        a map or, in general, any pair of metrics that can be plotted against
 								        each other.
 								        Parameters
 								        ----------
 								        x : int or str
 								            The column name or column position to be used as horizontal
 								            coordinates for each point.
 								        y : int or str
 								            The column name or column position to be used as vertical
 								            coordinates for each point.
 								        s : scalar or array_like, optional
 								            (matplotlib-only).
 								        c : str, int or array_like, optional
 								            (matplotlib-only).
 								        **kwds: Optional
 								            Keyword arguments to pass on to :meth:`pyspark.pandas.DataFrame.plot`.
 								        Returns
 								        -------
 								        :class:`plotly.graph_objs.Figure`
 								            Return an custom object when ``backend!=plotly``.
 								            Return an ndarray when ``subplots=True`` (matplotlib-only).
 								        See Also
 								        --------
 								        plotly.express.scatter : Scatter plot using multiple input data
 								            formats (plotly).
 								        matplotlib.pyplot.scatter : Scatter plot using multiple input data
 								            formats (matplotlib).
 								        Examples
 								        --------
 								        Let's see how to draw a scatter plot using coordinates from the values
 								        in a DataFrame's columns.
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
 								            ...                   columns=['length', 'width', 'species'])
 								            >>> df.plot.scatter(x='length', y='width')  # doctest: +SKIP
 								        And now with dark scheme:
 								        .. plotly::
-												[SPARK-34983][PYTHON] Renaming the package alias from pp to ps

### What changes were proposed in this pull request?

This PR proposes to fix:

```python
import pyspark.pandas as pp
```

to

```python
import pyspark.pandas as ps
```

### Why are the changes needed?

`pp` might sound offensive in some contexts.

### Does this PR introduce _any_ user-facing change?

The change is in master only. We'll use `ps` as the short name instead of `pp`.

### How was this patch tested?

The CI in this PR will test it out.

Closes #32108 from LSturtew/renaming_pyspark.pandas.

Authored-by: Luka Sturtewagen <luka.sturtewagen@linkit.nl>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-11 22:18:08 -04:00
+								            >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
-												[SPARK-34890][PYTHON] Port/integrate Koalas main codes into PySpark

### What changes were proposed in this pull request?

As a first step of [SPARK-34849](https://issues.apache.org/jira/browse/SPARK-34849), this PR proposes porting the Koalas main code into PySpark.

This PR contains minimal changes to the existing Koalas code as follows:
1. `databricks.koalas` -> `pyspark.pandas`
2. `from databricks import koalas as ks` -> `from pyspark import pandas as pp`
3. `ks.xxx -> pp.xxx`

Other than them:
1. Added a line to `python/mypy.ini` in order to ignore the mypy test. See related issue at [SPARK-34941](https://issues.apache.org/jira/browse/SPARK-34941).
2. Added a comment to several lines in several files to ignore the flake8 F401. See related issue at [SPARK-34943](https://issues.apache.org/jira/browse/SPARK-34943).

When this PR is merged, all the features that were previously used in [Koalas](https://github.com/databricks/koalas) will be available in PySpark as well.

Users can access to the pandas API in PySpark as below:

```python
>>> from pyspark import pandas as pp
>>> ppdf = pp.DataFrame({"A": [1, 2, 3], "B": [15, 20, 25]})
>>> ppdf
   A   B
0  1  15
1  2  20
2  3  25
```

The existing "options and settings" in Koalas are also available in the same way:

```python
>>> from pyspark.pandas.config import set_option, reset_option, get_option
>>> ppser1 = pp.Series([1, 2, 3])
>>> ppser2 = pp.Series([3, 4, 5])
>>> ppser1 + ppser2
Traceback (most recent call last):
...
ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

>>> set_option("compute.ops_on_diff_frames", True)
>>> ppser1 + ppser2
0    4
1    6
2    8
dtype: int64
```

Please also refer to the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html) and [Options and Settings](https://koalas.readthedocs.io/en/latest/user_guide/options.html) for more detail.

**NOTE** that this PR intentionally ports the main codes of Koalas first almost as are with minimal changes because:
- Koalas project is fairly large. Making some changes together for PySpark will make it difficult to review the individual change.
    Koalas dev includes multiple Spark committers who will review. By doing this, the committers will be able to more easily and effectively review and drive the development.
- Koalas tests and documentation require major changes to make it look great together with PySpark whereas main codes do not require.
- We lately froze the Koalas codebase, and plan to work together on the initial porting. By porting the main codes first as are, it unblocks the Koalas dev to work on other items in parallel.

I promise and will make sure on:
- Rename Koalas to PySpark pandas APIs and/or pandas-on-Spark accordingly in documentation, and the docstrings and comments in the main codes.
- Triage APIs to remove that don’t make sense when Koalas is in PySpark

The documentation changes will be tracked in [SPARK-34885](https://issues.apache.org/jira/browse/SPARK-34885), the test code changes will be tracked in [SPARK-34886](https://issues.apache.org/jira/browse/SPARK-34886).

### Why are the changes needed?

Please refer to:
- [[DISCUSS] Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/DISCUSS-Support-pandas-API-layer-on-PySpark-td30945.html)
- [[VOTE] SPIP: Support pandas API layer on PySpark](http://apache-spark-developers-list.1001551.n3.nabble.com/VOTE-SPIP-Support-pandas-API-layer-on-PySpark-td30996.html)

### Does this PR introduce _any_ user-facing change?

Yes, now users can use the pandas APIs on Spark

### How was this patch tested?

Manually tested for exposed major APIs and options as described above.

### Koalas contributors

Koalas would not have been possible without the following contributors:

ueshin
HyukjinKwon
rxin
xinrong-databricks
RainFung
charlesdong1991
harupy
floscha
beobest2
thunterdb
garawalid
LucasG0
shril
deepyaman
gioa
fwani
90jam
thoo
AbdealiJK
abishekganesh72
gliptak
DumbMachine
dvgodoy
stbof
nitlev
hjoo
gatorsmile
tomspur
icexelloss
awdavidson
guyao
akhilputhiry
scook12
patryk-oleniuk
tracek
dennyglee
athena15
gstaubli
WeichenXu123
hsubbaraj
lfdversluis
ktksq
shengjh
margaret-databricks
LSturtew
sllynn
manuzhang
jijosg
sadikovi

Closes #32036 from itholic/SPARK-34890.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>

											
										
										
											2021-04-05 23:42:39 -04:00
+								            ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
 								            ...                   columns=['length', 'width', 'species'])
 								            >>> fig = df.plot.scatter(x='length', y='width')
 								            >>> fig.update_layout(template="plotly_dark")  # doctest: +SKIP
 								        """
 								        return self(kind="scatter", x=x, y=y, **kwds)
 								    def hexbin(self, **kwds):
 								        return unsupported_function(class_name="pd.DataFrame", method_name="hexbin")()