7ff9d2e3ee
### What changes were proposed in this pull request? This PR proposes to rename Koalas to pandas-on-Spark in main codes ### Why are the changes needed? To have the correct name in PySpark. NOTE that the official name in the main documentation will be pandas APIs on Spark to be extra clear. pandas-on-Spark is not the official term. ### Does this PR introduce _any_ user-facing change? No, it's master-only change. It changes the docstring and class names. ### How was this patch tested? Manually tested via: ```bash ./python/run-tests --python-executable=python3 --modules pyspark-pandas ``` Closes #32166 from HyukjinKwon/rename-koalas. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
898 lines
30 KiB
Python
898 lines
30 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from distutils.version import LooseVersion
|
|
|
|
import matplotlib as mat
|
|
import numpy as np
|
|
import pandas as pd
|
|
from matplotlib.axes._base import _process_plot_format
|
|
from pandas.core.dtypes.inference import is_list_like
|
|
from pandas.io.formats.printing import pprint_thing
|
|
|
|
from pyspark.pandas.plot import (
|
|
TopNPlotBase,
|
|
SampledPlotBase,
|
|
HistogramPlotBase,
|
|
BoxPlotBase,
|
|
unsupported_function,
|
|
KdePlotBase,
|
|
)
|
|
|
|
|
|
if LooseVersion(pd.__version__) < LooseVersion("0.25"):
|
|
from pandas.plotting._core import (
|
|
_all_kinds,
|
|
BarPlot as PandasBarPlot,
|
|
BoxPlot as PandasBoxPlot,
|
|
HistPlot as PandasHistPlot,
|
|
MPLPlot as PandasMPLPlot,
|
|
PiePlot as PandasPiePlot,
|
|
AreaPlot as PandasAreaPlot,
|
|
LinePlot as PandasLinePlot,
|
|
BarhPlot as PandasBarhPlot,
|
|
ScatterPlot as PandasScatterPlot,
|
|
KdePlot as PandasKdePlot,
|
|
)
|
|
else:
|
|
from pandas.plotting._matplotlib import (
|
|
BarPlot as PandasBarPlot,
|
|
BoxPlot as PandasBoxPlot,
|
|
HistPlot as PandasHistPlot,
|
|
PiePlot as PandasPiePlot,
|
|
AreaPlot as PandasAreaPlot,
|
|
LinePlot as PandasLinePlot,
|
|
BarhPlot as PandasBarhPlot,
|
|
ScatterPlot as PandasScatterPlot,
|
|
KdePlot as PandasKdePlot,
|
|
)
|
|
from pandas.plotting._core import PlotAccessor
|
|
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot
|
|
|
|
_all_kinds = PlotAccessor._all_kinds
|
|
|
|
|
|
class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase):
|
|
def __init__(self, data, **kwargs):
|
|
super().__init__(self.get_top_n(data), **kwargs)
|
|
|
|
def _plot(self, ax, x, y, w, start=0, log=False, **kwds):
|
|
self.set_result_text(ax)
|
|
return ax.bar(x, y, w, bottom=start, log=log, **kwds)
|
|
|
|
|
|
class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase):
|
|
def boxplot(
|
|
self,
|
|
ax,
|
|
bxpstats,
|
|
notch=None,
|
|
sym=None,
|
|
vert=None,
|
|
whis=None,
|
|
positions=None,
|
|
widths=None,
|
|
patch_artist=None,
|
|
bootstrap=None,
|
|
usermedians=None,
|
|
conf_intervals=None,
|
|
meanline=None,
|
|
showmeans=None,
|
|
showcaps=None,
|
|
showbox=None,
|
|
showfliers=None,
|
|
boxprops=None,
|
|
labels=None,
|
|
flierprops=None,
|
|
medianprops=None,
|
|
meanprops=None,
|
|
capprops=None,
|
|
whiskerprops=None,
|
|
manage_ticks=None,
|
|
# manage_xticks is for compatibility of matplotlib < 3.1.0.
|
|
# Remove this when minimum version is 3.0.0
|
|
manage_xticks=None,
|
|
autorange=False,
|
|
zorder=None,
|
|
precision=None,
|
|
):
|
|
def update_dict(dictionary, rc_name, properties):
|
|
""" Loads properties in the dictionary from rc file if not already
|
|
in the dictionary"""
|
|
rc_str = "boxplot.{0}.{1}"
|
|
if dictionary is None:
|
|
dictionary = dict()
|
|
for prop_dict in properties:
|
|
dictionary.setdefault(prop_dict, mat.rcParams[rc_str.format(rc_name, prop_dict)])
|
|
return dictionary
|
|
|
|
# Common property dictionaries loading from rc
|
|
flier_props = [
|
|
"color",
|
|
"marker",
|
|
"markerfacecolor",
|
|
"markeredgecolor",
|
|
"markersize",
|
|
"linestyle",
|
|
"linewidth",
|
|
]
|
|
default_props = ["color", "linewidth", "linestyle"]
|
|
|
|
boxprops = update_dict(boxprops, "boxprops", default_props)
|
|
whiskerprops = update_dict(whiskerprops, "whiskerprops", default_props)
|
|
capprops = update_dict(capprops, "capprops", default_props)
|
|
medianprops = update_dict(medianprops, "medianprops", default_props)
|
|
meanprops = update_dict(meanprops, "meanprops", default_props)
|
|
flierprops = update_dict(flierprops, "flierprops", flier_props)
|
|
|
|
if patch_artist:
|
|
boxprops["linestyle"] = "solid"
|
|
boxprops["edgecolor"] = boxprops.pop("color")
|
|
|
|
# if non-default sym value, put it into the flier dictionary
|
|
# the logic for providing the default symbol ('b+') now lives
|
|
# in bxp in the initial value of final_flierprops
|
|
# handle all of the `sym` related logic here so we only have to pass
|
|
# on the flierprops dict.
|
|
if sym is not None:
|
|
# no-flier case, which should really be done with
|
|
# 'showfliers=False' but none-the-less deal with it to keep back
|
|
# compatibility
|
|
if sym == "":
|
|
# blow away existing dict and make one for invisible markers
|
|
flierprops = dict(linestyle="none", marker="", color="none")
|
|
# turn the fliers off just to be safe
|
|
showfliers = False
|
|
# now process the symbol string
|
|
else:
|
|
# process the symbol string
|
|
# discarded linestyle
|
|
_, marker, color = _process_plot_format(sym)
|
|
# if we have a marker, use it
|
|
if marker is not None:
|
|
flierprops["marker"] = marker
|
|
# if we have a color, use it
|
|
if color is not None:
|
|
# assume that if color is passed in the user want
|
|
# filled symbol, if the users want more control use
|
|
# flierprops
|
|
flierprops["color"] = color
|
|
flierprops["markerfacecolor"] = color
|
|
flierprops["markeredgecolor"] = color
|
|
|
|
# replace medians if necessary:
|
|
if usermedians is not None:
|
|
if len(np.ravel(usermedians)) != len(bxpstats) or np.shape(usermedians)[0] != len(
|
|
bxpstats
|
|
):
|
|
raise ValueError("usermedians length not compatible with x")
|
|
else:
|
|
# reassign medians as necessary
|
|
for stats, med in zip(bxpstats, usermedians):
|
|
if med is not None:
|
|
stats["med"] = med
|
|
|
|
if conf_intervals is not None:
|
|
if np.shape(conf_intervals)[0] != len(bxpstats):
|
|
err_mess = "conf_intervals length not compatible with x"
|
|
raise ValueError(err_mess)
|
|
else:
|
|
for stats, ci in zip(bxpstats, conf_intervals):
|
|
if ci is not None:
|
|
if len(ci) != 2:
|
|
raise ValueError("each confidence interval must " "have two values")
|
|
else:
|
|
if ci[0] is not None:
|
|
stats["cilo"] = ci[0]
|
|
if ci[1] is not None:
|
|
stats["cihi"] = ci[1]
|
|
|
|
should_manage_ticks = True
|
|
if manage_xticks is not None:
|
|
should_manage_ticks = manage_xticks
|
|
if manage_ticks is not None:
|
|
should_manage_ticks = manage_ticks
|
|
|
|
if LooseVersion(mat.__version__) < LooseVersion("3.1.0"):
|
|
extra_args = {"manage_xticks": should_manage_ticks}
|
|
else:
|
|
extra_args = {"manage_ticks": should_manage_ticks}
|
|
|
|
artists = ax.bxp(
|
|
bxpstats,
|
|
positions=positions,
|
|
widths=widths,
|
|
vert=vert,
|
|
patch_artist=patch_artist,
|
|
shownotches=notch,
|
|
showmeans=showmeans,
|
|
showcaps=showcaps,
|
|
showbox=showbox,
|
|
boxprops=boxprops,
|
|
flierprops=flierprops,
|
|
medianprops=medianprops,
|
|
meanprops=meanprops,
|
|
meanline=meanline,
|
|
showfliers=showfliers,
|
|
capprops=capprops,
|
|
whiskerprops=whiskerprops,
|
|
zorder=zorder,
|
|
**extra_args,
|
|
)
|
|
return artists
|
|
|
|
def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds):
|
|
bp = self.boxplot(ax, bxpstats, **kwds)
|
|
|
|
if return_type == "dict":
|
|
return bp, bp
|
|
elif return_type == "both":
|
|
return self.BP(ax=ax, lines=bp), bp
|
|
else:
|
|
return ax, bp
|
|
|
|
def _compute_plot_data(self):
|
|
colname = self.data.name
|
|
spark_column_name = self.data._internal.spark_column_name_for(self.data._column_label)
|
|
data = self.data
|
|
|
|
# Updates all props with the rc defaults from matplotlib
|
|
self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds))
|
|
|
|
# Gets some important kwds
|
|
showfliers = self.kwds.get("showfliers", False)
|
|
whis = self.kwds.get("whis", 1.5)
|
|
labels = self.kwds.get("labels", [colname])
|
|
|
|
# This one is pandas-on-Spark specific to control precision for approx_percentile
|
|
precision = self.kwds.get("precision", 0.01)
|
|
|
|
# # Computes mean, median, Q1 and Q3 with approx_percentile and precision
|
|
col_stats, col_fences = BoxPlotBase.compute_stats(data, spark_column_name, whis, precision)
|
|
|
|
# # Creates a column to flag rows as outliers or not
|
|
outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences)
|
|
|
|
# # Computes min and max values of non-outliers - the whiskers
|
|
whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)
|
|
|
|
if showfliers:
|
|
fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0])
|
|
else:
|
|
fliers = []
|
|
|
|
# Builds bxpstats dict
|
|
stats = []
|
|
item = {
|
|
"mean": col_stats["mean"],
|
|
"med": col_stats["med"],
|
|
"q1": col_stats["q1"],
|
|
"q3": col_stats["q3"],
|
|
"whislo": whiskers[0],
|
|
"whishi": whiskers[1],
|
|
"fliers": fliers,
|
|
"label": labels[0],
|
|
}
|
|
stats.append(item)
|
|
|
|
self.data = {labels[0]: stats}
|
|
|
|
def _make_plot(self):
|
|
bxpstats = list(self.data.values())[0]
|
|
ax = self._get_ax(0)
|
|
kwds = self.kwds.copy()
|
|
|
|
for stats in bxpstats:
|
|
if len(stats["fliers"]) > 1000:
|
|
stats["fliers"] = stats["fliers"][:1000]
|
|
ax.text(
|
|
1,
|
|
1,
|
|
"showing top 1,000 fliers only",
|
|
size=6,
|
|
ha="right",
|
|
va="bottom",
|
|
transform=ax.transAxes,
|
|
)
|
|
|
|
ret, bp = self._plot(ax, bxpstats, column_num=0, return_type=self.return_type, **kwds)
|
|
self.maybe_color_bp(bp)
|
|
self._return_obj = ret
|
|
|
|
labels = [l for l, _ in self.data.items()]
|
|
labels = [pprint_thing(l) for l in labels]
|
|
if not self.use_index:
|
|
labels = [pprint_thing(key) for key in range(len(labels))]
|
|
self._set_ticklabels(ax, labels)
|
|
|
|
@staticmethod
|
|
def rc_defaults(
|
|
notch=None,
|
|
vert=None,
|
|
whis=None,
|
|
patch_artist=None,
|
|
bootstrap=None,
|
|
meanline=None,
|
|
showmeans=None,
|
|
showcaps=None,
|
|
showbox=None,
|
|
showfliers=None,
|
|
**kwargs
|
|
):
|
|
# Missing arguments default to rcParams.
|
|
if whis is None:
|
|
whis = mat.rcParams["boxplot.whiskers"]
|
|
if bootstrap is None:
|
|
bootstrap = mat.rcParams["boxplot.bootstrap"]
|
|
|
|
if notch is None:
|
|
notch = mat.rcParams["boxplot.notch"]
|
|
if vert is None:
|
|
vert = mat.rcParams["boxplot.vertical"]
|
|
if patch_artist is None:
|
|
patch_artist = mat.rcParams["boxplot.patchartist"]
|
|
if meanline is None:
|
|
meanline = mat.rcParams["boxplot.meanline"]
|
|
if showmeans is None:
|
|
showmeans = mat.rcParams["boxplot.showmeans"]
|
|
if showcaps is None:
|
|
showcaps = mat.rcParams["boxplot.showcaps"]
|
|
if showbox is None:
|
|
showbox = mat.rcParams["boxplot.showbox"]
|
|
if showfliers is None:
|
|
showfliers = mat.rcParams["boxplot.showfliers"]
|
|
|
|
return dict(
|
|
whis=whis,
|
|
bootstrap=bootstrap,
|
|
notch=notch,
|
|
vert=vert,
|
|
patch_artist=patch_artist,
|
|
meanline=meanline,
|
|
showmeans=showmeans,
|
|
showcaps=showcaps,
|
|
showbox=showbox,
|
|
showfliers=showfliers,
|
|
)
|
|
|
|
|
|
class PandasOnSparkHistPlot(PandasHistPlot, HistogramPlotBase):
|
|
def _args_adjust(self):
|
|
if is_list_like(self.bottom):
|
|
self.bottom = np.array(self.bottom)
|
|
|
|
def _compute_plot_data(self):
|
|
self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins)
|
|
|
|
def _make_plot(self):
|
|
# TODO: this logic is similar with KdePlot. Might have to deduplicate it.
|
|
# 'num_colors' requires to calculate `shape` which has to count all.
|
|
# Use 1 for now to save the computation.
|
|
colors = self._get_colors(num_colors=1)
|
|
stacking_id = self._get_stacking_id()
|
|
output_series = HistogramPlotBase.compute_hist(self.data, self.bins)
|
|
|
|
for (i, label), y in zip(enumerate(self.data._internal.column_labels), output_series):
|
|
ax = self._get_ax(i)
|
|
|
|
kwds = self.kwds.copy()
|
|
|
|
label = pprint_thing(label if len(label) > 1 else label[0])
|
|
kwds["label"] = label
|
|
|
|
style, kwds = self._apply_style_colors(colors, kwds, i, label)
|
|
if style is not None:
|
|
kwds["style"] = style
|
|
|
|
kwds = self._make_plot_keywords(kwds, y)
|
|
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
|
|
self._add_legend_handle(artists[0], label, index=i)
|
|
|
|
@classmethod
|
|
def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds):
|
|
if column_num == 0:
|
|
cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
|
|
|
|
base = np.zeros(len(bins) - 1)
|
|
bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
|
|
|
|
# Since the counts were computed already, we use them as weights and just generate
|
|
# one entry for each bin
|
|
n, bins, patches = ax.hist(bins[:-1], bins=bins, bottom=bottom, weights=y, **kwds)
|
|
|
|
cls._update_stacker(ax, stacking_id, n)
|
|
return patches
|
|
|
|
|
|
class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase):
|
|
def __init__(self, data, **kwargs):
|
|
super().__init__(self.get_top_n(data), **kwargs)
|
|
|
|
def _make_plot(self):
|
|
self.set_result_text(self._get_ax(0))
|
|
super()._make_plot()
|
|
|
|
|
|
class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
|
|
def __init__(self, data, **kwargs):
|
|
super().__init__(self.get_sampled(data), **kwargs)
|
|
|
|
def _make_plot(self):
|
|
self.set_result_text(self._get_ax(0))
|
|
super()._make_plot()
|
|
|
|
|
|
class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
|
|
def __init__(self, data, **kwargs):
|
|
super().__init__(self.get_sampled(data), **kwargs)
|
|
|
|
def _make_plot(self):
|
|
self.set_result_text(self._get_ax(0))
|
|
super()._make_plot()
|
|
|
|
|
|
class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
|
|
def __init__(self, data, **kwargs):
|
|
super().__init__(self.get_top_n(data), **kwargs)
|
|
|
|
def _make_plot(self):
|
|
self.set_result_text(self._get_ax(0))
|
|
super()._make_plot()
|
|
|
|
|
|
class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
|
|
def __init__(self, data, x, y, **kwargs):
|
|
super().__init__(self.get_top_n(data), x, y, **kwargs)
|
|
|
|
def _make_plot(self):
|
|
self.set_result_text(self._get_ax(0))
|
|
super()._make_plot()
|
|
|
|
|
|
class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
|
|
def _compute_plot_data(self):
|
|
self.data = KdePlotBase.prepare_kde_data(self.data)
|
|
|
|
def _make_plot(self):
|
|
# 'num_colors' requires to calculate `shape` which has to count all.
|
|
# Use 1 for now to save the computation.
|
|
colors = self._get_colors(num_colors=1)
|
|
stacking_id = self._get_stacking_id()
|
|
|
|
sdf = self.data._internal.spark_frame
|
|
|
|
for i, label in enumerate(self.data._internal.column_labels):
|
|
# 'y' is a Spark DataFrame that selects one column.
|
|
y = sdf.select(self.data._internal.spark_column_for(label))
|
|
ax = self._get_ax(i)
|
|
|
|
kwds = self.kwds.copy()
|
|
|
|
label = pprint_thing(label if len(label) > 1 else label[0])
|
|
kwds["label"] = label
|
|
|
|
style, kwds = self._apply_style_colors(colors, kwds, i, label)
|
|
if style is not None:
|
|
kwds["style"] = style
|
|
|
|
kwds = self._make_plot_keywords(kwds, y)
|
|
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
|
|
self._add_legend_handle(artists[0], label, index=i)
|
|
|
|
def _get_ind(self, y):
|
|
return KdePlotBase.get_ind(y, self.ind)
|
|
|
|
@classmethod
|
|
def _plot(
|
|
cls, ax, y, style=None, bw_method=None, ind=None, column_num=None, stacking_id=None, **kwds
|
|
):
|
|
y = KdePlotBase.compute_kde(y, bw_method=bw_method, ind=ind)
|
|
lines = PandasMPLPlot._plot(ax, ind, y, style=style, **kwds)
|
|
return lines
|
|
|
|
|
|
_klasses = [
|
|
PandasOnSparkHistPlot,
|
|
PandasOnSparkBarPlot,
|
|
PandasOnSparkBoxPlot,
|
|
PandasOnSparkPiePlot,
|
|
PandasOnSparkAreaPlot,
|
|
PandasOnSparkLinePlot,
|
|
PandasOnSparkBarhPlot,
|
|
PandasOnSparkScatterPlot,
|
|
PandasOnSparkKdePlot,
|
|
]
|
|
_plot_klass = {getattr(klass, "_kind"): klass for klass in _klasses}
|
|
_common_kinds = {"area", "bar", "barh", "box", "hist", "kde", "line", "pie"}
|
|
_series_kinds = _common_kinds.union(set())
|
|
_dataframe_kinds = _common_kinds.union({"scatter", "hexbin"})
|
|
_pandas_on_spark_all_kinds = _common_kinds.union(_series_kinds).union(_dataframe_kinds)
|
|
|
|
|
|
def plot_pandas_on_spark(data, kind, **kwargs):
|
|
if kind not in _pandas_on_spark_all_kinds:
|
|
raise ValueError("{} is not a valid plot kind".format(kind))
|
|
|
|
from pyspark.pandas import DataFrame, Series
|
|
|
|
if isinstance(data, Series):
|
|
if kind not in _series_kinds:
|
|
return unsupported_function(class_name="pd.Series", method_name=kind)()
|
|
return plot_series(data=data, kind=kind, **kwargs)
|
|
elif isinstance(data, DataFrame):
|
|
if kind not in _dataframe_kinds:
|
|
return unsupported_function(class_name="pd.DataFrame", method_name=kind)()
|
|
return plot_frame(data=data, kind=kind, **kwargs)
|
|
|
|
|
|
def plot_series(
|
|
data,
|
|
kind="line",
|
|
ax=None, # Series unique
|
|
figsize=None,
|
|
use_index=True,
|
|
title=None,
|
|
grid=None,
|
|
legend=False,
|
|
style=None,
|
|
logx=False,
|
|
logy=False,
|
|
loglog=False,
|
|
xticks=None,
|
|
yticks=None,
|
|
xlim=None,
|
|
ylim=None,
|
|
rot=None,
|
|
fontsize=None,
|
|
colormap=None,
|
|
table=False,
|
|
yerr=None,
|
|
xerr=None,
|
|
label=None,
|
|
secondary_y=False, # Series unique
|
|
**kwds
|
|
):
|
|
"""
|
|
Make plots of Series using matplotlib / pylab.
|
|
|
|
Each plot kind has a corresponding method on the
|
|
``Series.plot`` accessor:
|
|
``s.plot(kind='line')`` is equivalent to
|
|
``s.plot.line()``.
|
|
|
|
Parameters
|
|
----------
|
|
data : Series
|
|
|
|
kind : str
|
|
- 'line' : line plot (default)
|
|
- 'bar' : vertical bar plot
|
|
- 'barh' : horizontal bar plot
|
|
- 'hist' : histogram
|
|
- 'box' : boxplot
|
|
- 'kde' : Kernel Density Estimation plot
|
|
- 'density' : same as 'kde'
|
|
- 'area' : area plot
|
|
- 'pie' : pie plot
|
|
|
|
ax : matplotlib axes object
|
|
If not passed, uses gca()
|
|
figsize : a tuple (width, height) in inches
|
|
use_index : boolean, default True
|
|
Use index as ticks for x axis
|
|
title : string or list
|
|
Title to use for the plot. If a string is passed, print the string at
|
|
the top of the figure. If a list is passed and `subplots` is True,
|
|
print each item in the list above the corresponding subplot.
|
|
grid : boolean, default None (matlab style default)
|
|
Axis grid lines
|
|
legend : False/True/'reverse'
|
|
Place legend on axis subplots
|
|
style : list or dict
|
|
matplotlib line style per column
|
|
logx : boolean, default False
|
|
Use log scaling on x axis
|
|
logy : boolean, default False
|
|
Use log scaling on y axis
|
|
loglog : boolean, default False
|
|
Use log scaling on both x and y axes
|
|
xticks : sequence
|
|
Values to use for the xticks
|
|
yticks : sequence
|
|
Values to use for the yticks
|
|
xlim : 2-tuple/list
|
|
ylim : 2-tuple/list
|
|
rot : int, default None
|
|
Rotation for ticks (xticks for vertical, yticks for horizontal plots)
|
|
fontsize : int, default None
|
|
Font size for xticks and yticks
|
|
colormap : str or matplotlib colormap object, default None
|
|
Colormap to select colors from. If string, load colormap with that name
|
|
from matplotlib.
|
|
colorbar : boolean, optional
|
|
If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
|
|
position : float
|
|
Specify relative alignments for bar plot layout.
|
|
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
|
|
table : boolean, Series or DataFrame, default False
|
|
If True, draw a table using the data in the DataFrame and the data will
|
|
be transposed to meet matplotlib's default layout.
|
|
If a Series or DataFrame is passed, use passed data to draw a table.
|
|
yerr : DataFrame, Series, array-like, dict and str
|
|
See :ref:`Plotting with Error Bars <visualization.errorbars>` for
|
|
detail.
|
|
xerr : same types as yerr.
|
|
label : label argument to provide to plot
|
|
secondary_y : boolean or sequence of ints, default False
|
|
If True then y-axis will be on the right
|
|
mark_right : boolean, default True
|
|
When using a secondary_y axis, automatically mark the column
|
|
labels with "(right)" in the legend
|
|
**kwds : keywords
|
|
Options to pass to matplotlib plotting method
|
|
|
|
Returns
|
|
-------
|
|
axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
|
|
|
|
Notes
|
|
-----
|
|
|
|
- See matplotlib documentation online for more on this subject
|
|
- If `kind` = 'bar' or 'barh', you can specify relative alignments
|
|
for bar plot layout by `position` keyword.
|
|
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
|
|
"""
|
|
|
|
# function copied from pandas.plotting._core
|
|
# so it calls modified _plot below
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
if ax is None and len(plt.get_fignums()) > 0:
|
|
with plt.rc_context():
|
|
ax = plt.gca()
|
|
ax = PandasMPLPlot._get_ax_layer(ax)
|
|
return _plot(
|
|
data,
|
|
kind=kind,
|
|
ax=ax,
|
|
figsize=figsize,
|
|
use_index=use_index,
|
|
title=title,
|
|
grid=grid,
|
|
legend=legend,
|
|
style=style,
|
|
logx=logx,
|
|
logy=logy,
|
|
loglog=loglog,
|
|
xticks=xticks,
|
|
yticks=yticks,
|
|
xlim=xlim,
|
|
ylim=ylim,
|
|
rot=rot,
|
|
fontsize=fontsize,
|
|
colormap=colormap,
|
|
table=table,
|
|
yerr=yerr,
|
|
xerr=xerr,
|
|
label=label,
|
|
secondary_y=secondary_y,
|
|
**kwds,
|
|
)
|
|
|
|
|
|
def plot_frame(
|
|
data,
|
|
x=None,
|
|
y=None,
|
|
kind="line",
|
|
ax=None,
|
|
subplots=None,
|
|
sharex=None,
|
|
sharey=False,
|
|
layout=None,
|
|
figsize=None,
|
|
use_index=True,
|
|
title=None,
|
|
grid=None,
|
|
legend=True,
|
|
style=None,
|
|
logx=False,
|
|
logy=False,
|
|
loglog=False,
|
|
xticks=None,
|
|
yticks=None,
|
|
xlim=None,
|
|
ylim=None,
|
|
rot=None,
|
|
fontsize=None,
|
|
colormap=None,
|
|
table=False,
|
|
yerr=None,
|
|
xerr=None,
|
|
secondary_y=False,
|
|
sort_columns=False,
|
|
**kwds
|
|
):
|
|
"""
|
|
Make plots of DataFrames using matplotlib / pylab.
|
|
|
|
Each plot kind has a corresponding method on the
|
|
``DataFrame.plot`` accessor:
|
|
``kdf.plot(kind='line')`` is equivalent to
|
|
``kdf.plot.line()``.
|
|
|
|
Parameters
|
|
----------
|
|
data : DataFrame
|
|
|
|
kind : str
|
|
- 'line' : line plot (default)
|
|
- 'bar' : vertical bar plot
|
|
- 'barh' : horizontal bar plot
|
|
- 'hist' : histogram
|
|
- 'box' : boxplot
|
|
- 'kde' : Kernel Density Estimation plot
|
|
- 'density' : same as 'kde'
|
|
- 'area' : area plot
|
|
- 'pie' : pie plot
|
|
- 'scatter' : scatter plot
|
|
ax : matplotlib axes object
|
|
If not passed, uses gca()
|
|
x : label or position, default None
|
|
y : label, position or list of label, positions, default None
|
|
Allows plotting of one column versus another.
|
|
figsize : a tuple (width, height) in inches
|
|
use_index : boolean, default True
|
|
Use index as ticks for x axis
|
|
title : string or list
|
|
Title to use for the plot. If a string is passed, print the string at
|
|
the top of the figure. If a list is passed and `subplots` is True,
|
|
print each item in the list above the corresponding subplot.
|
|
grid : boolean, default None (matlab style default)
|
|
Axis grid lines
|
|
legend : False/True/'reverse'
|
|
Place legend on axis subplots
|
|
style : list or dict
|
|
matplotlib line style per column
|
|
logx : boolean, default False
|
|
Use log scaling on x axis
|
|
logy : boolean, default False
|
|
Use log scaling on y axis
|
|
loglog : boolean, default False
|
|
Use log scaling on both x and y axes
|
|
xticks : sequence
|
|
Values to use for the xticks
|
|
yticks : sequence
|
|
Values to use for the yticks
|
|
xlim : 2-tuple/list
|
|
ylim : 2-tuple/list
|
|
sharex: bool or None, default is None
|
|
Whether to share x axis or not.
|
|
sharey: bool, default is False
|
|
Whether to share y axis or not.
|
|
rot : int, default None
|
|
Rotation for ticks (xticks for vertical, yticks for horizontal plots)
|
|
fontsize : int, default None
|
|
Font size for xticks and yticks
|
|
colormap : str or matplotlib colormap object, default None
|
|
Colormap to select colors from. If string, load colormap with that name
|
|
from matplotlib.
|
|
colorbar : boolean, optional
|
|
If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
|
|
position : float
|
|
Specify relative alignments for bar plot layout.
|
|
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
|
|
table : boolean, Series or DataFrame, default False
|
|
If True, draw a table using the data in the DataFrame and the data will
|
|
be transposed to meet matplotlib's default layout.
|
|
If a Series or DataFrame is passed, use passed data to draw a table.
|
|
yerr : DataFrame, Series, array-like, dict and str
|
|
See :ref:`Plotting with Error Bars <visualization.errorbars>` for
|
|
detail.
|
|
xerr : same types as yerr.
|
|
label : label argument to provide to plot
|
|
secondary_y : boolean or sequence of ints, default False
|
|
If True then y-axis will be on the right
|
|
mark_right : boolean, default True
|
|
When using a secondary_y axis, automatically mark the column
|
|
labels with "(right)" in the legend
|
|
sort_columns: bool, default is False
|
|
When True, will sort values on plots.
|
|
**kwds : keywords
|
|
Options to pass to matplotlib plotting method
|
|
|
|
Returns
|
|
-------
|
|
axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
|
|
|
|
Notes
|
|
-----
|
|
|
|
- See matplotlib documentation online for more on this subject
|
|
- If `kind` = 'bar' or 'barh', you can specify relative alignments
|
|
for bar plot layout by `position` keyword.
|
|
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
|
|
"""
|
|
|
|
return _plot(
|
|
data,
|
|
kind=kind,
|
|
x=x,
|
|
y=y,
|
|
ax=ax,
|
|
figsize=figsize,
|
|
use_index=use_index,
|
|
title=title,
|
|
grid=grid,
|
|
legend=legend,
|
|
subplots=subplots,
|
|
style=style,
|
|
logx=logx,
|
|
logy=logy,
|
|
loglog=loglog,
|
|
xticks=xticks,
|
|
yticks=yticks,
|
|
xlim=xlim,
|
|
ylim=ylim,
|
|
rot=rot,
|
|
fontsize=fontsize,
|
|
colormap=colormap,
|
|
table=table,
|
|
yerr=yerr,
|
|
xerr=xerr,
|
|
sharex=sharex,
|
|
sharey=sharey,
|
|
secondary_y=secondary_y,
|
|
layout=layout,
|
|
sort_columns=sort_columns,
|
|
**kwds,
|
|
)
|
|
|
|
|
|
def _plot(data, x=None, y=None, subplots=False, ax=None, kind="line", **kwds):
|
|
from pyspark.pandas import DataFrame
|
|
|
|
# function copied from pandas.plotting._core
|
|
# and adapted to handle pandas-on-Spark DataFrame and Series
|
|
|
|
kind = kind.lower().strip()
|
|
kind = {"density": "kde"}.get(kind, kind)
|
|
if kind in _all_kinds:
|
|
klass = _plot_klass[kind]
|
|
else:
|
|
raise ValueError("%r is not a valid plot kind" % kind)
|
|
|
|
# scatter and hexbin are inherited from PlanePlot which require x and y
|
|
if kind in ("scatter", "hexbin"):
|
|
plot_obj = klass(data, x, y, subplots=subplots, ax=ax, kind=kind, **kwds)
|
|
else:
|
|
|
|
# check data type and do preprocess before applying plot
|
|
if isinstance(data, DataFrame):
|
|
if x is not None:
|
|
data = data.set_index(x)
|
|
# TODO: check if value of y is plottable
|
|
if y is not None:
|
|
data = data[y]
|
|
|
|
plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
|
|
plot_obj.generate()
|
|
plot_obj.draw()
|
|
return plot_obj.result
|