spark-instrumented-optimizer/python/pyspark/pandas/plot/matplotlib.py
HyukjinKwon 7ff9d2e3ee [SPARK-35071][PYTHON] Rename Koalas to pandas-on-Spark in main codes
### What changes were proposed in this pull request?

This PR proposes to rename Koalas to pandas-on-Spark in main codes

### Why are the changes needed?

To have the correct name in PySpark. NOTE that the official name in the main documentation will be pandas APIs on Spark to be extra clear. pandas-on-Spark is not the official term.

### Does this PR introduce _any_ user-facing change?

No, it's master-only change. It changes the docstring and class names.

### How was this patch tested?

Manually tested via:

```bash
./python/run-tests --python-executable=python3 --modules pyspark-pandas
```

Closes #32166 from HyukjinKwon/rename-koalas.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2021-04-15 12:48:59 +09:00

898 lines
30 KiB
Python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from distutils.version import LooseVersion
import matplotlib as mat
import numpy as np
import pandas as pd
from matplotlib.axes._base import _process_plot_format
from pandas.core.dtypes.inference import is_list_like
from pandas.io.formats.printing import pprint_thing
from pyspark.pandas.plot import (
TopNPlotBase,
SampledPlotBase,
HistogramPlotBase,
BoxPlotBase,
unsupported_function,
KdePlotBase,
)
if LooseVersion(pd.__version__) < LooseVersion("0.25"):
from pandas.plotting._core import (
_all_kinds,
BarPlot as PandasBarPlot,
BoxPlot as PandasBoxPlot,
HistPlot as PandasHistPlot,
MPLPlot as PandasMPLPlot,
PiePlot as PandasPiePlot,
AreaPlot as PandasAreaPlot,
LinePlot as PandasLinePlot,
BarhPlot as PandasBarhPlot,
ScatterPlot as PandasScatterPlot,
KdePlot as PandasKdePlot,
)
else:
from pandas.plotting._matplotlib import (
BarPlot as PandasBarPlot,
BoxPlot as PandasBoxPlot,
HistPlot as PandasHistPlot,
PiePlot as PandasPiePlot,
AreaPlot as PandasAreaPlot,
LinePlot as PandasLinePlot,
BarhPlot as PandasBarhPlot,
ScatterPlot as PandasScatterPlot,
KdePlot as PandasKdePlot,
)
from pandas.plotting._core import PlotAccessor
from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot
_all_kinds = PlotAccessor._all_kinds
class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_top_n(data), **kwargs)
def _plot(self, ax, x, y, w, start=0, log=False, **kwds):
self.set_result_text(ax)
return ax.bar(x, y, w, bottom=start, log=log, **kwds)
class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase):
def boxplot(
self,
ax,
bxpstats,
notch=None,
sym=None,
vert=None,
whis=None,
positions=None,
widths=None,
patch_artist=None,
bootstrap=None,
usermedians=None,
conf_intervals=None,
meanline=None,
showmeans=None,
showcaps=None,
showbox=None,
showfliers=None,
boxprops=None,
labels=None,
flierprops=None,
medianprops=None,
meanprops=None,
capprops=None,
whiskerprops=None,
manage_ticks=None,
# manage_xticks is for compatibility of matplotlib < 3.1.0.
# Remove this when minimum version is 3.0.0
manage_xticks=None,
autorange=False,
zorder=None,
precision=None,
):
def update_dict(dictionary, rc_name, properties):
""" Loads properties in the dictionary from rc file if not already
in the dictionary"""
rc_str = "boxplot.{0}.{1}"
if dictionary is None:
dictionary = dict()
for prop_dict in properties:
dictionary.setdefault(prop_dict, mat.rcParams[rc_str.format(rc_name, prop_dict)])
return dictionary
# Common property dictionaries loading from rc
flier_props = [
"color",
"marker",
"markerfacecolor",
"markeredgecolor",
"markersize",
"linestyle",
"linewidth",
]
default_props = ["color", "linewidth", "linestyle"]
boxprops = update_dict(boxprops, "boxprops", default_props)
whiskerprops = update_dict(whiskerprops, "whiskerprops", default_props)
capprops = update_dict(capprops, "capprops", default_props)
medianprops = update_dict(medianprops, "medianprops", default_props)
meanprops = update_dict(meanprops, "meanprops", default_props)
flierprops = update_dict(flierprops, "flierprops", flier_props)
if patch_artist:
boxprops["linestyle"] = "solid"
boxprops["edgecolor"] = boxprops.pop("color")
# if non-default sym value, put it into the flier dictionary
# the logic for providing the default symbol ('b+') now lives
# in bxp in the initial value of final_flierprops
# handle all of the `sym` related logic here so we only have to pass
# on the flierprops dict.
if sym is not None:
# no-flier case, which should really be done with
# 'showfliers=False' but none-the-less deal with it to keep back
# compatibility
if sym == "":
# blow away existing dict and make one for invisible markers
flierprops = dict(linestyle="none", marker="", color="none")
# turn the fliers off just to be safe
showfliers = False
# now process the symbol string
else:
# process the symbol string
# discarded linestyle
_, marker, color = _process_plot_format(sym)
# if we have a marker, use it
if marker is not None:
flierprops["marker"] = marker
# if we have a color, use it
if color is not None:
# assume that if color is passed in the user want
# filled symbol, if the users want more control use
# flierprops
flierprops["color"] = color
flierprops["markerfacecolor"] = color
flierprops["markeredgecolor"] = color
# replace medians if necessary:
if usermedians is not None:
if len(np.ravel(usermedians)) != len(bxpstats) or np.shape(usermedians)[0] != len(
bxpstats
):
raise ValueError("usermedians length not compatible with x")
else:
# reassign medians as necessary
for stats, med in zip(bxpstats, usermedians):
if med is not None:
stats["med"] = med
if conf_intervals is not None:
if np.shape(conf_intervals)[0] != len(bxpstats):
err_mess = "conf_intervals length not compatible with x"
raise ValueError(err_mess)
else:
for stats, ci in zip(bxpstats, conf_intervals):
if ci is not None:
if len(ci) != 2:
raise ValueError("each confidence interval must " "have two values")
else:
if ci[0] is not None:
stats["cilo"] = ci[0]
if ci[1] is not None:
stats["cihi"] = ci[1]
should_manage_ticks = True
if manage_xticks is not None:
should_manage_ticks = manage_xticks
if manage_ticks is not None:
should_manage_ticks = manage_ticks
if LooseVersion(mat.__version__) < LooseVersion("3.1.0"):
extra_args = {"manage_xticks": should_manage_ticks}
else:
extra_args = {"manage_ticks": should_manage_ticks}
artists = ax.bxp(
bxpstats,
positions=positions,
widths=widths,
vert=vert,
patch_artist=patch_artist,
shownotches=notch,
showmeans=showmeans,
showcaps=showcaps,
showbox=showbox,
boxprops=boxprops,
flierprops=flierprops,
medianprops=medianprops,
meanprops=meanprops,
meanline=meanline,
showfliers=showfliers,
capprops=capprops,
whiskerprops=whiskerprops,
zorder=zorder,
**extra_args,
)
return artists
def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds):
bp = self.boxplot(ax, bxpstats, **kwds)
if return_type == "dict":
return bp, bp
elif return_type == "both":
return self.BP(ax=ax, lines=bp), bp
else:
return ax, bp
def _compute_plot_data(self):
colname = self.data.name
spark_column_name = self.data._internal.spark_column_name_for(self.data._column_label)
data = self.data
# Updates all props with the rc defaults from matplotlib
self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds))
# Gets some important kwds
showfliers = self.kwds.get("showfliers", False)
whis = self.kwds.get("whis", 1.5)
labels = self.kwds.get("labels", [colname])
# This one is pandas-on-Spark specific to control precision for approx_percentile
precision = self.kwds.get("precision", 0.01)
# # Computes mean, median, Q1 and Q3 with approx_percentile and precision
col_stats, col_fences = BoxPlotBase.compute_stats(data, spark_column_name, whis, precision)
# # Creates a column to flag rows as outliers or not
outliers = BoxPlotBase.outliers(data, spark_column_name, *col_fences)
# # Computes min and max values of non-outliers - the whiskers
whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)
if showfliers:
fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0])
else:
fliers = []
# Builds bxpstats dict
stats = []
item = {
"mean": col_stats["mean"],
"med": col_stats["med"],
"q1": col_stats["q1"],
"q3": col_stats["q3"],
"whislo": whiskers[0],
"whishi": whiskers[1],
"fliers": fliers,
"label": labels[0],
}
stats.append(item)
self.data = {labels[0]: stats}
def _make_plot(self):
bxpstats = list(self.data.values())[0]
ax = self._get_ax(0)
kwds = self.kwds.copy()
for stats in bxpstats:
if len(stats["fliers"]) > 1000:
stats["fliers"] = stats["fliers"][:1000]
ax.text(
1,
1,
"showing top 1,000 fliers only",
size=6,
ha="right",
va="bottom",
transform=ax.transAxes,
)
ret, bp = self._plot(ax, bxpstats, column_num=0, return_type=self.return_type, **kwds)
self.maybe_color_bp(bp)
self._return_obj = ret
labels = [l for l, _ in self.data.items()]
labels = [pprint_thing(l) for l in labels]
if not self.use_index:
labels = [pprint_thing(key) for key in range(len(labels))]
self._set_ticklabels(ax, labels)
@staticmethod
def rc_defaults(
notch=None,
vert=None,
whis=None,
patch_artist=None,
bootstrap=None,
meanline=None,
showmeans=None,
showcaps=None,
showbox=None,
showfliers=None,
**kwargs
):
# Missing arguments default to rcParams.
if whis is None:
whis = mat.rcParams["boxplot.whiskers"]
if bootstrap is None:
bootstrap = mat.rcParams["boxplot.bootstrap"]
if notch is None:
notch = mat.rcParams["boxplot.notch"]
if vert is None:
vert = mat.rcParams["boxplot.vertical"]
if patch_artist is None:
patch_artist = mat.rcParams["boxplot.patchartist"]
if meanline is None:
meanline = mat.rcParams["boxplot.meanline"]
if showmeans is None:
showmeans = mat.rcParams["boxplot.showmeans"]
if showcaps is None:
showcaps = mat.rcParams["boxplot.showcaps"]
if showbox is None:
showbox = mat.rcParams["boxplot.showbox"]
if showfliers is None:
showfliers = mat.rcParams["boxplot.showfliers"]
return dict(
whis=whis,
bootstrap=bootstrap,
notch=notch,
vert=vert,
patch_artist=patch_artist,
meanline=meanline,
showmeans=showmeans,
showcaps=showcaps,
showbox=showbox,
showfliers=showfliers,
)
class PandasOnSparkHistPlot(PandasHistPlot, HistogramPlotBase):
def _args_adjust(self):
if is_list_like(self.bottom):
self.bottom = np.array(self.bottom)
def _compute_plot_data(self):
self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins)
def _make_plot(self):
# TODO: this logic is similar with KdePlot. Might have to deduplicate it.
# 'num_colors' requires to calculate `shape` which has to count all.
# Use 1 for now to save the computation.
colors = self._get_colors(num_colors=1)
stacking_id = self._get_stacking_id()
output_series = HistogramPlotBase.compute_hist(self.data, self.bins)
for (i, label), y in zip(enumerate(self.data._internal.column_labels), output_series):
ax = self._get_ax(i)
kwds = self.kwds.copy()
label = pprint_thing(label if len(label) > 1 else label[0])
kwds["label"] = label
style, kwds = self._apply_style_colors(colors, kwds, i, label)
if style is not None:
kwds["style"] = style
kwds = self._make_plot_keywords(kwds, y)
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
self._add_legend_handle(artists[0], label, index=i)
@classmethod
def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds):
if column_num == 0:
cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
base = np.zeros(len(bins) - 1)
bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
# Since the counts were computed already, we use them as weights and just generate
# one entry for each bin
n, bins, patches = ax.hist(bins[:-1], bins=bins, bottom=bottom, weights=y, **kwds)
cls._update_stacker(ax, stacking_id, n)
return patches
class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_top_n(data), **kwargs)
def _make_plot(self):
self.set_result_text(self._get_ax(0))
super()._make_plot()
class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_sampled(data), **kwargs)
def _make_plot(self):
self.set_result_text(self._get_ax(0))
super()._make_plot()
class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_sampled(data), **kwargs)
def _make_plot(self):
self.set_result_text(self._get_ax(0))
super()._make_plot()
class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
def __init__(self, data, **kwargs):
super().__init__(self.get_top_n(data), **kwargs)
def _make_plot(self):
self.set_result_text(self._get_ax(0))
super()._make_plot()
class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
def __init__(self, data, x, y, **kwargs):
super().__init__(self.get_top_n(data), x, y, **kwargs)
def _make_plot(self):
self.set_result_text(self._get_ax(0))
super()._make_plot()
class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
def _compute_plot_data(self):
self.data = KdePlotBase.prepare_kde_data(self.data)
def _make_plot(self):
# 'num_colors' requires to calculate `shape` which has to count all.
# Use 1 for now to save the computation.
colors = self._get_colors(num_colors=1)
stacking_id = self._get_stacking_id()
sdf = self.data._internal.spark_frame
for i, label in enumerate(self.data._internal.column_labels):
# 'y' is a Spark DataFrame that selects one column.
y = sdf.select(self.data._internal.spark_column_for(label))
ax = self._get_ax(i)
kwds = self.kwds.copy()
label = pprint_thing(label if len(label) > 1 else label[0])
kwds["label"] = label
style, kwds = self._apply_style_colors(colors, kwds, i, label)
if style is not None:
kwds["style"] = style
kwds = self._make_plot_keywords(kwds, y)
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
self._add_legend_handle(artists[0], label, index=i)
def _get_ind(self, y):
return KdePlotBase.get_ind(y, self.ind)
@classmethod
def _plot(
cls, ax, y, style=None, bw_method=None, ind=None, column_num=None, stacking_id=None, **kwds
):
y = KdePlotBase.compute_kde(y, bw_method=bw_method, ind=ind)
lines = PandasMPLPlot._plot(ax, ind, y, style=style, **kwds)
return lines
_klasses = [
PandasOnSparkHistPlot,
PandasOnSparkBarPlot,
PandasOnSparkBoxPlot,
PandasOnSparkPiePlot,
PandasOnSparkAreaPlot,
PandasOnSparkLinePlot,
PandasOnSparkBarhPlot,
PandasOnSparkScatterPlot,
PandasOnSparkKdePlot,
]
_plot_klass = {getattr(klass, "_kind"): klass for klass in _klasses}
_common_kinds = {"area", "bar", "barh", "box", "hist", "kde", "line", "pie"}
_series_kinds = _common_kinds.union(set())
_dataframe_kinds = _common_kinds.union({"scatter", "hexbin"})
_pandas_on_spark_all_kinds = _common_kinds.union(_series_kinds).union(_dataframe_kinds)
def plot_pandas_on_spark(data, kind, **kwargs):
if kind not in _pandas_on_spark_all_kinds:
raise ValueError("{} is not a valid plot kind".format(kind))
from pyspark.pandas import DataFrame, Series
if isinstance(data, Series):
if kind not in _series_kinds:
return unsupported_function(class_name="pd.Series", method_name=kind)()
return plot_series(data=data, kind=kind, **kwargs)
elif isinstance(data, DataFrame):
if kind not in _dataframe_kinds:
return unsupported_function(class_name="pd.DataFrame", method_name=kind)()
return plot_frame(data=data, kind=kind, **kwargs)
def plot_series(
data,
kind="line",
ax=None, # Series unique
figsize=None,
use_index=True,
title=None,
grid=None,
legend=False,
style=None,
logx=False,
logy=False,
loglog=False,
xticks=None,
yticks=None,
xlim=None,
ylim=None,
rot=None,
fontsize=None,
colormap=None,
table=False,
yerr=None,
xerr=None,
label=None,
secondary_y=False, # Series unique
**kwds
):
"""
Make plots of Series using matplotlib / pylab.
Each plot kind has a corresponding method on the
``Series.plot`` accessor:
``s.plot(kind='line')`` is equivalent to
``s.plot.line()``.
Parameters
----------
data : Series
kind : str
- 'line' : line plot (default)
- 'bar' : vertical bar plot
- 'barh' : horizontal bar plot
- 'hist' : histogram
- 'box' : boxplot
- 'kde' : Kernel Density Estimation plot
- 'density' : same as 'kde'
- 'area' : area plot
- 'pie' : pie plot
ax : matplotlib axes object
If not passed, uses gca()
figsize : a tuple (width, height) in inches
use_index : boolean, default True
Use index as ticks for x axis
title : string or list
Title to use for the plot. If a string is passed, print the string at
the top of the figure. If a list is passed and `subplots` is True,
print each item in the list above the corresponding subplot.
grid : boolean, default None (matlab style default)
Axis grid lines
legend : False/True/'reverse'
Place legend on axis subplots
style : list or dict
matplotlib line style per column
logx : boolean, default False
Use log scaling on x axis
logy : boolean, default False
Use log scaling on y axis
loglog : boolean, default False
Use log scaling on both x and y axes
xticks : sequence
Values to use for the xticks
yticks : sequence
Values to use for the yticks
xlim : 2-tuple/list
ylim : 2-tuple/list
rot : int, default None
Rotation for ticks (xticks for vertical, yticks for horizontal plots)
fontsize : int, default None
Font size for xticks and yticks
colormap : str or matplotlib colormap object, default None
Colormap to select colors from. If string, load colormap with that name
from matplotlib.
colorbar : boolean, optional
If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
position : float
Specify relative alignments for bar plot layout.
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
table : boolean, Series or DataFrame, default False
If True, draw a table using the data in the DataFrame and the data will
be transposed to meet matplotlib's default layout.
If a Series or DataFrame is passed, use passed data to draw a table.
yerr : DataFrame, Series, array-like, dict and str
See :ref:`Plotting with Error Bars <visualization.errorbars>` for
detail.
xerr : same types as yerr.
label : label argument to provide to plot
secondary_y : boolean or sequence of ints, default False
If True then y-axis will be on the right
mark_right : boolean, default True
When using a secondary_y axis, automatically mark the column
labels with "(right)" in the legend
**kwds : keywords
Options to pass to matplotlib plotting method
Returns
-------
axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
Notes
-----
- See matplotlib documentation online for more on this subject
- If `kind` = 'bar' or 'barh', you can specify relative alignments
for bar plot layout by `position` keyword.
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
"""
# function copied from pandas.plotting._core
# so it calls modified _plot below
import matplotlib.pyplot as plt
if ax is None and len(plt.get_fignums()) > 0:
with plt.rc_context():
ax = plt.gca()
ax = PandasMPLPlot._get_ax_layer(ax)
return _plot(
data,
kind=kind,
ax=ax,
figsize=figsize,
use_index=use_index,
title=title,
grid=grid,
legend=legend,
style=style,
logx=logx,
logy=logy,
loglog=loglog,
xticks=xticks,
yticks=yticks,
xlim=xlim,
ylim=ylim,
rot=rot,
fontsize=fontsize,
colormap=colormap,
table=table,
yerr=yerr,
xerr=xerr,
label=label,
secondary_y=secondary_y,
**kwds,
)
def plot_frame(
data,
x=None,
y=None,
kind="line",
ax=None,
subplots=None,
sharex=None,
sharey=False,
layout=None,
figsize=None,
use_index=True,
title=None,
grid=None,
legend=True,
style=None,
logx=False,
logy=False,
loglog=False,
xticks=None,
yticks=None,
xlim=None,
ylim=None,
rot=None,
fontsize=None,
colormap=None,
table=False,
yerr=None,
xerr=None,
secondary_y=False,
sort_columns=False,
**kwds
):
"""
Make plots of DataFrames using matplotlib / pylab.
Each plot kind has a corresponding method on the
``DataFrame.plot`` accessor:
``kdf.plot(kind='line')`` is equivalent to
``kdf.plot.line()``.
Parameters
----------
data : DataFrame
kind : str
- 'line' : line plot (default)
- 'bar' : vertical bar plot
- 'barh' : horizontal bar plot
- 'hist' : histogram
- 'box' : boxplot
- 'kde' : Kernel Density Estimation plot
- 'density' : same as 'kde'
- 'area' : area plot
- 'pie' : pie plot
- 'scatter' : scatter plot
ax : matplotlib axes object
If not passed, uses gca()
x : label or position, default None
y : label, position or list of label, positions, default None
Allows plotting of one column versus another.
figsize : a tuple (width, height) in inches
use_index : boolean, default True
Use index as ticks for x axis
title : string or list
Title to use for the plot. If a string is passed, print the string at
the top of the figure. If a list is passed and `subplots` is True,
print each item in the list above the corresponding subplot.
grid : boolean, default None (matlab style default)
Axis grid lines
legend : False/True/'reverse'
Place legend on axis subplots
style : list or dict
matplotlib line style per column
logx : boolean, default False
Use log scaling on x axis
logy : boolean, default False
Use log scaling on y axis
loglog : boolean, default False
Use log scaling on both x and y axes
xticks : sequence
Values to use for the xticks
yticks : sequence
Values to use for the yticks
xlim : 2-tuple/list
ylim : 2-tuple/list
sharex: bool or None, default is None
Whether to share x axis or not.
sharey: bool, default is False
Whether to share y axis or not.
rot : int, default None
Rotation for ticks (xticks for vertical, yticks for horizontal plots)
fontsize : int, default None
Font size for xticks and yticks
colormap : str or matplotlib colormap object, default None
Colormap to select colors from. If string, load colormap with that name
from matplotlib.
colorbar : boolean, optional
If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
position : float
Specify relative alignments for bar plot layout.
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
table : boolean, Series or DataFrame, default False
If True, draw a table using the data in the DataFrame and the data will
be transposed to meet matplotlib's default layout.
If a Series or DataFrame is passed, use passed data to draw a table.
yerr : DataFrame, Series, array-like, dict and str
See :ref:`Plotting with Error Bars <visualization.errorbars>` for
detail.
xerr : same types as yerr.
label : label argument to provide to plot
secondary_y : boolean or sequence of ints, default False
If True then y-axis will be on the right
mark_right : boolean, default True
When using a secondary_y axis, automatically mark the column
labels with "(right)" in the legend
sort_columns: bool, default is False
When True, will sort values on plots.
**kwds : keywords
Options to pass to matplotlib plotting method
Returns
-------
axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them
Notes
-----
- See matplotlib documentation online for more on this subject
- If `kind` = 'bar' or 'barh', you can specify relative alignments
for bar plot layout by `position` keyword.
From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
"""
return _plot(
data,
kind=kind,
x=x,
y=y,
ax=ax,
figsize=figsize,
use_index=use_index,
title=title,
grid=grid,
legend=legend,
subplots=subplots,
style=style,
logx=logx,
logy=logy,
loglog=loglog,
xticks=xticks,
yticks=yticks,
xlim=xlim,
ylim=ylim,
rot=rot,
fontsize=fontsize,
colormap=colormap,
table=table,
yerr=yerr,
xerr=xerr,
sharex=sharex,
sharey=sharey,
secondary_y=secondary_y,
layout=layout,
sort_columns=sort_columns,
**kwds,
)
def _plot(data, x=None, y=None, subplots=False, ax=None, kind="line", **kwds):
from pyspark.pandas import DataFrame
# function copied from pandas.plotting._core
# and adapted to handle pandas-on-Spark DataFrame and Series
kind = kind.lower().strip()
kind = {"density": "kde"}.get(kind, kind)
if kind in _all_kinds:
klass = _plot_klass[kind]
else:
raise ValueError("%r is not a valid plot kind" % kind)
# scatter and hexbin are inherited from PlanePlot which require x and y
if kind in ("scatter", "hexbin"):
plot_obj = klass(data, x, y, subplots=subplots, ax=ax, kind=kind, **kwds)
else:
# check data type and do preprocess before applying plot
if isinstance(data, DataFrame):
if x is not None:
data = data.set_index(x)
# TODO: check if value of y is plottable
if y is not None:
data = data[y]
plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
plot_obj.generate()
plot_obj.draw()
return plot_obj.result