spark-instrumented-optimizer/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import base64
from distutils.version import LooseVersion
from io import BytesIO
import unittest

import pandas as pd
import numpy as np

from pyspark import pandas as ps
from pyspark.pandas.config import set_option, reset_option
from pyspark.pandas.testing.utils import have_matplotlib, ReusedSQLTestCase, TestUtils

if have_matplotlib:
    import matplotlib
    from matplotlib import pyplot as plt

    matplotlib.use("agg")


@unittest.skipIf(not have_matplotlib, "matplotlib is not installed.")
class DataFramePlotMatplotlibTest(ReusedSQLTestCase, TestUtils):
    sample_ratio_default = None

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        if LooseVersion(pd.__version__) >= LooseVersion("0.25"):
            pd.set_option("plotting.backend", "matplotlib")
        set_option("plotting.backend", "matplotlib")
        set_option("plotting.max_rows", 2000)
        set_option("plotting.sample_ratio", None)

    @classmethod
    def tearDownClass(cls):
        if LooseVersion(pd.__version__) >= LooseVersion("0.25"):
            pd.reset_option("plotting.backend")
        reset_option("plotting.backend")
        reset_option("plotting.max_rows")
        reset_option("plotting.sample_ratio")
        super().tearDownClass()

    @property
    def pdf1(self):
        return pd.DataFrame(
            {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50], "b": [2, 3, 4, 5, 7, 9, 10, 15, 34, 45, 49]},
            index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10],
        )

    @property
    def kdf1(self):
        return ps.from_pandas(self.pdf1)

    @staticmethod
    def plot_to_base64(ax):
        bytes_data = BytesIO()
        ax.figure.savefig(bytes_data, format="png")
        bytes_data.seek(0)
        b64_data = base64.b64encode(bytes_data.read())
        plt.close(ax.figure)
        return b64_data

    def test_line_plot(self):
        def check_line_plot(pdf, kdf):
            ax1 = pdf.plot(kind="line", colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="line", colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax3 = pdf.plot.line(colormap="Paired")
            bin3 = self.plot_to_base64(ax3)
            ax4 = kdf.plot.line(colormap="Paired")
            bin4 = self.plot_to_base64(ax4)
            self.assertEqual(bin3, bin4)

        pdf1 = self.pdf1
        kdf1 = self.kdf1
        check_line_plot(pdf1, kdf1)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_line_plot(pdf1, kdf1)

    def test_area_plot(self):
        def check_area_plot(pdf, kdf):
            ax1 = pdf.plot(kind="area", colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="area", colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax3 = pdf.plot.area(colormap="Paired")
            bin3 = self.plot_to_base64(ax3)
            ax4 = kdf.plot.area(colormap="Paired")
            bin4 = self.plot_to_base64(ax4)
            self.assertEqual(bin3, bin4)

        pdf = self.pdf1
        kdf = self.kdf1
        check_area_plot(pdf, kdf)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
        pdf.columns = columns
        kdf.columns = columns
        check_area_plot(pdf, kdf)

    def test_area_plot_stacked_false(self):
        def check_area_plot_stacked_false(pdf, kdf):
            ax1 = pdf.plot.area(stacked=False)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.area(stacked=False)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            # test if frame area plot is correct when stacked=False because default is True

        pdf = pd.DataFrame(
            {
                "sales": [3, 2, 3, 9, 10, 6],
                "signups": [5, 5, 6, 12, 14, 13],
                "visits": [20, 42, 28, 62, 81, 50],
            },
            index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="M"),
        )
        kdf = ps.from_pandas(pdf)
        check_area_plot_stacked_false(pdf, kdf)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "sales"), ("x", "signups"), ("y", "visits")])
        pdf.columns = columns
        kdf.columns = columns
        check_area_plot_stacked_false(pdf, kdf)

    def test_area_plot_y(self):
        def check_area_plot_y(pdf, kdf, y):
            ax1 = pdf.plot.area(y=y)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.area(y=y)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

        # test if frame area plot is correct when y is specified
        pdf = pd.DataFrame(
            {
                "sales": [3, 2, 3, 9, 10, 6],
                "signups": [5, 5, 6, 12, 14, 13],
                "visits": [20, 42, 28, 62, 81, 50],
            },
            index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="M"),
        )
        kdf = ps.from_pandas(pdf)
        check_area_plot_y(pdf, kdf, y="sales")

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "sales"), ("x", "signups"), ("y", "visits")])
        pdf.columns = columns
        kdf.columns = columns
        check_area_plot_y(pdf, kdf, y=("x", "sales"))

    def test_barh_plot_with_x_y(self):
        def check_barh_plot_with_x_y(pdf, kdf, x, y):
            ax1 = pdf.plot(kind="barh", x=x, y=y, colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="barh", x=x, y=y, colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax3 = pdf.plot.barh(x=x, y=y, colormap="Paired")
            bin3 = self.plot_to_base64(ax3)
            ax4 = kdf.plot.barh(x=x, y=y, colormap="Paired")
            bin4 = self.plot_to_base64(ax4)
            self.assertEqual(bin3, bin4)

        # this is testing plot with specified x and y
        pdf1 = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
        kdf1 = ps.from_pandas(pdf1)
        check_barh_plot_with_x_y(pdf1, kdf1, x="lab", y="val")

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "lab"), ("y", "val")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_barh_plot_with_x_y(pdf1, kdf1, x=("x", "lab"), y=("y", "val"))

    def test_barh_plot(self):
        def check_barh_plot(pdf, kdf):
            ax1 = pdf.plot(kind="barh", colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="barh", colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax3 = pdf.plot.barh(colormap="Paired")
            bin3 = self.plot_to_base64(ax3)
            ax4 = kdf.plot.barh(colormap="Paired")
            bin4 = self.plot_to_base64(ax4)
            self.assertEqual(bin3, bin4)

        # this is testing when x or y is not assigned
        pdf1 = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
        kdf1 = ps.from_pandas(pdf1)
        check_barh_plot(pdf1, kdf1)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "lab"), ("y", "val")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_barh_plot(pdf1, kdf1)

    def test_bar_plot(self):
        def check_bar_plot(pdf, kdf):
            ax1 = pdf.plot(kind="bar", colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="bar", colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax3 = pdf.plot.bar(colormap="Paired")
            bin3 = self.plot_to_base64(ax3)
            ax4 = kdf.plot.bar(colormap="Paired")
            bin4 = self.plot_to_base64(ax4)
            self.assertEqual(bin3, bin4)

        pdf1 = self.pdf1
        kdf1 = self.kdf1
        check_bar_plot(pdf1, kdf1)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "lab"), ("y", "val")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_bar_plot(pdf1, kdf1)

    def test_bar_with_x_y(self):
        # this is testing plot with specified x and y
        pdf = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
        kdf = ps.from_pandas(pdf)

        ax1 = pdf.plot(kind="bar", x="lab", y="val", colormap="Paired")
        bin1 = self.plot_to_base64(ax1)
        ax2 = kdf.plot(kind="bar", x="lab", y="val", colormap="Paired")
        bin2 = self.plot_to_base64(ax2)
        self.assertEqual(bin1, bin2)

        ax3 = pdf.plot.bar(x="lab", y="val", colormap="Paired")
        bin3 = self.plot_to_base64(ax3)
        ax4 = kdf.plot.bar(x="lab", y="val", colormap="Paired")
        bin4 = self.plot_to_base64(ax4)
        self.assertEqual(bin3, bin4)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "lab"), ("y", "val")])
        pdf.columns = columns
        kdf.columns = columns

        ax5 = pdf.plot(kind="bar", x=("x", "lab"), y=("y", "val"), colormap="Paired")
        bin5 = self.plot_to_base64(ax5)
        ax6 = kdf.plot(kind="bar", x=("x", "lab"), y=("y", "val"), colormap="Paired")
        bin6 = self.plot_to_base64(ax6)
        self.assertEqual(bin5, bin6)

        ax7 = pdf.plot.bar(x=("x", "lab"), y=("y", "val"), colormap="Paired")
        bin7 = self.plot_to_base64(ax7)
        ax8 = kdf.plot.bar(x=("x", "lab"), y=("y", "val"), colormap="Paired")
        bin8 = self.plot_to_base64(ax8)
        self.assertEqual(bin7, bin8)

    def test_pie_plot(self):
        def check_pie_plot(pdf, kdf, y):
            ax1 = pdf.plot.pie(y=y, figsize=(5, 5), colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.pie(y=y, figsize=(5, 5), colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax1 = pdf.plot(kind="pie", y=y, figsize=(5, 5), colormap="Paired")
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="pie", y=y, figsize=(5, 5), colormap="Paired")
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax11, ax12 = pdf.plot.pie(figsize=(5, 5), subplots=True, colormap="Paired")
            bin11 = self.plot_to_base64(ax11)
            bin12 = self.plot_to_base64(ax12)
            self.assertEqual(bin11, bin12)

            ax21, ax22 = kdf.plot.pie(figsize=(5, 5), subplots=True, colormap="Paired")
            bin21 = self.plot_to_base64(ax21)
            bin22 = self.plot_to_base64(ax22)
            self.assertEqual(bin21, bin22)

            ax11, ax12 = pdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap="Paired")
            bin11 = self.plot_to_base64(ax11)
            bin12 = self.plot_to_base64(ax12)
            self.assertEqual(bin11, bin12)

            ax21, ax22 = kdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap="Paired")
            bin21 = self.plot_to_base64(ax21)
            bin22 = self.plot_to_base64(ax22)
            self.assertEqual(bin21, bin22)

        pdf1 = pd.DataFrame(
            {"mass": [0.330, 4.87, 5.97], "radius": [2439.7, 6051.8, 6378.1]},
            index=["Mercury", "Venus", "Earth"],
        )
        kdf1 = ps.from_pandas(pdf1)
        check_pie_plot(pdf1, kdf1, y="mass")

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "mass"), ("y", "radius")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_pie_plot(pdf1, kdf1, y=("x", "mass"))

    def test_pie_plot_error_message(self):
        # this is to test if error is correctly raising when y is not specified
        # and subplots is not set to True
        pdf = pd.DataFrame(
            {"mass": [0.330, 4.87, 5.97], "radius": [2439.7, 6051.8, 6378.1]},
            index=["Mercury", "Venus", "Earth"],
        )
        kdf = ps.from_pandas(pdf)

        with self.assertRaises(ValueError) as context:
            kdf.plot.pie(figsize=(5, 5), colormap="Paired")
        error_message = "pie requires either y column or 'subplots=True'"
        self.assertTrue(error_message in str(context.exception))

    def test_scatter_plot(self):
        def check_scatter_plot(pdf, kdf, x, y, c):
            ax1 = pdf.plot.scatter(x=x, y=y)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.scatter(x=x, y=y)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax1 = pdf.plot(kind="scatter", x=x, y=y)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="scatter", x=x, y=y)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            # check when keyword c is given as name of a column
            ax1 = pdf.plot.scatter(x=x, y=y, c=c, s=50)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.scatter(x=x, y=y, c=c, s=50)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

        # Use pandas scatter plot example
        pdf1 = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"])
        kdf1 = ps.from_pandas(pdf1)
        check_scatter_plot(pdf1, kdf1, x="a", y="b", c="c")

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c"), ("z", "d")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_scatter_plot(pdf1, kdf1, x=("x", "a"), y=("x", "b"), c=("y", "c"))

    def test_hist_plot(self):
        def check_hist_plot(pdf, kdf):
            _, ax1 = plt.subplots(1, 1)
            ax1 = pdf.plot.hist()
            bin1 = self.plot_to_base64(ax1)
            _, ax2 = plt.subplots(1, 1)
            ax2 = kdf.plot.hist()
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax1 = pdf.plot.hist(bins=15)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.hist(bins=15)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax1 = pdf.plot(kind="hist", bins=15)
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot(kind="hist", bins=15)
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

            ax1 = pdf.plot.hist(bins=3, bottom=[2, 1, 3])
            bin1 = self.plot_to_base64(ax1)
            ax2 = kdf.plot.hist(bins=3, bottom=[2, 1, 3])
            bin2 = self.plot_to_base64(ax2)
            self.assertEqual(bin1, bin2)

        pdf1 = self.pdf1
        kdf1 = self.kdf1
        check_hist_plot(pdf1, kdf1)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
        pdf1.columns = columns
        kdf1.columns = columns
        check_hist_plot(pdf1, kdf1)

    def test_kde_plot(self):
        def moving_average(a, n=10):
            ret = np.cumsum(a, dtype=float)
            ret[n:] = ret[n:] - ret[:-n]
            return ret[n - 1:] / n

        def check_kde_plot(pdf, kdf, *args, **kwargs):
            _, ax1 = plt.subplots(1, 1)
            ax1 = pdf.plot.kde(*args, **kwargs)
            _, ax2 = plt.subplots(1, 1)
            ax2 = kdf.plot.kde(*args, **kwargs)

            try:
                for i, (line1, line2) in enumerate(zip(ax1.get_lines(), ax2.get_lines())):
                    expected = line1.get_xydata().ravel()
                    actual = line2.get_xydata().ravel()
                    # TODO: Due to implementation difference, the output is different comparing
                    # to pandas'. We should identify the root cause of difference, and reduce
                    # the diff.

                    # Note: Data is from 1 to 50. So, it smooths them by moving average and compares
                    # both.
                    self.assertTrue(
                        np.allclose(moving_average(actual), moving_average(expected), rtol=3.0)
                    )
            finally:
                ax1.cla()
                ax2.cla()

        pdf1 = self.pdf1
        kdf1 = self.kdf1
        check_kde_plot(pdf1, kdf1, bw_method=0.3)
        check_kde_plot(pdf1, kdf1, ind=[1, 2, 3], bw_method=3.0)

        # multi-index columns
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("y", "b")])
        pdf1.columns = columns
        pdf1.columns = columns
        check_kde_plot(pdf1, kdf1, bw_method=0.3)
        check_kde_plot(pdf1, kdf1, ind=[1, 2, 3], bw_method=3.0)


if __name__ == "__main__":
    from pyspark.pandas.tests.plot.test_frame_plot_matplotlib import *  # noqa: F401

    try:
        import xmlrunner  # type: ignore[import]
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)