From e15daa31b36669a7e29367e385f28b6ba25acf09 Mon Sep 17 00:00:00 2001 From: Cedric-Magnan Date: Tue, 17 Aug 2021 10:46:49 -0700 Subject: [PATCH] [SPARK-36370][PYTHON] _builtin_table directly imported from pandas instead of being redefined ### What changes were proposed in this pull request? Suggesting to refactor the way the _builtin_table is defined in the `python/pyspark/pandas/groupby.py` module. Pandas has recently refactored the way we import the _builtin_table and is now part of the pandas.core.common module instead of being an attribute of the pandas.core.base.SelectionMixin class. ### Why are the changes needed? This change is not fully needed but the current implementation redefines this table within pyspark, so any changes of this table from the pandas library would need to be updated in the pyspark repository as well. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Ran the following command successfully : ```sh python/run-tests --testnames 'pyspark.pandas.tests.test_groupby' ``` Tests passed in 327 seconds Closes #33687 from Cedric-Magnan/_builtin_table_from_pandas. Authored-by: Cedric-Magnan Signed-off-by: Takuya UESHIN (cherry picked from commit 964dfe254ff8ebf9d7f5c7115ff8f79da3f28261) Signed-off-by: Takuya UESHIN --- python/pyspark/pandas/groupby.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 376592dee3..2daf80f5b0 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -20,13 +20,13 @@ A wrapper for GroupedData to behave similar to pandas GroupBy. """ from abc import ABCMeta, abstractmethod -import builtins import sys import inspect from collections import OrderedDict, namedtuple from distutils.version import LooseVersion from functools import partial from itertools import product +from pkg_resources import parse_version # type: ignore from typing import ( Any, Callable, @@ -44,10 +44,16 @@ from typing import ( TYPE_CHECKING, ) -import numpy as np import pandas as pd from pandas.api.types import is_hashable, is_list_like +if parse_version(pd.__version__) >= parse_version("1.3.0"): + from pandas.core.common import _builtin_table +else: + from pandas.core.base import SelectionMixin + + _builtin_table = SelectionMixin._builtin_table + from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions as F from pyspark.sql.types import ( # noqa: F401 DataType, @@ -97,12 +103,6 @@ if TYPE_CHECKING: # to keep it the same as pandas NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) -_builtin_table = { - builtins.sum: np.sum, - builtins.max: np.max, - builtins.min: np.min, -} # type: Dict[Callable, Callable] - class GroupBy(Generic[FrameLike], metaclass=ABCMeta): """