[SPARK-36370][PYTHON] _builtin_table directly imported from pandas instead of being redefined
### What changes were proposed in this pull request? Suggesting to refactor the way the _builtin_table is defined in the `python/pyspark/pandas/groupby.py` module. Pandas has recently refactored the way we import the _builtin_table and is now part of the pandas.core.common module instead of being an attribute of the pandas.core.base.SelectionMixin class. ### Why are the changes needed? This change is not fully needed but the current implementation redefines this table within pyspark, so any changes of this table from the pandas library would need to be updated in the pyspark repository as well. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Ran the following command successfully : ```sh python/run-tests --testnames 'pyspark.pandas.tests.test_groupby' ``` Tests passed in 327 seconds Closes #33687 from Cedric-Magnan/_builtin_table_from_pandas. Authored-by: Cedric-Magnan <cedric.magnan@artefact.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
This commit is contained in:
parent
c0441bb7e8
commit
964dfe254f
|
@ -20,13 +20,13 @@ A wrapper for GroupedData to behave similar to pandas GroupBy.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
import builtins
|
|
||||||
import sys
|
import sys
|
||||||
import inspect
|
import inspect
|
||||||
from collections import OrderedDict, namedtuple
|
from collections import OrderedDict, namedtuple
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
from pkg_resources import parse_version # type: ignore
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
|
@ -44,10 +44,16 @@ from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import is_hashable, is_list_like
|
from pandas.api.types import is_hashable, is_list_like
|
||||||
|
|
||||||
|
if parse_version(pd.__version__) >= parse_version("1.3.0"):
|
||||||
|
from pandas.core.common import _builtin_table
|
||||||
|
else:
|
||||||
|
from pandas.core.base import SelectionMixin
|
||||||
|
|
||||||
|
_builtin_table = SelectionMixin._builtin_table
|
||||||
|
|
||||||
from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions as F
|
from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions as F
|
||||||
from pyspark.sql.types import ( # noqa: F401
|
from pyspark.sql.types import ( # noqa: F401
|
||||||
DataType,
|
DataType,
|
||||||
|
@ -97,12 +103,6 @@ if TYPE_CHECKING:
|
||||||
# to keep it the same as pandas
|
# to keep it the same as pandas
|
||||||
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
||||||
|
|
||||||
_builtin_table = {
|
|
||||||
builtins.sum: np.sum,
|
|
||||||
builtins.max: np.max,
|
|
||||||
builtins.min: np.min,
|
|
||||||
} # type: Dict[Callable, Callable]
|
|
||||||
|
|
||||||
|
|
||||||
class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
|
class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in a new issue