Revert "[SPARK-35721][PYTHON] Path level discover for python unittests"

This reverts commit 5db51efa1a.
This commit is contained in:
Takuya UESHIN 2021-06-29 12:08:09 -07:00
parent a6088e5036
commit 1f6e2f55d7
10 changed files with 140 additions and 117 deletions

View file

@ -15,72 +15,14 @@
# limitations under the License. # limitations under the License.
# #
from collections.abc import Iterable
from functools import total_ordering from functools import total_ordering
import itertools import itertools
import os import os
import re import re
import unittest
import sys
from sparktestsupport import SPARK_HOME
all_modules = [] all_modules = []
def _get_module_from_name(name):
__import__(name)
return sys.modules[name]
def _discover_python_unittests(*paths, discover_slow=False):
"""Discover the python module which contains unittests under paths.
Such as:
['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
{'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
Parameters
----------
paths : str
Paths of modules to be discovered.
discover_slow : bool
If True, will only discover slow tests
If False, will discover all tests except slow tests
Returns
-------
A set of complete test module name discovered under specified paths
"""
def add_test_module(testcases, modules, slow):
"""Append the testcases module names to modules set"""
if isinstance(testcases, Iterable):
for test_case in testcases:
add_test_module(test_case, modules, slow)
else:
name = testcases.__module__
module = _get_module_from_name(name)
if slow and hasattr(module, 'is_slow_test'):
modules.add(name)
if not slow and not hasattr(module, 'is_slow_test'):
modules.add(name)
if not paths:
return []
modules = set()
pyspark_path = os.path.join(SPARK_HOME, "python")
for path in paths:
# Discover the unittest in every path
testcases = unittest.defaultTestLoader.discover(
os.path.join(pyspark_path, path),
top_level_dir=pyspark_path
)
add_test_module(testcases, modules, discover_slow)
return sorted(list(modules))
@total_ordering @total_ordering
class Module(object): class Module(object):
""" """
@ -446,7 +388,24 @@ pyspark_core = Module(
"pyspark.profiler", "pyspark.profiler",
"pyspark.shuffle", "pyspark.shuffle",
"pyspark.util", "pyspark.util",
] + _discover_python_unittests("pyspark/tests"), # unittests
"pyspark.tests.test_appsubmit",
"pyspark.tests.test_broadcast",
"pyspark.tests.test_conf",
"pyspark.tests.test_context",
"pyspark.tests.test_daemon",
"pyspark.tests.test_install_spark",
"pyspark.tests.test_join",
"pyspark.tests.test_profiler",
"pyspark.tests.test_rdd",
"pyspark.tests.test_rddbarrier",
"pyspark.tests.test_readwrite",
"pyspark.tests.test_serializers",
"pyspark.tests.test_shuffle",
"pyspark.tests.test_taskcontext",
"pyspark.tests.test_util",
"pyspark.tests.test_worker",
]
) )
pyspark_sql = Module( pyspark_sql = Module(
@ -478,7 +437,32 @@ pyspark_sql = Module(
"pyspark.sql.pandas.serializers", "pyspark.sql.pandas.serializers",
"pyspark.sql.pandas.typehints", "pyspark.sql.pandas.typehints",
"pyspark.sql.pandas.utils", "pyspark.sql.pandas.utils",
] + _discover_python_unittests("pyspark/sql/tests"), # unittests
"pyspark.sql.tests.test_arrow",
"pyspark.sql.tests.test_catalog",
"pyspark.sql.tests.test_column",
"pyspark.sql.tests.test_conf",
"pyspark.sql.tests.test_context",
"pyspark.sql.tests.test_dataframe",
"pyspark.sql.tests.test_datasources",
"pyspark.sql.tests.test_functions",
"pyspark.sql.tests.test_group",
"pyspark.sql.tests.test_pandas_cogrouped_map",
"pyspark.sql.tests.test_pandas_grouped_map",
"pyspark.sql.tests.test_pandas_map",
"pyspark.sql.tests.test_pandas_udf",
"pyspark.sql.tests.test_pandas_udf_grouped_agg",
"pyspark.sql.tests.test_pandas_udf_scalar",
"pyspark.sql.tests.test_pandas_udf_typehints",
"pyspark.sql.tests.test_pandas_udf_window",
"pyspark.sql.tests.test_readwriter",
"pyspark.sql.tests.test_serde",
"pyspark.sql.tests.test_session",
"pyspark.sql.tests.test_streaming",
"pyspark.sql.tests.test_types",
"pyspark.sql.tests.test_udf",
"pyspark.sql.tests.test_utils",
]
) )
@ -490,7 +474,10 @@ pyspark_resource = Module(
source_file_regexes=[ source_file_regexes=[
"python/pyspark/resource" "python/pyspark/resource"
], ],
python_test_goals=_discover_python_unittests("pyspark/resource/tests"), python_test_goals=[
# unittests
"pyspark.resource.tests.test_resources",
]
) )
@ -507,7 +494,12 @@ pyspark_streaming = Module(
python_test_goals=[ python_test_goals=[
# doctests # doctests
"pyspark.streaming.util", "pyspark.streaming.util",
] + _discover_python_unittests("pyspark/streaming/tests"), # unittests
"pyspark.streaming.tests.test_context",
"pyspark.streaming.tests.test_dstream",
"pyspark.streaming.tests.test_kinesis",
"pyspark.streaming.tests.test_listener",
]
) )
@ -533,10 +525,17 @@ pyspark_mllib = Module(
"pyspark.mllib.stat.KernelDensity", "pyspark.mllib.stat.KernelDensity",
"pyspark.mllib.tree", "pyspark.mllib.tree",
"pyspark.mllib.util", "pyspark.mllib.util",
] + _discover_python_unittests("pyspark/mllib/tests"), # unittests
"pyspark.mllib.tests.test_algorithms",
"pyspark.mllib.tests.test_feature",
"pyspark.mllib.tests.test_linalg",
"pyspark.mllib.tests.test_stat",
"pyspark.mllib.tests.test_streaming_algorithms",
"pyspark.mllib.tests.test_util",
],
excluded_python_implementations=[ excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
], ]
) )
@ -560,13 +559,27 @@ pyspark_ml = Module(
"pyspark.ml.regression", "pyspark.ml.regression",
"pyspark.ml.stat", "pyspark.ml.stat",
"pyspark.ml.tuning", "pyspark.ml.tuning",
] + _discover_python_unittests("pyspark/ml/tests"), # unittests
"pyspark.ml.tests.test_algorithms",
"pyspark.ml.tests.test_base",
"pyspark.ml.tests.test_evaluation",
"pyspark.ml.tests.test_feature",
"pyspark.ml.tests.test_image",
"pyspark.ml.tests.test_linalg",
"pyspark.ml.tests.test_param",
"pyspark.ml.tests.test_persistence",
"pyspark.ml.tests.test_pipeline",
"pyspark.ml.tests.test_stat",
"pyspark.ml.tests.test_training_summary",
"pyspark.ml.tests.test_tuning",
"pyspark.ml.tests.test_util",
"pyspark.ml.tests.test_wrapper",
],
excluded_python_implementations=[ excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
], ]
) )
pyspark_pandas = Module( pyspark_pandas = Module(
name="pyspark-pandas", name="pyspark-pandas",
dependencies=[pyspark_core, pyspark_sql], dependencies=[pyspark_core, pyspark_sql],
@ -601,14 +614,59 @@ pyspark_pandas = Module(
"pyspark.pandas.spark.accessors", "pyspark.pandas.spark.accessors",
"pyspark.pandas.spark.utils", "pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints", "pyspark.pandas.typedef.typehints",
] + _discover_python_unittests("pyspark/pandas/tests"), # unittests
"pyspark.pandas.tests.data_type_ops.test_base",
"pyspark.pandas.tests.data_type_ops.test_binary_ops",
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
"pyspark.pandas.tests.data_type_ops.test_categorical_ops",
"pyspark.pandas.tests.data_type_ops.test_complex_ops",
"pyspark.pandas.tests.data_type_ops.test_date_ops",
"pyspark.pandas.tests.data_type_ops.test_datetime_ops",
"pyspark.pandas.tests.data_type_ops.test_decimal_ops",
"pyspark.pandas.tests.data_type_ops.test_null_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
"pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly",
"pyspark.pandas.tests.plot.test_series_plot",
"pyspark.pandas.tests.plot.test_series_plot_matplotlib",
"pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
"pyspark.pandas.tests.test_csv",
"pyspark.pandas.tests.test_dataframe_conversion",
"pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_default_index",
"pyspark.pandas.tests.test_expanding",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_indexops_spark",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
"pyspark.pandas.tests.test_spark_functions",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.test_window",
],
excluded_python_implementations=[ excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there # they aren't available there
], ]
) )
pyspark_pandas_slow = Module( pyspark_pandas_slow = Module(
name="pyspark-pandas-slow", name="pyspark-pandas-slow",
dependencies=[pyspark_core, pyspark_sql], dependencies=[pyspark_core, pyspark_sql],
@ -620,7 +678,17 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.frame", "pyspark.pandas.frame",
"pyspark.pandas.generic", "pyspark.pandas.generic",
"pyspark.pandas.series", "pyspark.pandas.series",
] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True), # unittests
"pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.indexes.test_datetime",
"pyspark.pandas.tests.test_dataframe",
"pyspark.pandas.tests.test_groupby",
"pyspark.pandas.tests.test_indexing",
"pyspark.pandas.tests.test_ops_on_diff_frames",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
"pyspark.pandas.tests.test_series",
"pyspark.pandas.tests.test_stats",
],
excluded_python_implementations=[ excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there # they aren't available there

View file

@ -34,11 +34,6 @@ from pyspark.pandas.missing.indexes import (
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class IndexesTest(PandasOnSparkTestCase, TestUtils): class IndexesTest(PandasOnSparkTestCase, TestUtils):
@property @property
def pdf(self): def pdf(self):

View file

@ -25,11 +25,6 @@ import pyspark.pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils): class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
@property @property
def fixed_freqs(self): def fixed_freqs(self):

View file

@ -50,11 +50,6 @@ from pyspark.testing.sqlutils import SQLTestUtils
from pyspark.pandas.utils import name_like_string from pyspark.pandas.utils import name_like_string
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
@property @property
def pdf(self): def pdf(self):

View file

@ -34,11 +34,6 @@ from pyspark.pandas.groupby import is_multi_agg_with_relabel
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class GroupByTest(PandasOnSparkTestCase, TestUtils): class GroupByTest(PandasOnSparkTestCase, TestUtils):
def test_groupby_simple(self): def test_groupby_simple(self):
pdf = pd.DataFrame( pdf = pd.DataFrame(

View file

@ -27,11 +27,6 @@ from pyspark.pandas.exceptions import SparkPandasIndexingError
from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class BasicIndexingTest(ComparisonTestBase): class BasicIndexingTest(ComparisonTestBase):
@property @property
def pdf(self): def pdf(self):

View file

@ -35,11 +35,6 @@ from pyspark.pandas.typedef.typehints import (
) )
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):

View file

@ -25,11 +25,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils from pyspark.testing.sqlutils import SQLTestUtils
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):

View file

@ -44,11 +44,6 @@ from pyspark.pandas.typedef.typehints import (
) )
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
@property @property
def pser(self): def pser(self):

View file

@ -31,11 +31,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_
from pyspark.testing.sqlutils import SQLTestUtils from pyspark.testing.sqlutils import SQLTestUtils
# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True
class StatsTest(PandasOnSparkTestCase, SQLTestUtils): class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
def _test_stat_functions(self, pdf_or_pser, psdf_or_psser): def _test_stat_functions(self, pdf_or_pser, psdf_or_psser):
functions = ["max", "min", "mean", "sum", "count"] functions = ["max", "min", "mean", "sum", "count"]