Revert "[SPARK-35721][PYTHON] Path level discover for python unittests"

This reverts commit 5db51efa1a.
2021-06-29 12:08:09 -07:00 · 2021-06-29 12:08:09 -07:00 · 1f6e2f55d7
parent a6088e5036
commit 1f6e2f55d7
10 changed files with 140 additions and 117 deletions
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@ -15,72 +15,14 @@
 # limitations under the License.
 #

-from collections.abc import Iterable
 from functools import total_ordering
 import itertools
 import os
 import re
-import unittest
-import sys
-
-from sparktestsupport import SPARK_HOME

 all_modules = []


-def _get_module_from_name(name):
-    __import__(name)
-    return sys.modules[name]
-
-
-def _discover_python_unittests(*paths, discover_slow=False):
-    """Discover the python module which contains unittests under paths.
-
-    Such as:
-    ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
-    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
-
-    Parameters
-    ----------
-    paths : str
-        Paths of modules to be discovered.
-    discover_slow : bool
-        If True, will only discover slow tests
-        If False, will discover all tests except slow tests
-
-    Returns
-    -------
-    A set of complete test module name discovered under specified paths
-    """
-
-    def add_test_module(testcases, modules, slow):
-        """Append the testcases module names to modules set"""
-        if isinstance(testcases, Iterable):
-            for test_case in testcases:
-                add_test_module(test_case, modules, slow)
-        else:
-            name = testcases.__module__
-            module = _get_module_from_name(name)
-            if slow and hasattr(module, 'is_slow_test'):
-                modules.add(name)
-            if not slow and not hasattr(module, 'is_slow_test'):
-                modules.add(name)
-
-    if not paths:
-        return []
-    modules = set()
-    pyspark_path = os.path.join(SPARK_HOME, "python")
-    for path in paths:
-        # Discover the unittest in every path
-        testcases = unittest.defaultTestLoader.discover(
-            os.path.join(pyspark_path, path),
-            top_level_dir=pyspark_path
-        )
-        add_test_module(testcases, modules, discover_slow)
-
-    return sorted(list(modules))
-
-
@total_ordering
 class Module(object):
    """
@ -446,7 +388,24 @@ pyspark_core = Module(
        "pyspark.profiler",
        "pyspark.shuffle",
        "pyspark.util",
-    ] + _discover_python_unittests("pyspark/tests"),
+        # unittests
+        "pyspark.tests.test_appsubmit",
+        "pyspark.tests.test_broadcast",
+        "pyspark.tests.test_conf",
+        "pyspark.tests.test_context",
+        "pyspark.tests.test_daemon",
+        "pyspark.tests.test_install_spark",
+        "pyspark.tests.test_join",
+        "pyspark.tests.test_profiler",
+        "pyspark.tests.test_rdd",
+        "pyspark.tests.test_rddbarrier",
+        "pyspark.tests.test_readwrite",
+        "pyspark.tests.test_serializers",
+        "pyspark.tests.test_shuffle",
+        "pyspark.tests.test_taskcontext",
+        "pyspark.tests.test_util",
+        "pyspark.tests.test_worker",
+    ]
 )

 pyspark_sql = Module(
@ -478,7 +437,32 @@ pyspark_sql = Module(
        "pyspark.sql.pandas.serializers",
        "pyspark.sql.pandas.typehints",
        "pyspark.sql.pandas.utils",
-    ] + _discover_python_unittests("pyspark/sql/tests"),
+        # unittests
+        "pyspark.sql.tests.test_arrow",
+        "pyspark.sql.tests.test_catalog",
+        "pyspark.sql.tests.test_column",
+        "pyspark.sql.tests.test_conf",
+        "pyspark.sql.tests.test_context",
+        "pyspark.sql.tests.test_dataframe",
+        "pyspark.sql.tests.test_datasources",
+        "pyspark.sql.tests.test_functions",
+        "pyspark.sql.tests.test_group",
+        "pyspark.sql.tests.test_pandas_cogrouped_map",
+        "pyspark.sql.tests.test_pandas_grouped_map",
+        "pyspark.sql.tests.test_pandas_map",
+        "pyspark.sql.tests.test_pandas_udf",
+        "pyspark.sql.tests.test_pandas_udf_grouped_agg",
+        "pyspark.sql.tests.test_pandas_udf_scalar",
+        "pyspark.sql.tests.test_pandas_udf_typehints",
+        "pyspark.sql.tests.test_pandas_udf_window",
+        "pyspark.sql.tests.test_readwriter",
+        "pyspark.sql.tests.test_serde",
+        "pyspark.sql.tests.test_session",
+        "pyspark.sql.tests.test_streaming",
+        "pyspark.sql.tests.test_types",
+        "pyspark.sql.tests.test_udf",
+        "pyspark.sql.tests.test_utils",
+    ]
 )


@ -490,7 +474,10 @@ pyspark_resource = Module(
    source_file_regexes=[
        "python/pyspark/resource"
    ],
-    python_test_goals=_discover_python_unittests("pyspark/resource/tests"),
+    python_test_goals=[
+        # unittests
+        "pyspark.resource.tests.test_resources",
+    ]
 )


@ -507,7 +494,12 @@ pyspark_streaming = Module(
    python_test_goals=[
        # doctests
        "pyspark.streaming.util",
-    ] + _discover_python_unittests("pyspark/streaming/tests"),
+        # unittests
+        "pyspark.streaming.tests.test_context",
+        "pyspark.streaming.tests.test_dstream",
+        "pyspark.streaming.tests.test_kinesis",
+        "pyspark.streaming.tests.test_listener",
+    ]
 )


@ -533,10 +525,17 @@ pyspark_mllib = Module(
        "pyspark.mllib.stat.KernelDensity",
        "pyspark.mllib.tree",
        "pyspark.mllib.util",
-    ] + _discover_python_unittests("pyspark/mllib/tests"),
+        # unittests
+        "pyspark.mllib.tests.test_algorithms",
+        "pyspark.mllib.tests.test_feature",
+        "pyspark.mllib.tests.test_linalg",
+        "pyspark.mllib.tests.test_stat",
+        "pyspark.mllib.tests.test_streaming_algorithms",
+        "pyspark.mllib.tests.test_util",
+    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ],
+    ]
 )


@ -560,13 +559,27 @@ pyspark_ml = Module(
        "pyspark.ml.regression",
        "pyspark.ml.stat",
        "pyspark.ml.tuning",
-    ] + _discover_python_unittests("pyspark/ml/tests"),
+        # unittests
+        "pyspark.ml.tests.test_algorithms",
+        "pyspark.ml.tests.test_base",
+        "pyspark.ml.tests.test_evaluation",
+        "pyspark.ml.tests.test_feature",
+        "pyspark.ml.tests.test_image",
+        "pyspark.ml.tests.test_linalg",
+        "pyspark.ml.tests.test_param",
+        "pyspark.ml.tests.test_persistence",
+        "pyspark.ml.tests.test_pipeline",
+        "pyspark.ml.tests.test_stat",
+        "pyspark.ml.tests.test_training_summary",
+        "pyspark.ml.tests.test_tuning",
+        "pyspark.ml.tests.test_util",
+        "pyspark.ml.tests.test_wrapper",
+    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ],
+    ]
 )

-
 pyspark_pandas = Module(
    name="pyspark-pandas",
    dependencies=[pyspark_core, pyspark_sql],
@ -601,14 +614,59 @@ pyspark_pandas = Module(
        "pyspark.pandas.spark.accessors",
        "pyspark.pandas.spark.utils",
        "pyspark.pandas.typedef.typehints",
-    ] + _discover_python_unittests("pyspark/pandas/tests"),
+        # unittests
+        "pyspark.pandas.tests.data_type_ops.test_base",
+        "pyspark.pandas.tests.data_type_ops.test_binary_ops",
+        "pyspark.pandas.tests.data_type_ops.test_boolean_ops",
+        "pyspark.pandas.tests.data_type_ops.test_categorical_ops",
+        "pyspark.pandas.tests.data_type_ops.test_complex_ops",
+        "pyspark.pandas.tests.data_type_ops.test_date_ops",
+        "pyspark.pandas.tests.data_type_ops.test_datetime_ops",
+        "pyspark.pandas.tests.data_type_ops.test_decimal_ops",
+        "pyspark.pandas.tests.data_type_ops.test_null_ops",
+        "pyspark.pandas.tests.data_type_ops.test_num_ops",
+        "pyspark.pandas.tests.data_type_ops.test_string_ops",
+        "pyspark.pandas.tests.data_type_ops.test_udt_ops",
+        "pyspark.pandas.tests.indexes.test_category",
+        "pyspark.pandas.tests.plot.test_frame_plot",
+        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
+        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
+        "pyspark.pandas.tests.plot.test_series_plot",
+        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
+        "pyspark.pandas.tests.plot.test_series_plot_plotly",
+        "pyspark.pandas.tests.test_categorical",
+        "pyspark.pandas.tests.test_config",
+        "pyspark.pandas.tests.test_csv",
+        "pyspark.pandas.tests.test_dataframe_conversion",
+        "pyspark.pandas.tests.test_dataframe_spark_io",
+        "pyspark.pandas.tests.test_default_index",
+        "pyspark.pandas.tests.test_expanding",
+        "pyspark.pandas.tests.test_extension",
+        "pyspark.pandas.tests.test_frame_spark",
+        "pyspark.pandas.tests.test_indexops_spark",
+        "pyspark.pandas.tests.test_internal",
+        "pyspark.pandas.tests.test_namespace",
+        "pyspark.pandas.tests.test_numpy_compat",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
+        "pyspark.pandas.tests.test_repr",
+        "pyspark.pandas.tests.test_reshape",
+        "pyspark.pandas.tests.test_rolling",
+        "pyspark.pandas.tests.test_series_conversion",
+        "pyspark.pandas.tests.test_series_datetime",
+        "pyspark.pandas.tests.test_series_string",
+        "pyspark.pandas.tests.test_spark_functions",
+        "pyspark.pandas.tests.test_sql",
+        "pyspark.pandas.tests.test_typedef",
+        "pyspark.pandas.tests.test_utils",
+        "pyspark.pandas.tests.test_window",
+    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
                # they aren't available there
-    ],
+    ]
 )

-
 pyspark_pandas_slow = Module(
    name="pyspark-pandas-slow",
    dependencies=[pyspark_core, pyspark_sql],
@ -620,7 +678,17 @@ pyspark_pandas_slow = Module(
        "pyspark.pandas.frame",
        "pyspark.pandas.generic",
        "pyspark.pandas.series",
-    ] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True),
+        # unittests
+        "pyspark.pandas.tests.indexes.test_base",
+        "pyspark.pandas.tests.indexes.test_datetime",
+        "pyspark.pandas.tests.test_dataframe",
+        "pyspark.pandas.tests.test_groupby",
+        "pyspark.pandas.tests.test_indexing",
+        "pyspark.pandas.tests.test_ops_on_diff_frames",
+        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
+        "pyspark.pandas.tests.test_series",
+        "pyspark.pandas.tests.test_stats",
+    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@ -34,11 +34,6 @@ from pyspark.pandas.missing.indexes import (
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class IndexesTest(PandasOnSparkTestCase, TestUtils):
    @property
    def pdf(self):
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@ -25,11 +25,6 @@ import pyspark.pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
    @property
    def fixed_freqs(self):
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@ -50,11 +50,6 @@ from pyspark.testing.sqlutils import SQLTestUtils
 from pyspark.pandas.utils import name_like_string


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
    @property
    def pdf(self):
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@ -34,11 +34,6 @@ from pyspark.pandas.groupby import is_multi_agg_with_relabel
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class GroupByTest(PandasOnSparkTestCase, TestUtils):
    def test_groupby_simple(self):
        pdf = pd.DataFrame(
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@ -27,11 +27,6 @@ from pyspark.pandas.exceptions import SparkPandasIndexingError
 from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class BasicIndexingTest(ComparisonTestBase):
    @property
    def pdf(self):
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@ -35,11 +35,6 @@ from pyspark.pandas.typedef.typehints import (
 )


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
    @classmethod
    def setUpClass(cls):
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@ -25,11 +25,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
    @classmethod
    def setUpClass(cls):
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@ -44,11 +44,6 @@ from pyspark.pandas.typedef.typehints import (
 )


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
    @property
    def pser(self):
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@ -31,11 +31,6 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_
 from pyspark.testing.sqlutils import SQLTestUtils


-# This is used in run-tests.py to discover the slow test. See more in the doc of
-# _discover_python_unittests of dev/sparktestsupport/modules.py
-is_slow_test = True
-
-
 class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
    def _test_stat_functions(self, pdf_or_pser, psdf_or_psser):
        functions = ["max", "min", "mean", "sum", "count"]