[SPARK-32319][PYSPARK] Disallow the use of unused imports

Disallow the use of unused imports: - Unnecessary increases the memory footprint of the application - Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :) ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused python/pyspark/rdd.py:21:1: F401 're' imported but unused python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused ``` After: ``` fokkodriesprongFan spark % flake8 python | grep -i "imported but unused" fokkodriesprongFan spark % ``` ### What changes were proposed in this pull request? Removing unused imports from the Python files to keep everything nice and tidy. ### Why are the changes needed? Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Adding the rule to the existing Flake8 checks. Closes #29121 from Fokko/SPARK-32319. Authored-by: Fokko Driesprong <fokko@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
2020-08-08 08:51:57 -07:00 · 2020-08-08 08:51:57 -07:00 · 9fcf0ea718
parent eb74d55fb5
commit 9fcf0ea718
92 changed files with 124 additions and 147 deletions
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@ -23,7 +23,7 @@ import sys
 from subprocess import Popen, PIPE

 try:
-    from jira.client import JIRA
+    from jira.client import JIRA  # noqa: F401
    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
    try:
        from jira.exceptions import JIRAError
@ -35,7 +35,7 @@ except ImportError:
    sys.exit(-1)

 try:
-    from github import Github
+    from github import Github  # noqa: F401
    from github import GithubException
 except ImportError:
    print("This tool requires the PyGithub library")
--- a/dev/lint-python
+++ b/dev/lint-python
@ -147,8 +147,7 @@ flake8 checks failed."
    fi

    echo "starting $FLAKE8_BUILD test..."
-    FLAKE8_REPORT=$( ($FLAKE8_BUILD . --count --select=E901,E999,F821,F822,F823 \
-                     --max-line-length=100 --show-source --statistics) 2>&1)
+    FLAKE8_REPORT=$( ($FLAKE8_BUILD --append-config dev/tox.ini --count --show-source --statistics .) 2>&1)
    FLAKE8_STATUS=$?

    if [ "$FLAKE8_STATUS" -ne 0 ]; then
--- a/dev/pip-sanity-check.py
+++ b/dev/pip-sanity-check.py
@ -16,7 +16,6 @@
 #

 from pyspark.sql import SparkSession
-from pyspark.mllib.linalg import *
 import sys

 if __name__ == "__main__":
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@ -26,7 +26,6 @@ import sys
 import subprocess
 import glob
 import shutil
-from collections import namedtuple

 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
--- a/dev/tox.ini
+++ b/dev/tox.ini
@ -17,3 +17,8 @@
 ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504
 max-line-length=100
 exclude=python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
+
+[flake8]
+select = E901,E999,F821,F822,F823,F401
+exclude = python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
+max-line-length = 100
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@ -21,7 +21,7 @@ Run with:
  ./bin/spark-submit examples/src/main/python/sql/hive.py
 """
 # $example on:spark_hive$
-from os.path import join, abspath
+from os.path import abspath

 from pyspark.sql import SparkSession
 from pyspark.sql import Row
--- a/examples/src/main/python/status_api_demo.py
+++ b/examples/src/main/python/status_api_demo.py
@ -17,7 +17,6 @@

 import time
 import threading
-import sys
 import queue as Queue

 from pyspark import SparkConf, SparkContext
--- a/python/pyspark/init.py
+++ b/python/pyspark/init.py
@ -61,8 +61,8 @@ from pyspark.serializers import MarshalSerializer, PickleSerializer
 from pyspark.status import *
 from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
 from pyspark.profiler import Profiler, BasicProfiler
-from pyspark.version import __version__
-from pyspark._globals import _NoValue
+from pyspark.version import __version__  # noqa: F401
+from pyspark._globals import _NoValue  # noqa: F401


 def since(version):
@ -115,7 +115,7 @@ def keyword_only(func):


 # for back compatibility
-from pyspark.sql import SQLContext, HiveContext, Row
+from pyspark.sql import SQLContext, HiveContext, Row  # noqa: F401

 __all__ = [
    "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
--- a/python/pyspark/ml/init.py
+++ b/python/pyspark/ml/init.py
@ -23,7 +23,7 @@ from pyspark.ml.base import Estimator, Model, Predictor, PredictionModel, \
    Transformer, UnaryTransformer
 from pyspark.ml.pipeline import Pipeline, PipelineModel
 from pyspark.ml import classification, clustering, evaluation, feature, fpm, \
-    image, pipeline, recommendation, regression, stat, tuning, util, linalg, param
+    image, recommendation, regression, stat, tuning, util, linalg, param

 __all__ = [
    "Transformer", "UnaryTransformer", "Estimator", "Model",
--- a/python/pyspark/ml/param/init.py
+++ b/python/pyspark/ml/param/init.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import array
-import sys
 from abc import ABCMeta
 import copy

--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@ -15,8 +15,6 @@
 # limitations under the License.
 #

-import sys
-
 from pyspark import keyword_only
 from pyspark.ml.base import Estimator, Model, Transformer
 from pyspark.ml.param import Param, Params
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@ -15,18 +15,16 @@
 # limitations under the License.
 #

-import sys
 from abc import ABCMeta

-from pyspark import since, keyword_only
+from pyspark import keyword_only
 from pyspark.ml import Predictor, PredictionModel
 from pyspark.ml.base import _PredictorParams
 from pyspark.ml.param.shared import *
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
-    _TreeEnsembleModel, _TreeEnsembleParams, _RandomForestParams, _GBTParams, \
-    _HasVarianceImpurity, _TreeRegressorParams
+    _TreeEnsembleModel, _RandomForestParams, _GBTParams, _TreeRegressorParams
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, \
    JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc
 from pyspark.sql import DataFrame
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@ -19,7 +19,6 @@ import sys

 from pyspark import since, SparkContext
 from pyspark.ml.common import _java2py, _py2java
-from pyspark.ml.linalg import DenseMatrix, Vectors
 from pyspark.ml.wrapper import JavaWrapper, _jvm
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.functions import lit
@ -121,7 +120,7 @@ class Correlation(object):
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

-        >>> from pyspark.ml.linalg import Vectors
+        >>> from pyspark.ml.linalg import DenseMatrix, Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
@ -412,6 +411,7 @@ class SummaryBuilder(JavaWrapper):
 class MultivariateGaussian(object):
    """Represents a (mean, cov) tuple

+    >>> from pyspark.ml.linalg import DenseMatrix, Vectors
    >>> m = MultivariateGaussian(Vectors.dense([11,12]), DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))
    >>> (m.mean, m.cov.toArray())
    (DenseVector([11.0, 12.0]), array([[ 1.,  5.],
--- a/python/pyspark/ml/tests/test_algorithms.py
+++ b/python/pyspark/ml/tests/test_algorithms.py
@ -330,7 +330,7 @@ class LinearRegressionTest(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_algorithms import *
+    from pyspark.ml.tests.test_algorithms import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_base.py
+++ b/python/pyspark/ml/tests/test_base.py
@ -67,7 +67,7 @@ class EstimatorTest(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_base import *
+    from pyspark.ml.tests.test_base import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_evaluation.py
+++ b/python/pyspark/ml/tests/test_evaluation.py
@ -53,7 +53,7 @@ class EvaluatorTests(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_evaluation import *
+    from pyspark.ml.tests.test_evaluation import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@ -16,7 +16,6 @@
 # limitations under the License.
 #

-import sys
 import unittest

 from pyspark.ml.feature import Binarizer, CountVectorizer, CountVectorizerModel, HashingTF, IDF, \
@ -301,7 +300,7 @@ class HashingTFTest(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_feature import *
+    from pyspark.ml.tests.test_feature import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_image.py
+++ b/python/pyspark/ml/tests/test_image.py
@ -16,10 +16,8 @@
 #
 import unittest

-import py4j
-
 from pyspark.ml.image import ImageSchema
-from pyspark.testing.mlutils import PySparkTestCase, SparkSessionTestCase
+from pyspark.testing.mlutils import SparkSessionTestCase
 from pyspark.sql import Row
 from pyspark.testing.utils import QuietTest

@ -68,7 +66,7 @@ class ImageFileFormatTest(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_image import *
+    from pyspark.ml.tests.test_image import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_linalg.py
+++ b/python/pyspark/ml/tests/test_linalg.py
@ -378,7 +378,7 @@ class MatrixUDTTests(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_linalg import *
+    from pyspark.ml.tests.test_linalg import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@ -17,7 +17,6 @@
 #

 import inspect
-import sys
 import array as pyarray
 import unittest

@ -370,7 +369,7 @@ class DefaultValuesTests(PySparkTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_param import *
+    from pyspark.ml.tests.test_param import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_persistence.py
+++ b/python/pyspark/ml/tests/test_persistence.py
@ -453,7 +453,7 @@ class PersistenceTest(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_persistence import *
+    from pyspark.ml.tests.test_persistence import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_pipeline.py
+++ b/python/pyspark/ml/tests/test_pipeline.py
@ -59,7 +59,7 @@ class PipelineTests(PySparkTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_pipeline import *
+    from pyspark.ml.tests.test_pipeline import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_stat.py
+++ b/python/pyspark/ml/tests/test_stat.py
@ -40,7 +40,7 @@ class ChiSquareTestTests(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_stat import *
+    from pyspark.ml.tests.test_stat import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #

-import sys
 import unittest

 from pyspark.ml.classification import BinaryLogisticRegressionSummary, \
@ -443,7 +442,7 @@ class TrainingSummaryTest(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_training_summary import *
+    from pyspark.ml.tests.test_training_summary import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_tuning.py
+++ b/python/pyspark/ml/tests/test_tuning.py
@ -424,8 +424,6 @@ class CrossValidatorTests(SparkSessionTestCase):
        self.assertEqual(loadedCV.getFoldCol(), cv_with_user_folds.getFoldCol())

    def test_invalid_user_specified_folds(self):
-        from pyspark.sql import functions as F
-
        dataset_with_folds = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0, 0),
             (Vectors.dense([0.4]), 1.0, 1),
@ -754,7 +752,7 @@ class TrainValidationSplitTests(SparkSessionTestCase):


 if __name__ == "__main__":
-    from pyspark.ml.tests.test_tuning import *
+    from pyspark.ml.tests.test_tuning import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/tests/test_wrapper.py
+++ b/python/pyspark/ml/tests/test_wrapper.py
@ -117,7 +117,7 @@ class WrapperTests(MLlibTestCase):
        self.assertEqual(_java2py(self.sc, java_array), expected_str_list)

 if __name__ == "__main__":
-    from pyspark.ml.tests.test_wrapper import *
+    from pyspark.ml.tests.test_wrapper import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@ -16,7 +16,6 @@
 #

 import json
-import sys
 import os
 import time
 import uuid
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@ -23,7 +23,7 @@ import numpy

 from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import (
    LabeledPoint, LinearModel, _regression_train_wrapper,
    StreamingLinearAlgorithm)
@ -102,6 +102,7 @@ class LogisticRegressionModel(LinearClassificationModel):
      in Multinomial Logistic Regression. By default, it is binary
      logistic regression so numClasses will be set to 2.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> data = [
    ...     LabeledPoint(0.0, [0.0, 1.0]),
    ...     LabeledPoint(1.0, [1.0, 0.0]),
@ -410,6 +411,7 @@ class SVMModel(LinearClassificationModel):
    :param intercept:
      Intercept computed for this model.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> data = [
    ...     LabeledPoint(0.0, [0.0]),
    ...     LabeledPoint(1.0, [1.0]),
@ -569,6 +571,7 @@ class NaiveBayesModel(Saveable, Loader):
      Log of class conditional probabilities, whose dimension is C-by-D,
      where D is number of features.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> data = [
    ...     LabeledPoint(0.0, [0.0, 0.0]),
    ...     LabeledPoint(0.0, [0.0, 1.0]),
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@ -25,7 +25,7 @@ from numpy import array, random, tile
 from pyspark import SparkContext, since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
+from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector  # noqa: F401
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
 from pyspark.streaming import DStream
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@ -25,9 +25,7 @@ from py4j.protocol import Py4JJavaError
 from pyspark import since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import (
-    Vectors, DenseVector, SparseVector, _convert_to_vector)
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.linalg import Vectors, _convert_to_vector
 from pyspark.mllib.util import JavaLoader, JavaSaveable

 __all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
@ -60,6 +58,7 @@ class Normalizer(VectorTransformer):

    :param p: Normalization in L^p^ space, p = 2 by default.

+    >>> from pyspark.mllib.linalg import Vectors
    >>> v = Vectors.dense(range(3))
    >>> nor = Normalizer(1)
    >>> nor.transform(v)
@ -285,6 +284,8 @@ class ChiSqSelector(object):
    By default, the selection method is `numTopFeatures`, with the default number of top features
    set to 50.

+    >>> from pyspark.mllib.linalg import SparseVector, DenseVector
+    >>> from pyspark.mllib.regression import LabeledPoint
    >>> data = sc.parallelize([
    ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
    ...     LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@ -23,7 +23,7 @@ import numpy as np
 from pyspark import RDD, since
 from pyspark.streaming.dstream import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.util import Saveable, Loader

 __all__ = ['LabeledPoint', 'LinearModel',
@ -102,6 +102,7 @@ class LinearRegressionModelBase(LinearModel):

    """A linear regression model.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1)
    >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6
    True
@ -128,6 +129,7 @@ class LinearRegressionModel(LinearRegressionModelBase):

    """A linear regression model derived from a least-squares fit.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> from pyspark.mllib.regression import LabeledPoint
    >>> data = [
    ...     LabeledPoint(0.0, [0.0]),
@ -297,6 +299,7 @@ class LassoModel(LinearRegressionModelBase):
    """A linear regression model derived from a least-squares fit with
    an l_1 penalty term.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> from pyspark.mllib.regression import LabeledPoint
    >>> data = [
    ...     LabeledPoint(0.0, [0.0]),
@ -441,6 +444,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
    """A linear regression model derived from a least-squares fit with
    an l_2 penalty term.

+    >>> from pyspark.mllib.linalg import SparseVector
    >>> from pyspark.mllib.regression import LabeledPoint
    >>> data = [
    ...     LabeledPoint(0.0, [0.0]),
--- a/python/pyspark/mllib/tests/test_algorithms.py
+++ b/python/pyspark/mllib/tests/test_algorithms.py
@ -292,7 +292,7 @@ class FPGrowthTest(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_algorithms import *
+    from pyspark.mllib.tests.test_algorithms import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/mllib/tests/test_feature.py
+++ b/python/pyspark/mllib/tests/test_feature.py
@ -18,9 +18,9 @@
 from math import sqrt
 import unittest

-from numpy import array, random, exp, abs, tile
+from numpy import array, abs, tile

-from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, Vectors
+from pyspark.mllib.linalg import SparseVector, DenseVector, Vectors
 from pyspark.mllib.linalg.distributed import RowMatrix
 from pyspark.mllib.feature import HashingTF, IDF, StandardScaler, ElementwiseProduct, Word2Vec
 from pyspark.testing.mllibutils import MLlibTestCase
@ -182,7 +182,7 @@ class DimensionalityReductionTests(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_feature import *
+    from pyspark.mllib.tests.test_feature import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/mllib/tests/test_linalg.py
+++ b/python/pyspark/mllib/tests/test_linalg.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #

-import sys
 import array as pyarray
 import unittest

@ -639,7 +638,7 @@ class SciPyTests(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_linalg import *
+    from pyspark.mllib.tests.test_linalg import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/mllib/tests/test_stat.py
+++ b/python/pyspark/mllib/tests/test_stat.py
@ -20,8 +20,7 @@ import unittest

 from numpy import array

-from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
-    DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
+from pyspark.mllib.linalg import Vectors, Matrices
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat import Statistics
@ -178,7 +177,7 @@ class KolmogorovSmirnovTest(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_stat import *
+    from pyspark.mllib.tests.test_stat import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/mllib/tests/test_streaming_algorithms.py
+++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #

-from time import time, sleep
 import unittest

 from numpy import array, random, exp, dot, all, mean, abs
@ -467,7 +466,7 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_streaming_algorithms import *
+    from pyspark.mllib.tests.test_streaming_algorithms import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/mllib/tests/test_util.py
+++ b/python/pyspark/mllib/tests/test_util.py
@ -94,7 +94,7 @@ class SerDeTest(MLlibTestCase):


 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_util import *
+    from pyspark.mllib.tests.test_util import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@ -18,7 +18,6 @@
 import copy
 import sys
 import os
-import re
 import operator
 import shlex
 import warnings
@ -26,7 +25,6 @@ import heapq
 import bisect
 import random
 from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
 from threading import Thread
 from collections import defaultdict
 from itertools import chain
@ -1566,6 +1564,7 @@ class RDD(object):
        used is :class:`pyspark.serializers.PickleSerializer`, default batch size
        is 10.

+        >>> from tempfile import NamedTemporaryFile
        >>> tmpFile = NamedTemporaryFile(delete=True)
        >>> tmpFile.close()
        >>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3)
@ -1586,6 +1585,7 @@ class RDD(object):
        :param compressionCodecClass: (None by default) string i.e.
            "org.apache.hadoop.io.compress.GzipCodec"

+        >>> from tempfile import NamedTemporaryFile
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name)
@ -1596,6 +1596,7 @@ class RDD(object):

        Empty lines are tolerated when saving to text files.

+        >>> from tempfile import NamedTemporaryFile
        >>> tempFile2 = NamedTemporaryFile(delete=True)
        >>> tempFile2.close()
        >>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)
@ -1604,6 +1605,7 @@ class RDD(object):

        Using compressionCodecClass

+        >>> from tempfile import NamedTemporaryFile
        >>> tempFile3 = NamedTemporaryFile(delete=True)
        >>> tempFile3.close()
        >>> codec = "org.apache.hadoop.io.compress.GzipCodec"
--- a/python/pyspark/resource/tests/test_resources.py
+++ b/python/pyspark/resource/tests/test_resources.py
@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import random
 import unittest

-from pyspark.resource import ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder,\
+from pyspark.resource import ExecutorResourceRequests, ResourceProfileBuilder,\
    TaskResourceRequests


@ -73,7 +72,7 @@ class ResourceProfileTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.resource.tests.test_resources import *
+    from pyspark.resource.tests.test_resources import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/avro/functions.py
+++ b/python/pyspark/sql/avro/functions.py
@ -122,7 +122,7 @@ def _test():
        os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args])

    import doctest
-    from pyspark.sql import Row, SparkSession
+    from pyspark.sql import SparkSession
    import pyspark.sql.avro.functions
    globs = pyspark.sql.avro.functions.__dict__.copy()
    spark = SparkSession.builder\
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@ -23,8 +23,7 @@ from pyspark.sql.session import _monkey_patch_RDD, SparkSession
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import IntegerType, Row, StringType
-from pyspark.sql.udf import UDFRegistration
+from pyspark.sql.udf import UDFRegistration  # noqa: F401
 from pyspark.sql.utils import install_exception_handler

 __all__ = ["SQLContext", "HiveContext"]
@ -53,6 +52,7 @@ class SQLContext(object):
        .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead.

        >>> from datetime import datetime
+        >>> from pyspark.sql import Row
        >>> sqlContext = SQLContext(sc)
        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@ -29,9 +29,10 @@ from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import StringType, DataType
 # Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409
-from pyspark.sql.udf import UserDefinedFunction, _create_udf
+from pyspark.sql.udf import UserDefinedFunction, _create_udf  # noqa: F401
+from pyspark.sql.udf import _create_udf
 # Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264
-from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
+from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType  # noqa: F401
 from pyspark.sql.utils import to_str

 # Note to developers: all of PySpark functions here take string as column names whenever possible.
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@ -16,7 +16,6 @@
 #

 import functools
-import sys
 import warnings
 from inspect import getfullargspec

--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@ -1246,7 +1246,7 @@ def _test():
    import tempfile
    import py4j
    from pyspark.context import SparkContext
-    from pyspark.sql import SparkSession, Row
+    from pyspark.sql import SparkSession
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@ -27,7 +27,7 @@ from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.pandas.conversion import SparkConversionMixin
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import Row, DataType, StringType, StructType, \
+from pyspark.sql.types import DataType, StructType, \
    _make_type_verifier, _infer_schema, _has_nulltype, _merge_type, _create_converter, \
    _parse_datatype_string
 from pyspark.sql.utils import install_exception_handler
@ -192,6 +192,7 @@ class SparkSession(SparkConversionMixin):
        """Creates a new SparkSession.

        >>> from datetime import datetime
+        >>> from pyspark.sql import Row
        >>> spark = SparkSession(sc)
        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@ -1231,7 +1231,7 @@ def _test():
    import doctest
    import os
    import tempfile
-    from pyspark.sql import Row, SparkSession, SQLContext
+    from pyspark.sql import SparkSession, SQLContext
    import pyspark.sql.streaming

    os.chdir(os.environ["SPARK_HOME"])
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@ -35,7 +35,7 @@ if have_pandas:
    from pandas.util.testing import assert_frame_equal

 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401


@unittest.skipIf(
--- a/python/pyspark/sql/tests/test_catalog.py
+++ b/python/pyspark/sql/tests/test_catalog.py
@ -190,7 +190,7 @@ class CatalogTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_catalog import *
+    from pyspark.sql.tests.test_catalog import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@ -142,7 +142,7 @@ class ColumnTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_column import *
+    from pyspark.sql.tests.test_column import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_conf.py
+++ b/python/pyspark/sql/tests/test_conf.py
@ -46,7 +46,7 @@ class ConfTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_conf import *
+    from pyspark.sql.tests.test_conf import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@ -273,7 +273,7 @@ class SQLContextTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_context import *
+    from pyspark.sql.tests.test_context import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@ -379,8 +379,6 @@ class DataFrameTests(ReusedSQLTestCase):

    # add tests for SPARK-23647 (test more types for hint)
    def test_extended_hint_types(self):
-        from pyspark.sql import DataFrame
-
        df = self.spark.range(10e10).toDF("id")
        such_a_nice_list = ["itworks1", "itworks2", "itworks3"]
        hinted_df = df.hint("my awesome hint", 1.2345, "what", such_a_nice_list)
--- a/python/pyspark/sql/tests/test_datasources.py
+++ b/python/pyspark/sql/tests/test_datasources.py
@ -161,7 +161,7 @@ class DataSourcesTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_datasources import *
+    from pyspark.sql.tests.test_datasources import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@ -16,7 +16,6 @@
 #

 import datetime
-import sys
 from itertools import chain
 import re

@ -369,7 +368,7 @@ class FunctionsTests(ReusedSQLTestCase):
        self.assertListEqual(actual, expected)

    def test_higher_order_function_failures(self):
-        from pyspark.sql.functions import col, exists, transform
+        from pyspark.sql.functions import col, transform

        # Should fail with varargs
        with self.assertRaises(ValueError):
@ -394,7 +393,7 @@ class FunctionsTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_functions import *
+    from pyspark.sql.tests.test_functions import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_group.py
+++ b/python/pyspark/sql/tests/test_group.py
@ -36,7 +36,7 @@ class GroupTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_group import *
+    from pyspark.sql.tests.test_group import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
@ -16,9 +16,8 @@
 #

 import unittest
-import sys

-from pyspark.sql.functions import array, explode, col, lit, udf, sum, pandas_udf, PandasUDFType
+from pyspark.sql.functions import array, explode, col, lit, udf, pandas_udf
 from pyspark.sql.types import DoubleType, StructType, StructField, Row
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
    pandas_requirement_message, pyarrow_requirement_message
@ -26,10 +25,10 @@ from pyspark.testing.utils import QuietTest

 if have_pandas:
    import pandas as pd
-    from pandas.util.testing import assert_frame_equal, assert_series_equal
+    from pandas.util.testing import assert_frame_equal

 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401


@unittest.skipIf(
@ -245,7 +244,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_cogrouped_map import *
+    from pyspark.sql.tests.test_pandas_cogrouped_map import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@ -17,7 +17,6 @@

 import datetime
 import unittest
-import sys

 from collections import OrderedDict
 from decimal import Decimal
@ -35,7 +34,7 @@ if have_pandas:
    from pandas.util.testing import assert_frame_equal

 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401


@unittest.skipIf(
--- a/python/pyspark/sql/tests/test_pandas_map.py
+++ b/python/pyspark/sql/tests/test_pandas_map.py
@ -15,11 +15,9 @@
 # limitations under the License.
 #
 import os
-import sys
 import time
 import unittest

-from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
    pandas_requirement_message, pyarrow_requirement_message

@ -116,7 +114,7 @@ class MapInPandasTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_map import *
+    from pyspark.sql.tests.test_pandas_map import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_udf.py
+++ b/python/pyspark/sql/tests/test_pandas_udf.py
@ -241,7 +241,7 @@ class PandasUDFTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf import *
+    from pyspark.sql.tests.test_pandas_udf import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@ -510,7 +510,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_grouped_agg import *
+    from pyspark.sql.tests.test_pandas_udf_grouped_agg import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
@ -17,7 +17,6 @@
 import os
 import random
 import shutil
-import sys
 import tempfile
 import time
 import unittest
@ -41,7 +40,7 @@ if have_pandas:
    import pandas as pd

 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401


@unittest.skipIf(
--- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
@ -243,7 +243,7 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_typehints import *
+    from pyspark.sql.tests.test_pandas_udf_typehints import *  # noqa: #401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_pandas_udf_window.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_window.py
@ -352,7 +352,7 @@ class WindowPandasUDFTests(ReusedSQLTestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_window import *
+    from pyspark.sql.tests.test_pandas_udf_window import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_readwriter.py
+++ b/python/pyspark/sql/tests/test_readwriter.py
@ -201,7 +201,7 @@ class ReadwriterV2Tests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_readwriter import *
+    from pyspark.sql.tests.test_readwriter import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_serde.py
+++ b/python/pyspark/sql/tests/test_serde.py
@ -139,7 +139,7 @@ class SerdeTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_serde import *
+    from pyspark.sql.tests.test_serde import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_session.py
+++ b/python/pyspark/sql/tests/test_session.py
@ -358,7 +358,7 @@ class SparkExtensionsTest(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_session import *
+    from pyspark.sql.tests.test_session import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_streaming.py
+++ b/python/pyspark/sql/tests/test_streaming.py
@ -567,7 +567,7 @@ class StreamingTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_streaming import *
+    from pyspark.sql.tests.test_streaming import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@ -969,7 +969,7 @@ class DataTypeVerificationTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.sql.tests.test_types import *
+    from pyspark.sql.tests.test_types import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/sql/tests/test_udf.py
+++ b/python/pyspark/sql/tests/test_udf.py
@ -21,8 +21,6 @@ import shutil
 import tempfile
 import unittest

-import py4j
-
 from pyspark import SparkContext
 from pyspark.sql import SparkSession, Column, Row
 from pyspark.sql.functions import UserDefinedFunction, udf
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@ -52,7 +52,7 @@ class UtilsTests(ReusedSQLTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.sql.tests.test_utils import *
+    from pyspark.sql.tests.test_utils import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #

-import sys
 import operator
 import time
 from itertools import chain
--- a/python/pyspark/streaming/tests/test_context.py
+++ b/python/pyspark/streaming/tests/test_context.py
@ -175,7 +175,7 @@ class StreamingContextTests(PySparkStreamingTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.streaming.tests.test_context import *
+    from pyspark.streaming.tests.test_context import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@ -24,7 +24,7 @@ from functools import reduce
 from itertools import chain
 import platform

-from pyspark import SparkConf, SparkContext, RDD
+from pyspark import SparkConf, SparkContext
 from pyspark.streaming import StreamingContext
 from pyspark.testing.streamingutils import PySparkStreamingTestCase

@ -644,7 +644,7 @@ class CheckpointTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.streaming.tests.test_dstream import *
+    from pyspark.streaming.tests.test_dstream import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/streaming/tests/test_kinesis.py
+++ b/python/pyspark/streaming/tests/test_kinesis.py
@ -80,7 +80,7 @@ class KinesisStreamTests(PySparkStreamingTestCase):


 if __name__ == "__main__":
-    from pyspark.streaming.tests.test_kinesis import *
+    from pyspark.streaming.tests.test_kinesis import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/streaming/tests/test_listener.py
+++ b/python/pyspark/streaming/tests/test_listener.py
@ -149,7 +149,7 @@ class StreamingListenerTests(PySparkStreamingTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.streaming.tests.test_listener import *
+    from pyspark.streaming.tests.test_listener import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@ -27,13 +27,13 @@ from pyspark import SparkContext, SparkConf
 have_scipy = False
 have_numpy = False
 try:
-    import scipy.sparse
+    import scipy.sparse  # noqa: F401
    have_scipy = True
 except:
    # No SciPy, but that's okay, we'll skip those tests
    pass
 try:
-    import numpy as np
+    import numpy as np  # noqa: F401
    have_numpy = True
 except:
    # No NumPy, but that's okay, we'll skip those tests
--- a/python/pyspark/tests/test_appsubmit.py
+++ b/python/pyspark/tests/test_appsubmit.py
@ -238,7 +238,7 @@ class SparkSubmitTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_appsubmit import *
+    from pyspark.tests.test_appsubmit import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_broadcast.py
+++ b/python/pyspark/tests/test_broadcast.py
@ -145,7 +145,7 @@ class BroadcastFrameProtocolTest(unittest.TestCase):


 if __name__ == '__main__':
-    from pyspark.tests.test_broadcast import *
+    from pyspark.tests.test_broadcast import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_conf.py
+++ b/python/pyspark/tests/test_conf.py
@ -33,7 +33,7 @@ class ConfTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_conf import *
+    from pyspark.tests.test_conf import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_context.py
+++ b/python/pyspark/tests/test_context.py
@ -126,7 +126,7 @@ class AddFileTests(PySparkTestCase):
        # To ensure that we're actually testing addPyFile's effects, check that
        # this fails due to `userlibrary` not being on the Python path:
        def func():
-            from userlibrary import UserClass
+            from userlibrary import UserClass  # noqa: F401
        self.assertRaises(ImportError, func)
        path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")
        self.sc.addPyFile(path)
@ -137,7 +137,7 @@ class AddFileTests(PySparkTestCase):
        # To ensure that we're actually testing addPyFile's effects, check that
        # this fails due to `userlibrary` not being on the Python path:
        def func():
-            from userlib import UserClass
+            from userlib import UserClass  # noqa: F401
        self.assertRaises(ImportError, func)
        path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1.zip")
        self.sc.addPyFile(path)
@ -318,7 +318,7 @@ class ContextTestsWithResources(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_context import *
+    from pyspark.tests.test_context import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_daemon.py
+++ b/python/pyspark/tests/test_daemon.py
@ -73,7 +73,7 @@ class DaemonTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_daemon import *
+    from pyspark.tests.test_daemon import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_join.py
+++ b/python/pyspark/tests/test_join.py
@ -59,7 +59,7 @@ class JoinTests(ReusedPySparkTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_join import *
+    from pyspark.tests.test_join import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_pin_thread.py
+++ b/python/pyspark/tests/test_pin_thread.py
@ -16,7 +16,6 @@
 #
 import os
 import time
-import random
 import threading
 import unittest

@ -167,7 +166,7 @@ class PinThreadTests(unittest.TestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_pin_thread import *
+    from pyspark.tests.test_pin_thread import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_profiler.py
+++ b/python/pyspark/tests/test_profiler.py
@ -98,7 +98,7 @@ class ProfilerTests2(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_profiler import *
+    from pyspark.tests.test_profiler import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_rdd.py
+++ b/python/pyspark/tests/test_rdd.py
@ -18,7 +18,6 @@ from datetime import datetime, timedelta
 import hashlib
 import os
 import random
-import sys
 import tempfile
 import time
 from glob import glob
@ -26,7 +25,7 @@ from glob import glob
 from py4j.protocol import Py4JJavaError

 from pyspark import shuffle, RDD
-from pyspark.resource import ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder,\
+from pyspark.resource import ExecutorResourceRequests, ResourceProfileBuilder,\
    TaskResourceRequests
 from pyspark.serializers import CloudPickleSerializer, BatchedSerializer, PickleSerializer,\
    MarshalSerializer, UTF8Deserializer, NoOpSerializer
@ -882,7 +881,7 @@ class RDDTests(ReusedPySparkTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_rdd import *
+    from pyspark.tests.test_rdd import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_rddbarrier.py
+++ b/python/pyspark/tests/test_rddbarrier.py
@ -40,7 +40,7 @@ class RDDBarrierTests(ReusedPySparkTestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_rddbarrier import *
+    from pyspark.tests.test_rddbarrier import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_readwrite.py
+++ b/python/pyspark/tests/test_readwrite.py
@ -16,10 +16,8 @@
 #
 import os
 import shutil
-import sys
 import tempfile
 import unittest
-from array import array

 from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME

@ -306,7 +304,7 @@ class OutputFormatTests(ReusedPySparkTestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_readwrite import *
+    from pyspark.tests.test_readwrite import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_serializers.py
+++ b/python/pyspark/tests/test_serializers.py
@ -87,7 +87,7 @@ class SerializationTestCase(unittest.TestCase):
    def test_pickling_file_handles(self):
        # to be corrected with SPARK-11160
        try:
-            import xmlrunner
+            import xmlrunner  # noqa: F401
        except ImportError:
            ser = CloudPickleSerializer()
            out1 = sys.stderr
--- a/python/pyspark/tests/test_shuffle.py
+++ b/python/pyspark/tests/test_shuffle.py
@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import random
-import sys
 import unittest

 from py4j.protocol import Py4JJavaError
@ -168,7 +167,7 @@ class SorterTests(unittest.TestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_shuffle import *
+    from pyspark.tests.test_shuffle import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@ -16,7 +16,6 @@
 #
 import os
 import random
-import shutil
 import stat
 import sys
 import tempfile
@ -322,7 +321,7 @@ class TaskContextTestsWithResources(unittest.TestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_taskcontext import *
+    from pyspark.tests.test_taskcontext import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_util.py
+++ b/python/pyspark/tests/test_util.py
@ -74,7 +74,7 @@ class UtilTests(PySparkTestCase):


 if __name__ == "__main__":
-    from pyspark.tests.test_util import *
+    from pyspark.tests.test_util import *  # noqa: F401

    try:
        import xmlrunner
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@ -16,14 +16,13 @@
 # limitations under the License.
 #
 import os
-import sys
 import tempfile
 import threading
 import time
 import unittest
 has_resource_module = True
 try:
-    import resource
+    import resource  # noqa: F401
 except ImportError:
    has_resource_module = False

@ -200,7 +199,7 @@ class WorkerMemoryTest(unittest.TestCase):

 if __name__ == "__main__":
    import unittest
-    from pyspark.tests.test_worker import *
+    from pyspark.tests.test_worker import *  # noqa: F401

    try:
        import xmlrunner