From 9fcf0ea71820f7331504073045c38820e50141c7 Mon Sep 17 00:00:00 2001
From: Fokko Driesprong <fokko@apache.org>
Date: Sat, 8 Aug 2020 08:51:57 -0700
Subject: [PATCH] [SPARK-32319][PYSPARK] Disallow the use of unused imports

Disallow the use of unused imports:

- Unnecessary increases the memory footprint of the application
- Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :)

```
fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
python/pyspark/cloudpickle.py:46:1: F401 'functools.partial' imported but unused
python/pyspark/cloudpickle.py:55:1: F401 'traceback' imported but unused
python/pyspark/heapq3.py:868:5: F401 '_heapq.*' imported but unused
python/pyspark/__init__.py:61:1: F401 'pyspark.version.__version__' imported but unused
python/pyspark/__init__.py:62:1: F401 'pyspark._globals._NoValue' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.SQLContext' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.HiveContext' imported but unused
python/pyspark/__init__.py:115:1: F401 'pyspark.sql.Row' imported but unused
python/pyspark/rdd.py:21:1: F401 're' imported but unused
python/pyspark/rdd.py:29:1: F401 'tempfile.NamedTemporaryFile' imported but unused
python/pyspark/mllib/regression.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/clustering.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/classification.py:26:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/feature.py:28:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/feature.py:30:1: F401 'pyspark.mllib.regression.LabeledPoint' imported but unused
python/pyspark/mllib/tests/test_linalg.py:18:1: F401 'sys' imported but unused
python/pyspark/mllib/tests/test_linalg.py:642:5: F401 'pyspark.mllib.tests.test_linalg.*' imported but unused
python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.random' imported but unused
python/pyspark/mllib/tests/test_feature.py:21:1: F401 'numpy.exp' imported but unused
python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/test_feature.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/test_feature.py:185:5: F401 'pyspark.mllib.tests.test_feature.*' imported but unused
python/pyspark/mllib/tests/test_util.py:97:5: F401 'pyspark.mllib.tests.test_util.*' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused
python/pyspark/mllib/tests/test_stat.py:23:1: F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused
python/pyspark/mllib/tests/test_stat.py:181:5: F401 'pyspark.mllib.tests.test_stat.*' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.time' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:18:1: F401 'time.sleep' imported but unused
python/pyspark/mllib/tests/test_streaming_algorithms.py:470:5: F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused
python/pyspark/mllib/tests/test_algorithms.py:295:5: F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused
python/pyspark/tests/test_serializers.py:90:13: F401 'xmlrunner' imported but unused
python/pyspark/tests/test_rdd.py:21:1: F401 'sys' imported but unused
python/pyspark/tests/test_rdd.py:29:1: F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/tests/test_rdd.py:885:5: F401 'pyspark.tests.test_rdd.*' imported but unused
python/pyspark/tests/test_readwrite.py:19:1: F401 'sys' imported but unused
python/pyspark/tests/test_readwrite.py:22:1: F401 'array.array' imported but unused
python/pyspark/tests/test_readwrite.py:309:5: F401 'pyspark.tests.test_readwrite.*' imported but unused
python/pyspark/tests/test_join.py:62:5: F401 'pyspark.tests.test_join.*' imported but unused
python/pyspark/tests/test_taskcontext.py:19:1: F401 'shutil' imported but unused
python/pyspark/tests/test_taskcontext.py:325:5: F401 'pyspark.tests.test_taskcontext.*' imported but unused
python/pyspark/tests/test_conf.py:36:5: F401 'pyspark.tests.test_conf.*' imported but unused
python/pyspark/tests/test_broadcast.py:148:5: F401 'pyspark.tests.test_broadcast.*' imported but unused
python/pyspark/tests/test_daemon.py:76:5: F401 'pyspark.tests.test_daemon.*' imported but unused
python/pyspark/tests/test_util.py:77:5: F401 'pyspark.tests.test_util.*' imported but unused
python/pyspark/tests/test_pin_thread.py:19:1: F401 'random' imported but unused
python/pyspark/tests/test_pin_thread.py:149:5: F401 'pyspark.tests.test_pin_thread.*' imported but unused
python/pyspark/tests/test_worker.py:19:1: F401 'sys' imported but unused
python/pyspark/tests/test_worker.py:26:5: F401 'resource' imported but unused
python/pyspark/tests/test_worker.py:203:5: F401 'pyspark.tests.test_worker.*' imported but unused
python/pyspark/tests/test_profiler.py:101:5: F401 'pyspark.tests.test_profiler.*' imported but unused
python/pyspark/tests/test_shuffle.py:18:1: F401 'sys' imported but unused
python/pyspark/tests/test_shuffle.py:171:5: F401 'pyspark.tests.test_shuffle.*' imported but unused
python/pyspark/tests/test_rddbarrier.py:43:5: F401 'pyspark.tests.test_rddbarrier.*' imported but unused
python/pyspark/tests/test_context.py:129:13: F401 'userlibrary.UserClass' imported but unused
python/pyspark/tests/test_context.py:140:13: F401 'userlib.UserClass' imported but unused
python/pyspark/tests/test_context.py:310:5: F401 'pyspark.tests.test_context.*' imported but unused
python/pyspark/tests/test_appsubmit.py:241:5: F401 'pyspark.tests.test_appsubmit.*' imported but unused
python/pyspark/streaming/dstream.py:18:1: F401 'sys' imported but unused
python/pyspark/streaming/tests/test_dstream.py:27:1: F401 'pyspark.RDD' imported but unused
python/pyspark/streaming/tests/test_dstream.py:647:5: F401 'pyspark.streaming.tests.test_dstream.*' imported but unused
python/pyspark/streaming/tests/test_kinesis.py:83:5: F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused
python/pyspark/streaming/tests/test_listener.py:152:5: F401 'pyspark.streaming.tests.test_listener.*' imported but unused
python/pyspark/streaming/tests/test_context.py:178:5: F401 'pyspark.streaming.tests.test_context.*' imported but unused
python/pyspark/testing/utils.py:30:5: F401 'scipy.sparse' imported but unused
python/pyspark/testing/utils.py:36:5: F401 'numpy as np' imported but unused
python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._TreeEnsembleParams' imported but unused
python/pyspark/ml/regression.py:25:1: F401 'pyspark.ml.tree._HasVarianceImpurity' imported but unused
python/pyspark/ml/regression.py:29:1: F401 'pyspark.ml.wrapper.JavaParams' imported but unused
python/pyspark/ml/util.py:19:1: F401 'sys' imported but unused
python/pyspark/ml/__init__.py:25:1: F401 'pyspark.ml.pipeline' imported but unused
python/pyspark/ml/pipeline.py:18:1: F401 'sys' imported but unused
python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.DenseMatrix' imported but unused
python/pyspark/ml/stat.py:22:1: F401 'pyspark.ml.linalg.Vectors' imported but unused
python/pyspark/ml/tests/test_training_summary.py:18:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_training_summary.py:364:5: F401 'pyspark.ml.tests.test_training_summary.*' imported but unused
python/pyspark/ml/tests/test_linalg.py:381:5: F401 'pyspark.ml.tests.test_linalg.*' imported but unused
python/pyspark/ml/tests/test_tuning.py:427:9: F401 'pyspark.sql.functions as F' imported but unused
python/pyspark/ml/tests/test_tuning.py:757:5: F401 'pyspark.ml.tests.test_tuning.*' imported but unused
python/pyspark/ml/tests/test_wrapper.py:120:5: F401 'pyspark.ml.tests.test_wrapper.*' imported but unused
python/pyspark/ml/tests/test_feature.py:19:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_feature.py:304:5: F401 'pyspark.ml.tests.test_feature.*' imported but unused
python/pyspark/ml/tests/test_image.py:19:1: F401 'py4j' imported but unused
python/pyspark/ml/tests/test_image.py:22:1: F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused
python/pyspark/ml/tests/test_image.py:71:5: F401 'pyspark.ml.tests.test_image.*' imported but unused
python/pyspark/ml/tests/test_persistence.py:456:5: F401 'pyspark.ml.tests.test_persistence.*' imported but unused
python/pyspark/ml/tests/test_evaluation.py:56:5: F401 'pyspark.ml.tests.test_evaluation.*' imported but unused
python/pyspark/ml/tests/test_stat.py:43:5: F401 'pyspark.ml.tests.test_stat.*' imported but unused
python/pyspark/ml/tests/test_base.py:70:5: F401 'pyspark.ml.tests.test_base.*' imported but unused
python/pyspark/ml/tests/test_param.py:20:1: F401 'sys' imported but unused
python/pyspark/ml/tests/test_param.py:375:5: F401 'pyspark.ml.tests.test_param.*' imported but unused
python/pyspark/ml/tests/test_pipeline.py:62:5: F401 'pyspark.ml.tests.test_pipeline.*' imported but unused
python/pyspark/ml/tests/test_algorithms.py:333:5: F401 'pyspark.ml.tests.test_algorithms.*' imported but unused
python/pyspark/ml/param/__init__.py:18:1: F401 'sys' imported but unused
python/pyspark/resource/tests/test_resources.py:17:1: F401 'random' imported but unused
python/pyspark/resource/tests/test_resources.py:20:1: F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/resource/tests/test_resources.py:75:5: F401 'pyspark.resource.tests.test_resources.*' imported but unused
python/pyspark/sql/functions.py:32:1: F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused
python/pyspark/sql/functions.py:34:1: F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused
python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/session.py:30:1: F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/readwriter.py:1084:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.IntegerType' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/context.py:26:1: F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/context.py:27:1: F401 'pyspark.sql.udf.UDFRegistration' imported but unused
python/pyspark/sql/streaming.py:1212:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/tests/test_utils.py:55:5: F401 'pyspark.sql.tests.test_utils.*' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:18:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.pandas_udf' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:22:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/test_pandas_map.py:119:5: F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused
python/pyspark/sql/tests/test_catalog.py:193:5: F401 'pyspark.sql.tests.test_catalog.*' imported but unused
python/pyspark/sql/tests/test_group.py:39:5: F401 'pyspark.sql.tests.test_group.*' imported but unused
python/pyspark/sql/tests/test_session.py:361:5: F401 'pyspark.sql.tests.test_session.*' imported but unused
python/pyspark/sql/tests/test_conf.py:49:5: F401 'pyspark.sql.tests.test_conf.*' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:19:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.sum' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:21:1: F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:29:5: F401 'pandas.util.testing.assert_series_equal' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:32:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_cogrouped_map.py:248:5: F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused
python/pyspark/sql/tests/test_udf.py:24:1: F401 'py4j' imported but unused
python/pyspark/sql/tests/test_pandas_udf_typehints.py:246:5: F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused
python/pyspark/sql/tests/test_functions.py:19:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_functions.py:362:9: F401 'pyspark.sql.functions.exists' imported but unused
python/pyspark/sql/tests/test_functions.py:387:5: F401 'pyspark.sql.tests.test_functions.*' imported but unused
python/pyspark/sql/tests/test_pandas_udf_scalar.py:21:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_udf_scalar.py:45:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_udf_window.py:355:5: F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused
python/pyspark/sql/tests/test_arrow.py:38:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_pandas_grouped_map.py:20:1: F401 'sys' imported but unused
python/pyspark/sql/tests/test_pandas_grouped_map.py:38:5: F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/test_dataframe.py:382:9: F401 'pyspark.sql.DataFrame' imported but unused
python/pyspark/sql/avro/functions.py:125:5: F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/pandas/functions.py:19:1: F401 'sys' imported but unused
```

After:
```
fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
fokkodriesprongFan spark %
```

### What changes were proposed in this pull request?

Removing unused imports from the Python files to keep everything nice and tidy.

### Why are the changes needed?

Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Adding the rule to the existing Flake8 checks.

Closes #29121 from Fokko/SPARK-32319.

Authored-by: Fokko Driesprong <fokko@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 dev/create-release/releaseutils.py                      | 4 ++--
 dev/lint-python                                         | 3 +--
 dev/pip-sanity-check.py                                 | 1 -
 dev/run-tests.py                                        | 1 -
 dev/tox.ini                                             | 5 +++++
 examples/src/main/python/sql/hive.py                    | 2 +-
 examples/src/main/python/status_api_demo.py             | 1 -
 python/pyspark/__init__.py                              | 6 +++---
 python/pyspark/ml/__init__.py                           | 2 +-
 python/pyspark/ml/param/__init__.py                     | 1 -
 python/pyspark/ml/pipeline.py                           | 2 --
 python/pyspark/ml/regression.py                         | 8 +++-----
 python/pyspark/ml/stat.py                               | 4 ++--
 python/pyspark/ml/tests/test_algorithms.py              | 2 +-
 python/pyspark/ml/tests/test_base.py                    | 2 +-
 python/pyspark/ml/tests/test_evaluation.py              | 2 +-
 python/pyspark/ml/tests/test_feature.py                 | 3 +--
 python/pyspark/ml/tests/test_image.py                   | 6 ++----
 python/pyspark/ml/tests/test_linalg.py                  | 2 +-
 python/pyspark/ml/tests/test_param.py                   | 3 +--
 python/pyspark/ml/tests/test_persistence.py             | 2 +-
 python/pyspark/ml/tests/test_pipeline.py                | 2 +-
 python/pyspark/ml/tests/test_stat.py                    | 2 +-
 python/pyspark/ml/tests/test_training_summary.py        | 3 +--
 python/pyspark/ml/tests/test_tuning.py                  | 4 +---
 python/pyspark/ml/tests/test_wrapper.py                 | 2 +-
 python/pyspark/ml/util.py                               | 1 -
 python/pyspark/mllib/classification.py                  | 5 ++++-
 python/pyspark/mllib/clustering.py                      | 2 +-
 python/pyspark/mllib/feature.py                         | 7 ++++---
 python/pyspark/mllib/regression.py                      | 6 +++++-
 python/pyspark/mllib/tests/test_algorithms.py           | 2 +-
 python/pyspark/mllib/tests/test_feature.py              | 6 +++---
 python/pyspark/mllib/tests/test_linalg.py               | 3 +--
 python/pyspark/mllib/tests/test_stat.py                 | 5 ++---
 python/pyspark/mllib/tests/test_streaming_algorithms.py | 3 +--
 python/pyspark/mllib/tests/test_util.py                 | 2 +-
 python/pyspark/rdd.py                                   | 6 ++++--
 python/pyspark/resource/tests/test_resources.py         | 5 ++---
 python/pyspark/sql/avro/functions.py                    | 2 +-
 python/pyspark/sql/context.py                           | 4 ++--
 python/pyspark/sql/functions.py                         | 5 +++--
 python/pyspark/sql/pandas/functions.py                  | 1 -
 python/pyspark/sql/readwriter.py                        | 2 +-
 python/pyspark/sql/session.py                           | 3 ++-
 python/pyspark/sql/streaming.py                         | 2 +-
 python/pyspark/sql/tests/test_arrow.py                  | 2 +-
 python/pyspark/sql/tests/test_catalog.py                | 2 +-
 python/pyspark/sql/tests/test_column.py                 | 2 +-
 python/pyspark/sql/tests/test_conf.py                   | 2 +-
 python/pyspark/sql/tests/test_context.py                | 2 +-
 python/pyspark/sql/tests/test_dataframe.py              | 2 --
 python/pyspark/sql/tests/test_datasources.py            | 2 +-
 python/pyspark/sql/tests/test_functions.py              | 5 ++---
 python/pyspark/sql/tests/test_group.py                  | 2 +-
 python/pyspark/sql/tests/test_pandas_cogrouped_map.py   | 9 ++++-----
 python/pyspark/sql/tests/test_pandas_grouped_map.py     | 3 +--
 python/pyspark/sql/tests/test_pandas_map.py             | 4 +---
 python/pyspark/sql/tests/test_pandas_udf.py             | 2 +-
 python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py | 2 +-
 python/pyspark/sql/tests/test_pandas_udf_scalar.py      | 3 +--
 python/pyspark/sql/tests/test_pandas_udf_typehints.py   | 2 +-
 python/pyspark/sql/tests/test_pandas_udf_window.py      | 2 +-
 python/pyspark/sql/tests/test_readwriter.py             | 2 +-
 python/pyspark/sql/tests/test_serde.py                  | 2 +-
 python/pyspark/sql/tests/test_session.py                | 2 +-
 python/pyspark/sql/tests/test_streaming.py              | 2 +-
 python/pyspark/sql/tests/test_types.py                  | 2 +-
 python/pyspark/sql/tests/test_udf.py                    | 2 --
 python/pyspark/sql/tests/test_utils.py                  | 2 +-
 python/pyspark/streaming/dstream.py                     | 1 -
 python/pyspark/streaming/tests/test_context.py          | 2 +-
 python/pyspark/streaming/tests/test_dstream.py          | 4 ++--
 python/pyspark/streaming/tests/test_kinesis.py          | 2 +-
 python/pyspark/streaming/tests/test_listener.py         | 2 +-
 python/pyspark/testing/utils.py                         | 4 ++--
 python/pyspark/tests/test_appsubmit.py                  | 2 +-
 python/pyspark/tests/test_broadcast.py                  | 2 +-
 python/pyspark/tests/test_conf.py                       | 2 +-
 python/pyspark/tests/test_context.py                    | 6 +++---
 python/pyspark/tests/test_daemon.py                     | 2 +-
 python/pyspark/tests/test_join.py                       | 2 +-
 python/pyspark/tests/test_pin_thread.py                 | 3 +--
 python/pyspark/tests/test_profiler.py                   | 2 +-
 python/pyspark/tests/test_rdd.py                        | 5 ++---
 python/pyspark/tests/test_rddbarrier.py                 | 2 +-
 python/pyspark/tests/test_readwrite.py                  | 4 +---
 python/pyspark/tests/test_serializers.py                | 2 +-
 python/pyspark/tests/test_shuffle.py                    | 3 +--
 python/pyspark/tests/test_taskcontext.py                | 3 +--
 python/pyspark/tests/test_util.py                       | 2 +-
 python/pyspark/tests/test_worker.py                     | 5 ++---
 92 files changed, 124 insertions(+), 147 deletions(-)

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 9f1dffbd09..c6b3176a21 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -23,7 +23,7 @@ import sys
 from subprocess import Popen, PIPE
 
 try:
-    from jira.client import JIRA
+    from jira.client import JIRA  # noqa: F401
     # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
     try:
         from jira.exceptions import JIRAError
@@ -35,7 +35,7 @@ except ImportError:
     sys.exit(-1)
 
 try:
-    from github import Github
+    from github import Github  # noqa: F401
     from github import GithubException
 except ImportError:
     print("This tool requires the PyGithub library")
diff --git a/dev/lint-python b/dev/lint-python
index 41da41bfda..07897eb499 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -147,8 +147,7 @@ flake8 checks failed."
     fi
 
     echo "starting $FLAKE8_BUILD test..."
-    FLAKE8_REPORT=$( ($FLAKE8_BUILD . --count --select=E901,E999,F821,F822,F823 \
-                     --max-line-length=100 --show-source --statistics) 2>&1)
+    FLAKE8_REPORT=$( ($FLAKE8_BUILD --append-config dev/tox.ini --count --show-source --statistics .) 2>&1)
     FLAKE8_STATUS=$?
 
     if [ "$FLAKE8_STATUS" -ne 0 ]; then
diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py
index e9f10233b1..469e27b78b 100644
--- a/dev/pip-sanity-check.py
+++ b/dev/pip-sanity-check.py
@@ -16,7 +16,6 @@
 #
 
 from pyspark.sql import SparkSession
-from pyspark.mllib.linalg import *
 import sys
 
 if __name__ == "__main__":
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 6aae3bdaef..93023d41e2 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -26,7 +26,6 @@ import sys
 import subprocess
 import glob
 import shutil
-from collections import namedtuple
 
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
diff --git a/dev/tox.ini b/dev/tox.ini
index 5bf27d1abd..e8e44803bd 100644
--- a/dev/tox.ini
+++ b/dev/tox.ini
@@ -17,3 +17,8 @@
 ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504
 max-line-length=100
 exclude=python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
+
+[flake8]
+select = E901,E999,F821,F822,F823,F401
+exclude = python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
+max-line-length = 100
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index bc23dcd9bd..fa1b975e2b 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -21,7 +21,7 @@ Run with:
   ./bin/spark-submit examples/src/main/python/sql/hive.py
 """
 # $example on:spark_hive$
-from os.path import join, abspath
+from os.path import abspath
 
 from pyspark.sql import SparkSession
 from pyspark.sql import Row
diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
index 7b408c8726..ae39cef6eb 100644
--- a/examples/src/main/python/status_api_demo.py
+++ b/examples/src/main/python/status_api_demo.py
@@ -17,7 +17,6 @@
 
 import time
 import threading
-import sys
 import queue as Queue
 
 from pyspark import SparkConf, SparkContext
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index 61e38fdb2a..f84c01e505 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -61,8 +61,8 @@ from pyspark.serializers import MarshalSerializer, PickleSerializer
 from pyspark.status import *
 from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
 from pyspark.profiler import Profiler, BasicProfiler
-from pyspark.version import __version__
-from pyspark._globals import _NoValue
+from pyspark.version import __version__  # noqa: F401
+from pyspark._globals import _NoValue  # noqa: F401
 
 
 def since(version):
@@ -115,7 +115,7 @@ def keyword_only(func):
 
 
 # for back compatibility
-from pyspark.sql import SQLContext, HiveContext, Row
+from pyspark.sql import SQLContext, HiveContext, Row  # noqa: F401
 
 __all__ = [
     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py
index 47fc78e83f..7d0e55a922 100644
--- a/python/pyspark/ml/__init__.py
+++ b/python/pyspark/ml/__init__.py
@@ -23,7 +23,7 @@ from pyspark.ml.base import Estimator, Model, Predictor, PredictionModel, \
     Transformer, UnaryTransformer
 from pyspark.ml.pipeline import Pipeline, PipelineModel
 from pyspark.ml import classification, clustering, evaluation, feature, fpm, \
-    image, pipeline, recommendation, regression, stat, tuning, util, linalg, param
+    image, recommendation, regression, stat, tuning, util, linalg, param
 
 __all__ = [
     "Transformer", "UnaryTransformer", "Estimator", "Model",
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 96b07bfa5f..95f3c32b8b 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import array
-import sys
 from abc import ABCMeta
 import copy
 
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index eacb8b82b5..937237fb97 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-import sys
-
 from pyspark import keyword_only
 from pyspark.ml.base import Estimator, Model, Transformer
 from pyspark.ml.param import Param, Params
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 4a8d1530b8..6d88b97e8f 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -15,18 +15,16 @@
 # limitations under the License.
 #
 
-import sys
 from abc import ABCMeta
 
-from pyspark import since, keyword_only
+from pyspark import keyword_only
 from pyspark.ml import Predictor, PredictionModel
 from pyspark.ml.base import _PredictorParams
 from pyspark.ml.param.shared import *
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
-    _TreeEnsembleModel, _TreeEnsembleParams, _RandomForestParams, _GBTParams, \
-    _HasVarianceImpurity, _TreeRegressorParams
+    _TreeEnsembleModel, _RandomForestParams, _GBTParams, _TreeRegressorParams
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, \
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, \
     JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc
 from pyspark.sql import DataFrame
diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py
index 517c984252..caa847a6a4 100644
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@@ -19,7 +19,6 @@ import sys
 
 from pyspark import since, SparkContext
 from pyspark.ml.common import _java2py, _py2java
-from pyspark.ml.linalg import DenseMatrix, Vectors
 from pyspark.ml.wrapper import JavaWrapper, _jvm
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.functions import lit
@@ -121,7 +120,7 @@ class Correlation(object):
           DataFrame contains a single row and a single column of name
           '$METHODNAME($COLUMN)'.
 
-        >>> from pyspark.ml.linalg import Vectors
+        >>> from pyspark.ml.linalg import DenseMatrix, Vectors
         >>> from pyspark.ml.stat import Correlation
         >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
         ...            [Vectors.dense([4, 5, 0, 3])],
@@ -412,6 +411,7 @@ class SummaryBuilder(JavaWrapper):
 class MultivariateGaussian(object):
     """Represents a (mean, cov) tuple
 
+    >>> from pyspark.ml.linalg import DenseMatrix, Vectors
     >>> m = MultivariateGaussian(Vectors.dense([11,12]), DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))
     >>> (m.mean, m.cov.toArray())
     (DenseVector([11.0, 12.0]), array([[ 1.,  5.],
diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py
index c948bd0c64..492e849658 100644
--- a/python/pyspark/ml/tests/test_algorithms.py
+++ b/python/pyspark/ml/tests/test_algorithms.py
@@ -330,7 +330,7 @@ class LinearRegressionTest(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_algorithms import *
+    from pyspark.ml.tests.test_algorithms import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_base.py b/python/pyspark/ml/tests/test_base.py
index 1b7d1c7585..cba5369ca2 100644
--- a/python/pyspark/ml/tests/test_base.py
+++ b/python/pyspark/ml/tests/test_base.py
@@ -67,7 +67,7 @@ class EstimatorTest(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_base import *
+    from pyspark.ml.tests.test_base import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_evaluation.py b/python/pyspark/ml/tests/test_evaluation.py
index fdd6ee7a53..7883df7882 100644
--- a/python/pyspark/ml/tests/test_evaluation.py
+++ b/python/pyspark/ml/tests/test_evaluation.py
@@ -53,7 +53,7 @@ class EvaluatorTests(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_evaluation import *
+    from pyspark.ml.tests.test_evaluation import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py
index 7856a317c2..7fd8c0b669 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 #
 
-import sys
 import unittest
 
 from pyspark.ml.feature import Binarizer, CountVectorizer, CountVectorizerModel, HashingTF, IDF, \
@@ -301,7 +300,7 @@ class HashingTFTest(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_feature import *
+    from pyspark.ml.tests.test_feature import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py
index 5cc2a815ea..069ffceb50 100644
--- a/python/pyspark/ml/tests/test_image.py
+++ b/python/pyspark/ml/tests/test_image.py
@@ -16,10 +16,8 @@
 #
 import unittest
 
-import py4j
-
 from pyspark.ml.image import ImageSchema
-from pyspark.testing.mlutils import PySparkTestCase, SparkSessionTestCase
+from pyspark.testing.mlutils import SparkSessionTestCase
 from pyspark.sql import Row
 from pyspark.testing.utils import QuietTest
 
@@ -68,7 +66,7 @@ class ImageFileFormatTest(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_image import *
+    from pyspark.ml.tests.test_image import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_linalg.py b/python/pyspark/ml/tests/test_linalg.py
index 2cba5396f2..60dda82fe0 100644
--- a/python/pyspark/ml/tests/test_linalg.py
+++ b/python/pyspark/ml/tests/test_linalg.py
@@ -378,7 +378,7 @@ class MatrixUDTTests(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_linalg import *
+    from pyspark.ml.tests.test_linalg import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py
index 44731568b6..abee6d1be5 100644
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@@ -17,7 +17,6 @@
 #
 
 import inspect
-import sys
 import array as pyarray
 import unittest
 
@@ -370,7 +369,7 @@ class DefaultValuesTests(PySparkTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_param import *
+    from pyspark.ml.tests.test_param import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py
index 2f6d451851..4acf58da21 100644
--- a/python/pyspark/ml/tests/test_persistence.py
+++ b/python/pyspark/ml/tests/test_persistence.py
@@ -453,7 +453,7 @@ class PersistenceTest(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_persistence import *
+    from pyspark.ml.tests.test_persistence import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_pipeline.py b/python/pyspark/ml/tests/test_pipeline.py
index 18a8b7d3b0..011e6537a8 100644
--- a/python/pyspark/ml/tests/test_pipeline.py
+++ b/python/pyspark/ml/tests/test_pipeline.py
@@ -59,7 +59,7 @@ class PipelineTests(PySparkTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_pipeline import *
+    from pyspark.ml.tests.test_pipeline import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_stat.py b/python/pyspark/ml/tests/test_stat.py
index d583da2e8a..666d0aec58 100644
--- a/python/pyspark/ml/tests/test_stat.py
+++ b/python/pyspark/ml/tests/test_stat.py
@@ -40,7 +40,7 @@ class ChiSquareTestTests(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_stat import *
+    from pyspark.ml.tests.test_stat import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py
index 6b05ffaa7d..cb0effbe2b 100644
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import sys
 import unittest
 
 from pyspark.ml.classification import BinaryLogisticRegressionSummary, \
@@ -443,7 +442,7 @@ class TrainingSummaryTest(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_training_summary import *
+    from pyspark.ml.tests.test_training_summary import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py
index 0aa5d47ca6..66f1ea20a4 100644
--- a/python/pyspark/ml/tests/test_tuning.py
+++ b/python/pyspark/ml/tests/test_tuning.py
@@ -424,8 +424,6 @@ class CrossValidatorTests(SparkSessionTestCase):
         self.assertEqual(loadedCV.getFoldCol(), cv_with_user_folds.getFoldCol())
 
     def test_invalid_user_specified_folds(self):
-        from pyspark.sql import functions as F
-
         dataset_with_folds = self.spark.createDataFrame(
             [(Vectors.dense([0.0]), 0.0, 0),
              (Vectors.dense([0.4]), 1.0, 1),
@@ -754,7 +752,7 @@ class TrainValidationSplitTests(SparkSessionTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_tuning import *
+    from pyspark.ml.tests.test_tuning import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/tests/test_wrapper.py b/python/pyspark/ml/tests/test_wrapper.py
index c0747155cb..e6eef8a7de 100644
--- a/python/pyspark/ml/tests/test_wrapper.py
+++ b/python/pyspark/ml/tests/test_wrapper.py
@@ -117,7 +117,7 @@ class WrapperTests(MLlibTestCase):
         self.assertEqual(_java2py(self.sc, java_array), expected_str_list)
 
 if __name__ == "__main__":
-    from pyspark.ml.tests.test_wrapper import *
+    from pyspark.ml.tests.test_wrapper import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 9ab6bfa9ba..f5f4584231 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -16,7 +16,6 @@
 #
 
 import json
-import sys
 import os
 import time
 import uuid
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index c52da2ad63..bbca216cce 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -23,7 +23,7 @@ import numpy
 
 from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import (
     LabeledPoint, LinearModel, _regression_train_wrapper,
     StreamingLinearAlgorithm)
@@ -102,6 +102,7 @@ class LogisticRegressionModel(LinearClassificationModel):
       in Multinomial Logistic Regression. By default, it is binary
       logistic regression so numClasses will be set to 2.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 1.0]),
     ...     LabeledPoint(1.0, [1.0, 0.0]),
@@ -410,6 +411,7 @@ class SVMModel(LinearClassificationModel):
     :param intercept:
       Intercept computed for this model.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
     ...     LabeledPoint(1.0, [1.0]),
@@ -569,6 +571,7 @@ class NaiveBayesModel(Saveable, Loader):
       Log of class conditional probabilities, whose dimension is C-by-D,
       where D is number of features.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 0.0]),
     ...     LabeledPoint(0.0, [0.0, 1.0]),
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 85cfe583fd..b99a4150c3 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -25,7 +25,7 @@ from numpy import array, random, tile
 from pyspark import SparkContext, since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
+from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector  # noqa: F401
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
 from pyspark.streaming import DStream
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 80a197eaa7..d95f9197ea 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -25,9 +25,7 @@ from py4j.protocol import Py4JJavaError
 from pyspark import since
 from pyspark.rdd import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import (
-    Vectors, DenseVector, SparseVector, _convert_to_vector)
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.linalg import Vectors, _convert_to_vector
 from pyspark.mllib.util import JavaLoader, JavaSaveable
 
 __all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler',
@@ -60,6 +58,7 @@ class Normalizer(VectorTransformer):
 
     :param p: Normalization in L^p^ space, p = 2 by default.
 
+    >>> from pyspark.mllib.linalg import Vectors
     >>> v = Vectors.dense(range(3))
     >>> nor = Normalizer(1)
     >>> nor.transform(v)
@@ -285,6 +284,8 @@ class ChiSqSelector(object):
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.
 
+    >>> from pyspark.mllib.linalg import SparseVector, DenseVector
+    >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = sc.parallelize([
     ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
     ...     LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 56ee0083ab..77bca86ac1 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -23,7 +23,7 @@ import numpy as np
 from pyspark import RDD, since
 from pyspark.streaming.dstream import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.util import Saveable, Loader
 
 __all__ = ['LabeledPoint', 'LinearModel',
@@ -102,6 +102,7 @@ class LinearRegressionModelBase(LinearModel):
 
     """A linear regression model.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1)
     >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6
     True
@@ -128,6 +129,7 @@ class LinearRegressionModel(LinearRegressionModelBase):
 
     """A linear regression model derived from a least-squares fit.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
@@ -297,6 +299,7 @@ class LassoModel(LinearRegressionModelBase):
     """A linear regression model derived from a least-squares fit with
     an l_1 penalty term.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
@@ -441,6 +444,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     """A linear regression model derived from a least-squares fit with
     an l_2 penalty term.
 
+    >>> from pyspark.mllib.linalg import SparseVector
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
diff --git a/python/pyspark/mllib/tests/test_algorithms.py b/python/pyspark/mllib/tests/test_algorithms.py
index 4e9dd6b3ba..27a340068a 100644
--- a/python/pyspark/mllib/tests/test_algorithms.py
+++ b/python/pyspark/mllib/tests/test_algorithms.py
@@ -292,7 +292,7 @@ class FPGrowthTest(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_algorithms import *
+    from pyspark.mllib.tests.test_algorithms import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/mllib/tests/test_feature.py b/python/pyspark/mllib/tests/test_feature.py
index 9e1da0f860..165c1466dd 100644
--- a/python/pyspark/mllib/tests/test_feature.py
+++ b/python/pyspark/mllib/tests/test_feature.py
@@ -18,9 +18,9 @@
 from math import sqrt
 import unittest
 
-from numpy import array, random, exp, abs, tile
+from numpy import array, abs, tile
 
-from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, Vectors
+from pyspark.mllib.linalg import SparseVector, DenseVector, Vectors
 from pyspark.mllib.linalg.distributed import RowMatrix
 from pyspark.mllib.feature import HashingTF, IDF, StandardScaler, ElementwiseProduct, Word2Vec
 from pyspark.testing.mllibutils import MLlibTestCase
@@ -182,7 +182,7 @@ class DimensionalityReductionTests(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_feature import *
+    from pyspark.mllib.tests.test_feature import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/mllib/tests/test_linalg.py b/python/pyspark/mllib/tests/test_linalg.py
index 21c2bb422a..0e25836599 100644
--- a/python/pyspark/mllib/tests/test_linalg.py
+++ b/python/pyspark/mllib/tests/test_linalg.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import sys
 import array as pyarray
 import unittest
 
@@ -639,7 +638,7 @@ class SciPyTests(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_linalg import *
+    from pyspark.mllib.tests.test_linalg import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/mllib/tests/test_stat.py b/python/pyspark/mllib/tests/test_stat.py
index c222a33efb..6ed0589387 100644
--- a/python/pyspark/mllib/tests/test_stat.py
+++ b/python/pyspark/mllib/tests/test_stat.py
@@ -20,8 +20,7 @@ import unittest
 
 from numpy import array
 
-from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
-    DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
+from pyspark.mllib.linalg import Vectors, Matrices
 from pyspark.mllib.random import RandomRDDs
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat import Statistics
@@ -178,7 +177,7 @@ class KolmogorovSmirnovTest(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_stat import *
+    from pyspark.mllib.tests.test_stat import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py
index 5818a7c088..666f6f4d86 100644
--- a/python/pyspark/mllib/tests/test_streaming_algorithms.py
+++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-from time import time, sleep
 import unittest
 
 from numpy import array, random, exp, dot, all, mean, abs
@@ -467,7 +466,7 @@ class StreamingLinearRegressionWithTests(MLLibStreamingTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_streaming_algorithms import *
+    from pyspark.mllib.tests.test_streaming_algorithms import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/mllib/tests/test_util.py b/python/pyspark/mllib/tests/test_util.py
index 76bac6c5c0..12578e417b 100644
--- a/python/pyspark/mllib/tests/test_util.py
+++ b/python/pyspark/mllib/tests/test_util.py
@@ -94,7 +94,7 @@ class SerDeTest(MLlibTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.mllib.tests.test_util import *
+    from pyspark.mllib.tests.test_util import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 4ee486800f..ed4e387d1b 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -18,7 +18,6 @@
 import copy
 import sys
 import os
-import re
 import operator
 import shlex
 import warnings
@@ -26,7 +25,6 @@ import heapq
 import bisect
 import random
 from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile
 from threading import Thread
 from collections import defaultdict
 from itertools import chain
@@ -1566,6 +1564,7 @@ class RDD(object):
         used is :class:`pyspark.serializers.PickleSerializer`, default batch size
         is 10.
 
+        >>> from tempfile import NamedTemporaryFile
         >>> tmpFile = NamedTemporaryFile(delete=True)
         >>> tmpFile.close()
         >>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3)
@@ -1586,6 +1585,7 @@ class RDD(object):
         :param compressionCodecClass: (None by default) string i.e.
             "org.apache.hadoop.io.compress.GzipCodec"
 
+        >>> from tempfile import NamedTemporaryFile
         >>> tempFile = NamedTemporaryFile(delete=True)
         >>> tempFile.close()
         >>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name)
@@ -1596,6 +1596,7 @@ class RDD(object):
 
         Empty lines are tolerated when saving to text files.
 
+        >>> from tempfile import NamedTemporaryFile
         >>> tempFile2 = NamedTemporaryFile(delete=True)
         >>> tempFile2.close()
         >>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)
@@ -1604,6 +1605,7 @@ class RDD(object):
 
         Using compressionCodecClass
 
+        >>> from tempfile import NamedTemporaryFile
         >>> tempFile3 = NamedTemporaryFile(delete=True)
         >>> tempFile3.close()
         >>> codec = "org.apache.hadoop.io.compress.GzipCodec"
diff --git a/python/pyspark/resource/tests/test_resources.py b/python/pyspark/resource/tests/test_resources.py
index 09c0d3ca20..c2b574c61a 100644
--- a/python/pyspark/resource/tests/test_resources.py
+++ b/python/pyspark/resource/tests/test_resources.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import random
 import unittest
 
-from pyspark.resource import ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder,\
+from pyspark.resource import ExecutorResourceRequests, ResourceProfileBuilder,\
     TaskResourceRequests
 
 
@@ -73,7 +72,7 @@ class ResourceProfileTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.resource.tests.test_resources import *
+    from pyspark.resource.tests.test_resources import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py
index 974412ee4e..75fe4eaa07 100644
--- a/python/pyspark/sql/avro/functions.py
+++ b/python/pyspark/sql/avro/functions.py
@@ -122,7 +122,7 @@ def _test():
         os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args])
 
     import doctest
-    from pyspark.sql import Row, SparkSession
+    from pyspark.sql import SparkSession
     import pyspark.sql.avro.functions
     globs = pyspark.sql.avro.functions.__dict__.copy()
     spark = SparkSession.builder\
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 7fbcf85cb1..a9c5b3ba0c 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -23,8 +23,7 @@ from pyspark.sql.session import _monkey_patch_RDD, SparkSession
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import IntegerType, Row, StringType
-from pyspark.sql.udf import UDFRegistration
+from pyspark.sql.udf import UDFRegistration  # noqa: F401
 from pyspark.sql.utils import install_exception_handler
 
 __all__ = ["SQLContext", "HiveContext"]
@@ -53,6 +52,7 @@ class SQLContext(object):
         .. note:: Deprecated in 3.0.0. Use :func:`SparkSession.builder.getOrCreate()` instead.
 
         >>> from datetime import datetime
+        >>> from pyspark.sql import Row
         >>> sqlContext = SQLContext(sc)
         >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
         ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 308642b136..cf63bfbdc3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -29,9 +29,10 @@ from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import StringType, DataType
 # Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409
-from pyspark.sql.udf import UserDefinedFunction, _create_udf
+from pyspark.sql.udf import UserDefinedFunction, _create_udf  # noqa: F401
+from pyspark.sql.udf import _create_udf
 # Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264
-from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
+from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType  # noqa: F401
 from pyspark.sql.utils import to_str
 
 # Note to developers: all of PySpark functions here take string as column names whenever possible.
diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
index ba4dec82d4..08665a72b0 100644
--- a/python/pyspark/sql/pandas/functions.py
+++ b/python/pyspark/sql/pandas/functions.py
@@ -16,7 +16,6 @@
 #
 
 import functools
-import sys
 import warnings
 from inspect import getfullargspec
 
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 8560ef78c1..a8a067875d 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -1246,7 +1246,7 @@ def _test():
     import tempfile
     import py4j
     from pyspark.context import SparkContext
-    from pyspark.sql import SparkSession, Row
+    from pyspark.sql import SparkSession
     import pyspark.sql.readwriter
 
     os.chdir(os.environ["SPARK_HOME"])
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index a5d102712d..c86078c1b2 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -27,7 +27,7 @@ from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.pandas.conversion import SparkConversionMixin
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
-from pyspark.sql.types import Row, DataType, StringType, StructType, \
+from pyspark.sql.types import DataType, StructType, \
     _make_type_verifier, _infer_schema, _has_nulltype, _merge_type, _create_converter, \
     _parse_datatype_string
 from pyspark.sql.utils import install_exception_handler
@@ -192,6 +192,7 @@ class SparkSession(SparkConversionMixin):
         """Creates a new SparkSession.
 
         >>> from datetime import datetime
+        >>> from pyspark.sql import Row
         >>> spark = SparkSession(sc)
         >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
         ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index b1832f420e..07413ff48a 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -1231,7 +1231,7 @@ def _test():
     import doctest
     import os
     import tempfile
-    from pyspark.sql import Row, SparkSession, SQLContext
+    from pyspark.sql import SparkSession, SQLContext
     import pyspark.sql.streaming
 
     os.chdir(os.environ["SPARK_HOME"])
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index 148df9b7d4..620033fbce 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -35,7 +35,7 @@ if have_pandas:
     from pandas.util.testing import assert_frame_equal
 
 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401
 
 
 @unittest.skipIf(
diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py
index ebe81d0325..106426eb55 100644
--- a/python/pyspark/sql/tests/test_catalog.py
+++ b/python/pyspark/sql/tests/test_catalog.py
@@ -190,7 +190,7 @@ class CatalogTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_catalog import *
+    from pyspark.sql.tests.test_catalog import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
index e0b8bf45a2..99100c7a76 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -142,7 +142,7 @@ class ColumnTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_column import *
+    from pyspark.sql.tests.test_column import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_conf.py b/python/pyspark/sql/tests/test_conf.py
index 9ec10c4cb6..dd2e0be85d 100644
--- a/python/pyspark/sql/tests/test_conf.py
+++ b/python/pyspark/sql/tests/test_conf.py
@@ -46,7 +46,7 @@ class ConfTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_conf import *
+    from pyspark.sql.tests.test_conf import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
index ff953ba4b4..85920eef9a 100644
--- a/python/pyspark/sql/tests/test_context.py
+++ b/python/pyspark/sql/tests/test_context.py
@@ -273,7 +273,7 @@ class SQLContextTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_context import *
+    from pyspark.sql.tests.test_context import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
index 30c3fd4c8d..747abdec67 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -379,8 +379,6 @@ class DataFrameTests(ReusedSQLTestCase):
 
     # add tests for SPARK-23647 (test more types for hint)
     def test_extended_hint_types(self):
-        from pyspark.sql import DataFrame
-
         df = self.spark.range(10e10).toDF("id")
         such_a_nice_list = ["itworks1", "itworks2", "itworks3"]
         hinted_df = df.hint("my awesome hint", 1.2345, "what", such_a_nice_list)
diff --git a/python/pyspark/sql/tests/test_datasources.py b/python/pyspark/sql/tests/test_datasources.py
index a2e73ca610..1b466e294a 100644
--- a/python/pyspark/sql/tests/test_datasources.py
+++ b/python/pyspark/sql/tests/test_datasources.py
@@ -161,7 +161,7 @@ class DataSourcesTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_datasources import *
+    from pyspark.sql.tests.test_datasources import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 02180daf08..09f5960c6f 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -16,7 +16,6 @@
 #
 
 import datetime
-import sys
 from itertools import chain
 import re
 
@@ -369,7 +368,7 @@ class FunctionsTests(ReusedSQLTestCase):
         self.assertListEqual(actual, expected)
 
     def test_higher_order_function_failures(self):
-        from pyspark.sql.functions import col, exists, transform
+        from pyspark.sql.functions import col, transform
 
         # Should fail with varargs
         with self.assertRaises(ValueError):
@@ -394,7 +393,7 @@ class FunctionsTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_functions import *
+    from pyspark.sql.tests.test_functions import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_group.py b/python/pyspark/sql/tests/test_group.py
index 3261fa1836..2fab7a08da 100644
--- a/python/pyspark/sql/tests/test_group.py
+++ b/python/pyspark/sql/tests/test_group.py
@@ -36,7 +36,7 @@ class GroupTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_group import *
+    from pyspark.sql.tests.test_group import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
index 24a73918d8..5013e2d4d6 100644
--- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py
@@ -16,9 +16,8 @@
 #
 
 import unittest
-import sys
 
-from pyspark.sql.functions import array, explode, col, lit, udf, sum, pandas_udf, PandasUDFType
+from pyspark.sql.functions import array, explode, col, lit, udf, pandas_udf
 from pyspark.sql.types import DoubleType, StructType, StructField, Row
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
@@ -26,10 +25,10 @@ from pyspark.testing.utils import QuietTest
 
 if have_pandas:
     import pandas as pd
-    from pandas.util.testing import assert_frame_equal, assert_series_equal
+    from pandas.util.testing import assert_frame_equal
 
 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401
 
 
 @unittest.skipIf(
@@ -245,7 +244,7 @@ class CogroupedMapInPandasTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_cogrouped_map import *
+    from pyspark.sql.tests.test_pandas_cogrouped_map import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 00cc9b3a64..d1e841f7d6 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -17,7 +17,6 @@
 
 import datetime
 import unittest
-import sys
 
 from collections import OrderedDict
 from decimal import Decimal
@@ -35,7 +34,7 @@ if have_pandas:
     from pandas.util.testing import assert_frame_equal
 
 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401
 
 
 @unittest.skipIf(
diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py
index 02ae6a86f9..bda370dffb 100644
--- a/python/pyspark/sql/tests/test_pandas_map.py
+++ b/python/pyspark/sql/tests/test_pandas_map.py
@@ -15,11 +15,9 @@
 # limitations under the License.
 #
 import os
-import sys
 import time
 import unittest
 
-from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
 
@@ -116,7 +114,7 @@ class MapInPandasTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_map import *
+    from pyspark.sql.tests.test_pandas_map import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_udf.py b/python/pyspark/sql/tests/test_pandas_udf.py
index 7fa65f0e79..97b4de3aec 100644
--- a/python/pyspark/sql/tests/test_pandas_udf.py
+++ b/python/pyspark/sql/tests/test_pandas_udf.py
@@ -241,7 +241,7 @@ class PandasUDFTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf import *
+    from pyspark.sql.tests.test_pandas_udf import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
index 224c8ce97f..4014a70df9 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@@ -510,7 +510,7 @@ class GroupedAggPandasUDFTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_grouped_agg import *
+    from pyspark.sql.tests.test_pandas_udf_grouped_agg import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
index 448e409b0c..951cef7f9e 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py
@@ -17,7 +17,6 @@
 import os
 import random
 import shutil
-import sys
 import tempfile
 import time
 import unittest
@@ -41,7 +40,7 @@ if have_pandas:
     import pandas as pd
 
 if have_pyarrow:
-    import pyarrow as pa
+    import pyarrow as pa  # noqa: F401
 
 
 @unittest.skipIf(
diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
index 618164fa84..7be81f8280 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py
@@ -243,7 +243,7 @@ class PandasUDFTypeHintsTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_typehints import *
+    from pyspark.sql.tests.test_pandas_udf_typehints import *  # noqa: #401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_pandas_udf_window.py b/python/pyspark/sql/tests/test_pandas_udf_window.py
index 4c969abef4..6e59255da1 100644
--- a/python/pyspark/sql/tests/test_pandas_udf_window.py
+++ b/python/pyspark/sql/tests/test_pandas_udf_window.py
@@ -352,7 +352,7 @@ class WindowPandasUDFTests(ReusedSQLTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_window import *
+    from pyspark.sql.tests.test_pandas_udf_window import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_readwriter.py b/python/pyspark/sql/tests/test_readwriter.py
index 8e34d3865c..44b37ac1ac 100644
--- a/python/pyspark/sql/tests/test_readwriter.py
+++ b/python/pyspark/sql/tests/test_readwriter.py
@@ -201,7 +201,7 @@ class ReadwriterV2Tests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_readwriter import *
+    from pyspark.sql.tests.test_readwriter import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_serde.py b/python/pyspark/sql/tests/test_serde.py
index 052a5b2835..6508a0f09f 100644
--- a/python/pyspark/sql/tests/test_serde.py
+++ b/python/pyspark/sql/tests/test_serde.py
@@ -139,7 +139,7 @@ class SerdeTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_serde import *
+    from pyspark.sql.tests.test_serde import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_session.py b/python/pyspark/sql/tests/test_session.py
index 5e4166e6f8..d10f7bf906 100644
--- a/python/pyspark/sql/tests/test_session.py
+++ b/python/pyspark/sql/tests/test_session.py
@@ -358,7 +358,7 @@ class SparkExtensionsTest(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_session import *
+    from pyspark.sql.tests.test_session import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_streaming.py b/python/pyspark/sql/tests/test_streaming.py
index d19ca075f5..caac67d7ef 100644
--- a/python/pyspark/sql/tests/test_streaming.py
+++ b/python/pyspark/sql/tests/test_streaming.py
@@ -567,7 +567,7 @@ class StreamingTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_streaming import *
+    from pyspark.sql.tests.test_streaming import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
index 05e2717fda..68e4de8382 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -969,7 +969,7 @@ class DataTypeVerificationTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_types import *
+    from pyspark.sql.tests.test_types import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
index d673f7c159..ff92e1e97e 100644
--- a/python/pyspark/sql/tests/test_udf.py
+++ b/python/pyspark/sql/tests/test_udf.py
@@ -21,8 +21,6 @@ import shutil
 import tempfile
 import unittest
 
-import py4j
-
 from pyspark import SparkContext
 from pyspark.sql import SparkSession, Column, Row
 from pyspark.sql.functions import UserDefinedFunction, udf
diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py
index 072ea08085..c6e7fcd8ec 100644
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@@ -52,7 +52,7 @@ class UtilsTests(ReusedSQLTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_utils import *
+    from pyspark.sql.tests.test_utils import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 000318588e..f0067026ff 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-import sys
 import operator
 import time
 from itertools import chain
diff --git a/python/pyspark/streaming/tests/test_context.py b/python/pyspark/streaming/tests/test_context.py
index 69a209ad87..26f1d24f64 100644
--- a/python/pyspark/streaming/tests/test_context.py
+++ b/python/pyspark/streaming/tests/test_context.py
@@ -175,7 +175,7 @@ class StreamingContextTests(PySparkStreamingTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.streaming.tests.test_context import *
+    from pyspark.streaming.tests.test_context import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py
index 89edb23070..00d00b50c9 100644
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@@ -24,7 +24,7 @@ from functools import reduce
 from itertools import chain
 import platform
 
-from pyspark import SparkConf, SparkContext, RDD
+from pyspark import SparkConf, SparkContext
 from pyspark.streaming import StreamingContext
 from pyspark.testing.streamingutils import PySparkStreamingTestCase
 
@@ -644,7 +644,7 @@ class CheckpointTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.streaming.tests.test_dstream import *
+    from pyspark.streaming.tests.test_dstream import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/streaming/tests/test_kinesis.py b/python/pyspark/streaming/tests/test_kinesis.py
index a2da230821..b39809e2f6 100644
--- a/python/pyspark/streaming/tests/test_kinesis.py
+++ b/python/pyspark/streaming/tests/test_kinesis.py
@@ -80,7 +80,7 @@ class KinesisStreamTests(PySparkStreamingTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.streaming.tests.test_kinesis import *
+    from pyspark.streaming.tests.test_kinesis import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/streaming/tests/test_listener.py b/python/pyspark/streaming/tests/test_listener.py
index 48c5783bf8..3970cf6589 100644
--- a/python/pyspark/streaming/tests/test_listener.py
+++ b/python/pyspark/streaming/tests/test_listener.py
@@ -149,7 +149,7 @@ class StreamingListenerTests(PySparkStreamingTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.streaming.tests.test_listener import *
+    from pyspark.streaming.tests.test_listener import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index cda902b6f4..ca5761e8cd 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -27,13 +27,13 @@ from pyspark import SparkContext, SparkConf
 have_scipy = False
 have_numpy = False
 try:
-    import scipy.sparse
+    import scipy.sparse  # noqa: F401
     have_scipy = True
 except:
     # No SciPy, but that's okay, we'll skip those tests
     pass
 try:
-    import numpy as np
+    import numpy as np  # noqa: F401
     have_numpy = True
 except:
     # No NumPy, but that's okay, we'll skip those tests
diff --git a/python/pyspark/tests/test_appsubmit.py b/python/pyspark/tests/test_appsubmit.py
index 0eff514829..15170b878e 100644
--- a/python/pyspark/tests/test_appsubmit.py
+++ b/python/pyspark/tests/test_appsubmit.py
@@ -238,7 +238,7 @@ class SparkSubmitTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_appsubmit import *
+    from pyspark.tests.test_appsubmit import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_broadcast.py b/python/pyspark/tests/test_broadcast.py
index 02b0d799bd..543dc98660 100644
--- a/python/pyspark/tests/test_broadcast.py
+++ b/python/pyspark/tests/test_broadcast.py
@@ -145,7 +145,7 @@ class BroadcastFrameProtocolTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    from pyspark.tests.test_broadcast import *
+    from pyspark.tests.test_broadcast import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_conf.py b/python/pyspark/tests/test_conf.py
index d51fd3d1f4..3e80c17f49 100644
--- a/python/pyspark/tests/test_conf.py
+++ b/python/pyspark/tests/test_conf.py
@@ -33,7 +33,7 @@ class ConfTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_conf import *
+    from pyspark.tests.test_conf import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py
index f398cec344..9f159f7703 100644
--- a/python/pyspark/tests/test_context.py
+++ b/python/pyspark/tests/test_context.py
@@ -126,7 +126,7 @@ class AddFileTests(PySparkTestCase):
         # To ensure that we're actually testing addPyFile's effects, check that
         # this fails due to `userlibrary` not being on the Python path:
         def func():
-            from userlibrary import UserClass
+            from userlibrary import UserClass  # noqa: F401
         self.assertRaises(ImportError, func)
         path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")
         self.sc.addPyFile(path)
@@ -137,7 +137,7 @@ class AddFileTests(PySparkTestCase):
         # To ensure that we're actually testing addPyFile's effects, check that
         # this fails due to `userlibrary` not being on the Python path:
         def func():
-            from userlib import UserClass
+            from userlib import UserClass  # noqa: F401
         self.assertRaises(ImportError, func)
         path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1.zip")
         self.sc.addPyFile(path)
@@ -318,7 +318,7 @@ class ContextTestsWithResources(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_context import *
+    from pyspark.tests.test_context import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_daemon.py b/python/pyspark/tests/test_daemon.py
index 898fb39d9e..b1f8c71c77 100644
--- a/python/pyspark/tests/test_daemon.py
+++ b/python/pyspark/tests/test_daemon.py
@@ -73,7 +73,7 @@ class DaemonTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_daemon import *
+    from pyspark.tests.test_daemon import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_join.py b/python/pyspark/tests/test_join.py
index 138d062e72..815c78ef9a 100644
--- a/python/pyspark/tests/test_join.py
+++ b/python/pyspark/tests/test_join.py
@@ -59,7 +59,7 @@ class JoinTests(ReusedPySparkTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_join import *
+    from pyspark.tests.test_join import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_pin_thread.py b/python/pyspark/tests/test_pin_thread.py
index 50eb8e0ec8..efe7d7f663 100644
--- a/python/pyspark/tests/test_pin_thread.py
+++ b/python/pyspark/tests/test_pin_thread.py
@@ -16,7 +16,6 @@
 #
 import os
 import time
-import random
 import threading
 import unittest
 
@@ -167,7 +166,7 @@ class PinThreadTests(unittest.TestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_pin_thread import *
+    from pyspark.tests.test_pin_thread import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py
index dbce72a0d3..ca144cc6e1 100644
--- a/python/pyspark/tests/test_profiler.py
+++ b/python/pyspark/tests/test_profiler.py
@@ -98,7 +98,7 @@ class ProfilerTests2(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_profiler import *
+    from pyspark.tests.test_profiler import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py
index 1a580e27ea..c154bda00d 100644
--- a/python/pyspark/tests/test_rdd.py
+++ b/python/pyspark/tests/test_rdd.py
@@ -18,7 +18,6 @@ from datetime import datetime, timedelta
 import hashlib
 import os
 import random
-import sys
 import tempfile
 import time
 from glob import glob
@@ -26,7 +25,7 @@ from glob import glob
 from py4j.protocol import Py4JJavaError
 
 from pyspark import shuffle, RDD
-from pyspark.resource import ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder,\
+from pyspark.resource import ExecutorResourceRequests, ResourceProfileBuilder,\
     TaskResourceRequests
 from pyspark.serializers import CloudPickleSerializer, BatchedSerializer, PickleSerializer,\
     MarshalSerializer, UTF8Deserializer, NoOpSerializer
@@ -882,7 +881,7 @@ class RDDTests(ReusedPySparkTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_rdd import *
+    from pyspark.tests.test_rdd import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_rddbarrier.py b/python/pyspark/tests/test_rddbarrier.py
index 8534fb4abb..f0a05a23cc 100644
--- a/python/pyspark/tests/test_rddbarrier.py
+++ b/python/pyspark/tests/test_rddbarrier.py
@@ -40,7 +40,7 @@ class RDDBarrierTests(ReusedPySparkTestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_rddbarrier import *
+    from pyspark.tests.test_rddbarrier import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_readwrite.py b/python/pyspark/tests/test_readwrite.py
index faa006c7d8..adbc343c65 100644
--- a/python/pyspark/tests/test_readwrite.py
+++ b/python/pyspark/tests/test_readwrite.py
@@ -16,10 +16,8 @@
 #
 import os
 import shutil
-import sys
 import tempfile
 import unittest
-from array import array
 
 from pyspark.testing.utils import ReusedPySparkTestCase, SPARK_HOME
 
@@ -306,7 +304,7 @@ class OutputFormatTests(ReusedPySparkTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_readwrite import *
+    from pyspark.tests.test_readwrite import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_serializers.py b/python/pyspark/tests/test_serializers.py
index 8caf9da85a..bffd78a501 100644
--- a/python/pyspark/tests/test_serializers.py
+++ b/python/pyspark/tests/test_serializers.py
@@ -87,7 +87,7 @@ class SerializationTestCase(unittest.TestCase):
     def test_pickling_file_handles(self):
         # to be corrected with SPARK-11160
         try:
-            import xmlrunner
+            import xmlrunner  # noqa: F401
         except ImportError:
             ser = CloudPickleSerializer()
             out1 = sys.stderr
diff --git a/python/pyspark/tests/test_shuffle.py b/python/pyspark/tests/test_shuffle.py
index 434414618e..061b93f32c 100644
--- a/python/pyspark/tests/test_shuffle.py
+++ b/python/pyspark/tests/test_shuffle.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 import random
-import sys
 import unittest
 
 from py4j.protocol import Py4JJavaError
@@ -168,7 +167,7 @@ class SorterTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_shuffle import *
+    from pyspark.tests.test_shuffle import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
index 8c2bedbe4e..f5be685643 100644
--- a/python/pyspark/tests/test_taskcontext.py
+++ b/python/pyspark/tests/test_taskcontext.py
@@ -16,7 +16,6 @@
 #
 import os
 import random
-import shutil
 import stat
 import sys
 import tempfile
@@ -322,7 +321,7 @@ class TaskContextTestsWithResources(unittest.TestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_taskcontext import *
+    from pyspark.tests.test_taskcontext import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_util.py b/python/pyspark/tests/test_util.py
index 511d62a51f..e853bc322c 100644
--- a/python/pyspark/tests/test_util.py
+++ b/python/pyspark/tests/test_util.py
@@ -74,7 +74,7 @@ class UtilTests(PySparkTestCase):
 
 
 if __name__ == "__main__":
-    from pyspark.tests.test_util import *
+    from pyspark.tests.test_util import *  # noqa: F401
 
     try:
         import xmlrunner
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index 3b1848dcfd..a855eaafc1 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -16,14 +16,13 @@
 # limitations under the License.
 #
 import os
-import sys
 import tempfile
 import threading
 import time
 import unittest
 has_resource_module = True
 try:
-    import resource
+    import resource  # noqa: F401
 except ImportError:
     has_resource_module = False
 
@@ -200,7 +199,7 @@ class WorkerMemoryTest(unittest.TestCase):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.tests.test_worker import *
+    from pyspark.tests.test_worker import *  # noqa: F401
 
     try:
         import xmlrunner