Fokko Driesprong 9fcf0ea718 [SPARK-32319][PYSPARK] Disallow the use of unused imports
Disallow the use of unused imports:

- Unnecessary increases the memory footprint of the application
- Removes the imports that are required for the examples in the docstring from the file-scope to the example itself. This keeps the files itself clean, and gives a more complete example as it also includes the imports :)

fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
python/pyspark/ F401 'functools.partial' imported but unused
python/pyspark/ F401 'traceback' imported but unused
python/pyspark/ F401 '_heapq.*' imported but unused
python/pyspark/ F401 'pyspark.version.__version__' imported but unused
python/pyspark/ F401 'pyspark._globals._NoValue' imported but unused
python/pyspark/ F401 'pyspark.sql.SQLContext' imported but unused
python/pyspark/ F401 'pyspark.sql.HiveContext' imported but unused
python/pyspark/ F401 'pyspark.sql.Row' imported but unused
python/pyspark/ F401 're' imported but unused
python/pyspark/ F401 'tempfile.NamedTemporaryFile' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/ F401 'pyspark.mllib.regression.LabeledPoint' imported but unused
python/pyspark/mllib/tests/ F401 'sys' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_linalg.*' imported but unused
python/pyspark/mllib/tests/ F401 'numpy.random' imported but unused
python/pyspark/mllib/tests/ F401 'numpy.exp' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_feature.*' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_util.*' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.Vector' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.SparseVector' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.DenseVector' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.VectorUDT' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg._convert_to_vector' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.DenseMatrix' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.SparseMatrix' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.linalg.MatrixUDT' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_stat.*' imported but unused
python/pyspark/mllib/tests/ F401 'time.time' imported but unused
python/pyspark/mllib/tests/ F401 'time.sleep' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_streaming_algorithms.*' imported but unused
python/pyspark/mllib/tests/ F401 'pyspark.mllib.tests.test_algorithms.*' imported but unused
python/pyspark/tests/ F401 'xmlrunner' imported but unused
python/pyspark/tests/ F401 'sys' imported but unused
python/pyspark/tests/ F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_rdd.*' imported but unused
python/pyspark/tests/ F401 'sys' imported but unused
python/pyspark/tests/ F401 'array.array' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_readwrite.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_join.*' imported but unused
python/pyspark/tests/ F401 'shutil' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_taskcontext.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_conf.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_broadcast.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_daemon.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_util.*' imported but unused
python/pyspark/tests/ F401 'random' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_pin_thread.*' imported but unused
python/pyspark/tests/ F401 'sys' imported but unused
python/pyspark/tests/ F401 'resource' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_worker.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_profiler.*' imported but unused
python/pyspark/tests/ F401 'sys' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_shuffle.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_rddbarrier.*' imported but unused
python/pyspark/tests/ F401 'userlibrary.UserClass' imported but unused
python/pyspark/tests/ F401 'userlib.UserClass' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_context.*' imported but unused
python/pyspark/tests/ F401 'pyspark.tests.test_appsubmit.*' imported but unused
python/pyspark/streaming/ F401 'sys' imported but unused
python/pyspark/streaming/tests/ F401 'pyspark.RDD' imported but unused
python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_dstream.*' imported but unused
python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_kinesis.*' imported but unused
python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_listener.*' imported but unused
python/pyspark/streaming/tests/ F401 'pyspark.streaming.tests.test_context.*' imported but unused
python/pyspark/testing/ F401 'scipy.sparse' imported but unused
python/pyspark/testing/ F401 'numpy as np' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/ F401 'sys' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/ F401 'sys' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/ F401 '' imported but unused
python/pyspark/ml/tests/ F401 'sys' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 'pyspark.sql.functions as F' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 'sys' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 'py4j' imported but unused
python/pyspark/ml/tests/ F401 'pyspark.testing.mlutils.PySparkTestCase' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 'sys' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/tests/ F401 '*' imported but unused
python/pyspark/ml/param/ F401 'sys' imported but unused
python/pyspark/resource/tests/ F401 'random' imported but unused
python/pyspark/resource/tests/ F401 'pyspark.resource.ResourceProfile' imported but unused
python/pyspark/resource/tests/ F401 'pyspark.resource.tests.test_resources.*' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.udf.UserDefinedFunction' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.pandas.functions.pandas_udf' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.types.IntegerType' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.types.Row' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.types.StringType' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.udf.UDFRegistration' imported but unused
python/pyspark/sql/ F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_utils.*' imported but unused
python/pyspark/sql/tests/ F401 'sys' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.functions.pandas_udf' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_map.*' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_catalog.*' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_group.*' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_session.*' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_conf.*' imported but unused
python/pyspark/sql/tests/ F401 'sys' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.functions.sum' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.functions.PandasUDFType' imported but unused
python/pyspark/sql/tests/ F401 'pandas.util.testing.assert_series_equal' imported but unused
python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_cogrouped_map.*' imported but unused
python/pyspark/sql/tests/ F401 'py4j' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_udf_typehints.*' imported but unused
python/pyspark/sql/tests/ F401 'sys' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.functions.exists' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_functions.*' imported but unused
python/pyspark/sql/tests/ F401 'sys' imported but unused
python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.tests.test_pandas_udf_window.*' imported but unused
python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/ F401 'sys' imported but unused
python/pyspark/sql/tests/ F401 'pyarrow as pa' imported but unused
python/pyspark/sql/tests/ F401 'pyspark.sql.DataFrame' imported but unused
python/pyspark/sql/avro/ F401 'pyspark.sql.Row' imported but unused
python/pyspark/sql/pandas/ F401 'sys' imported but unused

fokkodriesprongFan spark % flake8 python | grep -i "imported but unused"
fokkodriesprongFan spark %

### What changes were proposed in this pull request?

Removing unused imports from the Python files to keep everything nice and tidy.

### Why are the changes needed?

Cleaning up of the imports that aren't used, and suppressing the imports that are used as references to other modules, preserving backward compatibility.

### Does this PR introduce _any_ user-facing change?


### How was this patch tested?

Adding the rule to the existing Flake8 checks.

Closes #29121 from Fokko/SPARK-32319.

Authored-by: Fokko Driesprong <>
Signed-off-by: Dongjoon Hyun <>
2020-08-08 08:51:57 -07:00

332 lines
13 KiB

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import stat
import sys
import tempfile
import time
import unittest
from pyspark import SparkConf, SparkContext, TaskContext, BarrierTaskContext
from pyspark.testing.utils import PySparkTestCase, SPARK_HOME
class TaskContextTests(PySparkTestCase):
def setUp(self):
self._old_sys_path = list(sys.path)
class_name = self.__class__.__name__
# Allow retries even though they are normally disabled in local mode = SparkContext('local[4, 2]', class_name)
def test_stage_id(self):
"""Test the stage ids are available and incrementing as expected."""
rdd =
stage1 = x: TaskContext.get().stageId()).take(1)[0]
stage2 = x: TaskContext.get().stageId()).take(1)[0]
# Test using the constructor directly rather than the get()
stage3 = x: TaskContext().stageId()).take(1)[0]
self.assertEqual(stage1 + 1, stage2)
self.assertEqual(stage1 + 2, stage3)
self.assertEqual(stage2 + 1, stage3)
def test_resources(self):
"""Test the resources are empty by default."""
rdd =
resources1 = x: TaskContext.get().resources()).take(1)[0]
# Test using the constructor directly rather than the get()
resources2 = x: TaskContext().resources()).take(1)[0]
self.assertEqual(len(resources1), 0)
self.assertEqual(len(resources2), 0)
def test_partition_id(self):
"""Test the partition id."""
rdd1 =, 1)
rdd2 =, 2)
pids1 = x: TaskContext.get().partitionId()).collect()
pids2 = x: TaskContext.get().partitionId()).collect()
self.assertEqual(0, pids1[0])
self.assertEqual(0, pids1[9])
self.assertEqual(0, pids2[0])
self.assertEqual(1, pids2[9])
def test_attempt_number(self):
"""Verify the attempt numbers are correctly reported."""
rdd =
# Verify a simple job with no failures
attempt_numbers = x: TaskContext.get().attemptNumber()).collect()
map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)
def fail_on_first(x):
"""Fail on the first attempt so we get a positive attempt number"""
tc = TaskContext.get()
attempt_number = tc.attemptNumber()
partition_id = tc.partitionId()
attempt_id = tc.taskAttemptId()
if attempt_number == 0 and partition_id == 0:
raise Exception("Failing on first attempt")
return [x, partition_id, attempt_number, attempt_id]
result =
# We should re-submit the first partition to it but other partitions should be attempt 0
self.assertEqual([0, 0, 1], result[0][0:3])
self.assertEqual([9, 3, 0], result[9][0:3])
first_partition = filter(lambda x: x[1] == 0, result)
map(lambda x: self.assertEqual(1, x[2]), first_partition)
other_partitions = filter(lambda x: x[1] != 0, result)
map(lambda x: self.assertEqual(0, x[2]), other_partitions)
# The task attempt id should be different
self.assertTrue(result[0][3] != result[9][3])
def test_tc_on_driver(self):
"""Verify that getting the TaskContext on the driver returns None."""
tc = TaskContext.get()
self.assertTrue(tc is None)
def test_get_local_property(self):
"""Verify that local properties set on the driver are available in TaskContext."""
key = "testkey"
value = "testvalue", value)
rdd =, 1)
prop1 = _: TaskContext.get().getLocalProperty(key)).collect()[0]
self.assertEqual(prop1, value)
prop2 = _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
self.assertTrue(prop2 is None)
finally:, None)
def test_barrier(self):
Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks
within a stage.
rdd =, 4)
def f(iterator):
yield sum(iterator)
def context_barrier(x):
tc = BarrierTaskContext.get()
time.sleep(random.randint(1, 10))
return time.time()
times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
self.assertTrue(max(times) - min(times) < 1)
def test_all_gather(self):
Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks
within a stage and passes messages properly.
rdd =, 4)
def f(iterator):
yield sum(iterator)
def context_barrier(x):
tc = BarrierTaskContext.get()
time.sleep(random.randint(1, 10))
out = tc.allGather(str(tc.partitionId()))
pids = [int(e) for e in out]
return pids
pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0]
self.assertEqual(pids, [0, 1, 2, 3])
def test_barrier_infos(self):
Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the
barrier stage.
rdd =, 4)
def f(iterator):
yield sum(iterator)
taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get()
self.assertTrue(len(taskInfos) == 4)
self.assertTrue(len(taskInfos[0]) == 4)
def test_context_get(self):
Verify that TaskContext.get() works both in or not in a barrier stage.
rdd =, 4)
def f(iterator):
taskContext = TaskContext.get()
if isinstance(taskContext, BarrierTaskContext):
yield taskContext.partitionId() + 1
elif isinstance(taskContext, TaskContext):
yield taskContext.partitionId() + 2
yield -1
# for normal stage
result1 = rdd.mapPartitions(f).collect()
self.assertTrue(result1 == [2, 3, 4, 5])
# for barrier stage
result2 = rdd.barrier().mapPartitions(f).collect()
self.assertTrue(result2 == [1, 2, 3, 4])
def test_barrier_context_get(self):
Verify that BarrierTaskContext.get() should only works in a barrier stage.
rdd =, 4)
def f(iterator):
taskContext = BarrierTaskContext.get()
except Exception:
yield -1
yield taskContext.partitionId()
# for normal stage
result1 = rdd.mapPartitions(f).collect()
self.assertTrue(result1 == [-1, -1, -1, -1])
# for barrier stage
result2 = rdd.barrier().mapPartitions(f).collect()
self.assertTrue(result2 == [0, 1, 2, 3])
class TaskContextTestsWithWorkerReuse(unittest.TestCase):
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.python.worker.reuse", "true") = SparkContext('local[2]', class_name, conf=conf)
def test_barrier_with_python_worker_reuse(self):
Regression test for SPARK-25921: verify that BarrierTaskContext.barrier() with
reused python worker.
# start a normal job first to start all workers and get all worker pids
worker_pids =, 2).map(lambda x: os.getpid()).collect()
# the worker will reuse in this barrier job
rdd =, 2)
def f(iterator):
yield sum(iterator)
def context_barrier(x):
tc = BarrierTaskContext.get()
time.sleep(random.randint(1, 10))
return (time.time(), os.getpid())
result = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
times = list(map(lambda x: x[0], result))
pids = list(map(lambda x: x[1], result))
# check both barrier and worker reuse effect
self.assertTrue(max(times) - min(times) < 1)
for pid in pids:
self.assertTrue(pid in worker_pids)
def test_task_context_correct_with_python_worker_reuse(self):
"""Verify the task context correct when reused python worker"""
# start a normal job first to start all workers and get all worker pids
worker_pids =, 2).map(lambda x: os.getpid()).collect()
# the worker will reuse in this barrier job
rdd =, 2)
def context(iterator):
tp = TaskContext.get().partitionId()
bp = BarrierTaskContext.get().partitionId()
except Exception:
bp = -1
yield (tp, bp, os.getpid())
# normal stage after normal stage
normal_result = rdd.mapPartitions(context).collect()
tps, bps, pids = zip(*normal_result)
self.assertTrue(tps == (0, 1))
self.assertTrue(bps == (-1, -1))
for pid in pids:
self.assertTrue(pid in worker_pids)
# barrier stage after normal stage
barrier_result = rdd.barrier().mapPartitions(context).collect()
tps, bps, pids = zip(*barrier_result)
self.assertTrue(tps == (0, 1))
self.assertTrue(bps == (0, 1))
for pid in pids:
self.assertTrue(pid in worker_pids)
# normal stage after barrier stage
normal_result2 = rdd.mapPartitions(context).collect()
tps, bps, pids = zip(*normal_result2)
self.assertTrue(tps == (0, 1))
self.assertTrue(bps == (-1, -1))
for pid in pids:
self.assertTrue(pid in worker_pids)
def tearDown(self):
class TaskContextTestsWithResources(unittest.TestCase):
def setUp(self):
class_name = self.__class__.__name__
self.tempFile = tempfile.NamedTemporaryFile(delete=False)
self.tempFile.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [\\"0\\"]}')
# create temporary directory for Worker resources coordination
self.tempdir = tempfile.NamedTemporaryFile(delete=False)
os.chmod(, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
stat.S_IROTH | stat.S_IXOTH)
conf = SparkConf().set("spark.test.home", SPARK_HOME)
conf = conf.set("spark.worker.resource.gpu.discoveryScript",
conf = conf.set("spark.worker.resource.gpu.amount", 1)
conf = conf.set("spark.task.resource.gpu.amount", "1")
conf = conf.set("spark.executor.resource.gpu.amount", "1") = SparkContext('local-cluster[2,1,1024]', class_name, conf=conf)
def test_resources(self):
"""Test the resources are available."""
rdd =
resources = x: TaskContext.get().resources()).take(1)[0]
self.assertEqual(len(resources), 1)
self.assertTrue('gpu' in resources)
self.assertEqual(resources['gpu'].name, 'gpu')
self.assertEqual(resources['gpu'].addresses, ['0'])
def tearDown(self):
if __name__ == "__main__":
import unittest
from pyspark.tests.test_taskcontext import * # noqa: F401
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)