89d9b7cc64
### What changes were proposed in this pull request? This PR proposes: 1. To introduce `InheritableThread` class, that works identically with `threading.Thread` but it can inherit the inheritable attributes of a JVM thread such as `InheritableThreadLocal`. This was a problem from the pinned thread mode, see also https://github.com/apache/spark/pull/24898. Now it works as below: ```python import pyspark spark.sparkContext.setLocalProperty("a", "hi") def print_prop(): print(spark.sparkContext.getLocalProperty("a")) pyspark.InheritableThread(target=print_prop).start() ``` ``` hi ``` 2. Also, it adds the resource leak fix into `InheritableThread`. Py4J leaks the thread and does not close the connection from Python to JVM. In `InheritableThread`, it manually closes the connections when PVM garbage collection happens. So, JVM threads finish safely. I manually verified by profiling but there's also another easy way to verify: ```bash PYSPARK_PIN_THREAD=true ./bin/pyspark ``` ```python >>> from threading import Thread >>> Thread(target=lambda: spark.range(1000).collect()).start() >>> Thread(target=lambda: spark.range(1000).collect()).start() >>> Thread(target=lambda: spark.range(1000).collect()).start() >>> spark._jvm._gateway_client.deque deque([<py4j.clientserver.ClientServerConnection object at 0x119f7aba8>, <py4j.clientserver.ClientServerConnection object at 0x119fc9b70>, <py4j.clientserver.ClientServerConnection object at 0x119fc9e10>, <py4j.clientserver.ClientServerConnection object at 0x11a015358>, <py4j.clientserver.ClientServerConnection object at 0x119fc00f0>]) >>> Thread(target=lambda: spark.range(1000).collect()).start() >>> spark._jvm._gateway_client.deque deque([<py4j.clientserver.ClientServerConnection object at 0x119f7aba8>, <py4j.clientserver.ClientServerConnection object at 0x119fc9b70>, <py4j.clientserver.ClientServerConnection object at 0x119fc9e10>, <py4j.clientserver.ClientServerConnection object at 0x11a015358>, <py4j.clientserver.ClientServerConnection object at 0x119fc08d0>, <py4j.clientserver.ClientServerConnection object at 0x119fc00f0>]) ``` This issue is fixed now. 3. Because now we have a fix for the issue here, it also proposes to deprecate `collectWithJobGroup` which was a temporary workaround added to avoid this leak issue. ### Why are the changes needed? To support pinned thread mode properly without a resource leak, and a proper inheritable local properties. ### Does this PR introduce _any_ user-facing change? Yes, it adds an API `InheritableThread` class for pinned thread mode. ### How was this patch tested? Manually tested as described above, and unit test was added as well. Closes #28968 from HyukjinKwon/SPARK-32010. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
178 lines
6.1 KiB
Python
178 lines
6.1 KiB
Python
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import os
|
|
import time
|
|
import random
|
|
import threading
|
|
import unittest
|
|
|
|
from pyspark import SparkContext, SparkConf, InheritableThread
|
|
|
|
|
|
class PinThreadTests(unittest.TestCase):
|
|
# These tests are in a separate class because it uses
|
|
# 'PYSPARK_PIN_THREAD' environment variable to test thread pin feature.
|
|
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
cls.old_pin_thread = os.environ.get("PYSPARK_PIN_THREAD")
|
|
os.environ["PYSPARK_PIN_THREAD"] = "true"
|
|
cls.sc = SparkContext('local[4]', cls.__name__, conf=SparkConf())
|
|
|
|
@classmethod
|
|
def tearDownClass(cls):
|
|
cls.sc.stop()
|
|
if cls.old_pin_thread is not None:
|
|
os.environ["PYSPARK_PIN_THREAD"] = cls.old_pin_thread
|
|
else:
|
|
del os.environ["PYSPARK_PIN_THREAD"]
|
|
|
|
def test_pinned_thread(self):
|
|
threads = []
|
|
exceptions = []
|
|
property_name = "test_property_%s" % PinThreadTests.__name__
|
|
jvm_thread_ids = []
|
|
|
|
for i in range(10):
|
|
def test_local_property():
|
|
jvm_thread_id = self.sc._jvm.java.lang.Thread.currentThread().getId()
|
|
jvm_thread_ids.append(jvm_thread_id)
|
|
|
|
# If a property is set in this thread, later it should get the same property
|
|
# within this thread.
|
|
self.sc.setLocalProperty(property_name, str(i))
|
|
|
|
# 5 threads, 1 second sleep. 5 threads without a sleep.
|
|
time.sleep(i % 2)
|
|
|
|
try:
|
|
assert self.sc.getLocalProperty(property_name) == str(i)
|
|
|
|
# Each command might create a thread in multi-threading mode in Py4J.
|
|
# This assert makes sure that the created thread is being reused.
|
|
assert jvm_thread_id == self.sc._jvm.java.lang.Thread.currentThread().getId()
|
|
except Exception as e:
|
|
exceptions.append(e)
|
|
threads.append(threading.Thread(target=test_local_property))
|
|
|
|
for t in threads:
|
|
t.start()
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
for e in exceptions:
|
|
raise e
|
|
|
|
# Created JVM threads should be 10 because Python thread are 10.
|
|
assert len(set(jvm_thread_ids)) == 10
|
|
|
|
def test_multiple_group_jobs(self):
|
|
# SPARK-22340 Add a mode to pin Python thread into JVM's
|
|
|
|
group_a = "job_ids_to_cancel"
|
|
group_b = "job_ids_to_run"
|
|
|
|
threads = []
|
|
thread_ids = range(4)
|
|
thread_ids_to_cancel = [i for i in thread_ids if i % 2 == 0]
|
|
thread_ids_to_run = [i for i in thread_ids if i % 2 != 0]
|
|
|
|
# A list which records whether job is cancelled.
|
|
# The index of the array is the thread index which job run in.
|
|
is_job_cancelled = [False for _ in thread_ids]
|
|
|
|
def run_job(job_group, index):
|
|
"""
|
|
Executes a job with the group ``job_group``. Each job waits for 3 seconds
|
|
and then exits.
|
|
"""
|
|
try:
|
|
self.sc.setJobGroup(job_group, "test rdd collect with setting job group")
|
|
self.sc.parallelize([15]).map(lambda x: time.sleep(x)).collect()
|
|
is_job_cancelled[index] = False
|
|
except Exception:
|
|
# Assume that exception means job cancellation.
|
|
is_job_cancelled[index] = True
|
|
|
|
# Test if job succeeded when not cancelled.
|
|
run_job(group_a, 0)
|
|
self.assertFalse(is_job_cancelled[0])
|
|
|
|
# Run jobs
|
|
for i in thread_ids_to_cancel:
|
|
t = threading.Thread(target=run_job, args=(group_a, i))
|
|
t.start()
|
|
threads.append(t)
|
|
|
|
for i in thread_ids_to_run:
|
|
t = threading.Thread(target=run_job, args=(group_b, i))
|
|
t.start()
|
|
threads.append(t)
|
|
|
|
# Wait to make sure all jobs are executed.
|
|
time.sleep(3)
|
|
# And then, cancel one job group.
|
|
self.sc.cancelJobGroup(group_a)
|
|
|
|
# Wait until all threads launching jobs are finished.
|
|
for t in threads:
|
|
t.join()
|
|
|
|
for i in thread_ids_to_cancel:
|
|
self.assertTrue(
|
|
is_job_cancelled[i],
|
|
"Thread {i}: Job in group A was not cancelled.".format(i=i))
|
|
|
|
for i in thread_ids_to_run:
|
|
self.assertFalse(
|
|
is_job_cancelled[i],
|
|
"Thread {i}: Job in group B did not succeeded.".format(i=i))
|
|
|
|
def test_inheritable_local_property(self):
|
|
self.sc.setLocalProperty("a", "hi")
|
|
expected = []
|
|
|
|
def get_inner_local_prop():
|
|
expected.append(self.sc.getLocalProperty("b"))
|
|
|
|
def get_outer_local_prop():
|
|
expected.append(self.sc.getLocalProperty("a"))
|
|
self.sc.setLocalProperty("b", "hello")
|
|
t2 = InheritableThread(target=get_inner_local_prop)
|
|
t2.start()
|
|
t2.join()
|
|
|
|
t1 = InheritableThread(target=get_outer_local_prop)
|
|
t1.start()
|
|
t1.join()
|
|
|
|
self.assertEqual(self.sc.getLocalProperty("b"), None)
|
|
self.assertEqual(expected, ["hi", "hello"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import unittest
|
|
from pyspark.tests.test_pin_thread import *
|
|
|
|
try:
|
|
import xmlrunner
|
|
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
|
|
except ImportError:
|
|
testRunner = None
|
|
unittest.main(testRunner=testRunner, verbosity=2)
|