[SPARK-7810] [PYSPARK] solve python rdd socket connection problem
Method "_load_from_socket" in rdd.py cannot load data from jvm socket when ipv6 is used. The current method only works well with ipv4. New modification should work around both two protocols.
Author: Ai He <ai.he@ussuning.com>
Author: AiHe <ai.he@ussuning.com>
Closes #6338 from AiHe/pyspark-networking-issue and squashes the following commits:
d4fc9c4 [Ai He] handle code review 2
e75c5c8 [Ai He] handle code review
5644953 [AiHe] solve python rdd socket connection problem to jvm
(cherry picked from commit ecd3aacf28
)
Signed-off-by: Davies Liu <davies@databricks.com>
This commit is contained in:
parent
457d07eaa0
commit
187015f671
|
@ -121,10 +121,22 @@ def _parse_memory(s):
|
||||||
|
|
||||||
|
|
||||||
def _load_from_socket(port, serializer):
|
def _load_from_socket(port, serializer):
|
||||||
sock = socket.socket()
|
sock = None
|
||||||
sock.settimeout(3)
|
# Support for both IPv4 and IPv6.
|
||||||
|
# On most of IPv6-ready systems, IPv6 will take precedence.
|
||||||
|
for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC, socket.SOCK_STREAM):
|
||||||
|
af, socktype, proto, canonname, sa = res
|
||||||
|
try:
|
||||||
|
sock = socket.socket(af, socktype, proto)
|
||||||
|
sock.settimeout(3)
|
||||||
|
sock.connect(sa)
|
||||||
|
except socket.error:
|
||||||
|
sock = None
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if not sock:
|
||||||
|
raise Exception("could not open socket")
|
||||||
try:
|
try:
|
||||||
sock.connect(("localhost", port))
|
|
||||||
rf = sock.makefile("rb", 65536)
|
rf = sock.makefile("rb", 65536)
|
||||||
for item in serializer.load_stream(rf):
|
for item in serializer.load_stream(rf):
|
||||||
yield item
|
yield item
|
||||||
|
|
Loading…
Reference in a new issue