[SPARK-1690] Tolerating empty elements when saving Python RDD to text files
Tolerate empty strings in PythonRDD Author: Kan Zhang <kzhang@apache.org> Closes #644 from kanzhang/SPARK-1690 and squashes the following commits: c62ad33 [Kan Zhang] Adding Python doctest 473ec4b [Kan Zhang] [SPARK-1690] Tolerating empty elements when saving Python RDD to text files
This commit is contained in:
parent
3776f2f283
commit
6c2691d0a0
|
@ -94,6 +94,7 @@ private[spark] class PythonRDD[T: ClassTag](
|
|||
val obj = new Array[Byte](length)
|
||||
stream.readFully(obj)
|
||||
obj
|
||||
case 0 => Array.empty[Byte]
|
||||
case SpecialLengths.TIMING_DATA =>
|
||||
// Timing data from worker
|
||||
val bootTime = stream.readLong()
|
||||
|
@ -123,7 +124,7 @@ private[spark] class PythonRDD[T: ClassTag](
|
|||
stream.readFully(update)
|
||||
accumulator += Collections.singletonList(update)
|
||||
}
|
||||
Array.empty[Byte]
|
||||
null
|
||||
}
|
||||
} catch {
|
||||
|
||||
|
@ -143,7 +144,7 @@ private[spark] class PythonRDD[T: ClassTag](
|
|||
|
||||
var _nextObj = read()
|
||||
|
||||
def hasNext = _nextObj.length != 0
|
||||
def hasNext = _nextObj != null
|
||||
}
|
||||
new InterruptibleIterator(context, stdoutIterator)
|
||||
}
|
||||
|
|
|
@ -891,6 +891,14 @@ class RDD(object):
|
|||
>>> from glob import glob
|
||||
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
|
||||
'0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'
|
||||
|
||||
Empty lines are tolerated when saving to text files.
|
||||
|
||||
>>> tempFile2 = NamedTemporaryFile(delete=True)
|
||||
>>> tempFile2.close()
|
||||
>>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)
|
||||
>>> ''.join(sorted(input(glob(tempFile2.name + "/part-0000*"))))
|
||||
'\\n\\n\\nbar\\nfoo\\n'
|
||||
"""
|
||||
def func(split, iterator):
|
||||
for x in iterator:
|
||||
|
|
Loading…
Reference in a new issue