[SPARK-8373] [PYSPARK] Add emptyRDD to pyspark and fix the issue when calling sum on an empty RDD
This PR fixes the sum issue and also adds `emptyRDD` so that it's easy to create a test case.
Author: zsxwing <zsxwing@gmail.com>
Closes #6826 from zsxwing/python-emptyRDD and squashes the following commits:
b36993f [zsxwing] Update the return type to JavaRDD[T]
71df047 [zsxwing] Add emptyRDD to pyspark and fix the issue when calling sum on an empty RDD
(cherry picked from commit 0fc4b96f3e
)
Signed-off-by: Andrew Or <andrew@databricks.com>
This commit is contained in:
parent
f0513733d4
commit
5e7973df0e
|
@ -425,6 +425,11 @@ private[spark] object PythonRDD extends Logging {
|
|||
iter.foreach(write)
|
||||
}
|
||||
|
||||
/** Create an RDD that has no partitions or elements. */
|
||||
def emptyRDD[T](sc: JavaSparkContext): JavaRDD[T] = {
|
||||
sc.emptyRDD[T]
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]],
|
||||
* key and value class.
|
||||
|
|
|
@ -324,6 +324,12 @@ class SparkContext(object):
|
|||
with SparkContext._lock:
|
||||
SparkContext._active_spark_context = None
|
||||
|
||||
def emptyRDD(self):
|
||||
"""
|
||||
Create an RDD that has no partitions or elements.
|
||||
"""
|
||||
return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
|
||||
|
||||
def range(self, start, end=None, step=1, numSlices=None):
|
||||
"""
|
||||
Create a new RDD of int containing elements from `start` to `end`
|
||||
|
|
|
@ -960,7 +960,7 @@ class RDD(object):
|
|||
>>> sc.parallelize([1.0, 2.0, 3.0]).sum()
|
||||
6.0
|
||||
"""
|
||||
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
|
||||
return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
|
||||
|
||||
def count(self):
|
||||
"""
|
||||
|
|
|
@ -458,6 +458,14 @@ class RDDTests(ReusedPySparkTestCase):
|
|||
self.assertEqual(id + 1, id2)
|
||||
self.assertEqual(id2, rdd2.id())
|
||||
|
||||
def test_empty_rdd(self):
|
||||
rdd = self.sc.emptyRDD()
|
||||
self.assertTrue(rdd.isEmpty())
|
||||
|
||||
def test_sum(self):
|
||||
self.assertEqual(0, self.sc.emptyRDD().sum())
|
||||
self.assertEqual(6, self.sc.parallelize([1, 2, 3]).sum())
|
||||
|
||||
def test_save_as_textfile_with_unicode(self):
|
||||
# Regression test for SPARK-970
|
||||
x = u"\u00A1Hola, mundo!"
|
||||
|
|
Loading…
Reference in a new issue