Fix PySpark docs and an overly long line of code after fdbae41e
This commit is contained in:
parent
b4fa11f6c9
commit
478b2b7edc
|
@ -16,7 +16,7 @@ This guide will show how to use the Spark features described there in Python.
|
|||
There are a few key differences between the Python and Scala APIs:
|
||||
|
||||
* Python is dynamically typed, so RDDs can hold objects of multiple types.
|
||||
* PySpark does not yet support a few API calls, such as `lookup`, `sort`, and non-text input files, though these will be added in future releases.
|
||||
* PySpark does not yet support a few API calls, such as `lookup` and non-text input files, though these will be added in future releases.
|
||||
|
||||
In PySpark, RDDs support the same methods as their Scala counterparts but take Python functions and return Python collection types.
|
||||
Short functions can be passed to RDD methods using Python's [`lambda`](http://www.diveintopython.net/power_of_introspection/lambda_functions.html) syntax:
|
||||
|
|
|
@ -117,8 +117,6 @@ class RDD(object):
|
|||
else:
|
||||
return None
|
||||
|
||||
# TODO persist(self, storageLevel)
|
||||
|
||||
def map(self, f, preservesPartitioning=False):
|
||||
"""
|
||||
Return a new RDD containing the distinct elements in this RDD.
|
||||
|
@ -227,7 +225,7 @@ class RDD(object):
|
|||
total = num
|
||||
|
||||
samples = self.sample(withReplacement, fraction, seed).collect()
|
||||
|
||||
|
||||
# If the first sample didn't turn out large enough, keep trying to take samples;
|
||||
# this shouldn't happen often because we use a big multiplier for their initial size.
|
||||
# See: scala/spark/RDD.scala
|
||||
|
@ -288,7 +286,7 @@ class RDD(object):
|
|||
maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner
|
||||
fraction = min(maxSampleSize / max(rddSize, 1), 1.0)
|
||||
|
||||
samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect()
|
||||
samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect()
|
||||
samples = sorted(samples, reverse=(not ascending), key=keyfunc)
|
||||
|
||||
# we have numPartitions many parts but one of the them has
|
||||
|
@ -309,7 +307,9 @@ class RDD(object):
|
|||
def mapFunc(iterator):
|
||||
yield sorted(iterator, reverse=(not ascending), key=lambda (k, v): keyfunc(k))
|
||||
|
||||
return self.partitionBy(numPartitions, partitionFunc=rangePartitionFunc).mapPartitions(mapFunc,preservesPartitioning=True).flatMap(lambda x: x, preservesPartitioning=True)
|
||||
return (self.partitionBy(numPartitions, partitionFunc=rangePartitionFunc)
|
||||
.mapPartitions(mapFunc,preservesPartitioning=True)
|
||||
.flatMap(lambda x: x, preservesPartitioning=True))
|
||||
|
||||
def glom(self):
|
||||
"""
|
||||
|
@ -471,7 +471,7 @@ class RDD(object):
|
|||
3
|
||||
"""
|
||||
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
|
||||
|
||||
|
||||
def stats(self):
|
||||
"""
|
||||
Return a L{StatCounter} object that captures the mean, variance
|
||||
|
@ -508,7 +508,7 @@ class RDD(object):
|
|||
0.816...
|
||||
"""
|
||||
return self.stats().stdev()
|
||||
|
||||
|
||||
def sampleStdev(self):
|
||||
"""
|
||||
Compute the sample standard deviation of this RDD's elements (which corrects for bias in
|
||||
|
@ -878,7 +878,7 @@ class RDD(object):
|
|||
>>> y = sc.parallelize([("a", 3), ("c", None)])
|
||||
>>> sorted(x.subtractByKey(y).collect())
|
||||
[('b', 4), ('b', 5)]
|
||||
"""
|
||||
"""
|
||||
filter_func = lambda (key, vals): len(vals[0]) > 0 and len(vals[1]) == 0
|
||||
map_func = lambda (key, vals): [(key, val) for val in vals[0]]
|
||||
return self.cogroup(other, numPartitions).filter(filter_func).flatMap(map_func)
|
||||
|
|
Loading…
Reference in a new issue