spark-instrumented-optimizer/python/pyspark/join.py

"""
Copyright (c) 2011, Douban Inc. <http://www.douban.com/>
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

    * Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.

    * Neither the name of the Douban Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

from pyspark.resultiterable import ResultIterable

def _do_python_join(rdd, other, numPartitions, dispatch):
    vs = rdd.map(lambda (k, v): (k, (1, v)))
    ws = other.map(lambda (k, v): (k, (2, v)))
    return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x : dispatch(x.__iter__()))


def python_join(rdd, other, numPartitions):
    def dispatch(seq):
        vbuf, wbuf = [], []
        for (n, v) in seq:
            if n == 1:
                vbuf.append(v)
            elif n == 2:
                wbuf.append(v)
        return [(v, w) for v in vbuf for w in wbuf]
    return _do_python_join(rdd, other, numPartitions, dispatch)


def python_right_outer_join(rdd, other, numPartitions):
    def dispatch(seq):
        vbuf, wbuf = [], []
        for (n, v) in seq:
            if n == 1:
                vbuf.append(v)
            elif n == 2:
                wbuf.append(v)
        if not vbuf:
            vbuf.append(None)
        return [(v, w) for v in vbuf for w in wbuf]
    return _do_python_join(rdd, other, numPartitions, dispatch)


def python_left_outer_join(rdd, other, numPartitions):
    def dispatch(seq):
        vbuf, wbuf = [], []
        for (n, v) in seq:
            if n == 1:
                vbuf.append(v)
            elif n == 2:
                wbuf.append(v)
        if not wbuf:
            wbuf.append(None)
        return [(v, w) for v in vbuf for w in wbuf]
    return _do_python_join(rdd, other, numPartitions, dispatch)


def python_cogroup(rdds, numPartitions):
    def make_mapper(i):
        return lambda (k, v): (k, (i, v))
    vrdds = [rdd.map(make_mapper(i)) for i, rdd in enumerate(rdds)]
    union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds)
    rdd_len = len(vrdds)
    def dispatch(seq):
        bufs = [[] for i in range(rdd_len)]
        for (n, v) in seq:
            bufs[n].append(v)
        return tuple(map(ResultIterable, bufs))
    return union_vrdds.groupByKey(numPartitions).mapValues(dispatch)
Add Python API. 2012-08-10 04:10:02 -04:00			`"""`
			`Copyright (c) 2011, Douban Inc. <http://www.douban.com/>`
			`All rights reserved.`

			`Redistribution and use in source and binary forms, with or without`
			`modification, are permitted provided that the following conditions are`
			`met:`

			`* Redistributions of source code must retain the above copyright`
			`notice, this list of conditions and the following disclaimer.`

			`* Redistributions in binary form must reproduce the above`
			`copyright notice, this list of conditions and the following disclaimer`
			`in the documentation and/or other materials provided with the`
			`distribution.`

			`* Neither the name of the Douban Inc. nor the names of its`
			`contributors may be used to endorse or promote products derived from`
			`this software without specific prior written permission.`

			`THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`"""`

Spark 1271: Co-Group and Group-By should pass Iterable[X] Author: Holden Karau <holden@pigscanfly.ca> Closes #242 from holdenk/spark-1320-cogroupandgroupshouldpassiterator and squashes the following commits: f289536 [Holden Karau] Fix bad merge, should have been Iterable rather than Iterator 77048f8 [Holden Karau] Fix merge up to master d3fe909 [Holden Karau] use toSeq instead 7a092a3 [Holden Karau] switch resultitr to resultiterable eb06216 [Holden Karau] maybe I should have had a coffee first. use correct import for guava iterables c5075aa [Holden Karau] If guava 14 had iterables 2d06e10 [Holden Karau] Fix Java 8 cogroup tests for the new API 11e730c [Holden Karau] Fix streaming tests 66b583d [Holden Karau] Fix the core test suite to compile 4ed579b [Holden Karau] Refactor from iterator to iterable d052c07 [Holden Karau] Python tests now pass with iterator pandas 3bcd81d [Holden Karau] Revert "Try and make pickling list iterators work" cd1e81c [Holden Karau] Try and make pickling list iterators work c60233a [Holden Karau] Start investigating moving to iterators for python API like the Java/Scala one. tl;dr: We will have to write our own iterator since the default one doesn't pickle well 88a5cef [Holden Karau] Fix cogroup test in JavaAPISuite for streaming a5ee714 [Holden Karau] oops, was checking wrong iterator e687f21 [Holden Karau] Fix groupbykey test in JavaAPISuite of streaming ec8cc3e [Holden Karau] Fix test issues\! 4b0eeb9 [Holden Karau] Switch cast in PairDStreamFunctions fa395c9 [Holden Karau] Revert "Add a join based on the problem in SVD" ec99e32 [Holden Karau] Revert "Revert this but for now put things in list pandas" b692868 [Holden Karau] Revert 7e533f7 [Holden Karau] Fix the bug 8a5153a [Holden Karau] Revert me, but we have some stuff to debug b4e86a9 [Holden Karau] Add a join based on the problem in SVD c4510e2 [Holden Karau] Revert this but for now put things in list pandas b4e0b1d [Holden Karau] Fix style issues 71e8b9f [Holden Karau] I really need to stop calling size on iterators, it is the path of sadness. b1ae51a [Holden Karau] Fix some of the types in the streaming JavaAPI suite. Probably still needs more work 37888ec [Holden Karau] core/tests now pass 249abde [Holden Karau] org.apache.spark.rdd.PairRDDFunctionsSuite passes 6698186 [Holden Karau] Revert "I think this might be a bad rabbit hole. Started work to make CoGroupedRDD use iterator and then went crazy" fe992fe [Holden Karau] hmmm try and fix up basic operation suite 172705c [Holden Karau] Fix Java API suite caafa63 [Holden Karau] I think this might be a bad rabbit hole. Started work to make CoGroupedRDD use iterator and then went crazy 88b3329 [Holden Karau] Fix groupbykey to actually give back an iterator 4991af6 [Holden Karau] Fix some tests be50246 [Holden Karau] Calling size on an iterator is not so good if we want to use it after 687ffbc [Holden Karau] This is the it compiles point of replacing Seq with Iterator and JList with JIterator in the groupby and cogroup signatures 2014-04-08 21:15:52 -04:00			`from pyspark.resultiterable import ResultIterable`
Add Python API. 2012-08-10 04:10:02 -04:00
Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`def _do_python_join(rdd, other, numPartitions, dispatch):`
Use only cPickle for serialization in Python API. Objects serialized with JSON can be compared for equality, but JSON can be slow to serialize and only supports a limited range of data types. 2012-08-18 19:07:10 -04:00			`vs = rdd.map(lambda (k, v): (k, (1, v)))`
			`ws = other.map(lambda (k, v): (k, (2, v)))`
Spark 1271: Co-Group and Group-By should pass Iterable[X] Author: Holden Karau <holden@pigscanfly.ca> Closes #242 from holdenk/spark-1320-cogroupandgroupshouldpassiterator and squashes the following commits: f289536 [Holden Karau] Fix bad merge, should have been Iterable rather than Iterator 77048f8 [Holden Karau] Fix merge up to master d3fe909 [Holden Karau] use toSeq instead 7a092a3 [Holden Karau] switch resultitr to resultiterable eb06216 [Holden Karau] maybe I should have had a coffee first. use correct import for guava iterables c5075aa [Holden Karau] If guava 14 had iterables 2d06e10 [Holden Karau] Fix Java 8 cogroup tests for the new API 11e730c [Holden Karau] Fix streaming tests 66b583d [Holden Karau] Fix the core test suite to compile 4ed579b [Holden Karau] Refactor from iterator to iterable d052c07 [Holden Karau] Python tests now pass with iterator pandas 3bcd81d [Holden Karau] Revert "Try and make pickling list iterators work" cd1e81c [Holden Karau] Try and make pickling list iterators work c60233a [Holden Karau] Start investigating moving to iterators for python API like the Java/Scala one. tl;dr: We will have to write our own iterator since the default one doesn't pickle well 88a5cef [Holden Karau] Fix cogroup test in JavaAPISuite for streaming a5ee714 [Holden Karau] oops, was checking wrong iterator e687f21 [Holden Karau] Fix groupbykey test in JavaAPISuite of streaming ec8cc3e [Holden Karau] Fix test issues\! 4b0eeb9 [Holden Karau] Switch cast in PairDStreamFunctions fa395c9 [Holden Karau] Revert "Add a join based on the problem in SVD" ec99e32 [Holden Karau] Revert "Revert this but for now put things in list pandas" b692868 [Holden Karau] Revert 7e533f7 [Holden Karau] Fix the bug 8a5153a [Holden Karau] Revert me, but we have some stuff to debug b4e86a9 [Holden Karau] Add a join based on the problem in SVD c4510e2 [Holden Karau] Revert this but for now put things in list pandas b4e0b1d [Holden Karau] Fix style issues 71e8b9f [Holden Karau] I really need to stop calling size on iterators, it is the path of sadness. b1ae51a [Holden Karau] Fix some of the types in the streaming JavaAPI suite. Probably still needs more work 37888ec [Holden Karau] core/tests now pass 249abde [Holden Karau] org.apache.spark.rdd.PairRDDFunctionsSuite passes 6698186 [Holden Karau] Revert "I think this might be a bad rabbit hole. Started work to make CoGroupedRDD use iterator and then went crazy" fe992fe [Holden Karau] hmmm try and fix up basic operation suite 172705c [Holden Karau] Fix Java API suite caafa63 [Holden Karau] I think this might be a bad rabbit hole. Started work to make CoGroupedRDD use iterator and then went crazy 88b3329 [Holden Karau] Fix groupbykey to actually give back an iterator 4991af6 [Holden Karau] Fix some tests be50246 [Holden Karau] Calling size on an iterator is not so good if we want to use it after 687ffbc [Holden Karau] This is the it compiles point of replacing Seq with Iterator and JList with JIterator in the groupby and cogroup signatures 2014-04-08 21:15:52 -04:00			`return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x : dispatch(x.__iter__()))`
Add Python API. 2012-08-10 04:10:02 -04:00

Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`def python_join(rdd, other, numPartitions):`
Add Python API. 2012-08-10 04:10:02 -04:00			`def dispatch(seq):`
			`vbuf, wbuf = [], []`
			`for (n, v) in seq:`
			`if n == 1:`
			`vbuf.append(v)`
			`elif n == 2:`
			`wbuf.append(v)`
			`return [(v, w) for v in vbuf for w in wbuf]`
Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`return _do_python_join(rdd, other, numPartitions, dispatch)`
Add Python API. 2012-08-10 04:10:02 -04:00

Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`def python_right_outer_join(rdd, other, numPartitions):`
Add Python API. 2012-08-10 04:10:02 -04:00			`def dispatch(seq):`
			`vbuf, wbuf = [], []`
			`for (n, v) in seq:`
			`if n == 1:`
			`vbuf.append(v)`
			`elif n == 2:`
			`wbuf.append(v)`
			`if not vbuf:`
			`vbuf.append(None)`
			`return [(v, w) for v in vbuf for w in wbuf]`
Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`return _do_python_join(rdd, other, numPartitions, dispatch)`
Add Python API. 2012-08-10 04:10:02 -04:00

Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`def python_left_outer_join(rdd, other, numPartitions):`
Add Python API. 2012-08-10 04:10:02 -04:00			`def dispatch(seq):`
			`vbuf, wbuf = [], []`
			`for (n, v) in seq:`
			`if n == 1:`
			`vbuf.append(v)`
			`elif n == 2:`
			`wbuf.append(v)`
			`if not wbuf:`
			`wbuf.append(None)`
			`return [(v, w) for v in vbuf for w in wbuf]`
Change numSplits to numPartitions in PySpark. 2013-02-24 16:25:09 -05:00			`return _do_python_join(rdd, other, numPartitions, dispatch)`
Add Python API. 2012-08-10 04:10:02 -04:00

SPARK-1868: Users should be allowed to cogroup at least 4 RDDs Adds cogroup for 4 RDDs. Author: Allan Douglas R. de Oliveira <allandouglas@gmail.com> Closes #813 from douglaz/more_cogroups and squashes the following commits: f8d6273 [Allan Douglas R. de Oliveira] Test python groupWith for one more case 0e9009c [Allan Douglas R. de Oliveira] Added scala tests c3ffcdd [Allan Douglas R. de Oliveira] Added java tests 517a67f [Allan Douglas R. de Oliveira] Added tests for python groupWith 2f402d5 [Allan Douglas R. de Oliveira] Removed TODO 17474f4 [Allan Douglas R. de Oliveira] Use new cogroup function 7877a2a [Allan Douglas R. de Oliveira] Fixed code ba02414 [Allan Douglas R. de Oliveira] Added varargs cogroup to pyspark c4a8a51 [Allan Douglas R. de Oliveira] Added java cogroup 4 e94963c [Allan Douglas R. de Oliveira] Fixed spacing f1ee57b [Allan Douglas R. de Oliveira] Fixed scala style issues d7196f1 [Allan Douglas R. de Oliveira] Allow the cogroup of 4 RDDs 2014-06-20 14:03:03 -04:00			`def python_cogroup(rdds, numPartitions):`
			`def make_mapper(i):`
			`return lambda (k, v): (k, (i, v))`
			`vrdds = [rdd.map(make_mapper(i)) for i, rdd in enumerate(rdds)]`
			`union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds)`
			`rdd_len = len(vrdds)`
Add Python API. 2012-08-10 04:10:02 -04:00			`def dispatch(seq):`
SPARK-1868: Users should be allowed to cogroup at least 4 RDDs Adds cogroup for 4 RDDs. Author: Allan Douglas R. de Oliveira <allandouglas@gmail.com> Closes #813 from douglaz/more_cogroups and squashes the following commits: f8d6273 [Allan Douglas R. de Oliveira] Test python groupWith for one more case 0e9009c [Allan Douglas R. de Oliveira] Added scala tests c3ffcdd [Allan Douglas R. de Oliveira] Added java tests 517a67f [Allan Douglas R. de Oliveira] Added tests for python groupWith 2f402d5 [Allan Douglas R. de Oliveira] Removed TODO 17474f4 [Allan Douglas R. de Oliveira] Use new cogroup function 7877a2a [Allan Douglas R. de Oliveira] Fixed code ba02414 [Allan Douglas R. de Oliveira] Added varargs cogroup to pyspark c4a8a51 [Allan Douglas R. de Oliveira] Added java cogroup 4 e94963c [Allan Douglas R. de Oliveira] Fixed spacing f1ee57b [Allan Douglas R. de Oliveira] Fixed scala style issues d7196f1 [Allan Douglas R. de Oliveira] Allow the cogroup of 4 RDDs 2014-06-20 14:03:03 -04:00			`bufs = [[] for i in range(rdd_len)]`
Add Python API. 2012-08-10 04:10:02 -04:00			`for (n, v) in seq:`
SPARK-1868: Users should be allowed to cogroup at least 4 RDDs Adds cogroup for 4 RDDs. Author: Allan Douglas R. de Oliveira <allandouglas@gmail.com> Closes #813 from douglaz/more_cogroups and squashes the following commits: f8d6273 [Allan Douglas R. de Oliveira] Test python groupWith for one more case 0e9009c [Allan Douglas R. de Oliveira] Added scala tests c3ffcdd [Allan Douglas R. de Oliveira] Added java tests 517a67f [Allan Douglas R. de Oliveira] Added tests for python groupWith 2f402d5 [Allan Douglas R. de Oliveira] Removed TODO 17474f4 [Allan Douglas R. de Oliveira] Use new cogroup function 7877a2a [Allan Douglas R. de Oliveira] Fixed code ba02414 [Allan Douglas R. de Oliveira] Added varargs cogroup to pyspark c4a8a51 [Allan Douglas R. de Oliveira] Added java cogroup 4 e94963c [Allan Douglas R. de Oliveira] Fixed spacing f1ee57b [Allan Douglas R. de Oliveira] Fixed scala style issues d7196f1 [Allan Douglas R. de Oliveira] Allow the cogroup of 4 RDDs 2014-06-20 14:03:03 -04:00			`bufs[n].append(v)`
			`return tuple(map(ResultIterable, bufs))`
			`return union_vrdds.groupByKey(numPartitions).mapValues(dispatch)`