spark-instrumented-optimizer/python/pyspark/mllib/recommendation.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark import SparkContext
from pyspark.mllib._common import \
    _get_unmangled_rdd, _get_unmangled_double_vector_rdd, \
    _serialize_double_matrix, _deserialize_double_matrix, \
    _serialize_double_vector, _deserialize_double_vector, \
    _get_initial_weights, _serialize_rating, _regression_train_wrapper, \
    _serialize_tuple, RatingDeserializer
from pyspark.rdd import RDD


class MatrixFactorizationModel(object):

    """A matrix factorisation model trained by regularized alternating
    least-squares.

    >>> r1 = (1, 1, 1.0)
    >>> r2 = (1, 2, 2.0)
    >>> r3 = (2, 1, 2.0)
    >>> ratings = sc.parallelize([r1, r2, r3])
    >>> model = ALS.trainImplicit(ratings, 1)
    >>> model.predict(2,2) is not None
    True
    >>> testset = sc.parallelize([(1, 2), (1, 1)])
    >>> model.predictAll(testset).count() == 2
    True
    """

    def __init__(self, sc, java_model):
        self._context = sc
        self._java_model = java_model

    def __del__(self):
        self._context._gateway.detach(self._java_model)

    def predict(self, user, product):
        return self._java_model.predict(user, product)

    def predictAll(self, usersProducts):
        usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple)
        return RDD(self._java_model.predict(usersProductsJRDD._jrdd),
                   self._context, RatingDeserializer())


class ALS(object):

    @classmethod
    def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
        sc = ratings.context
        ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
        mod = sc._jvm.PythonMLLibAPI().trainALSModel(
            ratingBytes._jrdd, rank, iterations, lambda_, blocks)
        return MatrixFactorizationModel(sc, mod)

    @classmethod
    def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):
        sc = ratings.context
        ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
        mod = sc._jvm.PythonMLLibAPI().trainImplicitALSModel(
            ratingBytes._jrdd, rank, iterations, lambda_, blocks, alpha)
        return MatrixFactorizationModel(sc, mod)


def _test():
    import doctest
    globs = globals().copy()
    globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        exit(-1)


if __name__ == "__main__":
    _test()
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from pyspark import SparkContext`
			`from pyspark.mllib._common import \`
			`_get_unmangled_rdd, _get_unmangled_double_vector_rdd, \`
			`_serialize_double_matrix, _deserialize_double_matrix, \`
			`_serialize_double_vector, _deserialize_double_vector, \`
Added python binding for bulk recommendation 2014-01-04 19:23:17 -05:00			`_get_initial_weights, _serialize_rating, _regression_train_wrapper, \`
Added predictAll python function to MatrixFactorizationModel 2014-01-06 15:19:43 -05:00			`_serialize_tuple, RatingDeserializer`
Added python binding for bulk recommendation 2014-01-04 19:23:17 -05:00			`from pyspark.rdd import RDD`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00
Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`class MatrixFactorizationModel(object):`
[SPARK-2627] [PySpark] have the build enforce PEP 8 automatically As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the ["Contributing to Spark"](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent 2014-08-06 15:58:24 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`"""A matrix factorisation model trained by regularized alternating`
			`least-squares.`

			`>>> r1 = (1, 1, 1.0)`
			`>>> r2 = (1, 2, 2.0)`
			`>>> r3 = (2, 1, 2.0)`
			`>>> ratings = sc.parallelize([r1, r2, r3])`
Add Naive Bayes to Python MLlib, and some API fixes - Added a Python wrapper for Naive Bayes - Updated the Scala Naive Bayes to match the style of our other algorithms better and in particular make it easier to call from Java (added builder pattern, removed default value in train method) - Updated Python MLlib functions to not require a SparkContext; we can get that from the RDD the user gives - Added a toString method in LabeledPoint - Made the Python MLlib tests run as part of run-tests as well (before they could only be run individually through each file) 2014-01-10 02:55:06 -05:00			`>>> model = ALS.trainImplicit(ratings, 1)`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`>>> model.predict(2,2) is not None`
			`True`
Added predictAll python function to MatrixFactorizationModel 2014-01-06 15:19:43 -05:00			`>>> testset = sc.parallelize([(1, 2), (1, 1)])`
Add Naive Bayes to Python MLlib, and some API fixes - Added a Python wrapper for Naive Bayes - Updated the Scala Naive Bayes to match the style of our other algorithms better and in particular make it easier to call from Java (added builder pattern, removed default value in train method) - Updated Python MLlib functions to not require a SparkContext; we can get that from the RDD the user gives - Added a toString method in LabeledPoint - Made the Python MLlib tests run as part of run-tests as well (before they could only be run individually through each file) 2014-01-10 02:55:06 -05:00			`>>> model.predictAll(testset).count() == 2`
Added predictAll python function to MatrixFactorizationModel 2014-01-06 15:19:43 -05:00			`True`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`"""`

			`def __init__(self, sc, java_model):`
			`self._context = sc`
			`self._java_model = java_model`

			`def __del__(self):`
			`self._context._gateway.detach(self._java_model)`

			`def predict(self, user, product):`
			`return self._java_model.predict(user, product)`

Added python binding for bulk recommendation 2014-01-04 19:23:17 -05:00			`def predictAll(self, usersProducts):`
			`usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple)`
Added predictAll python function to MatrixFactorizationModel 2014-01-06 15:19:43 -05:00			`return RDD(self._java_model.predict(usersProductsJRDD._jrdd),`
			`self._context, RatingDeserializer())`
Added python binding for bulk recommendation 2014-01-04 19:23:17 -05:00
Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`class ALS(object):`
[SPARK-2627] [PySpark] have the build enforce PEP 8 automatically As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the ["Contributing to Spark"](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent 2014-08-06 15:58:24 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`@classmethod`
Add Naive Bayes to Python MLlib, and some API fixes - Added a Python wrapper for Naive Bayes - Updated the Scala Naive Bayes to match the style of our other algorithms better and in particular make it easier to call from Java (added builder pattern, removed default value in train method) - Updated Python MLlib functions to not require a SparkContext; we can get that from the RDD the user gives - Added a toString method in LabeledPoint - Made the Python MLlib tests run as part of run-tests as well (before they could only be run individually through each file) 2014-01-10 02:55:06 -05:00			`def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):`
			`sc = ratings.context`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)`
Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00			`mod = sc._jvm.PythonMLLibAPI().trainALSModel(`
			`ratingBytes._jrdd, rank, iterations, lambda_, blocks)`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`return MatrixFactorizationModel(sc, mod)`

			`@classmethod`
Add Naive Bayes to Python MLlib, and some API fixes - Added a Python wrapper for Naive Bayes - Updated the Scala Naive Bayes to match the style of our other algorithms better and in particular make it easier to call from Java (added builder pattern, removed default value in train method) - Updated Python MLlib functions to not require a SparkContext; we can get that from the RDD the user gives - Added a toString method in LabeledPoint - Made the Python MLlib tests run as part of run-tests as well (before they could only be run individually through each file) 2014-01-10 02:55:06 -05:00			`def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):`
			`sc = ratings.context`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)`
Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00			`mod = sc._jvm.PythonMLLibAPI().trainImplicitALSModel(`
			`ratingBytes._jrdd, rank, iterations, lambda_, blocks, alpha)`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`return MatrixFactorizationModel(sc, mod)`

Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`def _test():`
			`import doctest`
			`globs = globals().copy()`
			`globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2)`
Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00			`(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)`
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`globs['sc'].stop()`
			`if failure_count:`
			`exit(-1)`

Fix PEP8 violations in Python mllib. Author: Reynold Xin <rxin@apache.org> Closes #871 from rxin/mllib-pep8 and squashes the following commits: 848416f [Reynold Xin] Fixed a typo in the previous cleanup (c -> sc). a8db4cd [Reynold Xin] Fix PEP8 violations in Python mllib. 2014-05-25 20:15:01 -04:00
Split the mllib bindings into a whole bunch of modules and rename some things. 2013-12-25 00:08:05 -05:00			`if __name__ == "__main__":`
			`_test()`