[SPARK-11548][DOCS] Replaced example code in mllib-collaborative-filtering.md using include_example

Kindly review the changes. Author: Rishabh Bhardwaj <rbnext29@gmail.com> Closes #9519 from rishabhbhardwaj/SPARK-11337.
2015-11-09 14:27:36 -08:00 · 2015-11-09 14:27:36 -08:00 · b7720fa455
parent 51d41e4b1a
commit b7720fa455
4 changed files with 221 additions and 135 deletions
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@ -66,43 +66,7 @@ recommendation model by measuring the Mean Squared Error of rating prediction.

 Refer to the [`ALS` Scala docs](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS) for details on the API.

-{% highlight scala %}
-import org.apache.spark.mllib.recommendation.ALS
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
-import org.apache.spark.mllib.recommendation.Rating
-
-// Load and parse the data
-val data = sc.textFile("data/mllib/als/test.data")
-val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
-    Rating(user.toInt, item.toInt, rate.toDouble)
-  })
-
-// Build the recommendation model using ALS
-val rank = 10
-val numIterations = 10
-val model = ALS.train(ratings, rank, numIterations, 0.01)
-
-// Evaluate the model on rating data
-val usersProducts = ratings.map { case Rating(user, product, rate) =>
-  (user, product)
-}
-val predictions = 
-  model.predict(usersProducts).map { case Rating(user, product, rate) => 
-    ((user, product), rate)
-  }
-val ratesAndPreds = ratings.map { case Rating(user, product, rate) => 
-  ((user, product), rate)
-}.join(predictions)
-val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => 
-  val err = (r1 - r2)
-  err * err
-}.mean()
-println("Mean Squared Error = " + MSE)
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/RecommendationExample.scala %}

 If the rating matrix is derived from another source of information (e.g., it is inferred from
 other signals), you can use the `trainImplicit` method to get better results.
@ -123,81 +87,7 @@ that is equivalent to the provided example in Scala is given below:

 Refer to the [`ALS` Java docs](api/java/org/apache/spark/mllib/recommendation/ALS.html) for details on the API.

-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.recommendation.ALS;
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
-import org.apache.spark.mllib.recommendation.Rating;
-import org.apache.spark.SparkConf;
-
-public class CollaborativeFiltering {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example");
-    JavaSparkContext sc = new JavaSparkContext(conf);
-
-    // Load and parse the data
-    String path = "data/mllib/als/test.data";
-    JavaRDD<String> data = sc.textFile(path);
-    JavaRDD<Rating> ratings = data.map(
-      new Function<String, Rating>() {
-        public Rating call(String s) {
-          String[] sarray = s.split(",");
-          return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]), 
-                            Double.parseDouble(sarray[2]));
-        }
-      }
-    );
-
-    // Build the recommendation model using ALS
-    int rank = 10;
-    int numIterations = 10;
-    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); 
-
-    // Evaluate the model on rating data
-    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
-      new Function<Rating, Tuple2<Object, Object>>() {
-        public Tuple2<Object, Object> call(Rating r) {
-          return new Tuple2<Object, Object>(r.user(), r.product());
-        }
-      }
-    );
-    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
-      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
-        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
-          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
-            return new Tuple2<Tuple2<Integer, Integer>, Double>(
-              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-          }
-        }
-    ));
-    JavaRDD<Tuple2<Double, Double>> ratesAndPreds = 
-      JavaPairRDD.fromJavaRDD(ratings.map(
-        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
-          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
-            return new Tuple2<Tuple2<Integer, Integer>, Double>(
-              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-          }
-        }
-    )).join(predictions).values();
-    double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
-      new Function<Tuple2<Double, Double>, Object>() {
-        public Object call(Tuple2<Double, Double> pair) {
-          Double err = pair._1() - pair._2();
-          return err * err;
-        }
-      }
-    ).rdd()).mean();
-    System.out.println("Mean Squared Error = " + MSE);
-
-    // Save and load model
-    model.save(sc.sc(), "myModelPath");
-    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(sc.sc(), "myModelPath");
-  }
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRecommendationExample.java %}
 </div>

 <div data-lang="python" markdown="1">
@ -207,29 +97,7 @@ recommendation by measuring the Mean Squared Error of rating prediction.

 Refer to the [`ALS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS) for more details on the API.

-{% highlight python %}
-from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
-
-# Load and parse the data
-data = sc.textFile("data/mllib/als/test.data")
-ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
-
-# Build the recommendation model using Alternating Least Squares
-rank = 10
-numIterations = 10
-model = ALS.train(ratings, rank, numIterations)
-
-# Evaluate the model on training data
-testdata = ratings.map(lambda p: (p[0], p[1]))
-predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
-ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
-MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
-print("Mean Squared Error = " + str(MSE))
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example python/mllib/recommendation_example.py %}

 If the rating matrix is derived from other source of information (i.e., it is inferred from other
 signals), you can use the trainImplicit method to get better results.
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java
@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
+import org.apache.spark.mllib.recommendation.Rating;
+import org.apache.spark.SparkConf;
+// $example off$
+
+public class JavaRecommendationExample {
+  public static void main(String args[]) {
+    // $example on$
+    SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/als/test.data";
+    JavaRDD<String> data = jsc.textFile(path);
+    JavaRDD<Rating> ratings = data.map(
+      new Function<String, Rating>() {
+        public Rating call(String s) {
+          String[] sarray = s.split(",");
+          return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
+            Double.parseDouble(sarray[2]));
+        }
+      }
+    );
+
+    // Build the recommendation model using ALS
+    int rank = 10;
+    int numIterations = 10;
+    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);
+
+    // Evaluate the model on rating data
+    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+      new Function<Rating, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(Rating r) {
+          return new Tuple2<Object, Object>(r.user(), r.product());
+        }
+      }
+    );
+    JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
+      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+      ));
+    JavaRDD<Tuple2<Double, Double>> ratesAndPreds =
+      JavaPairRDD.fromJavaRDD(ratings.map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Double>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+      )).join(predictions).values();
+    double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
+      new Function<Tuple2<Double, Double>, Object>() {
+        public Object call(Tuple2<Double, Double> pair) {
+          Double err = pair._1() - pair._2();
+          return err * err;
+        }
+      }
+    ).rdd()).mean();
+    System.out.println("Mean Squared Error = " + MSE);
+
+    // Save and load model
+    model.save(jsc.sc(), "target/tmp/myCollaborativeFilter");
+    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(jsc.sc(),
+      "target/tmp/myCollaborativeFilter");
+    // $example off$
+  }
+}
--- a/examples/src/main/python/mllib/recommendation_example.py
+++ b/examples/src/main/python/mllib/recommendation_example.py
@ -0,0 +1,54 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Collaborative Filtering Classification Example.
+"""
+from __future__ import print_function
+
+import sys
+
+from pyspark import SparkContext
+
+# $example on$
+from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PythonCollaborativeFilteringExample")
+    # $example on$
+    # Load and parse the data
+    data = sc.textFile("data/mllib/als/test.data")
+    ratings = data.map(lambda l: l.split(','))\
+        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
+
+    # Build the recommendation model using Alternating Least Squares
+    rank = 10
+    numIterations = 10
+    model = ALS.train(ratings, rank, numIterations)
+
+    # Evaluate the model on training data
+    testdata = ratings.map(lambda p: (p[0], p[1]))
+    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
+    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
+    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
+    print("Mean Squared Error = " + str(MSE))
+
+    # Save and load model
+    model.save(sc, "target/tmp/myCollaborativeFilter")
+    sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
+    # $example off$
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.{SparkContext, SparkConf}
+// $example on$
+import org.apache.spark.mllib.recommendation.ALS
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
+import org.apache.spark.mllib.recommendation.Rating
+// $example off$
+
+object RecommendationExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("CollaborativeFilteringExample")
+    val sc = new SparkContext(conf)
+    // $example on$
+    // Load and parse the data
+    val data = sc.textFile("data/mllib/als/test.data")
+    val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
+      Rating(user.toInt, item.toInt, rate.toDouble)
+    })
+
+    // Build the recommendation model using ALS
+    val rank = 10
+    val numIterations = 10
+    val model = ALS.train(ratings, rank, numIterations, 0.01)
+
+    // Evaluate the model on rating data
+    val usersProducts = ratings.map { case Rating(user, product, rate) =>
+      (user, product)
+    }
+    val predictions =
+      model.predict(usersProducts).map { case Rating(user, product, rate) =>
+        ((user, product), rate)
+      }
+    val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
+      ((user, product), rate)
+    }.join(predictions)
+    val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
+      val err = (r1 - r2)
+      err * err
+    }.mean()
+    println("Mean Squared Error = " + MSE)
+
+    // Save and load model
+    model.save(sc, "target/tmp/myCollaborativeFilter")
+    val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
+    // $example off$
+  }
+}
+// scalastyle:on println