diff --git a/docs/ml-features.md b/docs/ml-features.md
index 44a9882939..142afac2f3 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -37,23 +37,7 @@ In the following code segment, we start with a set of sentences. We split each
Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
-
-val sentenceData = sqlContext.createDataFrame(Seq(
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val wordsData = tokenizer.transform(sentenceData)
-val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
-val featurizedData = hashingTF.transform(wordsData)
-val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
-val idfModel = idf.fit(featurizedData)
-val rescaledData = idfModel.transform(featurizedData)
-rescaledData.select("features", "label").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
@@ -61,49 +45,7 @@ rescaledData.select("features", "label").take(3).foreach(println)
Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
[IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.IDF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0, "Hi I heard about Spark"),
- RowFactory.create(0, "I wish Java could use case classes"),
- RowFactory.create(1, "Logistic regression models are neat")
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
- new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
-Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsData = tokenizer.transform(sentenceData);
-int numFeatures = 20;
-HashingTF hashingTF = new HashingTF()
- .setInputCol("words")
- .setOutputCol("rawFeatures")
- .setNumFeatures(numFeatures);
-DataFrame featurizedData = hashingTF.transform(wordsData);
-IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
-IDFModel idfModel = idf.fit(featurizedData);
-DataFrame rescaledData = idfModel.transform(featurizedData);
-for (Row r : rescaledData.select("features", "label").take(3)) {
- Vector features = r.getAs(0);
- Double label = r.getDouble(1);
- System.out.println(features);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
@@ -111,24 +53,7 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import HashingTF, IDF, Tokenizer
-
-sentenceData = sqlContext.createDataFrame([
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
-], ["label", "sentence"])
-tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsData = tokenizer.transform(sentenceData)
-hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
-featurizedData = hashingTF.transform(wordsData)
-idf = IDF(inputCol="rawFeatures", outputCol="features")
-idfModel = idf.fit(featurizedData)
-rescaledData = idfModel.transform(featurizedData)
-for features_label in rescaledData.select("features", "label").take(3):
- print(features_label)
-{% endhighlight %}
+{% include_example python/ml/tf_idf_example.py %}
@@ -149,26 +74,7 @@ In the following code segment, we start with a set of documents, each of which i
Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.Word2Vec
-
-// Input data: Each row is a bag of words from a sentence or document.
-val documentDF = sqlContext.createDataFrame(Seq(
- "Hi I heard about Spark".split(" "),
- "I wish Java could use case classes".split(" "),
- "Logistic regression models are neat".split(" ")
-).map(Tuple1.apply)).toDF("text")
-
-// Learn a mapping from words to Vectors.
-val word2Vec = new Word2Vec()
- .setInputCol("text")
- .setOutputCol("result")
- .setVectorSize(3)
- .setMinCount(0)
-val model = word2Vec.fit(documentDF)
-val result = model.transform(documentDF)
-result.select("result").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
@@ -176,43 +82,7 @@ result.select("result").take(3).foreach(println)
Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.*;
-
-JavaSparkContext jsc = ...
-SQLContext sqlContext = ...
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
- RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
- RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
-
-// Learn a mapping from words to Vectors.
-Word2Vec word2Vec = new Word2Vec()
- .setInputCol("text")
- .setOutputCol("result")
- .setVectorSize(3)
- .setMinCount(0);
-Word2VecModel model = word2Vec.fit(documentDF);
-DataFrame result = model.transform(documentDF);
-for (Row r: result.select("result").take(3)) {
- System.out.println(r);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
@@ -220,22 +90,7 @@ for (Row r: result.select("result").take(3)) {
Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import Word2Vec
-
-# Input data: Each row is a bag of words from a sentence or document.
-documentDF = sqlContext.createDataFrame([
- ("Hi I heard about Spark".split(" "), ),
- ("I wish Java could use case classes".split(" "), ),
- ("Logistic regression models are neat".split(" "), )
-], ["text"])
-# Learn a mapping from words to Vectors.
-word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
-model = word2Vec.fit(documentDF)
-result = model.transform(documentDF)
-for feature in result.select("result").take(3):
- print(feature)
-{% endhighlight %}
+{% include_example python/ml/word2vec_example.py %}
@@ -283,30 +138,7 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.CountVectorizer
-import org.apache.spark.mllib.util.CountVectorizerModel
-
-val df = sqlContext.createDataFrame(Seq(
- (0, Array("a", "b", "c")),
- (1, Array("a", "b", "b", "c", "a"))
-)).toDF("id", "words")
-
-// fit a CountVectorizerModel from the corpus
-val cvModel: CountVectorizerModel = new CountVectorizer()
- .setInputCol("words")
- .setOutputCol("features")
- .setVocabSize(3)
- .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
- .fit(df)
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-val cvm = new CountVectorizerModel(Array("a", "b", "c"))
- .setInputCol("words")
- .setOutputCol("features")
-
-cvModel.transform(df).select("features").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
@@ -315,40 +147,7 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
for more details on the API.
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.CountVectorizer;
-import org.apache.spark.ml.feature.CountVectorizerModel;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-
-// Input data: Each row is a bag of words from a sentence or document.
-JavaRDD jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(Arrays.asList("a", "b", "c")),
- RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
-));
-StructType schema = new StructType(new StructField [] {
- new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-
-// fit a CountVectorizerModel from the corpus
-CountVectorizerModel cvModel = new CountVectorizer()
- .setInputCol("text")
- .setOutputCol("feature")
- .setVocabSize(3)
- .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
- .fit(df);
-
-// alternatively, define CountVectorizerModel with a-priori vocabulary
-CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
- .setInputCol("text")
- .setOutputCol("feature");
-
-cvModel.transform(df).show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
new file mode 100644
index 0000000000..ac33adb652
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaCountVectorizerExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ // Input data: Each row is a bag of words from a sentence or document.
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("a", "b", "c")),
+ RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+ ));
+ StructType schema = new StructType(new StructField [] {
+ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+ // fit a CountVectorizerModel from the corpus
+ CountVectorizerModel cvModel = new CountVectorizer()
+ .setInputCol("text")
+ .setOutputCol("feature")
+ .setVocabSize(3)
+ .setMinDF(2)
+ .fit(df);
+
+ // alternatively, define CountVectorizerModel with a-priori vocabulary
+ CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+ .setInputCol("text")
+ .setOutputCol("feature");
+
+ cvModel.transform(df).show();
+ // $example off$
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
new file mode 100644
index 0000000000..a41a5ec9bf
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.IDF;
+import org.apache.spark.ml.feature.IDFModel;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTfIdfExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "Hi I heard about Spark"),
+ RowFactory.create(0, "I wish Java could use case classes"),
+ RowFactory.create(1, "Logistic regression models are neat")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+ });
+ DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
+ Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+ DataFrame wordsData = tokenizer.transform(sentenceData);
+ int numFeatures = 20;
+ HashingTF hashingTF = new HashingTF()
+ .setInputCol("words")
+ .setOutputCol("rawFeatures")
+ .setNumFeatures(numFeatures);
+ DataFrame featurizedData = hashingTF.transform(wordsData);
+ IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
+ IDFModel idfModel = idf.fit(featurizedData);
+ DataFrame rescaledData = idfModel.transform(featurizedData);
+ for (Row r : rescaledData.select("features", "label").take(3)) {
+ Vector features = r.getAs(0);
+ Double label = r.getDouble(1);
+ System.out.println(features);
+ System.out.println(label);
+ }
+ // $example off$
+
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
new file mode 100644
index 0000000000..d472375ca9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Word2Vec;
+import org.apache.spark.ml.feature.Word2VecModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaWord2VecExample {
+ public static void main(String[] args) {
+
+ SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ // Input data: Each row is a bag of words from a sentence or document.
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+ RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+ RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+ });
+ DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+ // Learn a mapping from words to Vectors.
+ Word2Vec word2Vec = new Word2Vec()
+ .setInputCol("text")
+ .setOutputCol("result")
+ .setVectorSize(3)
+ .setMinCount(0);
+ Word2VecModel model = word2Vec.fit(documentDF);
+ DataFrame result = model.transform(documentDF);
+ for (Row r : result.select("result").take(3)) {
+ System.out.println(r);
+ }
+ // $example off$
+ }
+}
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
new file mode 100644
index 0000000000..c92313378e
--- /dev/null
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
+# $example off$
+from pyspark.sql import SQLContext
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="TfIdfExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ sentenceData = sqlContext.createDataFrame([
+ (0, "Hi I heard about Spark"),
+ (0, "I wish Java could use case classes"),
+ (1, "Logistic regression models are neat")
+ ], ["label", "sentence"])
+ tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+ wordsData = tokenizer.transform(sentenceData)
+ hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+ featurizedData = hashingTF.transform(wordsData)
+ idf = IDF(inputCol="rawFeatures", outputCol="features")
+ idfModel = idf.fit(featurizedData)
+ rescaledData = idfModel.transform(featurizedData)
+ for features_label in rescaledData.select("features", "label").take(3):
+ print(features_label)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
new file mode 100644
index 0000000000..53c77feb10
--- /dev/null
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Word2Vec
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="Word2VecExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ # Input data: Each row is a bag of words from a sentence or document.
+ documentDF = sqlContext.createDataFrame([
+ ("Hi I heard about Spark".split(" "), ),
+ ("I wish Java could use case classes".split(" "), ),
+ ("Logistic regression models are neat".split(" "), )
+ ], ["text"])
+ # Learn a mapping from words to Vectors.
+ word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+ model = word2Vec.fit(documentDF)
+ result = model.transform(documentDF)
+ for feature in result.select("result").take(3):
+ print(feature)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
new file mode 100644
index 0000000000..ba916f66c4
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CountVectorizerExample.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+object CountVectorizerExample {
+ def main(args: Array[String]) {
+ val conf = new SparkConf().setAppName("CounterVectorizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val df = sqlContext.createDataFrame(Seq(
+ (0, Array("a", "b", "c")),
+ (1, Array("a", "b", "b", "c", "a"))
+ )).toDF("id", "words")
+
+ // fit a CountVectorizerModel from the corpus
+ val cvModel: CountVectorizerModel = new CountVectorizer()
+ .setInputCol("words")
+ .setOutputCol("features")
+ .setVocabSize(3)
+ .setMinDF(2)
+ .fit(df)
+
+ // alternatively, define CountVectorizerModel with a-priori vocabulary
+ val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+ .setInputCol("words")
+ .setOutputCol("features")
+
+ cvModel.transform(df).select("features").show()
+ // $example off$
+ }
+}
+// scalastyle:on println
+
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
new file mode 100644
index 0000000000..40c33e4e7d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TfIdfExample {
+
+ def main(args: Array[String]) {
+ val conf = new SparkConf().setAppName("TfIdfExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val sentenceData = sqlContext.createDataFrame(Seq(
+ (0, "Hi I heard about Spark"),
+ (0, "I wish Java could use case classes"),
+ (1, "Logistic regression models are neat")
+ )).toDF("label", "sentence")
+
+ val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+ val wordsData = tokenizer.transform(sentenceData)
+ val hashingTF = new HashingTF()
+ .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+ val featurizedData = hashingTF.transform(wordsData)
+ val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
+ val idfModel = idf.fit(featurizedData)
+ val rescaledData = idfModel.transform(featurizedData)
+ rescaledData.select("features", "label").take(3).foreach(println)
+ // $example off$
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
new file mode 100644
index 0000000000..631ab4c8ef
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Word2Vec
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object Word2VecExample {
+ def main(args: Array[String]) {
+ val conf = new SparkConf().setAppName("Word2Vec example")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ // Input data: Each row is a bag of words from a sentence or document.
+ val documentDF = sqlContext.createDataFrame(Seq(
+ "Hi I heard about Spark".split(" "),
+ "I wish Java could use case classes".split(" "),
+ "Logistic regression models are neat".split(" ")
+ ).map(Tuple1.apply)).toDF("text")
+
+ // Learn a mapping from words to Vectors.
+ val word2Vec = new Word2Vec()
+ .setInputCol("text")
+ .setOutputCol("result")
+ .setVectorSize(3)
+ .setMinCount(0)
+ val model = word2Vec.fit(documentDF)
+ val result = model.transform(documentDF)
+ result.select("result").take(3).foreach(println)
+ // $example off$
+ }
+}
+// scalastyle:on println