[SPARK-11289][DOC] Substitute code examples in ML features extractors with include_example
mengxr https://issues.apache.org/jira/browse/SPARK-11289 I make some changes in ML feature extractors. I.e. TF-IDF, Word2Vec, and CountVectorizer. I add new example code in spark/examples, hope it is the right place to add those examples. Author: Xusen Yin <yinxusen@gmail.com> Closes #9266 from yinxusen/SPARK-11289.
This commit is contained in:
parent
a150e6c1b0
commit
943d4fa204
|
@ -37,23 +37,7 @@ In the following code segment, we start with a set of sentences. We split each
|
|||
Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
|
||||
the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
|
||||
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
|
||||
|
||||
val sentenceData = sqlContext.createDataFrame(Seq(
|
||||
(0, "Hi I heard about Spark"),
|
||||
(0, "I wish Java could use case classes"),
|
||||
(1, "Logistic regression models are neat")
|
||||
)).toDF("label", "sentence")
|
||||
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
|
||||
val wordsData = tokenizer.transform(sentenceData)
|
||||
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
|
||||
val featurizedData = hashingTF.transform(wordsData)
|
||||
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
|
||||
val idfModel = idf.fit(featurizedData)
|
||||
val rescaledData = idfModel.transform(featurizedData)
|
||||
rescaledData.select("features", "label").take(3).foreach(println)
|
||||
{% endhighlight %}
|
||||
{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -61,49 +45,7 @@ rescaledData.select("features", "label").take(3).foreach(println)
|
|||
Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
|
||||
[IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
|
||||
|
||||
{% highlight java %}
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.ml.feature.HashingTF;
|
||||
import org.apache.spark.ml.feature.IDF;
|
||||
import org.apache.spark.ml.feature.Tokenizer;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(0, "Hi I heard about Spark"),
|
||||
RowFactory.create(0, "I wish Java could use case classes"),
|
||||
RowFactory.create(1, "Logistic regression models are neat")
|
||||
));
|
||||
StructType schema = new StructType(new StructField[]{
|
||||
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
|
||||
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
|
||||
});
|
||||
DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
|
||||
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
|
||||
DataFrame wordsData = tokenizer.transform(sentenceData);
|
||||
int numFeatures = 20;
|
||||
HashingTF hashingTF = new HashingTF()
|
||||
.setInputCol("words")
|
||||
.setOutputCol("rawFeatures")
|
||||
.setNumFeatures(numFeatures);
|
||||
DataFrame featurizedData = hashingTF.transform(wordsData);
|
||||
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
|
||||
IDFModel idfModel = idf.fit(featurizedData);
|
||||
DataFrame rescaledData = idfModel.transform(featurizedData);
|
||||
for (Row r : rescaledData.select("features", "label").take(3)) {
|
||||
Vector features = r.getAs(0);
|
||||
Double label = r.getDouble(1);
|
||||
System.out.println(features);
|
||||
}
|
||||
{% endhighlight %}
|
||||
{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -111,24 +53,7 @@ for (Row r : rescaledData.select("features", "label").take(3)) {
|
|||
Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
|
||||
the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
|
||||
|
||||
{% highlight python %}
|
||||
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
|
||||
|
||||
sentenceData = sqlContext.createDataFrame([
|
||||
(0, "Hi I heard about Spark"),
|
||||
(0, "I wish Java could use case classes"),
|
||||
(1, "Logistic regression models are neat")
|
||||
], ["label", "sentence"])
|
||||
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
|
||||
wordsData = tokenizer.transform(sentenceData)
|
||||
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
|
||||
featurizedData = hashingTF.transform(wordsData)
|
||||
idf = IDF(inputCol="rawFeatures", outputCol="features")
|
||||
idfModel = idf.fit(featurizedData)
|
||||
rescaledData = idfModel.transform(featurizedData)
|
||||
for features_label in rescaledData.select("features", "label").take(3):
|
||||
print(features_label)
|
||||
{% endhighlight %}
|
||||
{% include_example python/ml/tf_idf_example.py %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -149,26 +74,7 @@ In the following code segment, we start with a set of documents, each of which i
|
|||
Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
|
||||
for more details on the API.
|
||||
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.ml.feature.Word2Vec
|
||||
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
val documentDF = sqlContext.createDataFrame(Seq(
|
||||
"Hi I heard about Spark".split(" "),
|
||||
"I wish Java could use case classes".split(" "),
|
||||
"Logistic regression models are neat".split(" ")
|
||||
).map(Tuple1.apply)).toDF("text")
|
||||
|
||||
// Learn a mapping from words to Vectors.
|
||||
val word2Vec = new Word2Vec()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("result")
|
||||
.setVectorSize(3)
|
||||
.setMinCount(0)
|
||||
val model = word2Vec.fit(documentDF)
|
||||
val result = model.transform(documentDF)
|
||||
result.select("result").take(3).foreach(println)
|
||||
{% endhighlight %}
|
||||
{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -176,43 +82,7 @@ result.select("result").take(3).foreach(println)
|
|||
Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
|
||||
for more details on the API.
|
||||
|
||||
{% highlight java %}
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.*;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
SQLContext sqlContext = ...
|
||||
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
|
||||
RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
|
||||
RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
|
||||
));
|
||||
StructType schema = new StructType(new StructField[]{
|
||||
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
|
||||
});
|
||||
DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
|
||||
|
||||
// Learn a mapping from words to Vectors.
|
||||
Word2Vec word2Vec = new Word2Vec()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("result")
|
||||
.setVectorSize(3)
|
||||
.setMinCount(0);
|
||||
Word2VecModel model = word2Vec.fit(documentDF);
|
||||
DataFrame result = model.transform(documentDF);
|
||||
for (Row r: result.select("result").take(3)) {
|
||||
System.out.println(r);
|
||||
}
|
||||
{% endhighlight %}
|
||||
{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
|
@ -220,22 +90,7 @@ for (Row r: result.select("result").take(3)) {
|
|||
Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
|
||||
for more details on the API.
|
||||
|
||||
{% highlight python %}
|
||||
from pyspark.ml.feature import Word2Vec
|
||||
|
||||
# Input data: Each row is a bag of words from a sentence or document.
|
||||
documentDF = sqlContext.createDataFrame([
|
||||
("Hi I heard about Spark".split(" "), ),
|
||||
("I wish Java could use case classes".split(" "), ),
|
||||
("Logistic regression models are neat".split(" "), )
|
||||
], ["text"])
|
||||
# Learn a mapping from words to Vectors.
|
||||
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
|
||||
model = word2Vec.fit(documentDF)
|
||||
result = model.transform(documentDF)
|
||||
for feature in result.select("result").take(3):
|
||||
print(feature)
|
||||
{% endhighlight %}
|
||||
{% include_example python/ml/word2vec_example.py %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
@ -283,30 +138,7 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
|
|||
and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
|
||||
for more details on the API.
|
||||
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.ml.feature.CountVectorizer
|
||||
import org.apache.spark.mllib.util.CountVectorizerModel
|
||||
|
||||
val df = sqlContext.createDataFrame(Seq(
|
||||
(0, Array("a", "b", "c")),
|
||||
(1, Array("a", "b", "b", "c", "a"))
|
||||
)).toDF("id", "words")
|
||||
|
||||
// fit a CountVectorizerModel from the corpus
|
||||
val cvModel: CountVectorizerModel = new CountVectorizer()
|
||||
.setInputCol("words")
|
||||
.setOutputCol("features")
|
||||
.setVocabSize(3)
|
||||
.setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
|
||||
.fit(df)
|
||||
|
||||
// alternatively, define CountVectorizerModel with a-priori vocabulary
|
||||
val cvm = new CountVectorizerModel(Array("a", "b", "c"))
|
||||
.setInputCol("words")
|
||||
.setOutputCol("features")
|
||||
|
||||
cvModel.transform(df).select("features").show()
|
||||
{% endhighlight %}
|
||||
{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
@ -315,40 +147,7 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
|
|||
and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
|
||||
for more details on the API.
|
||||
|
||||
{% highlight java %}
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.ml.feature.CountVectorizer;
|
||||
import org.apache.spark.ml.feature.CountVectorizerModel;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.types.*;
|
||||
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(Arrays.asList("a", "b", "c")),
|
||||
RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
|
||||
));
|
||||
StructType schema = new StructType(new StructField [] {
|
||||
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
|
||||
});
|
||||
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
|
||||
|
||||
// fit a CountVectorizerModel from the corpus
|
||||
CountVectorizerModel cvModel = new CountVectorizer()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("feature")
|
||||
.setVocabSize(3)
|
||||
.setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
|
||||
.fit(df);
|
||||
|
||||
// alternatively, define CountVectorizerModel with a-priori vocabulary
|
||||
CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
|
||||
.setInputCol("text")
|
||||
.setOutputCol("feature");
|
||||
|
||||
cvModel.transform(df).show();
|
||||
{% endhighlight %}
|
||||
{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.ml;
|
||||
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.ml.feature.CountVectorizer;
|
||||
import org.apache.spark.ml.feature.CountVectorizerModel;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.*;
|
||||
// $example off$
|
||||
|
||||
public class JavaCountVectorizerExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaCountVectorizerExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
SQLContext sqlContext = new SQLContext(jsc);
|
||||
|
||||
// $example on$
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(Arrays.asList("a", "b", "c")),
|
||||
RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
|
||||
));
|
||||
StructType schema = new StructType(new StructField [] {
|
||||
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
|
||||
});
|
||||
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
|
||||
|
||||
// fit a CountVectorizerModel from the corpus
|
||||
CountVectorizerModel cvModel = new CountVectorizer()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("feature")
|
||||
.setVocabSize(3)
|
||||
.setMinDF(2)
|
||||
.fit(df);
|
||||
|
||||
// alternatively, define CountVectorizerModel with a-priori vocabulary
|
||||
CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
|
||||
.setInputCol("text")
|
||||
.setOutputCol("feature");
|
||||
|
||||
cvModel.transform(df).show();
|
||||
// $example off$
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.ml;
|
||||
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.ml.feature.HashingTF;
|
||||
import org.apache.spark.ml.feature.IDF;
|
||||
import org.apache.spark.ml.feature.IDFModel;
|
||||
import org.apache.spark.ml.feature.Tokenizer;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
// $example off$
|
||||
|
||||
public class JavaTfIdfExample {
|
||||
public static void main(String[] args) {
|
||||
SparkConf conf = new SparkConf().setAppName("JavaTfIdfExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
SQLContext sqlContext = new SQLContext(jsc);
|
||||
|
||||
// $example on$
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(0, "Hi I heard about Spark"),
|
||||
RowFactory.create(0, "I wish Java could use case classes"),
|
||||
RowFactory.create(1, "Logistic regression models are neat")
|
||||
));
|
||||
StructType schema = new StructType(new StructField[]{
|
||||
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
|
||||
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
|
||||
});
|
||||
DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema);
|
||||
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
|
||||
DataFrame wordsData = tokenizer.transform(sentenceData);
|
||||
int numFeatures = 20;
|
||||
HashingTF hashingTF = new HashingTF()
|
||||
.setInputCol("words")
|
||||
.setOutputCol("rawFeatures")
|
||||
.setNumFeatures(numFeatures);
|
||||
DataFrame featurizedData = hashingTF.transform(wordsData);
|
||||
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
|
||||
IDFModel idfModel = idf.fit(featurizedData);
|
||||
DataFrame rescaledData = idfModel.transform(featurizedData);
|
||||
for (Row r : rescaledData.select("features", "label").take(3)) {
|
||||
Vector features = r.getAs(0);
|
||||
Double label = r.getDouble(1);
|
||||
System.out.println(features);
|
||||
System.out.println(label);
|
||||
}
|
||||
// $example off$
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.ml;
|
||||
|
||||
// $example on$
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.ml.feature.Word2Vec;
|
||||
import org.apache.spark.ml.feature.Word2VecModel;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.*;
|
||||
// $example off$
|
||||
|
||||
public class JavaWord2VecExample {
|
||||
public static void main(String[] args) {
|
||||
|
||||
SparkConf conf = new SparkConf().setAppName("JavaWord2VecExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
SQLContext sqlContext = new SQLContext(jsc);
|
||||
|
||||
// $example on$
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
|
||||
RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
|
||||
RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
|
||||
RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
|
||||
));
|
||||
StructType schema = new StructType(new StructField[]{
|
||||
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
|
||||
});
|
||||
DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
|
||||
|
||||
// Learn a mapping from words to Vectors.
|
||||
Word2Vec word2Vec = new Word2Vec()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("result")
|
||||
.setVectorSize(3)
|
||||
.setMinCount(0);
|
||||
Word2VecModel model = word2Vec.fit(documentDF);
|
||||
DataFrame result = model.transform(documentDF);
|
||||
for (Row r : result.select("result").take(3)) {
|
||||
System.out.println(r);
|
||||
}
|
||||
// $example off$
|
||||
}
|
||||
}
|
47
examples/src/main/python/ml/tf_idf_example.py
Normal file
47
examples/src/main/python/ml/tf_idf_example.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
|
||||
# $example off$
|
||||
from pyspark.sql import SQLContext
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="TfIdfExample")
|
||||
sqlContext = SQLContext(sc)
|
||||
|
||||
# $example on$
|
||||
sentenceData = sqlContext.createDataFrame([
|
||||
(0, "Hi I heard about Spark"),
|
||||
(0, "I wish Java could use case classes"),
|
||||
(1, "Logistic regression models are neat")
|
||||
], ["label", "sentence"])
|
||||
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
|
||||
wordsData = tokenizer.transform(sentenceData)
|
||||
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
|
||||
featurizedData = hashingTF.transform(wordsData)
|
||||
idf = IDF(inputCol="rawFeatures", outputCol="features")
|
||||
idfModel = idf.fit(featurizedData)
|
||||
rescaledData = idfModel.transform(featurizedData)
|
||||
for features_label in rescaledData.select("features", "label").take(3):
|
||||
print(features_label)
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
45
examples/src/main/python/ml/word2vec_example.py
Normal file
45
examples/src/main/python/ml/word2vec_example.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from pyspark import SparkContext
|
||||
from pyspark.sql import SQLContext
|
||||
# $example on$
|
||||
from pyspark.ml.feature import Word2Vec
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
sc = SparkContext(appName="Word2VecExample")
|
||||
sqlContext = SQLContext(sc)
|
||||
|
||||
# $example on$
|
||||
# Input data: Each row is a bag of words from a sentence or document.
|
||||
documentDF = sqlContext.createDataFrame([
|
||||
("Hi I heard about Spark".split(" "), ),
|
||||
("I wish Java could use case classes".split(" "), ),
|
||||
("Logistic regression models are neat".split(" "), )
|
||||
], ["text"])
|
||||
# Learn a mapping from words to Vectors.
|
||||
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
|
||||
model = word2Vec.fit(documentDF)
|
||||
result = model.transform(documentDF)
|
||||
for feature in result.select("result").take(3):
|
||||
print(feature)
|
||||
# $example off$
|
||||
|
||||
sc.stop()
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.ml
|
||||
|
||||
// $example on$
|
||||
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
|
||||
// $example off$
|
||||
import org.apache.spark.sql.SQLContext
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
|
||||
object CountVectorizerExample {
|
||||
def main(args: Array[String]) {
|
||||
val conf = new SparkConf().setAppName("CounterVectorizerExample")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlContext = new SQLContext(sc)
|
||||
|
||||
// $example on$
|
||||
val df = sqlContext.createDataFrame(Seq(
|
||||
(0, Array("a", "b", "c")),
|
||||
(1, Array("a", "b", "b", "c", "a"))
|
||||
)).toDF("id", "words")
|
||||
|
||||
// fit a CountVectorizerModel from the corpus
|
||||
val cvModel: CountVectorizerModel = new CountVectorizer()
|
||||
.setInputCol("words")
|
||||
.setOutputCol("features")
|
||||
.setVocabSize(3)
|
||||
.setMinDF(2)
|
||||
.fit(df)
|
||||
|
||||
// alternatively, define CountVectorizerModel with a-priori vocabulary
|
||||
val cvm = new CountVectorizerModel(Array("a", "b", "c"))
|
||||
.setInputCol("words")
|
||||
.setOutputCol("features")
|
||||
|
||||
cvModel.transform(df).select("features").show()
|
||||
// $example off$
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.ml
|
||||
|
||||
// $example on$
|
||||
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
|
||||
// $example off$
|
||||
import org.apache.spark.sql.SQLContext
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object TfIdfExample {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
val conf = new SparkConf().setAppName("TfIdfExample")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlContext = new SQLContext(sc)
|
||||
|
||||
// $example on$
|
||||
val sentenceData = sqlContext.createDataFrame(Seq(
|
||||
(0, "Hi I heard about Spark"),
|
||||
(0, "I wish Java could use case classes"),
|
||||
(1, "Logistic regression models are neat")
|
||||
)).toDF("label", "sentence")
|
||||
|
||||
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
|
||||
val wordsData = tokenizer.transform(sentenceData)
|
||||
val hashingTF = new HashingTF()
|
||||
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
|
||||
val featurizedData = hashingTF.transform(wordsData)
|
||||
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
|
||||
val idfModel = idf.fit(featurizedData)
|
||||
val rescaledData = idfModel.transform(featurizedData)
|
||||
rescaledData.select("features", "label").take(3).foreach(println)
|
||||
// $example off$
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.ml
|
||||
|
||||
// $example on$
|
||||
import org.apache.spark.ml.feature.Word2Vec
|
||||
// $example off$
|
||||
import org.apache.spark.sql.SQLContext
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object Word2VecExample {
|
||||
def main(args: Array[String]) {
|
||||
val conf = new SparkConf().setAppName("Word2Vec example")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlContext = new SQLContext(sc)
|
||||
|
||||
// $example on$
|
||||
// Input data: Each row is a bag of words from a sentence or document.
|
||||
val documentDF = sqlContext.createDataFrame(Seq(
|
||||
"Hi I heard about Spark".split(" "),
|
||||
"I wish Java could use case classes".split(" "),
|
||||
"Logistic regression models are neat".split(" ")
|
||||
).map(Tuple1.apply)).toDF("text")
|
||||
|
||||
// Learn a mapping from words to Vectors.
|
||||
val word2Vec = new Word2Vec()
|
||||
.setInputCol("text")
|
||||
.setOutputCol("result")
|
||||
.setVectorSize(3)
|
||||
.setMinCount(0)
|
||||
val model = word2Vec.fit(documentDF)
|
||||
val result = model.transform(documentDF)
|
||||
result.select("result").take(3).foreach(println)
|
||||
// $example off$
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
Loading…
Reference in a new issue