[SPARK-11689][ML] Add user guide and example code for LDA under spark.ml
jira: https://issues.apache.org/jira/browse/SPARK-11689 Add simple user guide for LDA under spark.ml and example code under examples/. Use include_example to include example code in the user guide markdown. Check SPARK-11606 for instructions. Original PR is reverted due to document build error. https://github.com/apache/spark/pull/9722 mengxr feynmanliang yinxusen Sorry for the troubling. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #9974 from hhbyyh/ldaMLExample.
This commit is contained in:
parent
a8ceec5e8c
commit
e232720a65
31
docs/ml-clustering.md
Normal file
31
docs/ml-clustering.md
Normal file
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
layout: global
|
||||
title: Clustering - ML
|
||||
displayTitle: <a href="ml-guide.html">ML</a> - Clustering
|
||||
---
|
||||
|
||||
In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html).
|
||||
|
||||
## Latent Dirichlet allocation (LDA)
|
||||
|
||||
`LDA` is implemented as an `Estimator` that supports both `EMLDAOptimizer` and `OnlineLDAOptimizer`,
|
||||
and generates a `LDAModel` as the base models. Expert users may cast a `LDAModel` generated by
|
||||
`EMLDAOptimizer` to a `DistributedLDAModel` if needed.
|
||||
|
||||
<div class="codetabs">
|
||||
|
||||
<div data-lang="scala" markdown="1">
|
||||
|
||||
Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.LDA) for more details.
|
||||
|
||||
{% include_example scala/org/apache/spark/examples/ml/LDAExample.scala %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
|
||||
Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) for more details.
|
||||
|
||||
{% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
|
||||
</div>
|
||||
|
||||
</div>
|
|
@ -40,6 +40,7 @@ Also, some algorithms have additional capabilities in the `spark.ml` API; e.g.,
|
|||
provide class probabilities, and linear models provide model summaries.
|
||||
|
||||
* [Feature extraction, transformation, and selection](ml-features.html)
|
||||
* [Clustering](ml-clustering.html)
|
||||
* [Decision Trees for classification and regression](ml-decision-tree.html)
|
||||
* [Ensembles](ml-ensembles.html)
|
||||
* [Linear methods with elastic net regularization](ml-linear-methods.html)
|
||||
|
@ -950,4 +951,4 @@ model.transform(test)
|
|||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
|
@ -69,6 +69,7 @@ We list major functionality from both below, with links to detailed guides.
|
|||
concepts. It also contains sections on using algorithms within the Pipelines API, for example:
|
||||
|
||||
* [Feature extraction, transformation, and selection](ml-features.html)
|
||||
* [Clustering](ml-clustering.html)
|
||||
* [Decision trees for classification and regression](ml-decision-tree.html)
|
||||
* [Ensembles](ml-ensembles.html)
|
||||
* [Linear methods with elastic net regularization](ml-linear-methods.html)
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.ml;
|
||||
// $example on$
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.ml.clustering.LDA;
|
||||
import org.apache.spark.ml.clustering.LDAModel;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.VectorUDT;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRow;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
// $example off$
|
||||
|
||||
/**
|
||||
* An example demonstrating LDA
|
||||
* Run with
|
||||
* <pre>
|
||||
* bin/run-example ml.JavaLDAExample
|
||||
* </pre>
|
||||
*/
|
||||
public class JavaLDAExample {
|
||||
|
||||
// $example on$
|
||||
private static class ParseVector implements Function<String, Row> {
|
||||
private static final Pattern separator = Pattern.compile(" ");
|
||||
|
||||
@Override
|
||||
public Row call(String line) {
|
||||
String[] tok = separator.split(line);
|
||||
double[] point = new double[tok.length];
|
||||
for (int i = 0; i < tok.length; ++i) {
|
||||
point[i] = Double.parseDouble(tok[i]);
|
||||
}
|
||||
Vector[] points = {Vectors.dense(point)};
|
||||
return new GenericRow(points);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
String inputFile = "data/mllib/sample_lda_data.txt";
|
||||
|
||||
// Parses the arguments
|
||||
SparkConf conf = new SparkConf().setAppName("JavaLDAExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(conf);
|
||||
SQLContext sqlContext = new SQLContext(jsc);
|
||||
|
||||
// Loads data
|
||||
JavaRDD<Row> points = jsc.textFile(inputFile).map(new ParseVector());
|
||||
StructField[] fields = {new StructField("features", new VectorUDT(), false, Metadata.empty())};
|
||||
StructType schema = new StructType(fields);
|
||||
DataFrame dataset = sqlContext.createDataFrame(points, schema);
|
||||
|
||||
// Trains a LDA model
|
||||
LDA lda = new LDA()
|
||||
.setK(10)
|
||||
.setMaxIter(10);
|
||||
LDAModel model = lda.fit(dataset);
|
||||
|
||||
System.out.println(model.logLikelihood(dataset));
|
||||
System.out.println(model.logPerplexity(dataset));
|
||||
|
||||
// Shows the result
|
||||
DataFrame topics = model.describeTopics(3);
|
||||
topics.show(false);
|
||||
model.transform(dataset).show(false);
|
||||
|
||||
jsc.stop();
|
||||
}
|
||||
// $example off$
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.examples.ml
|
||||
|
||||
// scalastyle:off println
|
||||
import org.apache.spark.{SparkContext, SparkConf}
|
||||
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
|
||||
// $example on$
|
||||
import org.apache.spark.ml.clustering.LDA
|
||||
import org.apache.spark.sql.{Row, SQLContext}
|
||||
import org.apache.spark.sql.types.{StructField, StructType}
|
||||
// $example off$
|
||||
|
||||
/**
|
||||
* An example demonstrating a LDA of ML pipeline.
|
||||
* Run with
|
||||
* {{{
|
||||
* bin/run-example ml.LDAExample
|
||||
* }}}
|
||||
*/
|
||||
object LDAExample {
|
||||
|
||||
final val FEATURES_COL = "features"
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val input = "data/mllib/sample_lda_data.txt"
|
||||
// Creates a Spark context and a SQL context
|
||||
val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlContext = new SQLContext(sc)
|
||||
|
||||
// $example on$
|
||||
// Loads data
|
||||
val rowRDD = sc.textFile(input).filter(_.nonEmpty)
|
||||
.map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
|
||||
val schema = StructType(Array(StructField(FEATURES_COL, new VectorUDT, false)))
|
||||
val dataset = sqlContext.createDataFrame(rowRDD, schema)
|
||||
|
||||
// Trains a LDA model
|
||||
val lda = new LDA()
|
||||
.setK(10)
|
||||
.setMaxIter(10)
|
||||
.setFeaturesCol(FEATURES_COL)
|
||||
val model = lda.fit(dataset)
|
||||
val transformed = model.transform(dataset)
|
||||
|
||||
val ll = model.logLikelihood(dataset)
|
||||
val lp = model.logPerplexity(dataset)
|
||||
|
||||
// describeTopics
|
||||
val topics = model.describeTopics(3)
|
||||
|
||||
// Shows the result
|
||||
topics.show(false)
|
||||
transformed.show(false)
|
||||
|
||||
// $example off$
|
||||
sc.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
Loading…
Reference in a new issue