[SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up
## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-14300 Duplicated code found in scala/examples/mllib, below all deleted in this PR: - DenseGaussianMixture.scala - StreamingLinearRegression.scala ## delete reasons: #### delete: mllib/DenseGaussianMixture.scala - duplicate of mllib/GaussianMixtureExample #### delete: mllib/StreamingLinearRegression.scala - duplicate of mllib/StreamingLinearRegressionExample When merging and cleaning those code, be sure not disturb the previous example on and off blocks. ## How was this patch tested? Test with `SKIP_API=1 jekyll` manually to make sure that works well. Author: Xin Ren <iamshrek@126.com> Closes #12195 from keypointt/SPARK-14300.
This commit is contained in:
parent
fb0a8a8dd7
commit
dcdda19785
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.mllib.clustering.GaussianMixture
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
|
||||
/**
|
||||
* An example Gaussian Mixture Model EM app. Run with
|
||||
* {{{
|
||||
* ./bin/run-example mllib.DenseGaussianMixture <input> <k> <convergenceTol>
|
||||
* }}}
|
||||
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
|
||||
*/
|
||||
object DenseGaussianMixture {
|
||||
def main(args: Array[String]): Unit = {
|
||||
if (args.length < 3) {
|
||||
println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
|
||||
} else {
|
||||
val maxIterations = if (args.length > 3) args(3).toInt else 100
|
||||
run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
|
||||
}
|
||||
}
|
||||
|
||||
private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
|
||||
val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
|
||||
val ctx = new SparkContext(conf)
|
||||
|
||||
val data = ctx.textFile(inputFile).map { line =>
|
||||
Vectors.dense(line.trim.split(' ').map(_.toDouble))
|
||||
}.cache()
|
||||
|
||||
val clusters = new GaussianMixture()
|
||||
.setK(k)
|
||||
.setConvergenceTol(convergenceTol)
|
||||
.setMaxIterations(maxIterations)
|
||||
.run(data)
|
||||
|
||||
for (i <- 0 until clusters.k) {
|
||||
println("weight=%f\nmu=%s\nsigma=\n%s\n" format
|
||||
(clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
|
||||
}
|
||||
|
||||
println("The membership value of each vector to all mixture components (first <= 100):")
|
||||
val membership = clusters.predictSoft(data)
|
||||
membership.take(100).foreach { x =>
|
||||
print(" " + x.mkString(","))
|
||||
}
|
||||
println()
|
||||
println("Cluster labels (first <= 100):")
|
||||
val clusterLabels = clusters.predict(data)
|
||||
clusterLabels.take(100).foreach { x =>
|
||||
print(" " + x)
|
||||
}
|
||||
println()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// scalastyle:off println
|
||||
package org.apache.spark.examples.mllib
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
|
||||
/**
|
||||
* Train a linear regression model on one stream of data and make predictions
|
||||
* on another stream, where the data streams arrive as text files
|
||||
* into two different directories.
|
||||
*
|
||||
* The rows of the text files must be labeled data points in the form
|
||||
* `(y,[x1,x2,x3,...,xn])`
|
||||
* Where n is the number of features. n must be the same for train and test.
|
||||
*
|
||||
* Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>
|
||||
*
|
||||
* To run on your local machine using the two directories `trainingDir` and `testDir`,
|
||||
* with updates every 5 seconds, and 2 features per data point, call:
|
||||
* $ bin/run-example mllib.StreamingLinearRegression trainingDir testDir 5 2
|
||||
*
|
||||
* As you add text files to `trainingDir` the model will continuously update.
|
||||
* Anytime you add text files to `testDir`, you'll see predictions from the current model.
|
||||
*
|
||||
*/
|
||||
object StreamingLinearRegression {
|
||||
|
||||
def main(args: Array[String]) {
|
||||
|
||||
if (args.length != 4) {
|
||||
System.err.println(
|
||||
"Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")
|
||||
System.exit(1)
|
||||
}
|
||||
|
||||
val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
|
||||
val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
|
||||
|
||||
val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
|
||||
val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
|
||||
|
||||
val model = new StreamingLinearRegressionWithSGD()
|
||||
.setInitialWeights(Vectors.zeros(args(3).toInt))
|
||||
|
||||
model.trainOn(trainingData)
|
||||
model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
|
||||
|
||||
ssc.start()
|
||||
ssc.awaitTermination()
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
// scalastyle:on println
|
|
@ -26,6 +26,25 @@ import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
|
|||
// $example off$
|
||||
import org.apache.spark.streaming._
|
||||
|
||||
/**
|
||||
* Train a linear regression model on one stream of data and make predictions
|
||||
* on another stream, where the data streams arrive as text files
|
||||
* into two different directories.
|
||||
*
|
||||
* The rows of the text files must be labeled data points in the form
|
||||
* `(y,[x1,x2,x3,...,xn])`
|
||||
* Where n is the number of features. n must be the same for train and test.
|
||||
*
|
||||
* Usage: StreamingLinearRegressionExample <trainingDir> <testDir>
|
||||
*
|
||||
* To run on your local machine using the two directories `trainingDir` and `testDir`,
|
||||
* with updates every 5 seconds, and 2 features per data point, call:
|
||||
* $ bin/run-example mllib.StreamingLinearRegressionExample trainingDir testDir
|
||||
*
|
||||
* As you add text files to `trainingDir` the model will continuously update.
|
||||
* Anytime you add text files to `testDir`, you'll see predictions from the current model.
|
||||
*
|
||||
*/
|
||||
object StreamingLinearRegressionExample {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
|
Loading…
Reference in a new issue