[SPARK-16260][ML][EXAMPLE] PySpark ML Example Improvements and Cleanup
## What changes were proposed in this pull request? 1). Remove unused import in Scala example; 2). Move spark session import outside example off; 3). Change parameter setting the same as Scala; 4). Change comment to be consistent; 5). Make sure that Scala and python using the same data set; I did one pass and fixed the above issues. There are missing examples in python, which might be added later. TODO: For some examples, there are comments on how to run examples; But there are many missing. We can add them later. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test them Author: wm624@hotmail.com <wm624@hotmail.com> Closes #14021 from wangmiao1981/ann.
This commit is contained in:
parent
2628333978
commit
a539b724c1
|
@ -30,10 +30,12 @@ if __name__ == "__main__":
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
# $example on$
|
# $example on$
|
||||||
|
# Create some vector data; also works for sparse vectors
|
||||||
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
|
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
|
||||||
df = spark.createDataFrame(data, ["vector"])
|
df = spark.createDataFrame(data, ["vector"])
|
||||||
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
|
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
|
||||||
inputCol="vector", outputCol="transformedVector")
|
inputCol="vector", outputCol="transformedVector")
|
||||||
|
# Batch transform the vectors to create new column:
|
||||||
transformer.transform(df).show()
|
transformer.transform(df).show()
|
||||||
# $example off$
|
# $example off$
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ if __name__ == "__main__":
|
||||||
(Vectors.dense([0.0, 0.0]),),
|
(Vectors.dense([0.0, 0.0]),),
|
||||||
(Vectors.dense([0.6, -1.1]),)],
|
(Vectors.dense([0.6, -1.1]),)],
|
||||||
["features"])
|
["features"])
|
||||||
px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
|
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
|
||||||
polyDF = px.transform(df)
|
polyDF = px.transform(df)
|
||||||
for expanded in polyDF.select("polyFeatures").take(3):
|
for expanded in polyDF.select("polyFeatures").take(3):
|
||||||
print(expanded)
|
print(expanded)
|
||||||
|
|
|
@ -24,7 +24,7 @@ from pyspark.sql import SparkSession
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate()
|
spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
|
||||||
|
|
||||||
# $example on$
|
# $example on$
|
||||||
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
|
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
|
||||||
|
|
|
@ -50,7 +50,7 @@ if __name__ == "__main__":
|
||||||
(trainingData, testData) = data.randomSplit([0.7, 0.3])
|
(trainingData, testData) = data.randomSplit([0.7, 0.3])
|
||||||
|
|
||||||
# Train a RandomForest model.
|
# Train a RandomForest model.
|
||||||
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
|
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
|
||||||
|
|
||||||
# Chain indexers and forest in a Pipeline
|
# Chain indexers and forest in a Pipeline
|
||||||
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
|
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
|
||||||
|
|
|
@ -48,7 +48,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
|
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
|
||||||
tokenizer = Tokenizer(inputCol="text", outputCol="words")
|
tokenizer = Tokenizer(inputCol="text", outputCol="words")
|
||||||
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
|
hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
|
||||||
lr = LogisticRegression(maxIter=10, regParam=0.001)
|
lr = LogisticRegression(maxIter=10, regParam=0.001)
|
||||||
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
|
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
|
||||||
|
|
||||||
import java.io.File
|
import java.io.File
|
||||||
|
|
||||||
import com.google.common.io.Files
|
|
||||||
import scopt.OptionParser
|
import scopt.OptionParser
|
||||||
|
|
||||||
import org.apache.spark.examples.mllib.AbstractParams
|
import org.apache.spark.examples.mllib.AbstractParams
|
||||||
|
|
|
@ -21,8 +21,8 @@ package org.apache.spark.examples.ml
|
||||||
|
|
||||||
// $example on$
|
// $example on$
|
||||||
import org.apache.spark.ml.clustering.GaussianMixture
|
import org.apache.spark.ml.clustering.GaussianMixture
|
||||||
import org.apache.spark.sql.SparkSession
|
|
||||||
// $example off$
|
// $example off$
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An example demonstrating Gaussian Mixture Model (GMM).
|
* An example demonstrating Gaussian Mixture Model (GMM).
|
||||||
|
|
|
@ -35,7 +35,7 @@ object NaiveBayesExample {
|
||||||
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
|
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
|
||||||
|
|
||||||
// Split the data into training and test sets (30% held out for testing)
|
// Split the data into training and test sets (30% held out for testing)
|
||||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
|
||||||
|
|
||||||
// Train a NaiveBayes model.
|
// Train a NaiveBayes model.
|
||||||
val model = new NaiveBayes()
|
val model = new NaiveBayes()
|
||||||
|
|
Loading…
Reference in a new issue