[SPARK-16260][ML][EXAMPLE] PySpark ML Example Improvements and Cleanup
## What changes were proposed in this pull request? 1). Remove unused import in Scala example; 2). Move spark session import outside example off; 3). Change parameter setting the same as Scala; 4). Change comment to be consistent; 5). Make sure that Scala and python using the same data set; I did one pass and fixed the above issues. There are missing examples in python, which might be added later. TODO: For some examples, there are comments on how to run examples; But there are many missing. We can add them later. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test them Author: wm624@hotmail.com <wm624@hotmail.com> Closes #14021 from wangmiao1981/ann.
This commit is contained in:
parent
2628333978
commit
a539b724c1
|
@ -30,10 +30,12 @@ if __name__ == "__main__":
|
|||
.getOrCreate()
|
||||
|
||||
# $example on$
|
||||
# Create some vector data; also works for sparse vectors
|
||||
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
|
||||
df = spark.createDataFrame(data, ["vector"])
|
||||
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
|
||||
inputCol="vector", outputCol="transformedVector")
|
||||
# Batch transform the vectors to create new column:
|
||||
transformer.transform(df).show()
|
||||
# $example off$
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ if __name__ == "__main__":
|
|||
(Vectors.dense([0.0, 0.0]),),
|
||||
(Vectors.dense([0.6, -1.1]),)],
|
||||
["features"])
|
||||
px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
|
||||
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
|
||||
polyDF = px.transform(df)
|
||||
for expanded in polyDF.select("polyFeatures").take(3):
|
||||
print(expanded)
|
||||
|
|
|
@ -24,7 +24,7 @@ from pyspark.sql import SparkSession
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate()
|
||||
spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
|
||||
|
||||
# $example on$
|
||||
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
|
||||
|
|
|
@ -50,7 +50,7 @@ if __name__ == "__main__":
|
|||
(trainingData, testData) = data.randomSplit([0.7, 0.3])
|
||||
|
||||
# Train a RandomForest model.
|
||||
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
|
||||
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
|
||||
|
||||
# Chain indexers and forest in a Pipeline
|
||||
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
|
||||
|
|
|
@ -48,7 +48,7 @@ if __name__ == "__main__":
|
|||
|
||||
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
|
||||
tokenizer = Tokenizer(inputCol="text", outputCol="words")
|
||||
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
|
||||
hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
|
||||
lr = LogisticRegression(maxIter=10, regParam=0.001)
|
||||
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
|
|||
|
||||
import java.io.File
|
||||
|
||||
import com.google.common.io.Files
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
|
|
|
@ -21,8 +21,8 @@ package org.apache.spark.examples.ml
|
|||
|
||||
// $example on$
|
||||
import org.apache.spark.ml.clustering.GaussianMixture
|
||||
import org.apache.spark.sql.SparkSession
|
||||
// $example off$
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
/**
|
||||
* An example demonstrating Gaussian Mixture Model (GMM).
|
||||
|
|
|
@ -35,7 +35,7 @@ object NaiveBayesExample {
|
|||
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
|
||||
|
||||
// Train a NaiveBayes model.
|
||||
val model = new NaiveBayes()
|
||||
|
|
Loading…
Reference in a new issue