[SPARK-16260][ML][EXAMPLE] PySpark ML Example Improvements and Cleanup

## What changes were proposed in this pull request?
1). Remove unused import in Scala example;

2). Move spark session import outside example off;

3). Change parameter setting the same as Scala;

4). Change comment to be consistent;

5). Make sure that Scala and python using the same data set;

I did one pass and fixed the above issues. There are missing examples in python, which might be added later.

TODO: For some examples, there are comments on how to run examples; But there are many missing. We can add them later.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)

Manually test them

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #14021 from wangmiao1981/ann.
This commit is contained in:
wm624@hotmail.com 2016-07-03 23:23:02 -07:00 committed by Yanbo Liang
parent 2628333978
commit a539b724c1
8 changed files with 8 additions and 7 deletions

View file

@ -30,10 +30,12 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
# Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
inputCol="vector", outputCol="transformedVector")
# Batch transform the vectors to create new column:
transformer.transform(df).show()
# $example off$

View file

@ -35,7 +35,7 @@ if __name__ == "__main__":
(Vectors.dense([0.0, 0.0]),),
(Vectors.dense([0.6, -1.1]),)],
["features"])
px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(df)
for expanded in polyDF.select("polyFeatures").take(3):
print(expanded)

View file

@ -24,7 +24,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate()
spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
# $example on$
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]

View file

@ -50,7 +50,7 @@ if __name__ == "__main__":
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

View file

@ -48,7 +48,7 @@ if __name__ == "__main__":
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

View file

@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
import java.io.File
import com.google.common.io.Files
import scopt.OptionParser
import org.apache.spark.examples.mllib.AbstractParams

View file

@ -21,8 +21,8 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.clustering.GaussianMixture
import org.apache.spark.sql.SparkSession
// $example off$
import org.apache.spark.sql.SparkSession
/**
* An example demonstrating Gaussian Mixture Model (GMM).

View file

@ -35,7 +35,7 @@ object NaiveBayesExample {
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
// Train a NaiveBayes model.
val model = new NaiveBayes()