[SPARK-15031][SPARK-15134][EXAMPLE][DOC] Use SparkSession and update indent in examples
## What changes were proposed in this pull request? 1, Use `SparkSession` according to [SPARK-15031](https://issues.apache.org/jira/browse/SPARK-15031) 2, Update indent for `SparkContext` according to [SPARK-15134](https://issues.apache.org/jira/browse/SPARK-15134) 3, BTW, remove some duplicate space and add missing '.' ## How was this patch tested? manual tests Author: Zheng RuiFeng <ruifengz@foxmail.com> Closes #13050 from zhengruifeng/use_sparksession.
This commit is contained in:
parent
ba5487c061
commit
9e266d07a4
|
@ -32,7 +32,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaDecisionTreeClassificationExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaDecisionTreeClassificationExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaDecisionTreeClassificationExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load the data stored in LIBSVM format as a DataFrame.
|
||||
|
@ -52,10 +54,10 @@ public class JavaDecisionTreeClassificationExample {
|
|||
VectorIndexerModel featureIndexer = new VectorIndexer()
|
||||
.setInputCol("features")
|
||||
.setOutputCol("indexedFeatures")
|
||||
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous
|
||||
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
|
||||
.fit(data);
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
|
||||
Dataset<Row> trainingData = splits[0];
|
||||
Dataset<Row> testData = splits[1];
|
||||
|
@ -71,11 +73,11 @@ public class JavaDecisionTreeClassificationExample {
|
|||
.setOutputCol("predictedLabel")
|
||||
.setLabels(labelIndexer.labels());
|
||||
|
||||
// Chain indexers and tree in a Pipeline
|
||||
// Chain indexers and tree in a Pipeline.
|
||||
Pipeline pipeline = new Pipeline()
|
||||
.setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
|
||||
|
||||
// Train model. This also runs the indexers.
|
||||
// Train model. This also runs the indexers.
|
||||
PipelineModel model = pipeline.fit(trainingData);
|
||||
|
||||
// Make predictions.
|
||||
|
@ -84,7 +86,7 @@ public class JavaDecisionTreeClassificationExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("predictedLabel", "label", "features").show(5);
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
|
||||
.setLabelCol("indexedLabel")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaDecisionTreeRegressionExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaDecisionTreeRegressionExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaDecisionTreeRegressionExample")
|
||||
.getOrCreate();
|
||||
// $example on$
|
||||
// Load the data stored in LIBSVM format as a DataFrame.
|
||||
Dataset<Row> data = spark.read().format("libsvm")
|
||||
|
@ -47,7 +49,7 @@ public class JavaDecisionTreeRegressionExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data);
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
|
||||
Dataset<Row> trainingData = splits[0];
|
||||
Dataset<Row> testData = splits[1];
|
||||
|
@ -56,11 +58,11 @@ public class JavaDecisionTreeRegressionExample {
|
|||
DecisionTreeRegressor dt = new DecisionTreeRegressor()
|
||||
.setFeaturesCol("indexedFeatures");
|
||||
|
||||
// Chain indexer and tree in a Pipeline
|
||||
// Chain indexer and tree in a Pipeline.
|
||||
Pipeline pipeline = new Pipeline()
|
||||
.setStages(new PipelineStage[]{featureIndexer, dt});
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
PipelineModel model = pipeline.fit(trainingData);
|
||||
|
||||
// Make predictions.
|
||||
|
@ -69,7 +71,7 @@ public class JavaDecisionTreeRegressionExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("label", "features").show(5);
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
RegressionEvaluator evaluator = new RegressionEvaluator()
|
||||
.setLabelCol("label")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -62,7 +62,7 @@ public class JavaDeveloperApiExample {
|
|||
new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
|
||||
Dataset<Row> training = spark.createDataFrame(localTraining, LabeledPoint.class);
|
||||
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
|
||||
// Print out the parameters, documentation, and any default values.
|
||||
System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n");
|
||||
|
@ -70,7 +70,7 @@ public class JavaDeveloperApiExample {
|
|||
// We may set parameters using setter methods.
|
||||
lr.setMaxIter(10);
|
||||
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
MyJavaLogisticRegressionModel model = lr.fit(training);
|
||||
|
||||
// Prepare test data.
|
||||
|
@ -214,7 +214,7 @@ class MyJavaLogisticRegressionModel
|
|||
}
|
||||
|
||||
/**
|
||||
* Number of classes the label can take. 2 indicates binary classification.
|
||||
* Number of classes the label can take. 2 indicates binary classification.
|
||||
*/
|
||||
public int numClasses() { return 2; }
|
||||
|
||||
|
|
|
@ -38,7 +38,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaEstimatorTransformerParamExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaEstimatorTransformerParamExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaEstimatorTransformerParamExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Prepare training data.
|
||||
|
|
|
@ -75,11 +75,11 @@ public class JavaGradientBoostedTreeClassifierExample {
|
|||
.setOutputCol("predictedLabel")
|
||||
.setLabels(labelIndexer.labels());
|
||||
|
||||
// Chain indexers and GBT in a Pipeline
|
||||
// Chain indexers and GBT in a Pipeline.
|
||||
Pipeline pipeline = new Pipeline()
|
||||
.setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});
|
||||
|
||||
// Train model. This also runs the indexers.
|
||||
// Train model. This also runs the indexers.
|
||||
PipelineModel model = pipeline.fit(trainingData);
|
||||
|
||||
// Make predictions.
|
||||
|
@ -88,7 +88,7 @@ public class JavaGradientBoostedTreeClassifierExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("predictedLabel", "label", "features").show(5);
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
|
||||
.setLabelCol("indexedLabel")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaGradientBoostedTreeRegressorExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaGradientBoostedTreeRegressorExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaGradientBoostedTreeRegressorExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load and parse the data file, converting it to a DataFrame.
|
||||
|
@ -48,7 +50,7 @@ public class JavaGradientBoostedTreeRegressorExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data);
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
|
||||
Dataset<Row> trainingData = splits[0];
|
||||
Dataset<Row> testData = splits[1];
|
||||
|
@ -59,10 +61,10 @@ public class JavaGradientBoostedTreeRegressorExample {
|
|||
.setFeaturesCol("indexedFeatures")
|
||||
.setMaxIter(10);
|
||||
|
||||
// Chain indexer and GBT in a Pipeline
|
||||
// Chain indexer and GBT in a Pipeline.
|
||||
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt});
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
PipelineModel model = pipeline.fit(trainingData);
|
||||
|
||||
// Make predictions.
|
||||
|
@ -71,7 +73,7 @@ public class JavaGradientBoostedTreeRegressorExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("prediction", "label", "features").show(5);
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
RegressionEvaluator evaluator = new RegressionEvaluator()
|
||||
.setLabelCol("label")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -30,10 +30,12 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaLinearRegressionWithElasticNetExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaLinearRegressionWithElasticNetExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaLinearRegressionWithElasticNetExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load training data
|
||||
// Load training data.
|
||||
Dataset<Row> training = spark.read().format("libsvm")
|
||||
.load("data/mllib/sample_linear_regression_data.txt");
|
||||
|
||||
|
@ -42,14 +44,14 @@ public class JavaLinearRegressionWithElasticNetExample {
|
|||
.setRegParam(0.3)
|
||||
.setElasticNetParam(0.8);
|
||||
|
||||
// Fit the model
|
||||
// Fit the model.
|
||||
LinearRegressionModel lrModel = lr.fit(training);
|
||||
|
||||
// Print the coefficients and intercept for linear regression
|
||||
// Print the coefficients and intercept for linear regression.
|
||||
System.out.println("Coefficients: "
|
||||
+ lrModel.coefficients() + " Intercept: " + lrModel.intercept());
|
||||
|
||||
// Summarize the model over the training set and print out some metrics
|
||||
// Summarize the model over the training set and print out some metrics.
|
||||
LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
|
||||
System.out.println("numIterations: " + trainingSummary.totalIterations());
|
||||
System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
|
||||
|
|
|
@ -31,7 +31,9 @@ import org.apache.spark.sql.functions;
|
|||
public class JavaLogisticRegressionSummaryExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaLogisticRegressionSummaryExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaLogisticRegressionSummaryExample")
|
||||
.getOrCreate();
|
||||
|
||||
// Load training data
|
||||
Dataset<Row> training = spark.read().format("libsvm")
|
||||
|
|
|
@ -28,7 +28,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaLogisticRegressionWithElasticNetExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaLogisticRegressionWithElasticNetExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaLogisticRegressionWithElasticNetExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load training data
|
||||
|
|
|
@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaModelSelectionViaCrossValidationExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaModelSelectionViaCrossValidationExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaModelSelectionViaCrossValidationExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Prepare training documents, which are labeled.
|
||||
|
|
|
@ -43,7 +43,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaModelSelectionViaTrainValidationSplitExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaModelSelectionViaTrainValidationSplitExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaModelSelectionViaTrainValidationSplitExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
Dataset<Row> data = spark.read().format("libsvm")
|
||||
|
|
|
@ -33,7 +33,9 @@ public class JavaMultilayerPerceptronClassifierExample {
|
|||
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaMultilayerPerceptronClassifierExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaMultilayerPerceptronClassifierExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load training data
|
||||
|
|
|
@ -35,7 +35,9 @@ import org.apache.spark.sql.types.StructType;
|
|||
public class JavaQuantileDiscretizerExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaQuantileDiscretizerExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaQuantileDiscretizerExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
List<Row> data = Arrays.asList(
|
||||
|
|
|
@ -33,7 +33,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaRandomForestClassifierExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaRandomForestClassifierExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaRandomForestClassifierExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load and parse the data file, converting it to a DataFrame.
|
||||
|
|
|
@ -34,7 +34,9 @@ import org.apache.spark.sql.SparkSession;
|
|||
public class JavaRandomForestRegressorExample {
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaRandomForestRegressorExample").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaRandomForestRegressorExample")
|
||||
.getOrCreate();
|
||||
|
||||
// $example on$
|
||||
// Load and parse the data file, converting it to a DataFrame.
|
||||
|
@ -62,7 +64,7 @@ public class JavaRandomForestRegressorExample {
|
|||
Pipeline pipeline = new Pipeline()
|
||||
.setStages(new PipelineStage[] {featureIndexer, rf});
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
PipelineModel model = pipeline.fit(trainingData);
|
||||
|
||||
// Make predictions.
|
||||
|
|
|
@ -46,7 +46,7 @@ public class JavaSimpleParamsExample {
|
|||
.getOrCreate();
|
||||
|
||||
// Prepare training data.
|
||||
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
|
||||
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
|
||||
// into DataFrames, where it uses the bean metadata to infer the schema.
|
||||
List<LabeledPoint> localTraining = Lists.newArrayList(
|
||||
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
|
||||
|
@ -56,7 +56,7 @@ public class JavaSimpleParamsExample {
|
|||
Dataset<Row> training =
|
||||
spark.createDataFrame(localTraining, LabeledPoint.class);
|
||||
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
LogisticRegression lr = new LogisticRegression();
|
||||
// Print out the parameters, documentation, and any default values.
|
||||
System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
|
||||
|
@ -65,7 +65,7 @@ public class JavaSimpleParamsExample {
|
|||
lr.setMaxIter(10)
|
||||
.setRegParam(0.01);
|
||||
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
LogisticRegressionModel model1 = lr.fit(training);
|
||||
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
|
||||
// we can view the parameters it used during fit().
|
||||
|
@ -82,7 +82,7 @@ public class JavaSimpleParamsExample {
|
|||
|
||||
// One can also combine ParamMaps.
|
||||
ParamMap paramMap2 = new ParamMap();
|
||||
paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
|
||||
paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
|
||||
ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
|
||||
|
||||
// Now learn a new model using the paramMapCombined parameters.
|
||||
|
|
|
@ -43,7 +43,9 @@ public class JavaSimpleTextClassificationPipeline {
|
|||
|
||||
public static void main(String[] args) {
|
||||
SparkSession spark = SparkSession
|
||||
.builder().appName("JavaSimpleTextClassificationPipeline").getOrCreate();
|
||||
.builder()
|
||||
.appName("JavaSimpleTextClassificationPipeline")
|
||||
.getOrCreate();
|
||||
|
||||
// Prepare training documents, which are labeled.
|
||||
List<LabeledDocument> localTraining = Lists.newArrayList(
|
||||
|
|
|
@ -47,10 +47,10 @@ object DecisionTreeClassificationExample {
|
|||
val featureIndexer = new VectorIndexer()
|
||||
.setInputCol("features")
|
||||
.setOutputCol("indexedFeatures")
|
||||
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous
|
||||
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a DecisionTree model.
|
||||
|
@ -64,11 +64,11 @@ object DecisionTreeClassificationExample {
|
|||
.setOutputCol("predictedLabel")
|
||||
.setLabels(labelIndexer.labels)
|
||||
|
||||
// Chain indexers and tree in a Pipeline
|
||||
// Chain indexers and tree in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
|
||||
|
||||
// Train model. This also runs the indexers.
|
||||
// Train model. This also runs the indexers.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -77,7 +77,7 @@ object DecisionTreeClassificationExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("predictedLabel", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new MulticlassClassificationEvaluator()
|
||||
.setLabelCol("indexedLabel")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -23,7 +23,6 @@ import scala.language.reflectiveCalls
|
|||
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
import org.apache.spark.ml.{Pipeline, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
|
||||
|
@ -40,7 +39,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
|
|||
* {{{
|
||||
* ./bin/run-example ml.DecisionTreeExample [options]
|
||||
* }}}
|
||||
* Note that Decision Trees can take a large amount of memory. If the run-example command above
|
||||
* Note that Decision Trees can take a large amount of memory. If the run-example command above
|
||||
* fails, try running via spark-submit and specifying the amount of memory as at least 1g.
|
||||
* For local mode, run
|
||||
* {{{
|
||||
|
@ -87,7 +86,7 @@ object DecisionTreeExample {
|
|||
.text(s"min info gain required to create a split, default: ${defaultParams.minInfoGain}")
|
||||
.action((x, c) => c.copy(minInfoGain = x))
|
||||
opt[Double]("fracTest")
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
s"this option is ignored. default: ${defaultParams.fracTest}")
|
||||
.action((x, c) => c.copy(fracTest = x))
|
||||
opt[Boolean]("cacheNodeIds")
|
||||
|
@ -106,7 +105,7 @@ object DecisionTreeExample {
|
|||
s"default: ${defaultParams.checkpointInterval}")
|
||||
.action((x, c) => c.copy(checkpointInterval = x))
|
||||
opt[String]("testInput")
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
s" default: ${defaultParams.testInput}")
|
||||
.action((x, c) => c.copy(testInput = x))
|
||||
opt[String]("dataFormat")
|
||||
|
@ -157,11 +156,10 @@ object DecisionTreeExample {
|
|||
* @param dataFormat "libsvm" or "dense"
|
||||
* @param testInput Path to test dataset.
|
||||
* @param algo Classification or Regression
|
||||
* @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given.
|
||||
* @param fracTest Fraction of input data to hold out for testing. Ignored if testInput given.
|
||||
* @return (training dataset, test dataset)
|
||||
*/
|
||||
private[ml] def loadDatasets(
|
||||
sc: SparkContext,
|
||||
input: String,
|
||||
dataFormat: String,
|
||||
testInput: String,
|
||||
|
@ -200,18 +198,21 @@ object DecisionTreeExample {
|
|||
}
|
||||
|
||||
def run(params: Params) {
|
||||
val conf = new SparkConf().setAppName(s"DecisionTreeExample with $params")
|
||||
val sc = new SparkContext(conf)
|
||||
params.checkpointDir.foreach(sc.setCheckpointDir)
|
||||
val spark = SparkSession
|
||||
.builder
|
||||
.appName(s"DecisionTreeExample with $params")
|
||||
.getOrCreate()
|
||||
|
||||
params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
|
||||
val algo = params.algo.toLowerCase
|
||||
|
||||
println(s"DecisionTreeExample with parameters:\n$params")
|
||||
|
||||
// Load training and test data and cache it.
|
||||
val (training: DataFrame, test: DataFrame) =
|
||||
loadDatasets(sc, params.input, params.dataFormat, params.testInput, algo, params.fracTest)
|
||||
loadDatasets(params.input, params.dataFormat, params.testInput, algo, params.fracTest)
|
||||
|
||||
// Set up Pipeline
|
||||
// Set up Pipeline.
|
||||
val stages = new mutable.ArrayBuffer[PipelineStage]()
|
||||
// (1) For classification, re-index classes.
|
||||
val labelColName = if (algo == "classification") "indexedLabel" else "label"
|
||||
|
@ -228,7 +229,7 @@ object DecisionTreeExample {
|
|||
.setOutputCol("indexedFeatures")
|
||||
.setMaxCategories(10)
|
||||
stages += featuresIndexer
|
||||
// (3) Learn Decision Tree
|
||||
// (3) Learn Decision Tree.
|
||||
val dt = algo match {
|
||||
case "classification" =>
|
||||
new DecisionTreeClassifier()
|
||||
|
@ -255,13 +256,13 @@ object DecisionTreeExample {
|
|||
stages += dt
|
||||
val pipeline = new Pipeline().setStages(stages.toArray)
|
||||
|
||||
// Fit the Pipeline
|
||||
// Fit the Pipeline.
|
||||
val startTime = System.nanoTime()
|
||||
val pipelineModel = pipeline.fit(training)
|
||||
val elapsedTime = (System.nanoTime() - startTime) / 1e9
|
||||
println(s"Training time: $elapsedTime seconds")
|
||||
|
||||
// Get the trained Decision Tree from the fitted PipelineModel
|
||||
// Get the trained Decision Tree from the fitted PipelineModel.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
val treeModel = pipelineModel.stages.last.asInstanceOf[DecisionTreeClassificationModel]
|
||||
|
@ -280,7 +281,7 @@ object DecisionTreeExample {
|
|||
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
// Evaluate model on training, test data
|
||||
// Evaluate model on training, test data.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
println("Training data results:")
|
||||
|
@ -296,11 +297,11 @@ object DecisionTreeExample {
|
|||
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
sc.stop()
|
||||
spark.stop()
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate the given ClassificationModel on data. Print the results.
|
||||
* Evaluate the given ClassificationModel on data. Print the results.
|
||||
* @param model Must fit ClassificationModel abstraction
|
||||
* @param data DataFrame with "prediction" and labelColName columns
|
||||
* @param labelColName Name of the labelCol parameter for the model
|
||||
|
@ -314,7 +315,7 @@ object DecisionTreeExample {
|
|||
val fullPredictions = model.transform(data).cache()
|
||||
val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
|
||||
val labels = fullPredictions.select(labelColName).rdd.map(_.getDouble(0))
|
||||
// Print number of classes for reference
|
||||
// Print number of classes for reference.
|
||||
val numClasses = MetadataUtils.getNumClasses(fullPredictions.schema(labelColName)) match {
|
||||
case Some(n) => n
|
||||
case None => throw new RuntimeException(
|
||||
|
@ -325,7 +326,7 @@ object DecisionTreeExample {
|
|||
}
|
||||
|
||||
/**
|
||||
* Evaluate the given RegressionModel on data. Print the results.
|
||||
* Evaluate the given RegressionModel on data. Print the results.
|
||||
* @param model Must fit RegressionModel abstraction
|
||||
* @param data DataFrame with "prediction" and labelColName columns
|
||||
* @param labelColName Name of the labelCol parameter for the model
|
||||
|
|
|
@ -46,7 +46,7 @@ object DecisionTreeRegressionExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a DecisionTree model.
|
||||
|
@ -54,11 +54,11 @@ object DecisionTreeRegressionExample {
|
|||
.setLabelCol("label")
|
||||
.setFeaturesCol("indexedFeatures")
|
||||
|
||||
// Chain indexer and tree in a Pipeline
|
||||
// Chain indexer and tree in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(featureIndexer, dt))
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -67,7 +67,7 @@ object DecisionTreeRegressionExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("prediction", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new RegressionEvaluator()
|
||||
.setLabelCol("label")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -50,7 +50,7 @@ object DeveloperApiExample {
|
|||
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
|
||||
LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
|
||||
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
val lr = new MyLogisticRegression()
|
||||
// Print out the parameters, documentation, and any default values.
|
||||
println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
|
||||
|
@ -58,7 +58,7 @@ object DeveloperApiExample {
|
|||
// We may set parameters using setter methods.
|
||||
lr.setMaxIter(10)
|
||||
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
val model = lr.fit(training.toDF())
|
||||
|
||||
// Prepare test data.
|
||||
|
@ -84,7 +84,7 @@ object DeveloperApiExample {
|
|||
/**
|
||||
* Example of defining a parameter trait for a user-defined type of [[Classifier]].
|
||||
*
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
*/
|
||||
private trait MyLogisticRegressionParams extends ClassifierParams {
|
||||
|
||||
|
@ -96,7 +96,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams {
|
|||
* - def getMyParamName
|
||||
* - def setMyParamName
|
||||
* Here, we have a trait to be mixed in with the Estimator and Model (MyLogisticRegression
|
||||
* and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator
|
||||
* and MyLogisticRegressionModel). We place the setter (setMaxIter) method in the Estimator
|
||||
* class since the maxIter parameter is only used during training (not in the Model).
|
||||
*/
|
||||
val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations")
|
||||
|
@ -106,7 +106,7 @@ private trait MyLogisticRegressionParams extends ClassifierParams {
|
|||
/**
|
||||
* Example of defining a type of [[Classifier]].
|
||||
*
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
*/
|
||||
private class MyLogisticRegression(override val uid: String)
|
||||
extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel]
|
||||
|
@ -138,7 +138,7 @@ private class MyLogisticRegression(override val uid: String)
|
|||
/**
|
||||
* Example of defining a type of [[ClassificationModel]].
|
||||
*
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
* NOTE: This is private since it is an example. In practice, you may not want it to be private.
|
||||
*/
|
||||
private class MyLogisticRegressionModel(
|
||||
override val uid: String,
|
||||
|
@ -169,7 +169,7 @@ private class MyLogisticRegressionModel(
|
|||
Vectors.dense(-margin, margin)
|
||||
}
|
||||
|
||||
/** Number of classes the label can take. 2 indicates binary classification. */
|
||||
/** Number of classes the label can take. 2 indicates binary classification. */
|
||||
override val numClasses: Int = 2
|
||||
|
||||
/** Number of features the model was trained on. */
|
||||
|
|
|
@ -43,7 +43,7 @@ object EstimatorTransformerParamExample {
|
|||
(1.0, Vectors.dense(0.0, 1.2, -0.5))
|
||||
)).toDF("label", "features")
|
||||
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
val lr = new LogisticRegression()
|
||||
// Print out the parameters, documentation, and any default values.
|
||||
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
|
||||
|
@ -52,7 +52,7 @@ object EstimatorTransformerParamExample {
|
|||
lr.setMaxIter(10)
|
||||
.setRegParam(0.01)
|
||||
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
val model1 = lr.fit(training)
|
||||
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
|
||||
// we can view the parameters it used during fit().
|
||||
|
@ -63,11 +63,11 @@ object EstimatorTransformerParamExample {
|
|||
// We may alternatively specify parameters using a ParamMap,
|
||||
// which supports several methods for specifying parameters.
|
||||
val paramMap = ParamMap(lr.maxIter -> 20)
|
||||
.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
|
||||
.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
|
||||
.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
|
||||
|
||||
// One can also combine ParamMaps.
|
||||
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
|
||||
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name.
|
||||
val paramMapCombined = paramMap ++ paramMap2
|
||||
|
||||
// Now learn a new model using the paramMapCombined parameters.
|
||||
|
|
|
@ -23,13 +23,12 @@ import scala.language.reflectiveCalls
|
|||
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
import org.apache.spark.ml.{Pipeline, PipelineStage}
|
||||
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
|
||||
import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer}
|
||||
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame
|
|||
* {{{
|
||||
* ./bin/run-example ml.GBTExample [options]
|
||||
* }}}
|
||||
* Decision Trees and ensembles can take a large amount of memory. If the run-example command
|
||||
* Decision Trees and ensembles can take a large amount of memory. If the run-example command
|
||||
* above fails, try running via spark-submit and specifying the amount of memory as at least 1g.
|
||||
* For local mode, run
|
||||
* {{{
|
||||
|
@ -88,7 +87,7 @@ object GBTExample {
|
|||
.text(s"number of trees in ensemble, default: ${defaultParams.maxIter}")
|
||||
.action((x, c) => c.copy(maxIter = x))
|
||||
opt[Double]("fracTest")
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
s"this option is ignored. default: ${defaultParams.fracTest}")
|
||||
.action((x, c) => c.copy(fracTest = x))
|
||||
opt[Boolean]("cacheNodeIds")
|
||||
|
@ -109,7 +108,7 @@ object GBTExample {
|
|||
s"default: ${defaultParams.checkpointInterval}")
|
||||
.action((x, c) => c.copy(checkpointInterval = x))
|
||||
opt[String]("testInput")
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
s" default: ${defaultParams.testInput}")
|
||||
.action((x, c) => c.copy(testInput = x))
|
||||
opt[String]("dataFormat")
|
||||
|
@ -136,15 +135,18 @@ object GBTExample {
|
|||
}
|
||||
|
||||
def run(params: Params) {
|
||||
val conf = new SparkConf().setAppName(s"GBTExample with $params")
|
||||
val sc = new SparkContext(conf)
|
||||
params.checkpointDir.foreach(sc.setCheckpointDir)
|
||||
val spark = SparkSession
|
||||
.builder
|
||||
.appName(s"GBTExample with $params")
|
||||
.getOrCreate()
|
||||
|
||||
params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
|
||||
val algo = params.algo.toLowerCase
|
||||
|
||||
println(s"GBTExample with parameters:\n$params")
|
||||
|
||||
// Load training and test data and cache it.
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
|
||||
params.dataFormat, params.testInput, algo, params.fracTest)
|
||||
|
||||
// Set up Pipeline
|
||||
|
@ -164,7 +166,7 @@ object GBTExample {
|
|||
.setOutputCol("indexedFeatures")
|
||||
.setMaxCategories(10)
|
||||
stages += featuresIndexer
|
||||
// (3) Learn GBT
|
||||
// (3) Learn GBT.
|
||||
val dt = algo match {
|
||||
case "classification" =>
|
||||
new GBTClassifier()
|
||||
|
@ -193,13 +195,13 @@ object GBTExample {
|
|||
stages += dt
|
||||
val pipeline = new Pipeline().setStages(stages.toArray)
|
||||
|
||||
// Fit the Pipeline
|
||||
// Fit the Pipeline.
|
||||
val startTime = System.nanoTime()
|
||||
val pipelineModel = pipeline.fit(training)
|
||||
val elapsedTime = (System.nanoTime() - startTime) / 1e9
|
||||
println(s"Training time: $elapsedTime seconds")
|
||||
|
||||
// Get the trained GBT from the fitted PipelineModel
|
||||
// Get the trained GBT from the fitted PipelineModel.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
val rfModel = pipelineModel.stages.last.asInstanceOf[GBTClassificationModel]
|
||||
|
@ -218,7 +220,7 @@ object GBTExample {
|
|||
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
// Evaluate model on training, test data
|
||||
// Evaluate model on training, test data.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
println("Training data results:")
|
||||
|
@ -234,7 +236,7 @@ object GBTExample {
|
|||
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
sc.stop()
|
||||
spark.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
|
@ -51,7 +51,7 @@ object GradientBoostedTreeClassifierExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a GBT model.
|
||||
|
@ -66,11 +66,11 @@ object GradientBoostedTreeClassifierExample {
|
|||
.setOutputCol("predictedLabel")
|
||||
.setLabels(labelIndexer.labels)
|
||||
|
||||
// Chain indexers and GBT in a Pipeline
|
||||
// Chain indexers and GBT in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))
|
||||
|
||||
// Train model. This also runs the indexers.
|
||||
// Train model. This also runs the indexers.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -79,7 +79,7 @@ object GradientBoostedTreeClassifierExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("predictedLabel", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new MulticlassClassificationEvaluator()
|
||||
.setLabelCol("indexedLabel")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -45,7 +45,7 @@ object GradientBoostedTreeRegressorExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a GBT model.
|
||||
|
@ -54,11 +54,11 @@ object GradientBoostedTreeRegressorExample {
|
|||
.setFeaturesCol("indexedFeatures")
|
||||
.setMaxIter(10)
|
||||
|
||||
// Chain indexer and GBT in a Pipeline
|
||||
// Chain indexer and GBT in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(featureIndexer, gbt))
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -67,7 +67,7 @@ object GradientBoostedTreeRegressorExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("prediction", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new RegressionEvaluator()
|
||||
.setLabelCol("label")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -22,10 +22,9 @@ import scala.language.reflectiveCalls
|
|||
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
import org.apache.spark.ml.regression.LinearRegression
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
/**
|
||||
* An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
|
||||
|
@ -74,11 +73,11 @@ object LinearRegressionExample {
|
|||
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
|
||||
.action((x, c) => c.copy(tol = x))
|
||||
opt[Double]("fracTest")
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
s"this option is ignored. default: ${defaultParams.fracTest}")
|
||||
.action((x, c) => c.copy(fracTest = x))
|
||||
opt[String]("testInput")
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
s" default: ${defaultParams.testInput}")
|
||||
.action((x, c) => c.copy(testInput = x))
|
||||
opt[String]("dataFormat")
|
||||
|
@ -105,13 +104,15 @@ object LinearRegressionExample {
|
|||
}
|
||||
|
||||
def run(params: Params) {
|
||||
val conf = new SparkConf().setAppName(s"LinearRegressionExample with $params")
|
||||
val sc = new SparkContext(conf)
|
||||
val spark = SparkSession
|
||||
.builder
|
||||
.appName(s"LinearRegressionExample with $params")
|
||||
.getOrCreate()
|
||||
|
||||
println(s"LinearRegressionExample with parameters:\n$params")
|
||||
|
||||
// Load training and test data and cache it.
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
|
||||
params.dataFormat, params.testInput, "regression", params.fracTest)
|
||||
|
||||
val lir = new LinearRegression()
|
||||
|
@ -136,7 +137,7 @@ object LinearRegressionExample {
|
|||
println("Test data results:")
|
||||
DecisionTreeExample.evaluateRegressionModel(lirModel, test, "label")
|
||||
|
||||
sc.stop()
|
||||
spark.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
|
@ -23,12 +23,11 @@ import scala.language.reflectiveCalls
|
|||
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
import org.apache.spark.ml.{Pipeline, PipelineStage}
|
||||
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
|
||||
import org.apache.spark.ml.feature.StringIndexer
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
/**
|
||||
* An example runner for logistic regression with elastic-net (mixing L1/L2) regularization.
|
||||
|
@ -81,11 +80,11 @@ object LogisticRegressionExample {
|
|||
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
|
||||
.action((x, c) => c.copy(tol = x))
|
||||
opt[Double]("fracTest")
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
s"this option is ignored. default: ${defaultParams.fracTest}")
|
||||
.action((x, c) => c.copy(fracTest = x))
|
||||
opt[String]("testInput")
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
s" default: ${defaultParams.testInput}")
|
||||
.action((x, c) => c.copy(testInput = x))
|
||||
opt[String]("dataFormat")
|
||||
|
@ -112,16 +111,18 @@ object LogisticRegressionExample {
|
|||
}
|
||||
|
||||
def run(params: Params) {
|
||||
val conf = new SparkConf().setAppName(s"LogisticRegressionExample with $params")
|
||||
val sc = new SparkContext(conf)
|
||||
val spark = SparkSession
|
||||
.builder
|
||||
.appName(s"LogisticRegressionExample with $params")
|
||||
.getOrCreate()
|
||||
|
||||
println(s"LogisticRegressionExample with parameters:\n$params")
|
||||
|
||||
// Load training and test data and cache it.
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
|
||||
params.dataFormat, params.testInput, "classification", params.fracTest)
|
||||
|
||||
// Set up Pipeline
|
||||
// Set up Pipeline.
|
||||
val stages = new mutable.ArrayBuffer[PipelineStage]()
|
||||
|
||||
val labelIndexer = new StringIndexer()
|
||||
|
@ -141,7 +142,7 @@ object LogisticRegressionExample {
|
|||
stages += lor
|
||||
val pipeline = new Pipeline().setStages(stages.toArray)
|
||||
|
||||
// Fit the Pipeline
|
||||
// Fit the Pipeline.
|
||||
val startTime = System.nanoTime()
|
||||
val pipelineModel = pipeline.fit(training)
|
||||
val elapsedTime = (System.nanoTime() - startTime) / 1e9
|
||||
|
@ -156,7 +157,7 @@ object LogisticRegressionExample {
|
|||
println("Test data results:")
|
||||
DecisionTreeExample.evaluateClassificationModel(pipelineModel, test, "indexedLabel")
|
||||
|
||||
sc.stop()
|
||||
spark.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
|
@ -27,7 +27,9 @@ object LogisticRegressionWithElasticNetExample {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val spark = SparkSession
|
||||
.builder.appName("LogisticRegressionWithElasticNetExample").getOrCreate()
|
||||
.builder
|
||||
.appName("LogisticRegressionWithElasticNetExample")
|
||||
.getOrCreate()
|
||||
|
||||
// $example on$
|
||||
// Load training data
|
||||
|
|
|
@ -42,7 +42,9 @@ object ModelSelectionViaCrossValidationExample {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val spark = SparkSession
|
||||
.builder.appName("ModelSelectionViaCrossValidationExample").getOrCreate()
|
||||
.builder
|
||||
.appName("ModelSelectionViaCrossValidationExample")
|
||||
.getOrCreate()
|
||||
|
||||
// $example on$
|
||||
// Prepare training data from a list of (id, text, label) tuples.
|
||||
|
|
|
@ -36,7 +36,9 @@ object ModelSelectionViaTrainValidationSplitExample {
|
|||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val spark = SparkSession
|
||||
.builder.appName("ModelSelectionViaTrainValidationSplitExample").getOrCreate()
|
||||
.builder
|
||||
.appName("ModelSelectionViaTrainValidationSplitExample")
|
||||
.getOrCreate()
|
||||
|
||||
// $example on$
|
||||
// Prepare training and test data.
|
||||
|
|
|
@ -51,7 +51,7 @@ object RandomForestClassifierExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a RandomForest model.
|
||||
|
@ -66,11 +66,11 @@ object RandomForestClassifierExample {
|
|||
.setOutputCol("predictedLabel")
|
||||
.setLabels(labelIndexer.labels)
|
||||
|
||||
// Chain indexers and forest in a Pipeline
|
||||
// Chain indexers and forest in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
|
||||
|
||||
// Train model. This also runs the indexers.
|
||||
// Train model. This also runs the indexers.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -79,7 +79,7 @@ object RandomForestClassifierExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("predictedLabel", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new MulticlassClassificationEvaluator()
|
||||
.setLabelCol("indexedLabel")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -23,13 +23,12 @@ import scala.language.reflectiveCalls
|
|||
|
||||
import scopt.OptionParser
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.examples.mllib.AbstractParams
|
||||
import org.apache.spark.ml.{Pipeline, PipelineStage}
|
||||
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
|
||||
import org.apache.spark.ml.feature.{StringIndexer, VectorIndexer}
|
||||
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -37,7 +36,7 @@ import org.apache.spark.sql.DataFrame
|
|||
* {{{
|
||||
* ./bin/run-example ml.RandomForestExample [options]
|
||||
* }}}
|
||||
* Decision Trees and ensembles can take a large amount of memory. If the run-example command
|
||||
* Decision Trees and ensembles can take a large amount of memory. If the run-example command
|
||||
* above fails, try running via spark-submit and specifying the amount of memory as at least 1g.
|
||||
* For local mode, run
|
||||
* {{{
|
||||
|
@ -94,7 +93,7 @@ object RandomForestExample {
|
|||
s" default: ${defaultParams.numTrees}")
|
||||
.action((x, c) => c.copy(featureSubsetStrategy = x))
|
||||
opt[Double]("fracTest")
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
.text(s"fraction of data to hold out for testing. If given option testInput, " +
|
||||
s"this option is ignored. default: ${defaultParams.fracTest}")
|
||||
.action((x, c) => c.copy(fracTest = x))
|
||||
opt[Boolean]("cacheNodeIds")
|
||||
|
@ -115,7 +114,7 @@ object RandomForestExample {
|
|||
s"default: ${defaultParams.checkpointInterval}")
|
||||
.action((x, c) => c.copy(checkpointInterval = x))
|
||||
opt[String]("testInput")
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
.text(s"input path to test dataset. If given, option fracTest is ignored." +
|
||||
s" default: ${defaultParams.testInput}")
|
||||
.action((x, c) => c.copy(testInput = x))
|
||||
opt[String]("dataFormat")
|
||||
|
@ -142,18 +141,21 @@ object RandomForestExample {
|
|||
}
|
||||
|
||||
def run(params: Params) {
|
||||
val conf = new SparkConf().setAppName(s"RandomForestExample with $params")
|
||||
val sc = new SparkContext(conf)
|
||||
params.checkpointDir.foreach(sc.setCheckpointDir)
|
||||
val spark = SparkSession
|
||||
.builder
|
||||
.appName(s"RandomForestExample with $params")
|
||||
.getOrCreate()
|
||||
|
||||
params.checkpointDir.foreach(spark.sparkContext.setCheckpointDir)
|
||||
val algo = params.algo.toLowerCase
|
||||
|
||||
println(s"RandomForestExample with parameters:\n$params")
|
||||
|
||||
// Load training and test data and cache it.
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(sc, params.input,
|
||||
val (training: DataFrame, test: DataFrame) = DecisionTreeExample.loadDatasets(params.input,
|
||||
params.dataFormat, params.testInput, algo, params.fracTest)
|
||||
|
||||
// Set up Pipeline
|
||||
// Set up Pipeline.
|
||||
val stages = new mutable.ArrayBuffer[PipelineStage]()
|
||||
// (1) For classification, re-index classes.
|
||||
val labelColName = if (algo == "classification") "indexedLabel" else "label"
|
||||
|
@ -170,7 +172,7 @@ object RandomForestExample {
|
|||
.setOutputCol("indexedFeatures")
|
||||
.setMaxCategories(10)
|
||||
stages += featuresIndexer
|
||||
// (3) Learn Random Forest
|
||||
// (3) Learn Random Forest.
|
||||
val dt = algo match {
|
||||
case "classification" =>
|
||||
new RandomForestClassifier()
|
||||
|
@ -201,13 +203,13 @@ object RandomForestExample {
|
|||
stages += dt
|
||||
val pipeline = new Pipeline().setStages(stages.toArray)
|
||||
|
||||
// Fit the Pipeline
|
||||
// Fit the Pipeline.
|
||||
val startTime = System.nanoTime()
|
||||
val pipelineModel = pipeline.fit(training)
|
||||
val elapsedTime = (System.nanoTime() - startTime) / 1e9
|
||||
println(s"Training time: $elapsedTime seconds")
|
||||
|
||||
// Get the trained Random Forest from the fitted PipelineModel
|
||||
// Get the trained Random Forest from the fitted PipelineModel.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
val rfModel = pipelineModel.stages.last.asInstanceOf[RandomForestClassificationModel]
|
||||
|
@ -226,7 +228,7 @@ object RandomForestExample {
|
|||
case _ => throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
// Evaluate model on training, test data
|
||||
// Evaluate model on training, test data.
|
||||
algo match {
|
||||
case "classification" =>
|
||||
println("Training data results:")
|
||||
|
@ -242,7 +244,7 @@ object RandomForestExample {
|
|||
throw new IllegalArgumentException("Algo ${params.algo} not supported.")
|
||||
}
|
||||
|
||||
sc.stop()
|
||||
spark.stop()
|
||||
}
|
||||
}
|
||||
// scalastyle:on println
|
||||
|
|
|
@ -45,7 +45,7 @@ object RandomForestRegressorExample {
|
|||
.setMaxCategories(4)
|
||||
.fit(data)
|
||||
|
||||
// Split the data into training and test sets (30% held out for testing)
|
||||
// Split the data into training and test sets (30% held out for testing).
|
||||
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
|
||||
|
||||
// Train a RandomForest model.
|
||||
|
@ -53,11 +53,11 @@ object RandomForestRegressorExample {
|
|||
.setLabelCol("label")
|
||||
.setFeaturesCol("indexedFeatures")
|
||||
|
||||
// Chain indexer and forest in a Pipeline
|
||||
// Chain indexer and forest in a Pipeline.
|
||||
val pipeline = new Pipeline()
|
||||
.setStages(Array(featureIndexer, rf))
|
||||
|
||||
// Train model. This also runs the indexer.
|
||||
// Train model. This also runs the indexer.
|
||||
val model = pipeline.fit(trainingData)
|
||||
|
||||
// Make predictions.
|
||||
|
@ -66,7 +66,7 @@ object RandomForestRegressorExample {
|
|||
// Select example rows to display.
|
||||
predictions.select("prediction", "label", "features").show(5)
|
||||
|
||||
// Select (prediction, true label) and compute test error
|
||||
// Select (prediction, true label) and compute test error.
|
||||
val evaluator = new RegressionEvaluator()
|
||||
.setLabelCol("label")
|
||||
.setPredictionCol("prediction")
|
||||
|
|
|
@ -41,7 +41,7 @@ object SimpleParamsExample {
|
|||
import spark.implicits._
|
||||
|
||||
// Prepare training data.
|
||||
// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
|
||||
// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
|
||||
// into DataFrames, where it uses the case class metadata to infer the schema.
|
||||
val training = spark.createDataFrame(Seq(
|
||||
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
|
||||
|
@ -49,7 +49,7 @@ object SimpleParamsExample {
|
|||
LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
|
||||
LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
|
||||
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
// Create a LogisticRegression instance. This instance is an Estimator.
|
||||
val lr = new LogisticRegression()
|
||||
// Print out the parameters, documentation, and any default values.
|
||||
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
|
||||
|
@ -58,7 +58,7 @@ object SimpleParamsExample {
|
|||
lr.setMaxIter(10)
|
||||
.setRegParam(0.01)
|
||||
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
// Learn a LogisticRegression model. This uses the parameters stored in lr.
|
||||
val model1 = lr.fit(training)
|
||||
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
|
||||
// we can view the parameters it used during fit().
|
||||
|
@ -69,7 +69,7 @@ object SimpleParamsExample {
|
|||
// We may alternatively specify parameters using a ParamMap,
|
||||
// which supports several methods for specifying parameters.
|
||||
val paramMap = ParamMap(lr.maxIter -> 20)
|
||||
paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
|
||||
paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
|
||||
paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params.
|
||||
|
||||
// One can also combine ParamMaps.
|
||||
|
|
Loading…
Reference in a new issue