[SPARK-10518] [DOCS] Update code examples in spark.ml user guide to use LIBSVM data source instead of MLUtils

I fixed to use LIBSVM data source in the example code in spark.ml instead of MLUtils

Author: y-shimizu <y.shimizu0429@gmail.com>

Closes #8697 from y-shimizu/SPARK-10518.
This commit is contained in:
y-shimizu 2015-09-11 08:27:30 -07:00 committed by Xiangrui Meng
parent 9bbe33f318
commit c268ca4ddd
3 changed files with 47 additions and 104 deletions

View file

@ -121,10 +121,9 @@ import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.classification.RandomForestClassificationModel import org.apache.spark.ml.classification.RandomForestClassificationModel
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column. // Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index. // Fit on whole dataset to include all labels in index.
@ -193,14 +192,11 @@ import org.apache.spark.ml.classification.RandomForestClassifier;
import org.apache.spark.ml.classification.RandomForestClassificationModel; import org.apache.spark.ml.classification.RandomForestClassificationModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*; import org.apache.spark.ml.feature.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); DataFrame data = sqlContext.read.format("libsvm")
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); .load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column. // Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index. // Fit on whole dataset to include all labels in index.
@ -268,10 +264,9 @@ from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
# Load and parse the data file, converting it to a DataFrame. # Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column. # Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index. # Fit on whole dataset to include all labels in index.
@ -327,10 +322,9 @@ import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.regression.RandomForestRegressionModel import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them. // Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous. // Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -387,14 +381,11 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.ml.regression.RandomForestRegressionModel;
import org.apache.spark.ml.regression.RandomForestRegressor; import org.apache.spark.ml.regression.RandomForestRegressor;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); DataFrame data = sqlContext.read.format("libsvm")
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class); .load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them. // Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous. // Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -450,10 +441,9 @@ from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
# Load and parse the data file, converting it to a DataFrame. # Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them. # Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. # Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -576,10 +566,9 @@ import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.classification.GBTClassificationModel import org.apache.spark.ml.classification.GBTClassificationModel
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer} import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column. // Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index. // Fit on whole dataset to include all labels in index.
@ -648,14 +637,10 @@ import org.apache.spark.ml.classification.GBTClassifier;
import org.apache.spark.ml.classification.GBTClassificationModel; import org.apache.spark.ml.classification.GBTClassificationModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*; import org.apache.spark.ml.feature.*;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
// Index labels, adding metadata to the label column. // Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index. // Fit on whole dataset to include all labels in index.
@ -724,10 +709,9 @@ from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
# Load and parse the data file, converting it to a DataFrame. # Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column. # Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index. # Fit on whole dataset to include all labels in index.
@ -783,10 +767,9 @@ import org.apache.spark.ml.regression.GBTRegressor
import org.apache.spark.ml.regression.GBTRegressionModel import org.apache.spark.ml.regression.GBTRegressionModel
import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them. // Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous. // Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -844,14 +827,10 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.GBTRegressionModel; import org.apache.spark.ml.regression.GBTRegressionModel;
import org.apache.spark.ml.regression.GBTRegressor; import org.apache.spark.ml.regression.GBTRegressor;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame. // Load and parse the data file, converting it to a DataFrame.
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt"); DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
// Automatically identify categorical features, and index them. // Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous. // Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -908,10 +887,9 @@ from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
# Load and parse the data file, converting it to a DataFrame. # Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them. # Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. # Set maxCategories so features with > 4 distinct values are treated as continuous.
@ -970,15 +948,14 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.{Row, SQLContext}
val sqlContext = new SQLContext(sc) val sqlContext = new SQLContext(sc)
// parse data into dataframe // parse data into dataframe
val data = MLUtils.loadLibSVMFile(sc, val data = sqlContext.read.format("libsvm")
"data/mllib/sample_multiclass_classification_data.txt") .load("data/mllib/sample_multiclass_classification_data.txt")
val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3)) val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
// instantiate multiclass learner and train // instantiate multiclass learner and train
val ovr = new OneVsRest().setClassifier(new LogisticRegression) val ovr = new OneVsRest().setClassifier(new LogisticRegression)
@ -1016,9 +993,6 @@ import org.apache.spark.ml.classification.OneVsRest;
import org.apache.spark.ml.classification.OneVsRestModel; import org.apache.spark.ml.classification.OneVsRestModel;
import org.apache.spark.mllib.evaluation.MulticlassMetrics; import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SQLContext;
@ -1026,10 +1000,9 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
JavaSparkContext jsc = new JavaSparkContext(conf); JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc); SQLContext jsql = new SQLContext(jsc);
RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), DataFrame dataFrame = sqlContext.read.format("libsvm")
"data/mllib/sample_multiclass_classification_data.txt"); .load("data/mllib/sample_multiclass_classification_data.txt");
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345); DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
DataFrame train = splits[0]; DataFrame train = splits[0];
DataFrame test = splits[1]; DataFrame test = splits[1];

View file

@ -1179,9 +1179,9 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
<div data-lang="scala" markdown="1"> <div data-lang="scala" markdown="1">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.mllib.util.MLUtils
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val data = sqlContext.read.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt")
val indexer = new VectorIndexer() val indexer = new VectorIndexer()
.setInputCol("features") .setInputCol("features")
.setOutputCol("indexed") .setOutputCol("indexed")
@ -1200,16 +1200,12 @@ val indexedData = indexerModel.transform(data)
{% highlight java %} {% highlight java %}
import java.util.Map; import java.util.Map;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), DataFrame data = sqlContext.read.format("libsvm")
"data/mllib/sample_libsvm_data.txt").toJavaRDD(); .load("data/mllib/sample_libsvm_data.txt");
DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
VectorIndexer indexer = new VectorIndexer() VectorIndexer indexer = new VectorIndexer()
.setInputCol("features") .setInputCol("features")
.setOutputCol("indexed") .setOutputCol("indexed")
@ -1230,9 +1226,9 @@ DataFrame indexedData = indexerModel.transform(data);
<div data-lang="python" markdown="1"> <div data-lang="python" markdown="1">
{% highlight python %} {% highlight python %}
from pyspark.ml.feature import VectorIndexer from pyspark.ml.feature import VectorIndexer
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() data = sqlContext.read.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt")
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data) indexerModel = indexer.fit(data)
@ -1253,10 +1249,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
<div data-lang="scala"> <div data-lang="scala">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.mllib.util.MLUtils
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val dataFrame = sqlContext.read.format("libsvm")
val dataFrame = sqlContext.createDataFrame(data) .load("data/mllib/sample_libsvm_data.txt")
// Normalize each Vector using $L^1$ norm. // Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer() val normalizer = new Normalizer()
@ -1272,15 +1267,11 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
<div data-lang="java"> <div data-lang="java">
{% highlight java %} {% highlight java %}
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Normalizer; import org.apache.spark.ml.feature.Normalizer;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
JavaRDD<LabeledPoint> data = DataFrame dataFrame = sqlContext.read.format("libsvm")
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); .load("data/mllib/sample_libsvm_data.txt");
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
// Normalize each Vector using $L^1$ norm. // Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer() Normalizer normalizer = new Normalizer()
@ -1297,11 +1288,10 @@ DataFrame lInfNormData =
<div data-lang="python"> <div data-lang="python">
{% highlight python %} {% highlight python %}
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Normalizer from pyspark.ml.feature import Normalizer
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") dataFrame = sqlContext.read.format("libsvm")
dataFrame = sqlContext.createDataFrame(data) .load("data/mllib/sample_libsvm_data.txt")
# Normalize each Vector using $L^1$ norm. # Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@ -1335,10 +1325,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
<div data-lang="scala"> <div data-lang="scala">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.mllib.util.MLUtils
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val dataFrame = sqlContext.read.format("libsvm")
val dataFrame = sqlContext.createDataFrame(data) .load("data/mllib/sample_libsvm_data.txt")
val scaler = new StandardScaler() val scaler = new StandardScaler()
.setInputCol("features") .setInputCol("features")
.setOutputCol("scaledFeatures") .setOutputCol("scaledFeatures")
@ -1355,16 +1344,12 @@ val scaledData = scalerModel.transform(dataFrame)
<div data-lang="java"> <div data-lang="java">
{% highlight java %} {% highlight java %}
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.StandardScaler; import org.apache.spark.ml.feature.StandardScaler;
import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.ml.feature.StandardScalerModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
JavaRDD<LabeledPoint> data = DataFrame dataFrame = sqlContext.read.format("libsvm")
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); .load("data/mllib/sample_libsvm_data.txt");
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
StandardScaler scaler = new StandardScaler() StandardScaler scaler = new StandardScaler()
.setInputCol("features") .setInputCol("features")
.setOutputCol("scaledFeatures") .setOutputCol("scaledFeatures")
@ -1381,11 +1366,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
<div data-lang="python"> <div data-lang="python">
{% highlight python %} {% highlight python %}
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler from pyspark.ml.feature import StandardScaler
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") dataFrame = sqlContext.read.format("libsvm")
dataFrame = sqlContext.createDataFrame(data) .load("data/mllib/sample_libsvm_data.txt")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
withStd=True, withMean=False) withStd=True, withMean=False)
@ -1424,10 +1408,9 @@ More details can be found in the API docs for
[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). [MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.mllib.util.MLUtils
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val dataFrame = sqlContext.read.format("libsvm")
val dataFrame = sqlContext.createDataFrame(data) .load("data/mllib/sample_libsvm_data.txt")
val scaler = new MinMaxScaler() val scaler = new MinMaxScaler()
.setInputCol("features") .setInputCol("features")
.setOutputCol("scaledFeatures") .setOutputCol("scaledFeatures")
@ -1448,13 +1431,10 @@ More details can be found in the API docs for
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.MinMaxScaler; import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel; import org.apache.spark.ml.feature.MinMaxScalerModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
JavaRDD<LabeledPoint> data = DataFrame dataFrame = sqlContext.read.format("libsvm")
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); .load("data/mllib/sample_libsvm_data.txt");
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
MinMaxScaler scaler = new MinMaxScaler() MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features") .setInputCol("features")
.setOutputCol("scaledFeatures"); .setOutputCol("scaledFeatures");

View file

@ -59,10 +59,9 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
<div data-lang="scala" markdown="1"> <div data-lang="scala" markdown="1">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.util.MLUtils
// Load training data // Load training data
val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val lr = new LogisticRegression() val lr = new LogisticRegression()
.setMaxIter(10) .setMaxIter(10)
@ -81,8 +80,6 @@ println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
{% highlight java %} {% highlight java %}
import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext; import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
@ -98,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt"; String path = "data/mllib/sample_libsvm_data.txt";
// Load training data // Load training data
DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); DataFrame training = sqlContext.read.format("libsvm").load(path);
LogisticRegression lr = new LogisticRegression() LogisticRegression lr = new LogisticRegression()
.setMaxIter(10) .setMaxIter(10)
@ -118,11 +115,9 @@ public class LogisticRegressionWithElasticNetExample {
<div data-lang="python" markdown="1"> <div data-lang="python" markdown="1">
{% highlight python %} {% highlight python %}
from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
# Load training data # Load training data
training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
@ -251,10 +246,9 @@ regression model and extracting model summary statistics.
<div data-lang="scala" markdown="1"> <div data-lang="scala" markdown="1">
{% highlight scala %} {% highlight scala %}
import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.util.MLUtils
// Load training data // Load training data
val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() val training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val lr = new LinearRegression() val lr = new LinearRegression()
.setMaxIter(10) .setMaxIter(10)
@ -283,8 +277,6 @@ import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.regression.LinearRegressionModel; import org.apache.spark.ml.regression.LinearRegressionModel;
import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext; import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.DataFrame;
@ -300,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt"; String path = "data/mllib/sample_libsvm_data.txt";
// Load training data // Load training data
DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class); DataFrame training = sqlContext.read.format("libsvm").load(path);
LinearRegression lr = new LinearRegression() LinearRegression lr = new LinearRegression()
.setMaxIter(10) .setMaxIter(10)
@ -329,11 +321,9 @@ public class LinearRegressionWithElasticNetExample {
<!--- TODO: Add python model summaries once implemented --> <!--- TODO: Add python model summaries once implemented -->
{% highlight python %} {% highlight python %}
from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
# Load training data # Load training data
training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() training = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)