[SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue
## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) In the MLLib naivebayes example, scala and python example doesn't use libsvm data, but Java does. I make changes in scala and python example to use the libsvm data as the same as Java example. ## How was this patch tested? Manual tests Author: wm624@hotmail.com <wm624@hotmail.com> Closes #13301 from wangmiao1981/example.
This commit is contained in:
parent
4a2fb8b87c
commit
5d4dafe8fd
|
@ -1,12 +0,0 @@
|
|||
0,1 0 0
|
||||
0,2 0 0
|
||||
0,3 0 0
|
||||
0,4 0 0
|
||||
1,0 1 0
|
||||
1,0 2 0
|
||||
1,0 3 0
|
||||
1,0 4 0
|
||||
2,0 0 1
|
||||
2,0 0 2
|
||||
2,0 0 3
|
||||
2,0 0 4
|
|
@ -36,9 +36,9 @@ public class JavaNaiveBayesExample {
|
|||
SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
|
||||
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
|
||||
// $example on$
|
||||
String path = "data/mllib/sample_naive_bayes_data.txt";
|
||||
String path = "data/mllib/sample_libsvm_data.txt";
|
||||
JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
|
||||
JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}, 12345);
|
||||
JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
|
||||
JavaRDD<LabeledPoint> training = tmp[0]; // training set
|
||||
JavaRDD<LabeledPoint> test = tmp[1]; // test set
|
||||
final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
|
||||
|
|
|
@ -29,15 +29,9 @@ import shutil
|
|||
from pyspark import SparkContext
|
||||
# $example on$
|
||||
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
|
||||
from pyspark.mllib.linalg import Vectors
|
||||
from pyspark.mllib.regression import LabeledPoint
|
||||
from pyspark.mllib.util import MLUtils
|
||||
|
||||
|
||||
def parseLine(line):
|
||||
parts = line.split(',')
|
||||
label = float(parts[0])
|
||||
features = Vectors.dense([float(x) for x in parts[1].split(' ')])
|
||||
return LabeledPoint(label, features)
|
||||
# $example off$
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -45,10 +39,11 @@ if __name__ == "__main__":
|
|||
sc = SparkContext(appName="PythonNaiveBayesExample")
|
||||
|
||||
# $example on$
|
||||
data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
|
||||
# Load and parse the data file.
|
||||
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
|
||||
|
||||
# Split data approximately into training (60%) and test (40%)
|
||||
training, test = data.randomSplit([0.6, 0.4], seed=0)
|
||||
training, test = data.randomSplit([0.6, 0.4])
|
||||
|
||||
# Train a naive Bayes model.
|
||||
model = NaiveBayes.train(training, 1.0)
|
||||
|
|
|
@ -21,8 +21,7 @@ package org.apache.spark.examples.mllib
|
|||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
// $example on$
|
||||
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.mllib.util.MLUtils
|
||||
// $example off$
|
||||
|
||||
object NaiveBayesExample {
|
||||
|
@ -31,16 +30,11 @@ object NaiveBayesExample {
|
|||
val conf = new SparkConf().setAppName("NaiveBayesExample")
|
||||
val sc = new SparkContext(conf)
|
||||
// $example on$
|
||||
val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
|
||||
val parsedData = data.map { line =>
|
||||
val parts = line.split(',')
|
||||
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
|
||||
}
|
||||
// Load and parse the data file.
|
||||
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
|
||||
|
||||
// Split data into training (60%) and test (40%).
|
||||
val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
|
||||
val training = splits(0)
|
||||
val test = splits(1)
|
||||
val Array(training, test) = data.randomSplit(Array(0.6, 0.4))
|
||||
|
||||
val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial")
|
||||
|
||||
|
|
Loading…
Reference in a new issue