[SPARK-15134][EXAMPLE] Indent SparkSession builder patterns and update binary_classification_metrics_example.py

## What changes were proposed in this pull request?

This issue addresses the comments in SPARK-15031 and also fix java-linter errors.
- Use multiline format in SparkSession builder patterns.
- Update `binary_classification_metrics_example.py` to use `SparkSession`.
- Fix Java Linter errors (in SPARK-13745, SPARK-15031, and so far)

## How was this patch tested?

After passing the Jenkins tests and run `dev/lint-java` manually.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #12911 from dongjoon-hyun/SPARK-15134.
This commit is contained in:
Dongjoon Hyun 2016-05-05 14:37:50 -07:00 committed by Andrew Or
parent bb9991dec5
commit 2c170dd3d7
142 changed files with 585 additions and 178 deletions

View file

@ -87,8 +87,11 @@ public class ExternalShuffleBlockHandler extends RpcHandler {
blocks.add(blockManager.getBlockData(msg.appId, msg.execId, blockId));
}
long streamId = streamManager.registerStream(client.getClientId(), blocks.iterator());
logger.trace("Registered streamId {} with {} buffers for client {} from host {}", streamId,
msg.blockIds.length, client.getClientId(), NettyUtils.getRemoteAddress(client.getChannel()));
logger.trace("Registered streamId {} with {} buffers for client {} from host {}",
streamId,
msg.blockIds.length,
client.getClientId(),
NettyUtils.getRemoteAddress(client.getChannel()));
callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteBuffer());
} else if (msgObj instanceof RegisterExecutor) {

View file

@ -33,7 +33,10 @@ import org.apache.spark.sql.types.*;
public class JavaAFTSurvivalRegressionExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaAFTSurvivalRegressionExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaAFTSurvivalRegressionExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -81,7 +81,10 @@ public class JavaALSExample {
// $example off$
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaALSExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaALSExample")
.getOrCreate();
// $example on$
JavaRDD<Rating> ratingsRDD = spark

View file

@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
@ -26,7 +24,6 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Binarizer;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@ -38,7 +35,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaBinarizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaBinarizerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaBinarizerExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -42,7 +42,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaBisectingKMeansExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaBisectingKMeansExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaBisectingKMeansExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -35,7 +35,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaBucketizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaBucketizerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaBucketizerExample")
.getOrCreate();
// $example on$
double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

View file

@ -17,9 +17,6 @@
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
@ -40,7 +37,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaChiSqSelectorExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaChiSqSelectorExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaChiSqSelectorExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -32,7 +32,10 @@ import org.apache.spark.sql.types.*;
public class JavaCountVectorizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaCountVectorizerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaCountVectorizerExample")
.getOrCreate();
// $example on$
// Input data: Each row is a bag of words from a sentence or document.

View file

@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
@ -26,7 +24,6 @@ import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.DCT;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.Vectors;
@ -39,7 +36,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaDCTExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaDCTExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaDCTExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -49,7 +49,10 @@ import org.apache.spark.sql.SparkSession;
public class JavaDeveloperApiExample {
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession.builder().appName("JavaDeveloperApiExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaDeveloperApiExample")
.getOrCreate();
// Prepare training data.
List<LabeledPoint> localTraining = Lists.newArrayList(

View file

@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SparkSession;
@ -27,7 +25,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.ElementwiseProduct;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.VectorUDT;
@ -42,7 +39,9 @@ import org.apache.spark.sql.types.StructType;
public class JavaElementwiseProductExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder().appName("JavaElementwiseProductExample").getOrCreate();
.builder()
.appName("JavaElementwiseProductExample")
.getOrCreate();
// $example on$
// Create some vector data; also works for sparse vectors

View file

@ -17,8 +17,6 @@
package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
@ -35,11 +33,15 @@ import org.apache.spark.sql.SparkSession;
public class JavaGradientBoostedTreeClassifierExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder().appName("JavaGradientBoostedTreeClassifierExample").getOrCreate();
.builder()
.appName("JavaGradientBoostedTreeClassifierExample")
.getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> data = spark.read().format("libsvm")
Dataset<Row> data = spark
.read()
.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.

View file

@ -37,7 +37,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaIndexToStringExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaIndexToStringExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaIndexToStringExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -70,7 +70,10 @@ public class JavaKMeansExample {
int k = Integer.parseInt(args[1]);
// Parses the arguments
SparkSession spark = SparkSession.builder().appName("JavaKMeansExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaKMeansExample")
.getOrCreate();
// $example on$
// Loads data

View file

@ -65,7 +65,10 @@ public class JavaLDAExample {
String inputFile = "data/mllib/sample_lda_data.txt";
// Parses the arguments
SparkSession spark = SparkSession.builder().appName("JavaLDAExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaLDAExample")
.getOrCreate();
// Loads data
JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new ParseVector());

View file

@ -28,13 +28,19 @@ import org.apache.spark.sql.SparkSession;
public class JavaMaxAbsScalerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaMaxAbsScalerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaMaxAbsScalerExample")
.getOrCreate();
// $example on$
Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
Dataset<Row> dataFrame = spark
.read()
.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
MaxAbsScaler scaler = new MaxAbsScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");
.setInputCol("features")
.setOutputCol("scaledFeatures");
// Compute summary statistics and generate MaxAbsScalerModel
MaxAbsScalerModel scalerModel = scaler.fit(dataFrame);

View file

@ -28,10 +28,16 @@ import org.apache.spark.sql.Row;
public class JavaMinMaxScalerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaMinMaxScalerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaMinMaxScalerExample")
.getOrCreate();
// $example on$
Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
Dataset<Row> dataFrame = spark
.read()
.format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");

View file

@ -35,7 +35,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaNGramExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaNGramExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaNGramExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -32,7 +32,10 @@ import org.apache.spark.sql.SparkSession;
public class JavaNaiveBayesExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaNaiveBayesExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaNaiveBayesExample")
.getOrCreate();
// $example on$
// Load training data

View file

@ -27,7 +27,10 @@ import org.apache.spark.sql.Row;
public class JavaNormalizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaNormalizerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaNormalizerExample")
.getOrCreate();
// $example on$
Dataset<Row> dataFrame =

View file

@ -37,7 +37,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaOneHotEncoderExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaOneHotEncoderExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaOneHotEncoderExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -58,7 +58,10 @@ public class JavaOneVsRestExample {
public static void main(String[] args) {
// parse the arguments
Params params = parse(args);
SparkSession spark = SparkSession.builder().appName("JavaOneVsRestExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaOneVsRestExample")
.getOrCreate();
// $example on$
// configure the base classifier

View file

@ -37,7 +37,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaPCAExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaPCAExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaPCAExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -36,7 +36,10 @@ import org.apache.spark.sql.SparkSession;
*/
public class JavaPipelineExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaPipelineExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaPipelineExample")
.getOrCreate();
// $example on$
// Prepare training documents, which are labeled.

View file

@ -36,7 +36,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaPolynomialExpansionExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaPolynomialExpansionExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaPolynomialExpansionExample")
.getOrCreate();
// $example on$
PolynomialExpansion polyExpansion = new PolynomialExpansion()

View file

@ -35,7 +35,10 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaRFormulaExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaRFormulaExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaRFormulaExample")
.getOrCreate();
// $example on$
StructType schema = createStructType(new StructField[]{

View file

@ -31,7 +31,10 @@ import org.apache.spark.sql.types.*;
public class JavaSQLTransformerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaSQLTransformerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaSQLTransformerExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -40,7 +40,10 @@ import org.apache.spark.sql.SparkSession;
public class JavaSimpleParamsExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaSimpleParamsExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaSimpleParamsExample")
.getOrCreate();
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans

View file

@ -28,7 +28,10 @@ import org.apache.spark.sql.Row;
public class JavaStandardScalerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaStandardScalerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaStandardScalerExample")
.getOrCreate();
// $example on$
Dataset<Row> dataFrame =

View file

@ -36,7 +36,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaStopWordsRemoverExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaStopWordsRemoverExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaStopWordsRemoverExample")
.getOrCreate();
// $example on$
StopWordsRemover remover = new StopWordsRemover()

View file

@ -35,7 +35,10 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaStringIndexerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaStringIndexerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaStringIndexerExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -38,7 +38,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaTfIdfExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaTfIdfExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaTfIdfExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -36,7 +36,10 @@ import org.apache.spark.sql.types.StructType;
public class JavaTokenizerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaTokenizerExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(

View file

@ -35,7 +35,10 @@ import static org.apache.spark.sql.types.DataTypes.*;
public class JavaVectorAssemblerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaVectorAssemblerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaVectorAssemblerExample")
.getOrCreate();
// $example on$
StructType schema = createStructType(new StructField[]{

View file

@ -30,7 +30,10 @@ import org.apache.spark.sql.Row;
public class JavaVectorIndexerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaVectorIndexerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaVectorIndexerExample")
.getOrCreate();
// $example on$
Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

View file

@ -37,7 +37,10 @@ import org.apache.spark.sql.types.*;
public class JavaVectorSlicerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaVectorSlicerExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaVectorSlicerExample")
.getOrCreate();
// $example on$
Attribute[] attrs = new Attribute[]{

View file

@ -32,7 +32,10 @@ import org.apache.spark.sql.types.*;
public class JavaWord2VecExample {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("JavaWord2VecExample").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaWord2VecExample")
.getOrCreate();
// $example on$
// Input data: Each row is a bag of words from a sentence or document.

View file

@ -51,7 +51,10 @@ public class JavaSparkSQL {
}
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession.builder().appName("JavaSparkSQL").getOrCreate();
SparkSession spark = SparkSession
.builder()
.appName("JavaSparkSQL")
.getOrCreate();
System.out.println("=== Data source: RDD ===");
// Load a text file and convert each line to a Java Bean.
@ -147,7 +150,8 @@ public class JavaSparkSQL {
// a RDD[String] storing one JSON object per string.
List<String> jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
JavaRDD<String> anotherPeopleRDD = spark.createDataFrame(jsonData, String.class).toJSON().javaRDD();
JavaRDD<String> anotherPeopleRDD = spark
.createDataFrame(jsonData, String.class).toJSON().javaRDD();
Dataset<Row> peopleFromJsonRDD = spark.read().json(anotherPeopleRDD);
// Take a look at the schema of this new DataFrame.

View file

@ -115,7 +115,10 @@ class JavaSparkSessionSingleton {
private static transient SparkSession instance = null;
public static SparkSession getInstance(SparkConf sparkConf) {
if (instance == null) {
instance = SparkSession.builder().config(sparkConf).getOrCreate();
instance = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
}
return instance;
}

View file

@ -30,7 +30,10 @@ from pyspark.sql import Row
# $example off$
if __name__ == "__main__":
spark = SparkSession.builder.appName("ALSExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("ALSExample")\
.getOrCreate()
# $example on$
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import Binarizer
# $example off$
if __name__ == "__main__":
spark = SparkSession.builder.appName("BinarizerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("BinarizerExample")\
.getOrCreate()
# $example on$
continuousDataFrame = spark.createDataFrame([

View file

@ -30,7 +30,10 @@ A simple example demonstrating a bisecting k-means clustering.
"""
if __name__ == "__main__":
spark = SparkSession.builder.appName("PythonBisectingKMeansExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("PythonBisectingKMeansExample")\
.getOrCreate()
# $example on$
data = spark.read.text("data/mllib/kmeans_data.txt").rdd

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import Bucketizer
# $example off$
if __name__ == "__main__":
spark = SparkSession.builder.appName("BucketizerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("BucketizerExample")\
.getOrCreate()
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

View file

@ -24,7 +24,10 @@ from pyspark.mllib.linalg import Vectors
# $example off$
if __name__ == "__main__":
spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("ChiSqSelectorExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import CountVectorizer
# $example off$
if __name__ == "__main__":
spark = SparkSession.builder.appName("CountVectorizerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("CountVectorizerExample")\
.getOrCreate()
# $example on$
# Input data: Each row is a bag of words with a ID.

View file

@ -35,7 +35,10 @@ Run with:
"""
if __name__ == "__main__":
spark = SparkSession.builder.appName("CrossValidatorExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("CrossValidatorExample")\
.getOrCreate()
# $example on$
# Prepare training documents, which are labeled.
training = spark.createDataFrame([

View file

@ -33,7 +33,10 @@ if __name__ == "__main__":
if len(sys.argv) > 2:
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("DataFrameExample")\
.getOrCreate()
if len(sys.argv) == 2:
input = sys.argv[1]
else:

View file

@ -24,7 +24,10 @@ from pyspark.mllib.linalg import Vectors
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("DCTExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("DCTExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("decision_tree_classification_example")\
.getOrCreate()
# $example on$
# Load the data stored in LIBSVM format as a DataFrame.

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("decision_tree_classification_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("decision_tree_classification_example")\
.getOrCreate()
# $example on$
# Load the data stored in LIBSVM format as a DataFrame.

View file

@ -24,7 +24,10 @@ from pyspark.mllib.linalg import Vectors
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("ElementwiseProductExample")\
.getOrCreate()
# $example on$
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]

View file

@ -26,7 +26,10 @@ from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("EstimatorTransformerParamExample")\
.getOrCreate()
# $example on$
# Prepare training data from a list of (label, features) tuples.

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("gradient_boosted_tree_classifier_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("gradient_boosted_tree_classifier_example")\
.getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("gradient_boosted_tree_regressor_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("gradient_boosted_tree_regressor_example")\
.getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("IndexToStringExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("IndexToStringExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame(

View file

@ -49,7 +49,10 @@ if __name__ == "__main__":
path = sys.argv[1]
k = sys.argv[2]
spark = SparkSession.builder.appName("PythonKMeansExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("PythonKMeansExample")\
.getOrCreate()
lines = spark.read.text(path).rdd
data = lines.map(parseVector)

View file

@ -23,7 +23,10 @@ from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("LinearRegressionWithElasticNet").getOrCreate()
spark = SparkSession\
.builder\
.appName("LinearRegressionWithElasticNet")\
.getOrCreate()
# $example on$
# Load training data

View file

@ -23,7 +23,10 @@ from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("LogisticRegressionWithElasticNet").getOrCreate()
spark = SparkSession\
.builder\
.appName("LogisticRegressionWithElasticNet")\
.getOrCreate()
# $example on$
# Load training data

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import MaxAbsScaler
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("MaxAbsScalerExample")\
.getOrCreate()
# $example on$
dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import MinMaxScaler
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("MinMaxScalerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("MinMaxScalerExample")\
.getOrCreate()
# $example on$
dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import NGram
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("NGramExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("NGramExample")\
.getOrCreate()
# $example on$
wordDataFrame = spark.createDataFrame([

View file

@ -24,7 +24,10 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("naive_bayes_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("naive_bayes_example")\
.getOrCreate()
# $example on$
# Load training data

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import Normalizer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("NormalizerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("NormalizerExample")\
.getOrCreate()
# $example on$
dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("OneHotEncoderExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("OneHotEncoderExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([

View file

@ -24,7 +24,10 @@ from pyspark.mllib.linalg import Vectors
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("PCAExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("PCAExample")\
.getOrCreate()
# $example on$
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),

View file

@ -27,7 +27,10 @@ from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("PipelineExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("PipelineExample")\
.getOrCreate()
# $example on$
# Prepare training documents from a list of (id, text, label) tuples.

View file

@ -24,7 +24,10 @@ from pyspark.mllib.linalg import Vectors
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("PolynomialExpansionExample")\
.getOrCreate()
# $example on$
df = spark\

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("random_forest_classifier_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("random_forest_classifier_example")\
.getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.

View file

@ -29,7 +29,10 @@ from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("random_forest_regressor_example").getOrCreate()
spark = SparkSession\
.builder\
.appName("random_forest_regressor_example")\
.getOrCreate()
# $example on$
# Load and parse the data file, converting it to a DataFrame.

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import RFormula
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("RFormulaExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("RFormulaExample")\
.getOrCreate()
# $example on$
dataset = spark.createDataFrame(

View file

@ -33,7 +33,10 @@ pipeline in Python. Run with:
if __name__ == "__main__":
spark = SparkSession.builder.appName("SimpleTextClassificationPipeline").getOrCreate()
spark = SparkSession\
.builder\
.appName("SimpleTextClassificationPipeline")\
.getOrCreate()
# Prepare training documents, which are labeled.
training = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import SQLTransformer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("SQLTransformerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("SQLTransformerExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import StandardScaler
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("StandardScalerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("StandardScalerExample")\
.getOrCreate()
# $example on$
dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("StopWordsRemoverExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("StopWordsRemoverExample")\
.getOrCreate()
# $example on$
sentenceData = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("StringIndexerExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame(

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("TfIdfExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("TfIdfExample")\
.getOrCreate()
# $example on$
sentenceData = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("TokenizerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("TokenizerExample")\
.getOrCreate()
# $example on$
sentenceDataFrame = spark.createDataFrame([

View file

@ -31,7 +31,10 @@ Run with:
"""
if __name__ == "__main__":
spark = SparkSession.builder.appName("TrainValidationSplit").getOrCreate()
spark = SparkSession\
.builder\
.appName("TrainValidationSplit")\
.getOrCreate()
# $example on$
# Prepare training and test data.
data = spark.read.format("libsvm")\

View file

@ -24,7 +24,10 @@ from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("VectorAssemblerExample")\
.getOrCreate()
# $example on$
dataset = spark.createDataFrame(

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import VectorIndexer
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("VectorIndexerExample")\
.getOrCreate()
# $example on$
data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -25,7 +25,10 @@ from pyspark.sql.types import Row
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("VectorSlicerExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("VectorSlicerExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([

View file

@ -23,7 +23,10 @@ from pyspark.ml.feature import Word2Vec
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession.builder.appName("Word2VecExample").getOrCreate()
spark = SparkSession\
.builder\
.appName("Word2VecExample")\
.getOrCreate()
# $example on$
# Input data: Each row is a bag of words from a sentence or document.

View file

@ -18,20 +18,25 @@
Binary Classification Metrics Example.
"""
from __future__ import print_function
from pyspark import SparkContext
from pyspark.sql import SparkSession
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="BinaryClassificationMetricsExample")
spark = SparkSession\
.builder\
.appName("BinaryClassificationMetricsExample")\
.getOrCreate()
# $example on$
# Several of the methods available in scala are currently missing from pyspark
# Load training data in LIBSVM format
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
data = spark\
.read.format("libsvm").load("data/mllib/sample_binary_classification_data.txt")\
.rdd.map(lambda row: LabeledPoint(row[0], row[1]))
# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11L)
@ -53,4 +58,4 @@ if __name__ == "__main__":
print("Area under ROC = %s" % metrics.areaUnderROC)
# $example off$
sc.stop()
spark.stop()

View file

@ -25,7 +25,10 @@ from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerT
if __name__ == "__main__":
spark = SparkSession.builder.appName("PythonSQL").getOrCreate()
spark = SparkSession\
.builder\
.appName("PythonSQL")\
.getOrCreate()
# A list of Rows. Infer schema from the first row, create a DataFrame and print the schema
rows = [Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]

View file

@ -38,8 +38,10 @@ from pyspark.sql import Row, SparkSession
def getSparkSessionInstance(sparkConf):
if ('sparkSessionSingletonInstance' not in globals()):
globals()['sparkSessionSingletonInstance'] =\
SparkSession.builder.config(conf=sparkConf).getOrCreate()
globals()['sparkSessionSingletonInstance'] = SparkSession\
.builder\
.config(conf=sparkConf)\
.getOrCreate()
return globals()['sparkSessionSingletonInstance']

View file

@ -30,7 +30,10 @@ import org.apache.spark.sql.SparkSession
object AFTSurvivalRegressionExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("AFTSurvivalRegressionExample").getOrCreate()
val spark = SparkSession
.builder
.appName("AFTSurvivalRegressionExample")
.getOrCreate()
// $example on$
val training = spark.createDataFrame(Seq(

View file

@ -42,7 +42,10 @@ object ALSExample {
// $example off$
def main(args: Array[String]) {
val spark = SparkSession.builder.appName("ALSExample").getOrCreate()
val spark = SparkSession
.builder
.appName("ALSExample")
.getOrCreate()
import spark.implicits._
// $example on$

View file

@ -25,7 +25,10 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
object BinarizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("BinarizerExample").getOrCreate()
val spark = SparkSession
.builder
.appName("BinarizerExample")
.getOrCreate()
// $example on$
val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
val dataFrame: DataFrame = spark.createDataFrame(data).toDF("label", "feature")

View file

@ -25,7 +25,10 @@ import org.apache.spark.sql.SparkSession
object BucketizerExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("BucketizerExample").getOrCreate()
val spark = SparkSession
.builder
.appName("BucketizerExample")
.getOrCreate()
// $example on$
val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)

View file

@ -26,7 +26,10 @@ import org.apache.spark.sql.SparkSession
object ChiSqSelectorExample {
def main(args: Array[String]) {
val spark = SparkSession.builder.appName("ChiSqSelectorExample").getOrCreate()
val spark = SparkSession
.builder
.appName("ChiSqSelectorExample")
.getOrCreate()
import spark.implicits._
// $example on$

View file

@ -25,7 +25,10 @@ import org.apache.spark.sql.SparkSession
object CountVectorizerExample {
def main(args: Array[String]) {
val spark = SparkSession.builder.appName("CounterVectorizerExample").getOrCreate()
val spark = SparkSession
.builder
.appName("CounterVectorizerExample")
.getOrCreate()
// $example on$
val df = spark.createDataFrame(Seq(

View file

@ -26,7 +26,10 @@ import org.apache.spark.sql.SparkSession
object DCTExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("DCTExample").getOrCreate()
val spark = SparkSession
.builder
.appName("DCTExample")
.getOrCreate()
// $example on$
val data = Seq(

View file

@ -61,7 +61,10 @@ object DataFrameExample {
}
def run(params: Params) {
val spark = SparkSession.builder.appName(s"DataFrameExample with $params").getOrCreate()
val spark = SparkSession
.builder
.appName(s"DataFrameExample with $params")
.getOrCreate()
// Load input data
println(s"Loading LIBSVM file with UDT from ${params.input}.")

View file

@ -29,7 +29,10 @@ import org.apache.spark.sql.SparkSession
object DecisionTreeClassificationExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate()
val spark = SparkSession
.builder
.appName("DecisionTreeClassificationExample")
.getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

View file

@ -167,7 +167,9 @@ object DecisionTreeExample {
testInput: String,
algo: String,
fracTest: Double): (DataFrame, DataFrame) = {
val spark = SparkSession.builder.getOrCreate()
val spark = SparkSession
.builder
.getOrCreate()
// Load training data
val origExamples: DataFrame = loadData(spark, input, dataFormat)

View file

@ -29,7 +29,10 @@ import org.apache.spark.sql.SparkSession
object DecisionTreeRegressionExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate()
val spark = SparkSession
.builder
.appName("DecisionTreeRegressionExample")
.getOrCreate()
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.

View file

@ -37,7 +37,10 @@ import org.apache.spark.sql.{Dataset, Row, SparkSession}
object DeveloperApiExample {
def main(args: Array[String]) {
val spark = SparkSession.builder.appName("DeveloperApiExample").getOrCreate()
val spark = SparkSession
.builder
.appName("DeveloperApiExample")
.getOrCreate()
import spark.implicits._
// Prepare training data.

View file

@ -26,7 +26,10 @@ import org.apache.spark.sql.SparkSession
object ElementwiseProductExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("ElementwiseProductExample").getOrCreate()
val spark = SparkSession
.builder
.appName("ElementwiseProductExample")
.getOrCreate()
// $example on$
// Create some vector data; also works for sparse vectors

View file

@ -29,7 +29,10 @@ import org.apache.spark.sql.SparkSession
object EstimatorTransformerParamExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("EstimatorTransformerParamExample").getOrCreate()
val spark = SparkSession
.builder
.appName("EstimatorTransformerParamExample")
.getOrCreate()
// $example on$
// Prepare training data from a list of (label, features) tuples.

View file

@ -28,7 +28,10 @@ import org.apache.spark.sql.SparkSession
object GradientBoostedTreeClassifierExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("GradientBoostedTreeClassifierExample").getOrCreate()
val spark = SparkSession
.builder
.appName("GradientBoostedTreeClassifierExample")
.getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.

Some files were not shown because too many files have changed in this diff Show more