2013-07-30 17:03:15 -04:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<!--
|
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
|
|
~ this work for additional information regarding copyright ownership.
|
|
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
|
|
~ the License. You may obtain a copy of the License at
|
|
|
|
~
|
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
~
|
|
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
~ See the License for the specific language governing permissions and
|
|
|
|
~ limitations under the License.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
|
|
<modelVersion>4.0.0</modelVersion>
|
|
|
|
<parent>
|
2013-08-31 22:27:07 -04:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2018-11-14 19:22:23 -05:00
|
|
|
<artifactId>spark-parent_2.12</artifactId>
|
2021-08-20 08:40:47 -04:00
|
|
|
<version>3.2.1-SNAPSHOT</version>
|
2013-07-30 17:03:15 -04:00
|
|
|
<relativePath>../pom.xml</relativePath>
|
|
|
|
</parent>
|
|
|
|
|
2018-11-14 19:22:23 -05:00
|
|
|
<artifactId>spark-mllib_2.12</artifactId>
|
2014-07-10 14:03:37 -04:00
|
|
|
<properties>
|
2014-07-28 15:07:30 -04:00
|
|
|
<sbt.project.name>mllib</sbt.project.name>
|
2015-01-08 20:15:13 -05:00
|
|
|
</properties>
|
2013-07-30 17:03:15 -04:00
|
|
|
<packaging>jar</packaging>
|
|
|
|
<name>Spark Project ML Library</name>
|
2014-03-02 04:00:16 -05:00
|
|
|
<url>http://spark.apache.org/</url>
|
2013-07-30 17:03:15 -04:00
|
|
|
|
|
|
|
<dependencies>
|
2017-09-01 14:21:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scala-lang.modules</groupId>
|
|
|
|
<artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
|
|
|
|
</dependency>
|
2013-08-15 15:10:31 -04:00
|
|
|
<dependency>
|
2013-08-31 22:27:07 -04:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2013-12-15 15:39:58 -05:00
|
|
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
2013-08-15 15:10:31 -04:00
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2015-05-29 20:19:46 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
Streaming mllib [SPARK-2438][MLLIB]
This PR implements a streaming linear regression analysis, in which a linear regression model is trained online as new data arrive. The design is based on discussions with tdas and mengxr, in which we determined how to add this functionality in a general way, with minimal changes to existing libraries.
__Summary of additions:__
_StreamingLinearAlgorithm_
- An abstract class for fitting generalized linear models online to streaming data, including training on (and updating) a model, and making predictions.
_StreamingLinearRegressionWithSGD_
- Class and companion object for running streaming linear regression
_StreamingLinearRegressionTestSuite_
- Unit tests
_StreamingLinearRegression_
- Example use case: fitting a model online to data from one stream, and making predictions on other data
__Notes__
- If this looks good, I can use the StreamingLinearAlgorithm class to easily implement other analyses that follow the same logic (Ridge, Lasso, Logistic, SVM).
Author: Jeremy Freeman <the.freeman.lab@gmail.com>
Author: freeman <the.freeman.lab@gmail.com>
Closes #1361 from freeman-lab/streaming-mllib and squashes the following commits:
775ea29 [Jeremy Freeman] Throw error if user doesn't initialize weights
4086fee [Jeremy Freeman] Fixed current weight formatting
8b95b27 [Jeremy Freeman] Restored broadcasting
29f27ec [Jeremy Freeman] Formatting
8711c41 [Jeremy Freeman] Used return to avoid indentation
777b596 [Jeremy Freeman] Restored treeAggregate
74cf440 [Jeremy Freeman] Removed static methods
d28cf9a [Jeremy Freeman] Added usage notes
c3326e7 [Jeremy Freeman] Improved documentation
9541a41 [Jeremy Freeman] Merge remote-tracking branch 'upstream/master' into streaming-mllib
66eba5e [Jeremy Freeman] Fixed line lengths
2fe0720 [Jeremy Freeman] Minor cleanup
7d51378 [Jeremy Freeman] Moved streaming loader to MLUtils
b9b69f6 [Jeremy Freeman] Added setter methods
c3f8b5a [Jeremy Freeman] Modified logging
00aafdc [Jeremy Freeman] Add modifiers
14b801e [Jeremy Freeman] Name changes
c7d38a3 [Jeremy Freeman] Move check for empty data to GradientDescent
4b0a5d3 [Jeremy Freeman] Cleaned up tests
74188d6 [Jeremy Freeman] Eliminate dependency on commons
50dd237 [Jeremy Freeman] Removed experimental tag
6bfe1e6 [Jeremy Freeman] Fixed imports
a2a63ad [freeman] Makes convergence test more robust
86220bc [freeman] Streaming linear regression unit tests
fb4683a [freeman] Minor changes for scalastyle consistency
fd31e03 [freeman] Changed logging behavior
453974e [freeman] Fixed indentation
c4b1143 [freeman] Streaming linear regression
604f4d7 [freeman] Expanded private class to include mllib
d99aa85 [freeman] Helper methods for streaming MLlib apps
0898add [freeman] Added dependency on streaming
2014-08-01 23:10:26 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2014-11-04 01:29:48 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2017-12-13 00:28:24 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-01-30 17:09:49 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2016-04-11 12:35:47 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-mllib-local_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-mllib-local_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-03-23 20:34:02 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalanlp</groupId>
|
|
|
|
<artifactId>breeze_${scala.binary.version}</artifactId>
|
|
|
|
</dependency>
|
2014-10-27 13:53:15 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-math3</artifactId>
|
|
|
|
</dependency>
|
2013-07-30 17:03:15 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalacheck</groupId>
|
2013-12-15 15:39:58 -05:00
|
|
|
<artifactId>scalacheck_${scala.binary.version}</artifactId>
|
2013-07-30 17:03:15 -04:00
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
[SPARK-3530][MLLIB] pipeline and parameters with examples
This PR adds package "org.apache.spark.ml" with pipeline and parameters, as discussed on the JIRA. This is a joint work of jkbradley etrain shivaram and many others who helped on the design, also with help from marmbrus and liancheng on the Spark SQL side. The design doc can be found at:
https://docs.google.com/document/d/1rVwXRjWKfIb-7PI6b86ipytwbUH7irSNLF1_6dLmh8o/edit?usp=sharing
**org.apache.spark.ml**
This is a new package with new set of ML APIs that address practical machine learning pipelines. (Sorry for taking so long!) It will be an alpha component, so this is definitely not something set in stone. The new set of APIs, inspired by the MLI project from AMPLab and scikit-learn, takes leverage on Spark SQL's schema support and execution plan optimization. It introduces the following components that help build a practical pipeline:
1. Transformer, which transforms a dataset into another
2. Estimator, which fits models to data, where models are transformers
3. Evaluator, which evaluates model output and returns a scalar metric
4. Pipeline, a simple pipeline that consists of transformers and estimators
Parameters could be supplied at fit/transform or embedded with components.
1. Param: a strong-typed parameter key with self-contained doc
2. ParamMap: a param -> value map
3. Params: trait for components with parameters
For any component that implements `Params`, user can easily check the doc by calling `explainParams`:
~~~
> val lr = new LogisticRegression
> lr.explainParams
maxIter: max number of iterations (default: 100)
regParam: regularization constant (default: 0.1)
labelCol: label column name (default: label)
featuresCol: features column name (default: features)
~~~
or user can check individual param:
~~~
> lr.maxIter
maxIter: max number of iterations (default: 100)
~~~
**Please start with the example code in test suites and under `org.apache.spark.examples.ml`, where I put several examples:**
1. run a simple logistic regression job
~~~
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(1.0)
val model = lr.fit(dataset)
model.transform(dataset, model.threshold -> 0.8) // overwrite threshold
.select('label, 'score, 'prediction).collect()
.foreach(println)
~~~
2. run logistic regression with cross-validation and grid search using areaUnderROC (default) as the metric
~~~
val lr = new LogisticRegression
val lrParamMaps = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 100.0))
.addGrid(lr.maxIter, Array(0, 5))
.build()
val eval = new BinaryClassificationEvaluator
val cv = new CrossValidator()
.setEstimator(lr)
.setEstimatorParamMaps(lrParamMaps)
.setEvaluator(eval)
.setNumFolds(3)
val bestModel = cv.fit(dataset)
~~~
3. run a pipeline that consists of a standard scaler and a logistic regression component
~~~
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
val lr = new LogisticRegression()
.setFeaturesCol(scaler.getOutputCol)
val pipeline = new Pipeline()
.setStages(Array(scaler, lr))
val model = pipeline.fit(dataset)
val predictions = model.transform(dataset)
.select('label, 'score, 'prediction)
.collect()
.foreach(println)
~~~
4. a simple text classification pipeline, which recognizes "spark":
~~~
val training = sparkContext.parallelize(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
LabeledDocument(1L, "b d", 0.0),
LabeledDocument(2L, "spark f g h", 1.0),
LabeledDocument(3L, "hadoop mapreduce", 0.0)))
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
val model = pipeline.fit(training)
val test = sparkContext.parallelize(Seq(
Document(4L, "spark i j k"),
Document(5L, "l m"),
Document(6L, "mapreduce spark"),
Document(7L, "apache hadoop")))
model.transform(test)
.select('id, 'text, 'prediction, 'score)
.collect()
.foreach(println)
~~~
Java examples are very similar. I put example code that creates a simple text classification pipeline in Scala and Java, where a simple tokenizer is defined as a transformer outside `org.apache.spark.ml`.
**What are missing now and will be added soon:**
1. ~~Runtime check of schemas. So before we touch the data, we will go through the schema and make sure column names and types match the input parameters.~~
2. ~~Java examples.~~
3. ~~Store training parameters in trained models.~~
4. (later) Serialization and Python API.
Author: Xiangrui Meng <meng@databricks.com>
Closes #3099 from mengxr/SPARK-3530 and squashes the following commits:
2cc93fd [Xiangrui Meng] hide APIs as much as I can
34319ba [Xiangrui Meng] use local instead local[2] for unit tests
2524251 [Xiangrui Meng] rename PipelineStage.transform to transformSchema
c9daab4 [Xiangrui Meng] remove mockito version
1397ab5 [Xiangrui Meng] use sqlContext from LocalSparkContext instead of TestSQLContext
6ffc389 [Xiangrui Meng] try to fix unit test
a59d8b7 [Xiangrui Meng] doc updates
977fd9d [Xiangrui Meng] add scala ml package object
6d97fe6 [Xiangrui Meng] add AlphaComponent annotation
731f0e4 [Xiangrui Meng] update package doc
0435076 [Xiangrui Meng] remove ;this from setters
fa21d9b [Xiangrui Meng] update extends indentation
f1091b3 [Xiangrui Meng] typo
228a9f4 [Xiangrui Meng] do not persist before calling binary classification metrics
f51cd27 [Xiangrui Meng] rename default to defaultValue
b3be094 [Xiangrui Meng] refactor schema transform in lr
8791e8e [Xiangrui Meng] rename copyValues to inheritValues and make it do the right thing
51f1c06 [Xiangrui Meng] remove leftover code in Transformer
494b632 [Xiangrui Meng] compure score once
ad678e9 [Xiangrui Meng] more doc for Transformer
4306ed4 [Xiangrui Meng] org imports in text pipeline
6e7c1c7 [Xiangrui Meng] update pipeline
4f9e34f [Xiangrui Meng] more doc for pipeline
aa5dbd4 [Xiangrui Meng] fix typo
11be383 [Xiangrui Meng] fix unit tests
3df7952 [Xiangrui Meng] clean up
986593e [Xiangrui Meng] re-org java test suites
2b11211 [Xiangrui Meng] remove external data deps
9fd4933 [Xiangrui Meng] add unit test for pipeline
2a0df46 [Xiangrui Meng] update tests
2d52e4d [Xiangrui Meng] add @AlphaComponent to package-info
27582a4 [Xiangrui Meng] doc changes
73a000b [Xiangrui Meng] add schema transformation layer
6736e87 [Xiangrui Meng] more doc / remove HasMetricName trait
80a8b5e [Xiangrui Meng] rename SimpleTransformer to UnaryTransformer
62ca2bb [Xiangrui Meng] check param parent in set/get
1622349 [Xiangrui Meng] add getModel to PipelineModel
a0e0054 [Xiangrui Meng] update StandardScaler to use SimpleTransformer
d0faa04 [Xiangrui Meng] remove implicit mapping from ParamMap
c7f6921 [Xiangrui Meng] move ParamGridBuilder test to ParamGridBuilderSuite
e246f29 [Xiangrui Meng] re-org:
7772430 [Xiangrui Meng] remove modelParams add a simple text classification pipeline
b95c408 [Xiangrui Meng] remove implicits add unit tests to params
bab3e5b [Xiangrui Meng] update params
fe0ee92 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
6e86d98 [Xiangrui Meng] some code clean-up
2d040b3 [Xiangrui Meng] implement setters inside each class, add Params.copyValues [ci skip]
fd751fc [Xiangrui Meng] add java-friendly versions of fit and tranform
3f810cd [Xiangrui Meng] use multi-model training api in cv
5b8f413 [Xiangrui Meng] rename model to modelParams
9d2d35d [Xiangrui Meng] test varargs and chain model params
f46e927 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
1ef26e0 [Xiangrui Meng] specialize methods/types for Java
df293ed [Xiangrui Meng] switch to setter/getter
376db0a [Xiangrui Meng] pipeline and parameters
2014-11-12 13:38:57 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.mockito</groupId>
|
2015-06-28 02:27:52 -04:00
|
|
|
<artifactId>mockito-core</artifactId>
|
[SPARK-3530][MLLIB] pipeline and parameters with examples
This PR adds package "org.apache.spark.ml" with pipeline and parameters, as discussed on the JIRA. This is a joint work of jkbradley etrain shivaram and many others who helped on the design, also with help from marmbrus and liancheng on the Spark SQL side. The design doc can be found at:
https://docs.google.com/document/d/1rVwXRjWKfIb-7PI6b86ipytwbUH7irSNLF1_6dLmh8o/edit?usp=sharing
**org.apache.spark.ml**
This is a new package with new set of ML APIs that address practical machine learning pipelines. (Sorry for taking so long!) It will be an alpha component, so this is definitely not something set in stone. The new set of APIs, inspired by the MLI project from AMPLab and scikit-learn, takes leverage on Spark SQL's schema support and execution plan optimization. It introduces the following components that help build a practical pipeline:
1. Transformer, which transforms a dataset into another
2. Estimator, which fits models to data, where models are transformers
3. Evaluator, which evaluates model output and returns a scalar metric
4. Pipeline, a simple pipeline that consists of transformers and estimators
Parameters could be supplied at fit/transform or embedded with components.
1. Param: a strong-typed parameter key with self-contained doc
2. ParamMap: a param -> value map
3. Params: trait for components with parameters
For any component that implements `Params`, user can easily check the doc by calling `explainParams`:
~~~
> val lr = new LogisticRegression
> lr.explainParams
maxIter: max number of iterations (default: 100)
regParam: regularization constant (default: 0.1)
labelCol: label column name (default: label)
featuresCol: features column name (default: features)
~~~
or user can check individual param:
~~~
> lr.maxIter
maxIter: max number of iterations (default: 100)
~~~
**Please start with the example code in test suites and under `org.apache.spark.examples.ml`, where I put several examples:**
1. run a simple logistic regression job
~~~
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(1.0)
val model = lr.fit(dataset)
model.transform(dataset, model.threshold -> 0.8) // overwrite threshold
.select('label, 'score, 'prediction).collect()
.foreach(println)
~~~
2. run logistic regression with cross-validation and grid search using areaUnderROC (default) as the metric
~~~
val lr = new LogisticRegression
val lrParamMaps = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 100.0))
.addGrid(lr.maxIter, Array(0, 5))
.build()
val eval = new BinaryClassificationEvaluator
val cv = new CrossValidator()
.setEstimator(lr)
.setEstimatorParamMaps(lrParamMaps)
.setEvaluator(eval)
.setNumFolds(3)
val bestModel = cv.fit(dataset)
~~~
3. run a pipeline that consists of a standard scaler and a logistic regression component
~~~
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
val lr = new LogisticRegression()
.setFeaturesCol(scaler.getOutputCol)
val pipeline = new Pipeline()
.setStages(Array(scaler, lr))
val model = pipeline.fit(dataset)
val predictions = model.transform(dataset)
.select('label, 'score, 'prediction)
.collect()
.foreach(println)
~~~
4. a simple text classification pipeline, which recognizes "spark":
~~~
val training = sparkContext.parallelize(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
LabeledDocument(1L, "b d", 0.0),
LabeledDocument(2L, "spark f g h", 1.0),
LabeledDocument(3L, "hadoop mapreduce", 0.0)))
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
val model = pipeline.fit(training)
val test = sparkContext.parallelize(Seq(
Document(4L, "spark i j k"),
Document(5L, "l m"),
Document(6L, "mapreduce spark"),
Document(7L, "apache hadoop")))
model.transform(test)
.select('id, 'text, 'prediction, 'score)
.collect()
.foreach(println)
~~~
Java examples are very similar. I put example code that creates a simple text classification pipeline in Scala and Java, where a simple tokenizer is defined as a transformer outside `org.apache.spark.ml`.
**What are missing now and will be added soon:**
1. ~~Runtime check of schemas. So before we touch the data, we will go through the schema and make sure column names and types match the input parameters.~~
2. ~~Java examples.~~
3. ~~Store training parameters in trained models.~~
4. (later) Serialization and Python API.
Author: Xiangrui Meng <meng@databricks.com>
Closes #3099 from mengxr/SPARK-3530 and squashes the following commits:
2cc93fd [Xiangrui Meng] hide APIs as much as I can
34319ba [Xiangrui Meng] use local instead local[2] for unit tests
2524251 [Xiangrui Meng] rename PipelineStage.transform to transformSchema
c9daab4 [Xiangrui Meng] remove mockito version
1397ab5 [Xiangrui Meng] use sqlContext from LocalSparkContext instead of TestSQLContext
6ffc389 [Xiangrui Meng] try to fix unit test
a59d8b7 [Xiangrui Meng] doc updates
977fd9d [Xiangrui Meng] add scala ml package object
6d97fe6 [Xiangrui Meng] add AlphaComponent annotation
731f0e4 [Xiangrui Meng] update package doc
0435076 [Xiangrui Meng] remove ;this from setters
fa21d9b [Xiangrui Meng] update extends indentation
f1091b3 [Xiangrui Meng] typo
228a9f4 [Xiangrui Meng] do not persist before calling binary classification metrics
f51cd27 [Xiangrui Meng] rename default to defaultValue
b3be094 [Xiangrui Meng] refactor schema transform in lr
8791e8e [Xiangrui Meng] rename copyValues to inheritValues and make it do the right thing
51f1c06 [Xiangrui Meng] remove leftover code in Transformer
494b632 [Xiangrui Meng] compure score once
ad678e9 [Xiangrui Meng] more doc for Transformer
4306ed4 [Xiangrui Meng] org imports in text pipeline
6e7c1c7 [Xiangrui Meng] update pipeline
4f9e34f [Xiangrui Meng] more doc for pipeline
aa5dbd4 [Xiangrui Meng] fix typo
11be383 [Xiangrui Meng] fix unit tests
3df7952 [Xiangrui Meng] clean up
986593e [Xiangrui Meng] re-org java test suites
2b11211 [Xiangrui Meng] remove external data deps
9fd4933 [Xiangrui Meng] add unit test for pipeline
2a0df46 [Xiangrui Meng] update tests
2d52e4d [Xiangrui Meng] add @AlphaComponent to package-info
27582a4 [Xiangrui Meng] doc changes
73a000b [Xiangrui Meng] add schema transformation layer
6736e87 [Xiangrui Meng] more doc / remove HasMetricName trait
80a8b5e [Xiangrui Meng] rename SimpleTransformer to UnaryTransformer
62ca2bb [Xiangrui Meng] check param parent in set/get
1622349 [Xiangrui Meng] add getModel to PipelineModel
a0e0054 [Xiangrui Meng] update StandardScaler to use SimpleTransformer
d0faa04 [Xiangrui Meng] remove implicit mapping from ParamMap
c7f6921 [Xiangrui Meng] move ParamGridBuilder test to ParamGridBuilderSuite
e246f29 [Xiangrui Meng] re-org:
7772430 [Xiangrui Meng] remove modelParams add a simple text classification pipeline
b95c408 [Xiangrui Meng] remove implicits add unit tests to params
bab3e5b [Xiangrui Meng] update params
fe0ee92 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
6e86d98 [Xiangrui Meng] some code clean-up
2d040b3 [Xiangrui Meng] implement setters inside each class, add Params.copyValues [ci skip]
fd751fc [Xiangrui Meng] add java-friendly versions of fit and tranform
3f810cd [Xiangrui Meng] use multi-model training api in cv
5b8f413 [Xiangrui Meng] rename model to modelParams
9d2d35d [Xiangrui Meng] test varargs and chain model params
f46e927 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
1ef26e0 [Xiangrui Meng] specialize methods/types for Java
df293ed [Xiangrui Meng] switch to setter/getter
376db0a [Xiangrui Meng] pipeline and parameters
2014-11-12 13:38:57 -05:00
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-08-19 16:28:57 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-04-30 02:21:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.jpmml</groupId>
|
|
|
|
<artifactId>pmml-model</artifactId>
|
2017-07-18 12:53:49 -04:00
|
|
|
<scope>compile</scope>
|
2015-04-30 02:21:21 -04:00
|
|
|
</dependency>
|
2019-02-26 19:26:49 -05:00
|
|
|
<!-- JPMML seems to be the piece that needs JAXB right now: -->
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.glassfish.jaxb</groupId>
|
|
|
|
<artifactId>jaxb-runtime</artifactId>
|
|
|
|
</dependency>
|
2015-10-07 17:11:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
2016-05-17 04:55:53 -04:00
|
|
|
<artifactId>spark-tags_${scala.binary.version}</artifactId>
|
2015-10-07 17:11:21 -04:00
|
|
|
</dependency>
|
2016-12-21 19:37:20 -05:00
|
|
|
|
|
|
|
<!--
|
|
|
|
This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
|
|
|
|
them will yield errors.
|
|
|
|
-->
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-tags_${scala.binary.version}</artifactId>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
|
|
|
|
[SPARK-35150][ML] Accelerate fallback BLAS with dev.ludovic.netlib
### What changes were proposed in this pull request?
Following https://github.com/apache/spark/pull/30810, I've continued looking for ways to accelerate the usage of BLAS in Spark. With this PR, I integrate work done in the [`dev.ludovic.netlib`](https://github.com/luhenry/netlib/) Maven package.
The `dev.ludovic.netlib` library wraps the original `com.github.fommil.netlib` library and focus on accelerating the linear algebra routines in use in Spark. When running the `org.apache.spark.ml.linalg.BLASBenchmark` benchmarking suite, I get the results at [1] on an Intel machine. Moreover, this library is thoroughly tested to return the exact same results as the reference implementation.
Under the hood, it reimplements the necessary algorithms in pure autovectorization-friendly Java 8, as well as takes advantage of the Vector API and Foreign Linker API introduced in JDK 16 when available.
A table summarising which version gets loaded in which case:
```
| | BLAS.nativeBLAS | BLAS.javaBLAS |
| --------------------- | -------------------------------------------------- | -------------------------------------------------- |
| with -Pnetlib-lgpl | 1. dev.ludovic.netlib.blas.NetlibNativeBLAS, a | 1. dev.ludovic.netlib.blas.VectorizedBLAS |
| | wrapper for com.github.fommil:all | (JDK16+, relies on the Vector API, requires |
| | 2. dev.ludovic.netlib.blas.ForeignBLAS (JDK16+, | `--add-modules=jdk.incubator.vector` on JDK16) |
| | relies on the Foreign Linker API, requires | 2. dev.ludovic.netlib.blas.Java11BLAS (JDK11+) |
| | `--add-modules=jdk.incubator.foreign | 3. dev.ludovic.netlib.blas.JavaBLAS |
| | -Dforeign.restricted=warn`) | 4. dev.ludovic.netlib.blas.NetlibF2jBLAS, a |
| | 3. fails to load, falls back to BLAS.javaBLAS in | wrapper for com.github.fommil:core |
| | org.apache.spark.ml.linalg.BLAS | |
| --------------------- | -------------------------------------------------- | -------------------------------------------------- |
| without -Pnetlib-lgpl | 1. dev.ludovic.netlib.blas.ForeignBLAS (JDK16+, | 1. dev.ludovic.netlib.blas.VectorizedBLAS |
| | relies on the Foreign Linker API, requires | (JDK16+, relies on the Vector API, requires |
| | `--add-modules=jdk.incubator.foreign | `--add-modules=jdk.incubator.vector` on JDK16) |
| | -Dforeign.restricted=warn`) | 2. dev.ludovic.netlib.blas.Java11BLAS (JDK11+) |
| | 2. fails to load, falls back to BLAS.javaBLAS in | 3. dev.ludovic.netlib.blas.JavaBLAS |
| | org.apache.spark.ml.linalg.BLAS | 4. dev.ludovic.netlib.blas.NetlibF2jBLAS, a |
| | | wrapper for com.github.fommil:core |
| --------------------- | -------------------------------------------------- | -------------------------------------------------- |
```
### Why are the changes needed?
Accelerates linear algebra operations when the pure-java fallback method is in use. Transparently falls back to native implementation (OpenBLAS, MKL) when available.
### Does this PR introduce _any_ user-facing change?
No, all changes are transparent to the user.
### How was this patch tested?
The `dev.ludovic.netlib` library has its own test suite [2]. It has also been validated by running the Spark test suite and benchmarking suite.
[1] Results for `org.apache.spark.ml.linalg.BLASBenchmark`:
#### JDK8:
```
[info] OpenJDK 64-Bit Server VM 1.8.0_292-b10 on Linux 5.8.0-50-generic
[info] Intel(R) Xeon(R) E-2276G CPU 3.80GHz
[info]
[info] f2jBLAS = dev.ludovic.netlib.blas.NetlibF2jBLAS
[info] javaBLAS = dev.ludovic.netlib.blas.Java8BLAS
[info] nativeBLAS = dev.ludovic.netlib.blas.Java8BLAS
[info]
[info] daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 223 232 8 448.0 2.2 1.0X
[info] java 221 228 7 453.0 2.2 1.0X
[info]
[info] saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 122 128 4 821.2 1.2 1.0X
[info] java 122 128 4 822.3 1.2 1.0X
[info]
[info] ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 109 112 2 921.4 1.1 1.0X
[info] java 70 74 3 1423.5 0.7 1.5X
[info]
[info] sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 96 98 2 1046.1 1.0 1.0X
[info] java 47 49 2 2121.7 0.5 2.0X
[info]
[info] dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 184 195 8 544.3 1.8 1.0X
[info] java 185 196 7 539.5 1.9 1.0X
[info]
[info] sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 99 104 4 1011.9 1.0 1.0X
[info] java 99 104 4 1010.4 1.0 1.0X
[info]
[info] dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 947.2 1.1 1.0X
[info] java 0 0 0 1584.8 0.6 1.7X
[info]
[info] dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 867.4 1.2 1.0X
[info] java 1 1 0 865.0 1.2 1.0X
[info]
[info] dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 485.9 2.1 1.0X
[info] java 1 1 0 486.8 2.1 1.0X
[info]
[info] dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1843.0 0.5 1.0X
[info] java 0 0 0 2690.6 0.4 1.5X
[info]
[info] dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1214.7 0.8 1.0X
[info] java 0 0 0 2536.8 0.4 2.1X
[info]
[info] sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1895.9 0.5 1.0X
[info] java 0 0 0 2961.1 0.3 1.6X
[info]
[info] sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1223.4 0.8 1.0X
[info] java 0 0 0 3091.4 0.3 2.5X
[info]
[info] dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 560 575 20 1787.1 0.6 1.0X
[info] java 226 232 5 4432.4 0.2 2.5X
[info]
[info] dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 570 586 23 1755.2 0.6 1.0X
[info] java 227 232 4 4410.1 0.2 2.5X
[info]
[info] dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 863 879 17 1158.4 0.9 1.0X
[info] java 227 231 3 4407.9 0.2 3.8X
[info]
[info] dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1282 1305 23 780.0 1.3 1.0X
[info] java 227 232 4 4413.4 0.2 5.7X
[info]
[info] sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 538 548 8 1858.6 0.5 1.0X
[info] java 221 226 3 4521.1 0.2 2.4X
[info]
[info] sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 549 558 10 1819.9 0.5 1.0X
[info] java 222 229 7 4503.5 0.2 2.5X
[info]
[info] sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 838 852 12 1193.0 0.8 1.0X
[info] java 222 229 5 4500.5 0.2 3.8X
[info]
[info] sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 905 919 18 1104.8 0.9 1.0X
[info] java 221 228 5 4521.3 0.2 4.1X
```
#### JDK11:
```
[info] OpenJDK 64-Bit Server VM 11.0.11+9-LTS on Linux 5.8.0-50-generic
[info] Intel(R) Xeon(R) E-2276G CPU 3.80GHz
[info]
[info] f2jBLAS = dev.ludovic.netlib.blas.NetlibF2jBLAS
[info] javaBLAS = dev.ludovic.netlib.blas.Java11BLAS
[info] nativeBLAS = dev.ludovic.netlib.blas.Java11BLAS
[info]
[info] daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 195 204 10 512.7 2.0 1.0X
[info] java 195 202 7 512.4 2.0 1.0X
[info]
[info] saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 108 113 4 923.3 1.1 1.0X
[info] java 102 107 4 984.4 1.0 1.1X
[info]
[info] ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 107 110 3 938.1 1.1 1.0X
[info] java 69 72 3 1447.1 0.7 1.5X
[info]
[info] sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 96 98 2 1046.5 1.0 1.0X
[info] java 43 45 2 2317.1 0.4 2.2X
[info]
[info] dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 155 168 8 644.2 1.6 1.0X
[info] java 158 169 8 632.8 1.6 1.0X
[info]
[info] sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 85 90 4 1178.1 0.8 1.0X
[info] java 86 90 4 1167.7 0.9 1.0X
[info]
[info] dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 0 0 0 1182.1 0.8 1.0X
[info] java 0 0 0 1432.1 0.7 1.2X
[info]
[info] dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 898.7 1.1 1.0X
[info] java 1 1 0 891.5 1.1 1.0X
[info]
[info] dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 495.4 2.0 1.0X
[info] java 1 1 0 495.7 2.0 1.0X
[info]
[info] dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 0 0 0 2271.6 0.4 1.0X
[info] java 0 0 0 3648.1 0.3 1.6X
[info]
[info] dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1229.3 0.8 1.0X
[info] java 0 0 0 2711.3 0.4 2.2X
[info]
[info] sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 0 0 0 2677.5 0.4 1.0X
[info] java 0 0 0 3288.2 0.3 1.2X
[info]
[info] sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1233.0 0.8 1.0X
[info] java 0 0 0 2766.3 0.4 2.2X
[info]
[info] dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 520 536 16 1923.6 0.5 1.0X
[info] java 214 221 7 4669.5 0.2 2.4X
[info]
[info] dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 593 612 17 1686.5 0.6 1.0X
[info] java 215 219 3 4643.3 0.2 2.8X
[info]
[info] dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 853 870 16 1172.8 0.9 1.0X
[info] java 215 218 3 4659.7 0.2 4.0X
[info]
[info] dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1350 1370 23 740.8 1.3 1.0X
[info] java 215 219 4 4656.6 0.2 6.3X
[info]
[info] sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 460 468 6 2173.2 0.5 1.0X
[info] java 210 213 2 4752.7 0.2 2.2X
[info]
[info] sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 535 544 8 1869.3 0.5 1.0X
[info] java 210 215 5 4761.8 0.2 2.5X
[info]
[info] sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 843 853 11 1186.8 0.8 1.0X
[info] java 209 214 4 4793.4 0.2 4.0X
[info]
[info] sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 891 904 15 1122.0 0.9 1.0X
[info] java 209 214 4 4777.2 0.2 4.3X
```
#### JDK16:
```
[info] OpenJDK 64-Bit Server VM 16+36 on Linux 5.8.0-50-generic
[info] Intel(R) Xeon(R) E-2276G CPU 3.80GHz
[info]
[info] f2jBLAS = dev.ludovic.netlib.blas.NetlibF2jBLAS
[info] javaBLAS = dev.ludovic.netlib.blas.VectorizedBLAS
[info] nativeBLAS = dev.ludovic.netlib.blas.VectorizedBLAS
[info]
[info] daxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 194 199 7 515.7 1.9 1.0X
[info] java 181 186 3 551.1 1.8 1.1X
[info]
[info] saxpy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 109 115 4 915.0 1.1 1.0X
[info] java 88 92 3 1138.8 0.9 1.2X
[info]
[info] ddot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 108 110 2 922.6 1.1 1.0X
[info] java 54 56 2 1839.2 0.5 2.0X
[info]
[info] sdot: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 96 97 2 1046.1 1.0 1.0X
[info] java 29 30 1 3393.4 0.3 3.2X
[info]
[info] dscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 156 165 5 643.0 1.6 1.0X
[info] java 150 159 5 667.1 1.5 1.0X
[info]
[info] sscal: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 85 91 6 1171.0 0.9 1.0X
[info] java 75 79 3 1340.6 0.7 1.1X
[info]
[info] dspmv[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 917.0 1.1 1.0X
[info] java 0 0 0 8147.2 0.1 8.9X
[info]
[info] dspr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 859.3 1.2 1.0X
[info] java 1 1 0 859.3 1.2 1.0X
[info]
[info] dsyr[U]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 482.1 2.1 1.0X
[info] java 1 1 0 482.6 2.1 1.0X
[info]
[info] dgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 0 0 0 2214.2 0.5 1.0X
[info] java 0 0 0 7975.8 0.1 3.6X
[info]
[info] dgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1231.4 0.8 1.0X
[info] java 0 0 0 8680.9 0.1 7.0X
[info]
[info] sgemv[N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 0 0 0 2684.3 0.4 1.0X
[info] java 0 0 0 18527.1 0.1 6.9X
[info]
[info] sgemv[T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1 1 0 1235.4 0.8 1.0X
[info] java 0 0 0 17347.9 0.1 14.0X
[info]
[info] dgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 530 552 18 1887.5 0.5 1.0X
[info] java 58 64 3 17143.9 0.1 9.1X
[info]
[info] dgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 598 620 17 1671.1 0.6 1.0X
[info] java 58 64 3 17196.6 0.1 10.3X
[info]
[info] dgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 834 847 14 1199.4 0.8 1.0X
[info] java 57 63 4 17486.9 0.1 14.6X
[info]
[info] dgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 1338 1366 22 747.3 1.3 1.0X
[info] java 58 63 3 17356.6 0.1 23.2X
[info]
[info] sgemm[N,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 489 501 9 2045.5 0.5 1.0X
[info] java 36 38 2 27721.9 0.0 13.6X
[info]
[info] sgemm[N,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 478 488 9 2094.0 0.5 1.0X
[info] java 36 38 2 27813.2 0.0 13.3X
[info]
[info] sgemm[T,N]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 825 837 10 1211.6 0.8 1.0X
[info] java 35 38 2 28433.1 0.0 23.5X
[info]
[info] sgemm[T,T]: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] f2j 900 918 15 1111.6 0.9 1.0X
[info] java 36 38 2 28073.0 0.0 25.3X
```
[2] https://github.com/luhenry/netlib/tree/master/blas/src/test/java/dev/ludovic/netlib/blas
Closes #32253 from luhenry/master.
Authored-by: Ludovic Henry <git@ludovic.dev>
Signed-off-by: Sean Owen <srowen@gmail.com>
2021-04-27 15:00:59 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>dev.ludovic.netlib</groupId>
|
|
|
|
<artifactId>blas</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>dev.ludovic.netlib</groupId>
|
|
|
|
<artifactId>lapack</artifactId>
|
|
|
|
</dependency>
|
|
|
|
<dependency>
|
|
|
|
<groupId>dev.ludovic.netlib</groupId>
|
|
|
|
<artifactId>arpack</artifactId>
|
|
|
|
</dependency>
|
|
|
|
|
2013-07-30 17:03:15 -04:00
|
|
|
</dependencies>
|
2017-07-18 12:53:49 -04:00
|
|
|
|
[SPARK-35532][TESTS] Ensure mllib and kafka-0-10 module can be maven test independently in Scala 2.13
### What changes were proposed in this pull request?
Before this pr, when we execute maven test command to test `mllib` and `kafka-0-10` module independently, there are some Java UTs failed, the key error messages are as follows:
```
java.lang.NoClassDefFoundError: scala/collection/parallel/TaskSupport
```
and
```
java.lang.NoClassDefFoundError: scala/collection/parallel/immutable/ParVector
```
The UTs need `scala-parallel-collections_2.13`, but it not in classpath when we run `mvn test -pl mllib -Pscala-2.13` and `mvn test -pl external/kafka-0-10 -Pscala-2.13`.
So the main change of this pr is add `scala-2.13` profile to `mllib/pom.xml` and `external/kafka-0-10/pom.xml`, the `scala-2.13` profile include dependency on `scala-parallel-collections_2.13`, then these two modules can maven test independently.
### Why are the changes needed?
Ensure mllib and kafka-0-10 module can be maven test independently in Scala 2.13
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- Pass the GitHub Action Scala 2.13 job
- Manual test:
1. Execute
```
dev/change-scala-version.sh 2.13
mvn clean install -DskipTests -Phadoop-3.2 -Phive-2.3 -Phadoop-cloud -Pmesos -Pyarn -Pkinesis-asl -Phive-thriftserver -Pspark-ganglia-lgpl -Pkubernetes -Phive -Pscala-2.13
```
2. Execute
```
mvn test -pl mllib -Phadoop-3.2 -Phive-2.3 -Phadoop-cloud -Pmesos -Pyarn -Pkinesis-asl -Phive-thriftserver -Pspark-ganglia-lgpl -Pkubernetes -Phive -Pscala-2.13
```
**Before**
6 Java UTs failed:
```
[ERROR] Errors:
[ERROR] JavaStreamingLogisticRegressionSuite.javaAPI:78 » TestFailed 20005 was not les...
[ERROR] JavaStreamingKMeansSuite.javaAPI:78 » TestFailed 20040 was not less than 20000...
[ERROR] JavaPrefixSpanSuite.runPrefixSpan:45 » NoClassDefFound scala/collection/parall...
[ERROR] JavaPrefixSpanSuite.runPrefixSpanSaveLoad:67 » NoClassDefFound scala/collectio...
[ERROR] JavaStreamingLinearRegressionSuite.javaAPI:77 » TestFailed 20014 was not less ...
[ERROR] JavaStatisticsSuite.streamingTest:112 » TestFailed 20043 was not less than 200...
[INFO]
[ERROR] Tests run: 122, Failures: 0, Errors: 6, Skipped: 0
```
**After**
```
[INFO] Tests run: 122, Failures: 0, Errors: 0, Skipped: 0
Run completed in 28 minutes, 32 seconds.
Total number of tests run: 1654
Suites: completed 208, aborted 0
Tests: succeeded 1654, failed 0, canceled 0, ignored 7, pending 0
All tests passed.
```
3. Execute
```
mvn test -pl external/kafka-0-10 -Phadoop-3.2 -Phive-2.3 -Phadoop-cloud -Pmesos -Pyarn -Pkinesis-asl -Phive-thriftserver -Pspark-ganglia-lgpl -Pkubernetes -Phive -Pscala-2.13
```
**Before**
2 Java UTs failed:
```
[ERROR] Errors:
[ERROR] org.apache.spark.streaming.kafka010.JavaDirectKafkaStreamSuite.testKafkaStream
[ERROR] Run 1: JavaDirectKafkaStreamSuite.testKafkaStream:170 expected:<[topic1-1, topic1-2, topic2-1, topic1-3, topic2-2, topic2-3]> but was:<[]>
[ERROR] Run 2: JavaDirectKafkaStreamSuite.tearDown:57 » NoClassDefFound scala/collection/para...
[ERROR] Tests run: 4, Failures: 0, Errors: 1, Skipped: 0
```
**After**
```
[INFO] Tests run: 4, Failures: 0, Errors: 0, Skipped: 0
Run completed in 1 minute, 3 seconds.
Total number of tests run: 21
Suites: completed 4, aborted 0
Tests: succeeded 21, failed 0, canceled 0, ignored 0, pending 0
All tests passed.
```
Closes #32676 from LuciferYang/mllib-kafka-mvn-test.
Authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
2021-05-30 19:36:17 -04:00
|
|
|
<profiles>
|
|
|
|
<profile>
|
|
|
|
<id>scala-2.13</id>
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>org.scala-lang.modules</groupId>
|
|
|
|
<artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
</profile>
|
|
|
|
</profiles>
|
|
|
|
|
2013-07-30 17:03:15 -04:00
|
|
|
<build>
|
2013-12-15 15:39:58 -05:00
|
|
|
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
|
|
|
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
2017-07-18 12:53:49 -04:00
|
|
|
<plugins>
|
|
|
|
<plugin>
|
|
|
|
<groupId>org.apache.maven.plugins</groupId>
|
|
|
|
<artifactId>maven-dependency-plugin</artifactId>
|
|
|
|
<executions>
|
|
|
|
<!-- When using SPARK_PREPEND_CLASSES Spark classes compiled locally don't use
|
|
|
|
shaded deps. So here we store jars in their original form which are added
|
|
|
|
when the classpath is computed. -->
|
|
|
|
<!-- See similar execution in core/pom.xml -->
|
|
|
|
<execution>
|
|
|
|
<id>copy-dependencies</id>
|
|
|
|
<phase>package</phase>
|
|
|
|
<goals>
|
|
|
|
<goal>copy-dependencies</goal>
|
|
|
|
</goals>
|
|
|
|
<configuration>
|
|
|
|
<outputDirectory>${project.build.directory}</outputDirectory>
|
|
|
|
<overWriteReleases>false</overWriteReleases>
|
|
|
|
<overWriteSnapshots>false</overWriteSnapshots>
|
|
|
|
<overWriteIfNewer>true</overWriteIfNewer>
|
|
|
|
<useSubDirectoryPerType>true</useSubDirectoryPerType>
|
|
|
|
<includeGroupIds>org.jpmml</includeGroupIds>
|
|
|
|
<silent>true</silent>
|
|
|
|
</configuration>
|
|
|
|
</execution>
|
|
|
|
</executions>
|
|
|
|
</plugin>
|
|
|
|
</plugins>
|
2013-07-30 17:03:15 -04:00
|
|
|
</build>
|
2017-07-18 12:53:49 -04:00
|
|
|
|
2013-07-30 17:03:15 -04:00
|
|
|
</project>
|