2013-07-30 17:03:15 -04:00
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<!--
|
|
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
|
|
~ this work for additional information regarding copyright ownership.
|
|
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
|
|
~ the License. You may obtain a copy of the License at
|
|
|
|
~
|
|
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
~
|
|
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
~ See the License for the specific language governing permissions and
|
|
|
|
~ limitations under the License.
|
|
|
|
-->
|
|
|
|
|
|
|
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
|
|
<modelVersion>4.0.0</modelVersion>
|
|
|
|
<parent>
|
2013-08-31 22:27:07 -04:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2015-03-05 14:31:48 -05:00
|
|
|
<artifactId>spark-parent_2.10</artifactId>
|
2015-05-29 15:15:18 -04:00
|
|
|
<version>1.4.0-SNAPSHOT</version>
|
2013-07-30 17:03:15 -04:00
|
|
|
<relativePath>../pom.xml</relativePath>
|
|
|
|
</parent>
|
|
|
|
|
2013-08-31 22:27:07 -04:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2013-12-07 02:15:57 -05:00
|
|
|
<artifactId>spark-mllib_2.10</artifactId>
|
2014-07-10 14:03:37 -04:00
|
|
|
<properties>
|
2014-07-28 15:07:30 -04:00
|
|
|
<sbt.project.name>mllib</sbt.project.name>
|
2015-01-08 20:15:13 -05:00
|
|
|
</properties>
|
2013-07-30 17:03:15 -04:00
|
|
|
<packaging>jar</packaging>
|
|
|
|
<name>Spark Project ML Library</name>
|
2014-03-02 04:00:16 -05:00
|
|
|
<url>http://spark.apache.org/</url>
|
2013-07-30 17:03:15 -04:00
|
|
|
|
|
|
|
<dependencies>
|
2013-08-15 15:10:31 -04:00
|
|
|
<dependency>
|
2013-08-31 22:27:07 -04:00
|
|
|
<groupId>org.apache.spark</groupId>
|
2013-12-15 15:39:58 -05:00
|
|
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
2013-08-15 15:10:31 -04:00
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
Streaming mllib [SPARK-2438][MLLIB]
This PR implements a streaming linear regression analysis, in which a linear regression model is trained online as new data arrive. The design is based on discussions with tdas and mengxr, in which we determined how to add this functionality in a general way, with minimal changes to existing libraries.
__Summary of additions:__
_StreamingLinearAlgorithm_
- An abstract class for fitting generalized linear models online to streaming data, including training on (and updating) a model, and making predictions.
_StreamingLinearRegressionWithSGD_
- Class and companion object for running streaming linear regression
_StreamingLinearRegressionTestSuite_
- Unit tests
_StreamingLinearRegression_
- Example use case: fitting a model online to data from one stream, and making predictions on other data
__Notes__
- If this looks good, I can use the StreamingLinearAlgorithm class to easily implement other analyses that follow the same logic (Ridge, Lasso, Logistic, SVM).
Author: Jeremy Freeman <the.freeman.lab@gmail.com>
Author: freeman <the.freeman.lab@gmail.com>
Closes #1361 from freeman-lab/streaming-mllib and squashes the following commits:
775ea29 [Jeremy Freeman] Throw error if user doesn't initialize weights
4086fee [Jeremy Freeman] Fixed current weight formatting
8b95b27 [Jeremy Freeman] Restored broadcasting
29f27ec [Jeremy Freeman] Formatting
8711c41 [Jeremy Freeman] Used return to avoid indentation
777b596 [Jeremy Freeman] Restored treeAggregate
74cf440 [Jeremy Freeman] Removed static methods
d28cf9a [Jeremy Freeman] Added usage notes
c3326e7 [Jeremy Freeman] Improved documentation
9541a41 [Jeremy Freeman] Merge remote-tracking branch 'upstream/master' into streaming-mllib
66eba5e [Jeremy Freeman] Fixed line lengths
2fe0720 [Jeremy Freeman] Minor cleanup
7d51378 [Jeremy Freeman] Moved streaming loader to MLUtils
b9b69f6 [Jeremy Freeman] Added setter methods
c3f8b5a [Jeremy Freeman] Modified logging
00aafdc [Jeremy Freeman] Add modifiers
14b801e [Jeremy Freeman] Name changes
c7d38a3 [Jeremy Freeman] Move check for empty data to GradientDescent
4b0a5d3 [Jeremy Freeman] Cleaned up tests
74188d6 [Jeremy Freeman] Eliminate dependency on commons
50dd237 [Jeremy Freeman] Removed experimental tag
6bfe1e6 [Jeremy Freeman] Fixed imports
a2a63ad [freeman] Makes convergence test more robust
86220bc [freeman] Streaming linear regression unit tests
fb4683a [freeman] Minor changes for scalastyle consistency
fd31e03 [freeman] Changed logging behavior
453974e [freeman] Fixed indentation
c4b1143 [freeman] Streaming linear regression
604f4d7 [freeman] Expanded private class to include mllib
d99aa85 [freeman] Helper methods for streaming MLlib apps
0898add [freeman] Added dependency on streaming
2014-08-01 23:10:26 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2014-11-04 01:29:48 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2015-01-30 17:09:49 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
</dependency>
|
2013-07-30 17:03:15 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.jblas</groupId>
|
|
|
|
<artifactId>jblas</artifactId>
|
2014-04-06 20:40:37 -04:00
|
|
|
<version>${jblas.version}</version>
|
2015-03-12 04:39:04 -04:00
|
|
|
<scope>test</scope>
|
2013-07-30 17:03:15 -04:00
|
|
|
</dependency>
|
2014-03-23 20:34:02 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalanlp</groupId>
|
|
|
|
<artifactId>breeze_${scala.binary.version}</artifactId>
|
2015-03-27 03:15:02 -04:00
|
|
|
<version>0.11.2</version>
|
2014-06-05 16:13:33 -04:00
|
|
|
<exclusions>
|
|
|
|
<!-- This is included as a compile-scoped dependency by jtransforms, which is
|
|
|
|
a dependency of breeze. -->
|
|
|
|
<exclusion>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
</exclusion>
|
2014-11-01 18:21:36 -04:00
|
|
|
<exclusion>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-math3</artifactId>
|
|
|
|
</exclusion>
|
2014-06-05 16:13:33 -04:00
|
|
|
</exclusions>
|
2014-03-23 20:34:02 -04:00
|
|
|
</dependency>
|
2014-10-27 13:53:15 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.commons</groupId>
|
|
|
|
<artifactId>commons-math3</artifactId>
|
|
|
|
</dependency>
|
2013-07-30 17:03:15 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.scalacheck</groupId>
|
2013-12-15 15:39:58 -05:00
|
|
|
<artifactId>scalacheck_${scala.binary.version}</artifactId>
|
2013-07-30 17:03:15 -04:00
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-07-30 18:04:33 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>junit</groupId>
|
|
|
|
<artifactId>junit</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2013-08-06 18:43:46 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>com.novocode</groupId>
|
|
|
|
<artifactId>junit-interface</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
[SPARK-3530][MLLIB] pipeline and parameters with examples
This PR adds package "org.apache.spark.ml" with pipeline and parameters, as discussed on the JIRA. This is a joint work of jkbradley etrain shivaram and many others who helped on the design, also with help from marmbrus and liancheng on the Spark SQL side. The design doc can be found at:
https://docs.google.com/document/d/1rVwXRjWKfIb-7PI6b86ipytwbUH7irSNLF1_6dLmh8o/edit?usp=sharing
**org.apache.spark.ml**
This is a new package with new set of ML APIs that address practical machine learning pipelines. (Sorry for taking so long!) It will be an alpha component, so this is definitely not something set in stone. The new set of APIs, inspired by the MLI project from AMPLab and scikit-learn, takes leverage on Spark SQL's schema support and execution plan optimization. It introduces the following components that help build a practical pipeline:
1. Transformer, which transforms a dataset into another
2. Estimator, which fits models to data, where models are transformers
3. Evaluator, which evaluates model output and returns a scalar metric
4. Pipeline, a simple pipeline that consists of transformers and estimators
Parameters could be supplied at fit/transform or embedded with components.
1. Param: a strong-typed parameter key with self-contained doc
2. ParamMap: a param -> value map
3. Params: trait for components with parameters
For any component that implements `Params`, user can easily check the doc by calling `explainParams`:
~~~
> val lr = new LogisticRegression
> lr.explainParams
maxIter: max number of iterations (default: 100)
regParam: regularization constant (default: 0.1)
labelCol: label column name (default: label)
featuresCol: features column name (default: features)
~~~
or user can check individual param:
~~~
> lr.maxIter
maxIter: max number of iterations (default: 100)
~~~
**Please start with the example code in test suites and under `org.apache.spark.examples.ml`, where I put several examples:**
1. run a simple logistic regression job
~~~
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(1.0)
val model = lr.fit(dataset)
model.transform(dataset, model.threshold -> 0.8) // overwrite threshold
.select('label, 'score, 'prediction).collect()
.foreach(println)
~~~
2. run logistic regression with cross-validation and grid search using areaUnderROC (default) as the metric
~~~
val lr = new LogisticRegression
val lrParamMaps = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 100.0))
.addGrid(lr.maxIter, Array(0, 5))
.build()
val eval = new BinaryClassificationEvaluator
val cv = new CrossValidator()
.setEstimator(lr)
.setEstimatorParamMaps(lrParamMaps)
.setEvaluator(eval)
.setNumFolds(3)
val bestModel = cv.fit(dataset)
~~~
3. run a pipeline that consists of a standard scaler and a logistic regression component
~~~
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
val lr = new LogisticRegression()
.setFeaturesCol(scaler.getOutputCol)
val pipeline = new Pipeline()
.setStages(Array(scaler, lr))
val model = pipeline.fit(dataset)
val predictions = model.transform(dataset)
.select('label, 'score, 'prediction)
.collect()
.foreach(println)
~~~
4. a simple text classification pipeline, which recognizes "spark":
~~~
val training = sparkContext.parallelize(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
LabeledDocument(1L, "b d", 0.0),
LabeledDocument(2L, "spark f g h", 1.0),
LabeledDocument(3L, "hadoop mapreduce", 0.0)))
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
val model = pipeline.fit(training)
val test = sparkContext.parallelize(Seq(
Document(4L, "spark i j k"),
Document(5L, "l m"),
Document(6L, "mapreduce spark"),
Document(7L, "apache hadoop")))
model.transform(test)
.select('id, 'text, 'prediction, 'score)
.collect()
.foreach(println)
~~~
Java examples are very similar. I put example code that creates a simple text classification pipeline in Scala and Java, where a simple tokenizer is defined as a transformer outside `org.apache.spark.ml`.
**What are missing now and will be added soon:**
1. ~~Runtime check of schemas. So before we touch the data, we will go through the schema and make sure column names and types match the input parameters.~~
2. ~~Java examples.~~
3. ~~Store training parameters in trained models.~~
4. (later) Serialization and Python API.
Author: Xiangrui Meng <meng@databricks.com>
Closes #3099 from mengxr/SPARK-3530 and squashes the following commits:
2cc93fd [Xiangrui Meng] hide APIs as much as I can
34319ba [Xiangrui Meng] use local instead local[2] for unit tests
2524251 [Xiangrui Meng] rename PipelineStage.transform to transformSchema
c9daab4 [Xiangrui Meng] remove mockito version
1397ab5 [Xiangrui Meng] use sqlContext from LocalSparkContext instead of TestSQLContext
6ffc389 [Xiangrui Meng] try to fix unit test
a59d8b7 [Xiangrui Meng] doc updates
977fd9d [Xiangrui Meng] add scala ml package object
6d97fe6 [Xiangrui Meng] add AlphaComponent annotation
731f0e4 [Xiangrui Meng] update package doc
0435076 [Xiangrui Meng] remove ;this from setters
fa21d9b [Xiangrui Meng] update extends indentation
f1091b3 [Xiangrui Meng] typo
228a9f4 [Xiangrui Meng] do not persist before calling binary classification metrics
f51cd27 [Xiangrui Meng] rename default to defaultValue
b3be094 [Xiangrui Meng] refactor schema transform in lr
8791e8e [Xiangrui Meng] rename copyValues to inheritValues and make it do the right thing
51f1c06 [Xiangrui Meng] remove leftover code in Transformer
494b632 [Xiangrui Meng] compure score once
ad678e9 [Xiangrui Meng] more doc for Transformer
4306ed4 [Xiangrui Meng] org imports in text pipeline
6e7c1c7 [Xiangrui Meng] update pipeline
4f9e34f [Xiangrui Meng] more doc for pipeline
aa5dbd4 [Xiangrui Meng] fix typo
11be383 [Xiangrui Meng] fix unit tests
3df7952 [Xiangrui Meng] clean up
986593e [Xiangrui Meng] re-org java test suites
2b11211 [Xiangrui Meng] remove external data deps
9fd4933 [Xiangrui Meng] add unit test for pipeline
2a0df46 [Xiangrui Meng] update tests
2d52e4d [Xiangrui Meng] add @AlphaComponent to package-info
27582a4 [Xiangrui Meng] doc changes
73a000b [Xiangrui Meng] add schema transformation layer
6736e87 [Xiangrui Meng] more doc / remove HasMetricName trait
80a8b5e [Xiangrui Meng] rename SimpleTransformer to UnaryTransformer
62ca2bb [Xiangrui Meng] check param parent in set/get
1622349 [Xiangrui Meng] add getModel to PipelineModel
a0e0054 [Xiangrui Meng] update StandardScaler to use SimpleTransformer
d0faa04 [Xiangrui Meng] remove implicit mapping from ParamMap
c7f6921 [Xiangrui Meng] move ParamGridBuilder test to ParamGridBuilderSuite
e246f29 [Xiangrui Meng] re-org:
7772430 [Xiangrui Meng] remove modelParams add a simple text classification pipeline
b95c408 [Xiangrui Meng] remove implicits add unit tests to params
bab3e5b [Xiangrui Meng] update params
fe0ee92 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
6e86d98 [Xiangrui Meng] some code clean-up
2d040b3 [Xiangrui Meng] implement setters inside each class, add Params.copyValues [ci skip]
fd751fc [Xiangrui Meng] add java-friendly versions of fit and tranform
3f810cd [Xiangrui Meng] use multi-model training api in cv
5b8f413 [Xiangrui Meng] rename model to modelParams
9d2d35d [Xiangrui Meng] test varargs and chain model params
f46e927 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-3530
1ef26e0 [Xiangrui Meng] specialize methods/types for Java
df293ed [Xiangrui Meng] switch to setter/getter
376db0a [Xiangrui Meng] pipeline and parameters
2014-11-12 13:38:57 -05:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.mockito</groupId>
|
|
|
|
<artifactId>mockito-all</artifactId>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2014-08-19 16:28:57 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.apache.spark</groupId>
|
|
|
|
<artifactId>spark-streaming_${scala.binary.version}</artifactId>
|
|
|
|
<version>${project.version}</version>
|
|
|
|
<type>test-jar</type>
|
|
|
|
<scope>test</scope>
|
|
|
|
</dependency>
|
2015-04-30 02:21:21 -04:00
|
|
|
<dependency>
|
|
|
|
<groupId>org.jpmml</groupId>
|
|
|
|
<artifactId>pmml-model</artifactId>
|
|
|
|
<version>1.1.15</version>
|
|
|
|
<exclusions>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>com.sun.xml.fastinfoset</groupId>
|
|
|
|
<artifactId>FastInfoset</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
<exclusion>
|
|
|
|
<groupId>com.sun.istack</groupId>
|
|
|
|
<artifactId>istack-commons-runtime</artifactId>
|
|
|
|
</exclusion>
|
|
|
|
</exclusions>
|
|
|
|
</dependency>
|
2013-07-30 17:03:15 -04:00
|
|
|
</dependencies>
|
2014-07-11 00:57:54 -04:00
|
|
|
<profiles>
|
|
|
|
<profile>
|
|
|
|
<id>netlib-lgpl</id>
|
|
|
|
<dependencies>
|
|
|
|
<dependency>
|
|
|
|
<groupId>com.github.fommil.netlib</groupId>
|
|
|
|
<artifactId>all</artifactId>
|
2015-03-12 04:39:04 -04:00
|
|
|
<version>${netlib.java.version}</version>
|
2014-07-11 00:57:54 -04:00
|
|
|
<type>pom</type>
|
|
|
|
</dependency>
|
|
|
|
</dependencies>
|
|
|
|
</profile>
|
|
|
|
</profiles>
|
2013-07-30 17:03:15 -04:00
|
|
|
<build>
|
2013-12-15 15:39:58 -05:00
|
|
|
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
|
|
|
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
2013-07-30 17:03:15 -04:00
|
|
|
</build>
|
|
|
|
</project>
|