[SPARK-7581] [ML] [DOC] User guide for spark.ml PolynomialExpansion
JIRA [here](https://issues.apache.org/jira/browse/SPARK-7581). CC jkbradley Author: Xusen Yin <yinxusen@gmail.com> Closes #6113 from yinxusen/SPARK-7581 and squashes the following commits: 1a7d80d [Xusen Yin] merge with master 892a8e9 [Xusen Yin] fix python 3 compatibility ec935bf [Xusen Yin] small fix 3e9fa1d [Xusen Yin] delete note 69fcf85 [Xusen Yin] simplify and add python example 81d21dc [Xusen Yin] add programming guide for Polynomial Expansion 40babfb [Xusen Yin] add java test suite for PolynomialExpansion
This commit is contained in:
parent
23cf897112
commit
6008ec14ed
|
@ -268,5 +268,88 @@ for binarized_feature, in binarizedFeatures.collect():
|
|||
</div>
|
||||
</div>
|
||||
|
||||
## PolynomialExpansion
|
||||
|
||||
[Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) class provides this functionality. The example below shows how to expand your features into a 3-degree polynomial space.
|
||||
|
||||
<div class="codetabs">
|
||||
<div data-lang="scala" markdown="1">
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.ml.feature.PolynomialExpansion
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
|
||||
val data = Array(
|
||||
Vectors.dense(-2.0, 2.3),
|
||||
Vectors.dense(0.0, 0.0),
|
||||
Vectors.dense(0.6, -1.1)
|
||||
)
|
||||
val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
|
||||
val polynomialExpansion = new PolynomialExpansion()
|
||||
.setInputCol("features")
|
||||
.setOutputCol("polyFeatures")
|
||||
.setDegree(3)
|
||||
val polyDF = polynomialExpansion.transform(df)
|
||||
polyDF.select("polyFeatures").take(3).foreach(println)
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="java" markdown="1">
|
||||
{% highlight java %}
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.VectorUDT;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
JavaSparkContext jsc = ...
|
||||
SQLContext jsql = ...
|
||||
PolynomialExpansion polyExpansion = new PolynomialExpansion()
|
||||
.setInputCol("features")
|
||||
.setOutputCol("polyFeatures")
|
||||
.setDegree(3);
|
||||
JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
|
||||
RowFactory.create(Vectors.dense(-2.0, 2.3)),
|
||||
RowFactory.create(Vectors.dense(0.0, 0.0)),
|
||||
RowFactory.create(Vectors.dense(0.6, -1.1))
|
||||
));
|
||||
StructType schema = new StructType(new StructField[] {
|
||||
new StructField("features", new VectorUDT(), false, Metadata.empty()),
|
||||
});
|
||||
DataFrame df = jsql.createDataFrame(data, schema);
|
||||
DataFrame polyDF = polyExpansion.transform(df);
|
||||
Row[] row = polyDF.select("polyFeatures").take(3);
|
||||
for (Row r : row) {
|
||||
System.out.println(r.get(0));
|
||||
}
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
|
||||
<div data-lang="python" markdown="1">
|
||||
{% highlight python %}
|
||||
from pyspark.ml.feature import PolynomialExpansion
|
||||
from pyspark.mllib.linalg import Vectors
|
||||
|
||||
df = sqlContext.createDataFrame(
|
||||
[(Vectors.dense([-2.0, 2.3]), ),
|
||||
(Vectors.dense([0.0, 0.0]), ),
|
||||
(Vectors.dense([0.6, -1.1]), )],
|
||||
["features"])
|
||||
px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
|
||||
polyDF = px.transform(df)
|
||||
for expanded in polyDF.select("polyFeatures").take(3):
|
||||
print(expanded)
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Feature Selectors
|
||||
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.ml.feature;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.junit.After;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.mllib.linalg.Vector;
|
||||
import org.apache.spark.mllib.linalg.VectorUDT;
|
||||
import org.apache.spark.mllib.linalg.Vectors;
|
||||
import org.apache.spark.sql.DataFrame;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.Metadata;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
public class JavaPolynomialExpansionSuite {
|
||||
private transient JavaSparkContext jsc;
|
||||
private transient SQLContext jsql;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
jsc = new JavaSparkContext("local", "JavaPolynomialExpansionSuite");
|
||||
jsql = new SQLContext(jsc);
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() {
|
||||
jsc.stop();
|
||||
jsc = null;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void polynomialExpansionTest() {
|
||||
PolynomialExpansion polyExpansion = new PolynomialExpansion()
|
||||
.setInputCol("features")
|
||||
.setOutputCol("polyFeatures")
|
||||
.setDegree(3);
|
||||
|
||||
JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
|
||||
RowFactory.create(
|
||||
Vectors.dense(-2.0, 2.3),
|
||||
Vectors.dense(-2.0, 4.0, -8.0, 2.3, -4.6, 9.2, 5.29, -10.58, 12.17)
|
||||
),
|
||||
RowFactory.create(Vectors.dense(0.0, 0.0), Vectors.dense(new double[9])),
|
||||
RowFactory.create(
|
||||
Vectors.dense(0.6, -1.1),
|
||||
Vectors.dense(0.6, 0.36, 0.216, -1.1, -0.66, -0.396, 1.21, 0.726, -1.331)
|
||||
)
|
||||
));
|
||||
|
||||
StructType schema = new StructType(new StructField[] {
|
||||
new StructField("features", new VectorUDT(), false, Metadata.empty()),
|
||||
new StructField("expected", new VectorUDT(), false, Metadata.empty())
|
||||
});
|
||||
|
||||
DataFrame dataset = jsql.createDataFrame(data, schema);
|
||||
|
||||
Row[] pairs = polyExpansion.transform(dataset)
|
||||
.select("polyFeatures", "expected")
|
||||
.collect();
|
||||
|
||||
for (Row r : pairs) {
|
||||
double[] polyFeatures = ((Vector)r.get(0)).toArray();
|
||||
double[] expected = ((Vector)r.get(1)).toArray();
|
||||
Assert.assertArrayEquals(polyFeatures, expected, 1e-1);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue