[SPARK-5726] [MLLIB] Elementwise (Hadamard) Vector Product Transformer
See https://issues.apache.org/jira/browse/SPARK-5726
Author: Octavian Geagla <ogeagla@gmail.com>
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #4580 from ogeagla/spark-mllib-weighting and squashes the following commits:
fac12ad [Octavian Geagla] [SPARK-5726] [MLLIB] Use new createTransformFunc.
90f7e39 [Joseph K. Bradley] small cleanups
4595165 [Octavian Geagla] [SPARK-5726] [MLLIB] Remove erroneous test case.
ded3ac6 [Octavian Geagla] [SPARK-5726] [MLLIB] Pass style checks.
37d4705 [Octavian Geagla] [SPARK-5726] [MLLIB] Incorporated feedback.
1dffeee [Octavian Geagla] [SPARK-5726] [MLLIB] Pass style checks.
e436896 [Octavian Geagla] [SPARK-5726] [MLLIB] Remove 'TF' from 'ElementwiseProductTF'
cb520e6 [Octavian Geagla] [SPARK-5726] [MLLIB] Rename HadamardProduct to ElementwiseProduct
4922722 [Octavian Geagla] [SPARK-5726] [MLLIB] Hadamard Vector Product Transformer
(cherry picked from commit 658a478d3f
)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
This commit is contained in:
parent
4436e26e43
commit
76e58b5d88
|
@ -477,3 +477,57 @@ sc.stop();
|
|||
</div>
|
||||
</div>
|
||||
|
||||
## ElementwiseProduct
|
||||
|
||||
ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
|
||||
|
||||
`\[ \begin{pmatrix}
|
||||
v_1 \\
|
||||
\vdots \\
|
||||
v_N
|
||||
\end{pmatrix} \circ \begin{pmatrix}
|
||||
w_1 \\
|
||||
\vdots \\
|
||||
w_N
|
||||
\end{pmatrix}
|
||||
= \begin{pmatrix}
|
||||
v_1 w_1 \\
|
||||
\vdots \\
|
||||
v_N w_N
|
||||
\end{pmatrix}
|
||||
\]`
|
||||
|
||||
[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
|
||||
|
||||
* `w`: the transforming vector.
|
||||
|
||||
`ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
|
||||
|
||||
### Example
|
||||
|
||||
This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value.
|
||||
|
||||
|
||||
<div class="codetabs">
|
||||
<div data-lang="scala">
|
||||
{% highlight scala %}
|
||||
import org.apache.spark.SparkContext._
|
||||
import org.apache.spark.mllib.feature.ElementwiseProduct
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
|
||||
// Load and parse the data:
|
||||
val data = sc.textFile("data/mllib/kmeans_data.txt")
|
||||
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
|
||||
|
||||
val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
|
||||
val transformer = new ElementwiseProduct(transformingVector)
|
||||
|
||||
// Batch transform and per-row transform give the same results:
|
||||
val transformedData = transformer.transform(parsedData)
|
||||
val transformedData2 = parsedData.map(x => transformer.transform(x))
|
||||
|
||||
{% endhighlight %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.ml.feature
|
||||
|
||||
import org.apache.spark.annotation.AlphaComponent
|
||||
import org.apache.spark.ml.UnaryTransformer
|
||||
import org.apache.spark.ml.param.Param
|
||||
import org.apache.spark.mllib.feature
|
||||
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
/**
|
||||
* :: AlphaComponent ::
|
||||
* Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
|
||||
* provided "weight" vector. In other words, it scales each column of the dataset by a scalar
|
||||
* multiplier.
|
||||
*/
|
||||
@AlphaComponent
|
||||
class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
|
||||
|
||||
/**
|
||||
* the vector to multiply with input vectors
|
||||
* @group param
|
||||
*/
|
||||
val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
|
||||
|
||||
/** @group setParam */
|
||||
def setScalingVec(value: Vector): this.type = set(scalingVec, value)
|
||||
|
||||
/** @group getParam */
|
||||
def getScalingVec: Vector = getOrDefault(scalingVec)
|
||||
|
||||
override protected def createTransformFunc: Vector => Vector = {
|
||||
require(params.contains(scalingVec), s"transformation requires a weight vector")
|
||||
val elemScaler = new feature.ElementwiseProduct($(scalingVec))
|
||||
elemScaler.transform
|
||||
}
|
||||
|
||||
override protected def outputDataType: DataType = new VectorUDT()
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.feature
|
||||
|
||||
import org.apache.spark.annotation.Experimental
|
||||
import org.apache.spark.mllib.linalg._
|
||||
|
||||
/**
|
||||
* :: Experimental ::
|
||||
* Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a
|
||||
* provided "weight" vector. In other words, it scales each column of the dataset by a scalar
|
||||
* multiplier.
|
||||
* @param scalingVector The values used to scale the reference vector's individual components.
|
||||
*/
|
||||
@Experimental
|
||||
class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
|
||||
|
||||
/**
|
||||
* Does the hadamard product transformation.
|
||||
*
|
||||
* @param vector vector to be transformed.
|
||||
* @return transformed vector.
|
||||
*/
|
||||
override def transform(vector: Vector): Vector = {
|
||||
require(vector.size == scalingVector.size,
|
||||
s"vector sizes do not match: Expected ${scalingVector.size} but found ${vector.size}")
|
||||
vector match {
|
||||
case dv: DenseVector =>
|
||||
val values: Array[Double] = dv.values.clone()
|
||||
val dim = scalingVector.size
|
||||
var i = 0
|
||||
while (i < dim) {
|
||||
values(i) *= scalingVector(i)
|
||||
i += 1
|
||||
}
|
||||
Vectors.dense(values)
|
||||
case SparseVector(size, indices, vs) =>
|
||||
val values = vs.clone()
|
||||
val dim = values.length
|
||||
var i = 0
|
||||
while (i < dim) {
|
||||
values(i) *= scalingVector(indices(i))
|
||||
i += 1
|
||||
}
|
||||
Vectors.sparse(size, indices, values)
|
||||
case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.mllib.feature
|
||||
|
||||
import org.scalatest.FunSuite
|
||||
|
||||
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
|
||||
import org.apache.spark.mllib.util.MLlibTestSparkContext
|
||||
import org.apache.spark.mllib.util.TestingUtils._
|
||||
|
||||
class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
|
||||
|
||||
test("elementwise (hadamard) product should properly apply vector to dense data set") {
|
||||
val denseData = Array(
|
||||
Vectors.dense(1.0, 4.0, 1.9, -9.0)
|
||||
)
|
||||
val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
|
||||
val transformer = new ElementwiseProduct(scalingVec)
|
||||
val transformedData = transformer.transform(sc.makeRDD(denseData))
|
||||
val transformedVecs = transformedData.collect()
|
||||
val transformedVec = transformedVecs(0)
|
||||
val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
|
||||
assert(transformedVec ~== expectedVec absTol 1E-5,
|
||||
s"Expected transformed vector $expectedVec but found $transformedVec")
|
||||
}
|
||||
|
||||
test("elementwise (hadamard) product should properly apply vector to sparse data set") {
|
||||
val sparseData = Array(
|
||||
Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
|
||||
)
|
||||
val dataRDD = sc.parallelize(sparseData, 3)
|
||||
val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
|
||||
val transformer = new ElementwiseProduct(scalingVec)
|
||||
val data2 = sparseData.map(transformer.transform)
|
||||
val data2RDD = transformer.transform(dataRDD)
|
||||
|
||||
assert((sparseData, data2, data2RDD.collect()).zipped.forall {
|
||||
case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
|
||||
case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
|
||||
case _ => false
|
||||
}, "The vector type should be preserved after hadamard product")
|
||||
|
||||
assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
|
||||
assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue