[SPARK-5890] [ML] Add feature discretizer

JIRA issue [here](https://issues.apache.org/jira/browse/SPARK-5890).

I borrow the code of `findSplits` from `RandomForest`. I don't think it's good to call it from `RandomForest` directly.

Author: Xusen Yin <yinxusen@gmail.com>

Closes #5779 from yinxusen/SPARK-5890.
This commit is contained in:
Xusen Yin 2015-10-02 10:19:18 -07:00 committed by Joseph K. Bradley
parent 2a717821bb
commit 23a9448c04
2 changed files with 274 additions and 0 deletions

View file

@ -0,0 +1,176 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.ml.feature
import scala.collection.mutable
import org.apache.spark.Logging
import org.apache.spark.annotation.Experimental
import org.apache.spark.ml._
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, _}
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DoubleType, StructType}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.util.random.XORShiftRandom
/**
* Params for [[QuantileDiscretizer]].
*/
private[feature] trait QuantileDiscretizerBase extends Params with HasInputCol with HasOutputCol {
/**
* Maximum number of buckets (quantiles, or categories) into which data points are grouped. Must
* be >= 2.
* default: 2
* @group param
*/
val numBuckets = new IntParam(this, "numBuckets", "Maximum number of buckets (quantiles, or " +
"categories) into which data points are grouped. Must be >= 2.",
ParamValidators.gtEq(2))
setDefault(numBuckets -> 2)
/** @group getParam */
def getNumBuckets: Int = getOrDefault(numBuckets)
}
/**
* :: Experimental ::
* `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
* categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
* into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
* covering all real values. This attempts to find numBuckets partitions based on a sample of data,
* but it may find fewer depending on the data sample values.
*/
@Experimental
final class QuantileDiscretizer(override val uid: String)
extends Estimator[Bucketizer] with QuantileDiscretizerBase {
def this() = this(Identifiable.randomUID("quantileDiscretizer"))
/** @group setParam */
def setNumBuckets(value: Int): this.type = set(numBuckets, value)
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)
override def transformSchema(schema: StructType): StructType = {
SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
val inputFields = schema.fields
require(inputFields.forall(_.name != $(outputCol)),
s"Output column ${$(outputCol)} already exists.")
val attr = NominalAttribute.defaultAttr.withName($(outputCol))
val outputFields = inputFields :+ attr.toStructField()
StructType(outputFields)
}
override def fit(dataset: DataFrame): Bucketizer = {
val samples = QuantileDiscretizer.getSampledInput(dataset.select($(inputCol)), $(numBuckets))
.map { case Row(feature: Double) => feature }
val candidates = QuantileDiscretizer.findSplitCandidates(samples, $(numBuckets) - 1)
val splits = QuantileDiscretizer.getSplits(candidates)
val bucketizer = new Bucketizer(uid).setSplits(splits)
copyValues(bucketizer)
}
override def copy(extra: ParamMap): QuantileDiscretizer = defaultCopy(extra)
}
private[feature] object QuantileDiscretizer extends Logging {
/**
* Sampling from the given dataset to collect quantile statistics.
*/
def getSampledInput(dataset: DataFrame, numBins: Int): Array[Row] = {
val totalSamples = dataset.count()
require(totalSamples > 0,
"QuantileDiscretizer requires non-empty input dataset but was given an empty input.")
val requiredSamples = math.max(numBins * numBins, 10000)
val fraction = math.min(requiredSamples / dataset.count(), 1.0)
dataset.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect()
}
/**
* Compute split points with respect to the sample distribution.
*/
def findSplitCandidates(samples: Array[Double], numSplits: Int): Array[Double] = {
val valueCountMap = samples.foldLeft(Map.empty[Double, Int]) { (m, x) =>
m + ((x, m.getOrElse(x, 0) + 1))
}
val valueCounts = valueCountMap.toSeq.sortBy(_._1).toArray ++ Array((Double.MaxValue, 1))
val possibleSplits = valueCounts.length - 1
if (possibleSplits <= numSplits) {
valueCounts.dropRight(1).map(_._1)
} else {
val stride: Double = math.ceil(samples.length.toDouble / (numSplits + 1))
val splitsBuilder = mutable.ArrayBuilder.make[Double]
var index = 1
// currentCount: sum of counts of values that have been visited
var currentCount = valueCounts(0)._2
// targetCount: target value for `currentCount`. If `currentCount` is closest value to
// `targetCount`, then current value is a split threshold. After finding a split threshold,
// `targetCount` is added by stride.
var targetCount = stride
while (index < valueCounts.length) {
val previousCount = currentCount
currentCount += valueCounts(index)._2
val previousGap = math.abs(previousCount - targetCount)
val currentGap = math.abs(currentCount - targetCount)
// If adding count of current value to currentCount makes the gap between currentCount and
// targetCount smaller, previous value is a split threshold.
if (previousGap < currentGap) {
splitsBuilder += valueCounts(index - 1)._1
targetCount += stride
}
index += 1
}
splitsBuilder.result()
}
}
/**
* Adjust split candidates to proper splits by: adding positive/negative infinity to both sides as
* needed, and adding a default split value of 0 if no good candidates are found.
*/
def getSplits(candidates: Array[Double]): Array[Double] = {
val effectiveValues = if (candidates.size != 0) {
if (candidates.head == Double.NegativeInfinity
&& candidates.last == Double.PositiveInfinity) {
candidates.drop(1).dropRight(1)
} else if (candidates.head == Double.NegativeInfinity) {
candidates.drop(1)
} else if (candidates.last == Double.PositiveInfinity) {
candidates.dropRight(1)
} else {
candidates
}
} else {
candidates
}
if (effectiveValues.size == 0) {
Array(Double.NegativeInfinity, 0, Double.PositiveInfinity)
} else {
Array(Double.NegativeInfinity) ++ effectiveValues ++ Array(Double.PositiveInfinity)
}
}
}

View file

@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.ml.feature
import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkContext, SparkFunSuite}
class QuantileDiscretizerSuite extends SparkFunSuite with MLlibTestSparkContext {
import org.apache.spark.ml.feature.QuantileDiscretizerSuite._
test("Test quantile discretizer") {
checkDiscretizedData(sc,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
10,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))
checkDiscretizedData(sc,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
4,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
Array("-Infinity, 1.0", "1.0, 2.0", "2.0, 3.0", "3.0, Infinity"))
checkDiscretizedData(sc,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
3,
Array[Double](0, 1, 2, 2, 2, 2, 2, 2, 2),
Array("-Infinity, 2.0", "2.0, 3.0", "3.0, Infinity"))
checkDiscretizedData(sc,
Array[Double](1, 2, 3, 3, 3, 3, 3, 3, 3),
2,
Array[Double](0, 1, 1, 1, 1, 1, 1, 1, 1),
Array("-Infinity, 2.0", "2.0, Infinity"))
}
test("Test getting splits") {
val splitTestPoints = Array(
Array[Double]() -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
Array(Double.NegativeInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
Array(Double.PositiveInfinity) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
Array(Double.NegativeInfinity, Double.PositiveInfinity)
-> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
Array(0.0) -> Array(Double.NegativeInfinity, 0, Double.PositiveInfinity),
Array(1.0) -> Array(Double.NegativeInfinity, 1, Double.PositiveInfinity),
Array(0.0, 1.0) -> Array(Double.NegativeInfinity, 0, 1, Double.PositiveInfinity)
)
for ((ori, res) <- splitTestPoints) {
assert(QuantileDiscretizer.getSplits(ori) === res, "Returned splits are invalid.")
}
}
}
private object QuantileDiscretizerSuite extends SparkFunSuite {
def checkDiscretizedData(
sc: SparkContext,
data: Array[Double],
numBucket: Int,
expectedResult: Array[Double],
expectedAttrs: Array[String]): Unit = {
val sqlCtx = SQLContext.getOrCreate(sc)
import sqlCtx.implicits._
val df = sc.parallelize(data.map(Tuple1.apply)).toDF("input")
val discretizer = new QuantileDiscretizer().setInputCol("input").setOutputCol("result")
.setNumBuckets(numBucket)
val result = discretizer.fit(df).transform(df)
val transformedFeatures = result.select("result").collect()
.map { case Row(transformedFeature: Double) => transformedFeature }
val transformedAttrs = Attribute.fromStructField(result.schema("result"))
.asInstanceOf[NominalAttribute].values.get
assert(transformedFeatures === expectedResult,
"Transformed features do not equal expected features.")
assert(transformedAttrs === expectedAttrs,
"Transformed attributes do not equal expected attributes.")
}
}