general fixes to Distribution, plus some tests
This commit is contained in:
parent
379564c7e0
commit
04e828f7c1
|
@ -6,8 +6,11 @@ import java.io.PrintStream
|
||||||
* util for getting some stats from a small sample of numeric values, with some handy summary functions
|
* util for getting some stats from a small sample of numeric values, with some handy summary functions
|
||||||
*
|
*
|
||||||
* Entirely in memory, not intended as a good way to compute stats over large data sets.
|
* Entirely in memory, not intended as a good way to compute stats over large data sets.
|
||||||
|
*
|
||||||
|
* assumes you are giving it a non-empty set of data
|
||||||
*/
|
*/
|
||||||
class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int) {
|
class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int) {
|
||||||
|
require(startIdx < endIdx)
|
||||||
def this(data: Traversable[Double]) = this(data.toArray, 0, data.size)
|
def this(data: Traversable[Double]) = this(data.toArray, 0, data.size)
|
||||||
java.util.Arrays.sort(data, startIdx, endIdx)
|
java.util.Arrays.sort(data, startIdx, endIdx)
|
||||||
val length = endIdx - startIdx
|
val length = endIdx - startIdx
|
||||||
|
@ -19,34 +22,43 @@ class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int)
|
||||||
* given from 0 to 1
|
* given from 0 to 1
|
||||||
* @param probabilities
|
* @param probabilities
|
||||||
*/
|
*/
|
||||||
def getQuantiles(probabilities: Traversable[Double]) = {
|
def getQuantiles(probabilities: Traversable[Double] = defaultProbabilities) = {
|
||||||
probabilities.map{q =>data((q * length).toInt + startIdx)}
|
probabilities.toIndexedSeq.map{p:Double => data(closestIndex(p))}
|
||||||
}
|
}
|
||||||
|
|
||||||
def showQuantiles(out: PrintStream = System.out, probabilities: Traversable[Double] = defaultProbabilities) = {
|
private def closestIndex(p: Double) = {
|
||||||
out.println("min\t25%\t50%\t75%max")
|
math.min((p * length).toInt + startIdx, endIdx - 1)
|
||||||
probabilities.foreach{q => out.print(q + "\t")}
|
}
|
||||||
|
|
||||||
|
def showQuantiles(out: PrintStream = System.out) = {
|
||||||
|
out.println("min\t25%\t50%\t75%\tmax")
|
||||||
|
getQuantiles(defaultProbabilities).foreach{q => out.print(q + "\t")}
|
||||||
out.println
|
out.println
|
||||||
}
|
}
|
||||||
|
|
||||||
def summary : (StatCounter, Traversable[Double]) = {
|
def statCounter = StatCounter(data.slice(startIdx, endIdx))
|
||||||
(StatCounter(data), getQuantiles(defaultProbabilities))
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* print a summary of this distribution to the given PrintStream.
|
* print a summary of this distribution to the given PrintStream.
|
||||||
* @param out
|
* @param out
|
||||||
*/
|
*/
|
||||||
def summary(out: PrintStream = System.out) {
|
def summary(out: PrintStream = System.out) {
|
||||||
val (statCounter, quantiles) = summary
|
|
||||||
out.println(statCounter)
|
out.println(statCounter)
|
||||||
Distribution.showQuantiles(out, quantiles)
|
showQuantiles(out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object Distribution {
|
object Distribution {
|
||||||
|
|
||||||
|
def apply(data: Traversable[Double]): Option[Distribution] = {
|
||||||
|
if (data.size > 0)
|
||||||
|
Some(new Distribution(data))
|
||||||
|
else
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
def showQuantiles(out: PrintStream = System.out, quantiles: Traversable[Double]) {
|
def showQuantiles(out: PrintStream = System.out, quantiles: Traversable[Double]) {
|
||||||
out.println("min\t25%\t50%\t75%max")
|
out.println("min\t25%\t50%\t75%\tmax")
|
||||||
quantiles.foreach{q => out.print(q + "\t")}
|
quantiles.foreach{q => out.print(q + "\t")}
|
||||||
out.println
|
out.println
|
||||||
}
|
}
|
||||||
|
|
25
core/src/test/scala/spark/util/DistributionSuite.scala
Normal file
25
core/src/test/scala/spark/util/DistributionSuite.scala
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
package spark.util
|
||||||
|
|
||||||
|
import org.scalatest.FunSuite
|
||||||
|
import org.scalatest.matchers.ShouldMatchers
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
class DistributionSuite extends FunSuite with ShouldMatchers {
|
||||||
|
test("summary") {
|
||||||
|
val d = new Distribution((1 to 100).toArray.map{_.toDouble})
|
||||||
|
val stats = d.statCounter
|
||||||
|
stats.count should be (100)
|
||||||
|
stats.mean should be (50.5)
|
||||||
|
stats.sum should be (50 * 101)
|
||||||
|
|
||||||
|
val quantiles = d.getQuantiles()
|
||||||
|
quantiles(0) should be (1)
|
||||||
|
quantiles(1) should be (26)
|
||||||
|
quantiles(2) should be (51)
|
||||||
|
quantiles(3) should be (76)
|
||||||
|
quantiles(4) should be (100)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue