general fixes to Distribution, plus some tests

This commit is contained in:
Imran Rashid 2013-02-08 19:07:36 -08:00
parent 379564c7e0
commit 04e828f7c1
2 changed files with 48 additions and 11 deletions

View file

@ -6,8 +6,11 @@ import java.io.PrintStream
* util for getting some stats from a small sample of numeric values, with some handy summary functions * util for getting some stats from a small sample of numeric values, with some handy summary functions
* *
* Entirely in memory, not intended as a good way to compute stats over large data sets. * Entirely in memory, not intended as a good way to compute stats over large data sets.
*
* assumes you are giving it a non-empty set of data
*/ */
class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int) { class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int) {
require(startIdx < endIdx)
def this(data: Traversable[Double]) = this(data.toArray, 0, data.size) def this(data: Traversable[Double]) = this(data.toArray, 0, data.size)
java.util.Arrays.sort(data, startIdx, endIdx) java.util.Arrays.sort(data, startIdx, endIdx)
val length = endIdx - startIdx val length = endIdx - startIdx
@ -19,34 +22,43 @@ class Distribution(val data: Array[Double], val startIdx: Int, val endIdx: Int)
* given from 0 to 1 * given from 0 to 1
* @param probabilities * @param probabilities
*/ */
def getQuantiles(probabilities: Traversable[Double]) = { def getQuantiles(probabilities: Traversable[Double] = defaultProbabilities) = {
probabilities.map{q =>data((q * length).toInt + startIdx)} probabilities.toIndexedSeq.map{p:Double => data(closestIndex(p))}
} }
def showQuantiles(out: PrintStream = System.out, probabilities: Traversable[Double] = defaultProbabilities) = { private def closestIndex(p: Double) = {
out.println("min\t25%\t50%\t75%max") math.min((p * length).toInt + startIdx, endIdx - 1)
probabilities.foreach{q => out.print(q + "\t")} }
def showQuantiles(out: PrintStream = System.out) = {
out.println("min\t25%\t50%\t75%\tmax")
getQuantiles(defaultProbabilities).foreach{q => out.print(q + "\t")}
out.println out.println
} }
def summary : (StatCounter, Traversable[Double]) = { def statCounter = StatCounter(data.slice(startIdx, endIdx))
(StatCounter(data), getQuantiles(defaultProbabilities))
}
/** /**
* print a summary of this distribution to the given PrintStream. * print a summary of this distribution to the given PrintStream.
* @param out * @param out
*/ */
def summary(out: PrintStream = System.out) { def summary(out: PrintStream = System.out) {
val (statCounter, quantiles) = summary
out.println(statCounter) out.println(statCounter)
Distribution.showQuantiles(out, quantiles) showQuantiles(out)
} }
} }
object Distribution { object Distribution {
def apply(data: Traversable[Double]): Option[Distribution] = {
if (data.size > 0)
Some(new Distribution(data))
else
None
}
def showQuantiles(out: PrintStream = System.out, quantiles: Traversable[Double]) { def showQuantiles(out: PrintStream = System.out, quantiles: Traversable[Double]) {
out.println("min\t25%\t50%\t75%max") out.println("min\t25%\t50%\t75%\tmax")
quantiles.foreach{q => out.print(q + "\t")} quantiles.foreach{q => out.print(q + "\t")}
out.println out.println
} }

View file

@ -0,0 +1,25 @@
package spark.util
import org.scalatest.FunSuite
import org.scalatest.matchers.ShouldMatchers
/**
*
*/
class DistributionSuite extends FunSuite with ShouldMatchers {
test("summary") {
val d = new Distribution((1 to 100).toArray.map{_.toDouble})
val stats = d.statCounter
stats.count should be (100)
stats.mean should be (50.5)
stats.sum should be (50 * 101)
val quantiles = d.getQuantiles()
quantiles(0) should be (1)
quantiles(1) should be (26)
quantiles(2) should be (51)
quantiles(3) should be (76)
quantiles(4) should be (100)
}
}