mimir-pip/lib/src/org/mimirdb/pip/distribution/numerical/Discretized.scala

197 lines
5.6 KiB
Scala

package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.ObjectOutputStream
import java.io.ObjectInputStream
import org.mimirdb.pip.SampleParams
import org.mimirdb.pip.udt.UnivariateDistribution
import org.apache.spark.sql.functions
import scala.collection.Searching._
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
import org.apache.spark.sql.catalyst.expressions.Expression
object Discretized
extends NumericalDistributionFamily
with CDFSupported
{
val ACCURACY = 0.0001
case class Bin(low: Double, high: Double, p: Double)
type Params = Seq[Bin]
def check(bins: Params): Params =
{
assert(!bins.isEmpty)
assert(
Math.abs(bins.map { _.p }.sum - 1.0) < ACCURACY,
s"Unexpected bin boundaries: ${bins.map { _.p }.sum} = ${bins.map { _.p }.mkString(" + ")}"
)
var curr = bins.head.high
for(x <- bins.tail)
{
assert(x.low < x.high)
assert(x.low == curr)
curr = x.high
}
return bins
}
def sample(params: Any, random: scala.util.Random): Double =
{
var x = random.nextDouble()
var bins = params.asInstanceOf[Params]
while(x > bins.head.p && bins.size > 1){
x -= bins.head.p
bins = bins.tail
}
x /= bins.head.p
return x * (bins.head.high - bins.head.low) + bins.head.low
}
def cdf(value: Double, params: Any): Double =
{
params.asInstanceOf[Params].map { bin =>
if(value >= bin.high){ bin.p }
else if(value >= bin.low){
bin.p * ((value - bin.low)/(bin.high - bin.low))
}
else { 0 }
}.sum
}
def serialize(out: ObjectOutputStream, params: Any): Unit =
{
val bins = params.asInstanceOf[Params]
out.writeInt(bins.size)
for(bin <- bins)
{
out.writeDouble(bin.low)
out.writeDouble(bin.high)
out.writeDouble(bin.p)
}
}
def deserialize(in: ObjectInputStream): Params =
{
val len = in.readInt()
check {
(0 until len).map { _ =>
val low = in.readDouble()
val high = in.readDouble()
val p = in.readDouble()
Bin(
low = low,
high = high,
p = p,
)
}.toSeq
}
}
def entropy(params: Any): Double =
params.asInstanceOf[Params].map { bin =>
if(bin.p > 0){ - Math.log(bin.p) * bin.p }
else { 0 }
}.sum
def klDivergence(target: Any, base: Any): Double =
{
assert(sameBins(target, base))
target.asInstanceOf[Params]
.zip(base.asInstanceOf[Params])
.map { case (t:Bin, b:Bin) =>
if(t.p > 0){
if(b.p > 0){
t.p * Math.log(t.p / b.p)
} else {
Double.PositiveInfinity
}
} else { 0 }
}.sum
}
def min(params: Any): Double = params.asInstanceOf[Params].head.low
def max(params: Any): Double = params.asInstanceOf[Params].last.high
def describe(params: Any): String =
s"Discretized(${params.asInstanceOf[Params].map { b => s"[${b.low}, ${b.high}] -> ${b.p}" }.mkString(", ")})"
def bins(params: Any): Array[Double] =
(params.asInstanceOf[Params].head.low +:
params.asInstanceOf[Params].map { _.high }
).toArray
def sameBins(paramsA: Any, paramsB: Any): Boolean =
{
(paramsA.asInstanceOf[Params].size == paramsB.asInstanceOf[Params].size) &&
paramsA.asInstanceOf[Params].zip(paramsB.asInstanceOf[Params])
.forall { case (a, b) => a.low == b.low && a.high == b.high }
}
def apply(base: UnivariateDistribution, bins: Array[Double], samples: Int): Params =
{
assert(bins.size >= 2)
val baseFamily = base.family.asInstanceOf[NumericalDistributionFamily]
val params:Params =
if(baseFamily.approximateCDFIsFast(base.params)){
val startCDF = baseFamily.approximateCDF(bins.head, base.params, 1000)
val endCDF = baseFamily.approximateCDF(bins.last, base.params, 1000)
val adjustCDF = endCDF - startCDF
var lastCDF = startCDF
var lastBin = bins.head
// println(s"Fast Path: $startCDF")
bins.tail.map { binHigh =>
val binLow = lastBin
var cdf = baseFamily.approximateCDF(binHigh, base.params, 1000)
val result = Bin(binLow, binHigh, (cdf - lastCDF) / adjustCDF)
lastCDF = cdf
lastBin = binHigh
result
}:Params
} else {
val counts = Array.fill(bins.size-1)(0)
var missed = 0
for(i <- 0 until samples)
{
val sample = base.family.sample(base, scala.util.Random).asInstanceOf[Double]
val bin = bins.search(sample)
if(bin.insertionPoint == 0 || bin.insertionPoint > bins.size){
missed += 1
} else {
counts(bin.insertionPoint - 1) += 1
}
}
counts.zipWithIndex.map { case (count, bin) =>
val binLow = bins(bin)
val binHigh = bins(bin+1)
val cdf = count.toDouble / (samples - missed)
Bin(binLow, binHigh, cdf)
}:Params
}
// println(bins.mkString(", "))
check(params)
return params
}
case class Constructor(args: Seq[Expression])
extends UnivariateDistributionConstructor
{
def family = Discretized
def params(values: Seq[Any]) =
Discretized(
base = UnivariateDistribution.decode(values(0)),
bins =
values(1).asInstanceOf[Double].until(
values(2).asInstanceOf[Double],
values(3).asInstanceOf[Double]
).toArray,
samples = values(4).asInstanceOf[Int]
)
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
copy(args = newChildren)
}
}