197 lines
5.6 KiB
Scala
197 lines
5.6 KiB
Scala
package org.mimirdb.pip.distribution.numerical
|
|
|
|
import scala.util.Random
|
|
import java.io.ObjectOutputStream
|
|
import java.io.ObjectInputStream
|
|
import org.mimirdb.pip.SampleParams
|
|
import org.mimirdb.pip.udt.UnivariateDistribution
|
|
import org.apache.spark.sql.functions
|
|
import scala.collection.Searching._
|
|
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
|
|
|
object Discretized
|
|
extends NumericalDistributionFamily
|
|
with CDFSupported
|
|
{
|
|
val ACCURACY = 0.0001
|
|
|
|
case class Bin(low: Double, high: Double, p: Double)
|
|
|
|
type Params = Seq[Bin]
|
|
|
|
def check(bins: Params): Params =
|
|
{
|
|
assert(!bins.isEmpty)
|
|
assert(
|
|
Math.abs(bins.map { _.p }.sum - 1.0) < ACCURACY,
|
|
s"Unexpected bin boundaries: ${bins.map { _.p }.sum} = ${bins.map { _.p }.mkString(" + ")}"
|
|
)
|
|
var curr = bins.head.high
|
|
for(x <- bins.tail)
|
|
{
|
|
assert(x.low < x.high)
|
|
assert(x.low == curr)
|
|
curr = x.high
|
|
}
|
|
return bins
|
|
}
|
|
|
|
def sample(params: Any, random: scala.util.Random): Double =
|
|
{
|
|
var x = random.nextDouble()
|
|
var bins = params.asInstanceOf[Params]
|
|
while(x > bins.head.p && bins.size > 1){
|
|
x -= bins.head.p
|
|
bins = bins.tail
|
|
}
|
|
x /= bins.head.p
|
|
return x * (bins.head.high - bins.head.low) + bins.head.low
|
|
}
|
|
|
|
def cdf(value: Double, params: Any): Double =
|
|
{
|
|
params.asInstanceOf[Params].map { bin =>
|
|
if(value >= bin.high){ bin.p }
|
|
else if(value >= bin.low){
|
|
bin.p * ((value - bin.low)/(bin.high - bin.low))
|
|
}
|
|
else { 0 }
|
|
}.sum
|
|
}
|
|
|
|
def serialize(out: ObjectOutputStream, params: Any): Unit =
|
|
{
|
|
val bins = params.asInstanceOf[Params]
|
|
out.writeInt(bins.size)
|
|
for(bin <- bins)
|
|
{
|
|
out.writeDouble(bin.low)
|
|
out.writeDouble(bin.high)
|
|
out.writeDouble(bin.p)
|
|
}
|
|
}
|
|
|
|
def deserialize(in: ObjectInputStream): Params =
|
|
{
|
|
val len = in.readInt()
|
|
check {
|
|
(0 until len).map { _ =>
|
|
val low = in.readDouble()
|
|
val high = in.readDouble()
|
|
val p = in.readDouble()
|
|
Bin(
|
|
low = low,
|
|
high = high,
|
|
p = p,
|
|
)
|
|
}.toSeq
|
|
}
|
|
}
|
|
|
|
def entropy(params: Any): Double =
|
|
params.asInstanceOf[Params].map { bin =>
|
|
if(bin.p > 0){ - Math.log(bin.p) * bin.p }
|
|
else { 0 }
|
|
}.sum
|
|
|
|
def klDivergence(target: Any, base: Any): Double =
|
|
{
|
|
assert(sameBins(target, base))
|
|
target.asInstanceOf[Params]
|
|
.zip(base.asInstanceOf[Params])
|
|
.map { case (t:Bin, b:Bin) =>
|
|
if(t.p > 0){
|
|
if(b.p > 0){
|
|
t.p * Math.log(t.p / b.p)
|
|
} else {
|
|
Double.PositiveInfinity
|
|
}
|
|
} else { 0 }
|
|
}.sum
|
|
}
|
|
|
|
def min(params: Any): Double = params.asInstanceOf[Params].head.low
|
|
def max(params: Any): Double = params.asInstanceOf[Params].last.high
|
|
|
|
def describe(params: Any): String =
|
|
s"Discretized(${params.asInstanceOf[Params].map { b => s"[${b.low}, ${b.high}] -> ${b.p}" }.mkString(", ")})"
|
|
|
|
def bins(params: Any): Array[Double] =
|
|
(params.asInstanceOf[Params].head.low +:
|
|
params.asInstanceOf[Params].map { _.high }
|
|
).toArray
|
|
|
|
def sameBins(paramsA: Any, paramsB: Any): Boolean =
|
|
{
|
|
(paramsA.asInstanceOf[Params].size == paramsB.asInstanceOf[Params].size) &&
|
|
paramsA.asInstanceOf[Params].zip(paramsB.asInstanceOf[Params])
|
|
.forall { case (a, b) => a.low == b.low && a.high == b.high }
|
|
}
|
|
|
|
def apply(base: UnivariateDistribution, bins: Array[Double], samples: Int): Params =
|
|
{
|
|
assert(bins.size >= 2)
|
|
val baseFamily = base.family.asInstanceOf[NumericalDistributionFamily]
|
|
|
|
val params:Params =
|
|
if(baseFamily.approximateCDFIsFast(base.params)){
|
|
val startCDF = baseFamily.approximateCDF(bins.head, base.params, 1000)
|
|
val endCDF = baseFamily.approximateCDF(bins.last, base.params, 1000)
|
|
val adjustCDF = endCDF - startCDF
|
|
var lastCDF = startCDF
|
|
var lastBin = bins.head
|
|
// println(s"Fast Path: $startCDF")
|
|
bins.tail.map { binHigh =>
|
|
val binLow = lastBin
|
|
var cdf = baseFamily.approximateCDF(binHigh, base.params, 1000)
|
|
val result = Bin(binLow, binHigh, (cdf - lastCDF) / adjustCDF)
|
|
lastCDF = cdf
|
|
lastBin = binHigh
|
|
result
|
|
}:Params
|
|
} else {
|
|
val counts = Array.fill(bins.size-1)(0)
|
|
var missed = 0
|
|
for(i <- 0 until samples)
|
|
{
|
|
val sample = base.family.sample(base, scala.util.Random).asInstanceOf[Double]
|
|
val bin = bins.search(sample)
|
|
if(bin.insertionPoint == 0 || bin.insertionPoint > bins.size){
|
|
missed += 1
|
|
} else {
|
|
counts(bin.insertionPoint - 1) += 1
|
|
}
|
|
}
|
|
counts.zipWithIndex.map { case (count, bin) =>
|
|
val binLow = bins(bin)
|
|
val binHigh = bins(bin+1)
|
|
val cdf = count.toDouble / (samples - missed)
|
|
Bin(binLow, binHigh, cdf)
|
|
}:Params
|
|
}
|
|
// println(bins.mkString(", "))
|
|
check(params)
|
|
|
|
return params
|
|
}
|
|
|
|
case class Constructor(args: Seq[Expression])
|
|
extends UnivariateDistributionConstructor
|
|
{
|
|
def family = Discretized
|
|
def params(values: Seq[Any]) =
|
|
Discretized(
|
|
base = UnivariateDistribution.decode(values(0)),
|
|
bins =
|
|
values(1).asInstanceOf[Double].until(
|
|
values(2).asInstanceOf[Double],
|
|
values(3).asInstanceOf[Double]
|
|
).toArray,
|
|
samples = values(4).asInstanceOf[Int]
|
|
)
|
|
|
|
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
|
|
copy(args = newChildren)
|
|
}
|
|
} |