mimir-pip/lib/src/org/mimirdb/pip/udf/Entropy.scala

28 lines
773 B
Scala

package org.mimirdb.pip.udf
import org.mimirdb.pip.udt.UnivariateDistribution
import org.mimirdb.pip.distribution.numerical.Discretized
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
import org.apache.spark.sql.functions
object Entropy
{
val BUCKETS = 1000
def apply(dist: UnivariateDistribution): Double =
{
dist.family match {
case Discretized =>
Discretized.entropy(dist.params)
case family:NumericalDistributionFamily =>
val min = family.min(dist.params)
val max = family.max(dist.params)
val step = (max - min) / BUCKETS
val params = Discretized(dist, (min.until(max, step)).toArray, 1000)
Discretized.entropy(params)
}
}
def udf = functions.udf(apply(_))
}