28 lines
773 B
Scala
28 lines
773 B
Scala
package org.mimirdb.pip.udf
|
|
|
|
import org.mimirdb.pip.udt.UnivariateDistribution
|
|
import org.mimirdb.pip.distribution.numerical.Discretized
|
|
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
|
import org.apache.spark.sql.functions
|
|
|
|
object Entropy
|
|
{
|
|
val BUCKETS = 1000
|
|
|
|
def apply(dist: UnivariateDistribution): Double =
|
|
{
|
|
dist.family match {
|
|
case Discretized =>
|
|
Discretized.entropy(dist.params)
|
|
case family:NumericalDistributionFamily =>
|
|
val min = family.min(dist.params)
|
|
val max = family.max(dist.params)
|
|
val step = (max - min) / BUCKETS
|
|
|
|
val params = Discretized(dist, (min.until(max, step)).toArray, 1000)
|
|
Discretized.entropy(params)
|
|
}
|
|
}
|
|
|
|
def udf = functions.udf(apply(_))
|
|
} |