Adding support for constant and uniform distributions

main
Oliver Kennedy 2024-02-15 21:50:54 -05:00
parent b2799bbac3
commit bfd5224fb9
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
3 changed files with 148 additions and 0 deletions

View File

@ -29,6 +29,8 @@ object Pip {
.createOrReplaceTempFunction(name, fn, "scala_udf")
registerFunction("gaussian", distribution.Gaussian.Constructor(_))
registerFunction("uniform", distribution.Uniform.Constructor(_))
registerFunction("num_const", distribution.ConstantNumber.Constructor(_))
registerFunction("clamp", distribution.Clamp.Constructor)
registerFunction("discretize", distribution.Discretized.Constructor)
spark.udf.register("entropy", udf.Entropy.udf)

View File

@ -0,0 +1,63 @@
package org.mimirdb.pip.distribution
import scala.util.Random
import java.io.Serializable
import java.io.ObjectOutputStream
import java.io.ObjectInputStream
import org.apache.commons.math3.special.Erf
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.DoubleType
import org.mimirdb.pip.SampleParams
import org.mimirdb.pip.udt.UnivariateDistribution
import org.mimirdb.pip.udt.UnivariateDistributionType
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
import org.apache.spark.sql.catalyst.expressions.Expression
/**
* The Uniform distribution
*
*/
object ConstantNumber
extends NumericalDistributionFamily
with CDFSupported
with ICDFSupported
{
def sample(params: Any, random: scala.util.Random): Double =
params.asInstanceOf[Double]
def serialize(in: ObjectOutputStream, params: Any): Unit =
{
in.writeDouble(params.asInstanceOf[Double])
}
def deserialize(in: ObjectInputStream): Double =
in.readDouble()
def min(params: Any) = params.asInstanceOf[Double]
def max(params: Any) = params.asInstanceOf[Double]
def cdf(value: Double, params: Any): Double =
{
val p = params.asInstanceOf[Double]
if(value < p) { 0.0 }
else { 1.0 }
}
def icdf(value: Double, params: Any): Double =
value.asInstanceOf[Double]
def describe(params: Any): String =
s"{ $params }"
case class Constructor(args: Seq[Expression])
extends UnivariateDistributionConstructor
{
def family = ConstantNumber
def params(values: Seq[Any]) =
values(0).asInstanceOf[Double]
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
copy(args = newChildren)
}
}

View File

@ -0,0 +1,83 @@
package org.mimirdb.pip.distribution
import scala.util.Random
import java.io.Serializable
import java.io.ObjectOutputStream
import java.io.ObjectInputStream
import org.apache.commons.math3.special.Erf
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.DoubleType
import org.mimirdb.pip.SampleParams
import org.mimirdb.pip.udt.UnivariateDistribution
import org.mimirdb.pip.udt.UnivariateDistributionType
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
import org.apache.spark.sql.catalyst.expressions.Expression
/**
* The Uniform distribution
*
*/
object Uniform
extends NumericalDistributionFamily
with CDFSupported
with ICDFSupported
{
case class Params(min: Double, max: Double)
{
assert(min <= max)
def width = max - min
}
def sample(params: Any, random: scala.util.Random): Double =
{
random.nextDouble() * params.asInstanceOf[Params].width
+ params.asInstanceOf[Params].min
}
def serialize(in: ObjectOutputStream, params: Any): Unit =
{
in.writeDouble(params.asInstanceOf[Params].min)
in.writeDouble(params.asInstanceOf[Params].max)
}
def deserialize(in: ObjectInputStream): Params =
{
return Params(
min = in.readDouble(),
max = in.readDouble()
)
}
def min(params: Any) = params.asInstanceOf[Params].min
def max(params: Any) = params.asInstanceOf[Params].max
def cdf(value: Double, params: Any): Double =
{
val p = params.asInstanceOf[Params]
if(value < p.min) { 0.0 }
else if(value >= p.max) { 1.0 }
else { (value - p.min) / p.width }
}
def icdf(value: Double, params: Any): Double =
{
val p = params.asInstanceOf[Params]
value * p.width + p.min
}
def describe(params: Any): String =
s"Uniform(min: ${params.asInstanceOf[Params].min}, max: ${params.asInstanceOf[Params].max})"
case class Constructor(args: Seq[Expression])
extends UnivariateDistributionConstructor
{
def family = Uniform
def params(values: Seq[Any]) =
Params(min = values(0).asInstanceOf[Double],
max = values(1).asInstanceOf[Double])
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
copy(args = newChildren)
}
}