Adding support for constant and uniform distributions
parent
b2799bbac3
commit
bfd5224fb9
|
@ -29,6 +29,8 @@ object Pip {
|
|||
.createOrReplaceTempFunction(name, fn, "scala_udf")
|
||||
|
||||
registerFunction("gaussian", distribution.Gaussian.Constructor(_))
|
||||
registerFunction("uniform", distribution.Uniform.Constructor(_))
|
||||
registerFunction("num_const", distribution.ConstantNumber.Constructor(_))
|
||||
registerFunction("clamp", distribution.Clamp.Constructor)
|
||||
registerFunction("discretize", distribution.Discretized.Constructor)
|
||||
spark.udf.register("entropy", udf.Entropy.udf)
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.Serializable
|
||||
import java.io.ObjectOutputStream
|
||||
import java.io.ObjectInputStream
|
||||
import org.apache.commons.math3.special.Erf
|
||||
import org.apache.spark.sql.Column
|
||||
import org.apache.spark.sql.types.DoubleType
|
||||
import org.mimirdb.pip.SampleParams
|
||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionType
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
|
||||
/**
|
||||
* The Uniform distribution
|
||||
*
|
||||
*/
|
||||
object ConstantNumber
|
||||
extends NumericalDistributionFamily
|
||||
with CDFSupported
|
||||
with ICDFSupported
|
||||
{
|
||||
def sample(params: Any, random: scala.util.Random): Double =
|
||||
params.asInstanceOf[Double]
|
||||
|
||||
def serialize(in: ObjectOutputStream, params: Any): Unit =
|
||||
{
|
||||
in.writeDouble(params.asInstanceOf[Double])
|
||||
}
|
||||
|
||||
def deserialize(in: ObjectInputStream): Double =
|
||||
in.readDouble()
|
||||
|
||||
def min(params: Any) = params.asInstanceOf[Double]
|
||||
def max(params: Any) = params.asInstanceOf[Double]
|
||||
|
||||
def cdf(value: Double, params: Any): Double =
|
||||
{
|
||||
val p = params.asInstanceOf[Double]
|
||||
if(value < p) { 0.0 }
|
||||
else { 1.0 }
|
||||
}
|
||||
|
||||
def icdf(value: Double, params: Any): Double =
|
||||
value.asInstanceOf[Double]
|
||||
|
||||
def describe(params: Any): String =
|
||||
s"{ $params }"
|
||||
|
||||
case class Constructor(args: Seq[Expression])
|
||||
extends UnivariateDistributionConstructor
|
||||
{
|
||||
def family = ConstantNumber
|
||||
def params(values: Seq[Any]) =
|
||||
values(0).asInstanceOf[Double]
|
||||
|
||||
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
|
||||
copy(args = newChildren)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.Serializable
|
||||
import java.io.ObjectOutputStream
|
||||
import java.io.ObjectInputStream
|
||||
import org.apache.commons.math3.special.Erf
|
||||
import org.apache.spark.sql.Column
|
||||
import org.apache.spark.sql.types.DoubleType
|
||||
import org.mimirdb.pip.SampleParams
|
||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionType
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
|
||||
/**
|
||||
* The Uniform distribution
|
||||
*
|
||||
*/
|
||||
object Uniform
|
||||
extends NumericalDistributionFamily
|
||||
with CDFSupported
|
||||
with ICDFSupported
|
||||
{
|
||||
case class Params(min: Double, max: Double)
|
||||
{
|
||||
assert(min <= max)
|
||||
def width = max - min
|
||||
}
|
||||
|
||||
def sample(params: Any, random: scala.util.Random): Double =
|
||||
{
|
||||
random.nextDouble() * params.asInstanceOf[Params].width
|
||||
+ params.asInstanceOf[Params].min
|
||||
}
|
||||
|
||||
def serialize(in: ObjectOutputStream, params: Any): Unit =
|
||||
{
|
||||
in.writeDouble(params.asInstanceOf[Params].min)
|
||||
in.writeDouble(params.asInstanceOf[Params].max)
|
||||
}
|
||||
|
||||
def deserialize(in: ObjectInputStream): Params =
|
||||
{
|
||||
return Params(
|
||||
min = in.readDouble(),
|
||||
max = in.readDouble()
|
||||
)
|
||||
}
|
||||
|
||||
def min(params: Any) = params.asInstanceOf[Params].min
|
||||
def max(params: Any) = params.asInstanceOf[Params].max
|
||||
|
||||
def cdf(value: Double, params: Any): Double =
|
||||
{
|
||||
val p = params.asInstanceOf[Params]
|
||||
if(value < p.min) { 0.0 }
|
||||
else if(value >= p.max) { 1.0 }
|
||||
else { (value - p.min) / p.width }
|
||||
}
|
||||
|
||||
def icdf(value: Double, params: Any): Double =
|
||||
{
|
||||
val p = params.asInstanceOf[Params]
|
||||
value * p.width + p.min
|
||||
}
|
||||
|
||||
def describe(params: Any): String =
|
||||
s"Uniform(min: ${params.asInstanceOf[Params].min}, max: ${params.asInstanceOf[Params].max})"
|
||||
|
||||
case class Constructor(args: Seq[Expression])
|
||||
extends UnivariateDistributionConstructor
|
||||
{
|
||||
def family = Uniform
|
||||
def params(values: Seq[Any]) =
|
||||
Params(min = values(0).asInstanceOf[Double],
|
||||
max = values(1).asInstanceOf[Double])
|
||||
|
||||
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
|
||||
copy(args = newChildren)
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue