WIP: Adding support for boolean ops
parent
5479e9578c
commit
443e34651d
|
@ -28,16 +28,16 @@ object Pip {
|
|||
.functionRegistry
|
||||
.createOrReplaceTempFunction(name, fn, "scala_udf")
|
||||
|
||||
registerFunction("gaussian", distribution.Gaussian.Constructor(_))
|
||||
registerFunction("uniform", distribution.Uniform.Constructor(_))
|
||||
registerFunction("num_const", distribution.ConstantNumber.Constructor(_))
|
||||
registerFunction("clamp", distribution.Clamp.Constructor)
|
||||
registerFunction("discretize", distribution.Discretized.Constructor)
|
||||
registerFunction("gaussian", distribution.numerical.Gaussian.Constructor(_))
|
||||
registerFunction("uniform", distribution.numerical.Uniform.Constructor(_))
|
||||
registerFunction("num_const", distribution.numerical.ConstantNumber.Constructor(_))
|
||||
registerFunction("clamp", distribution.numerical.Clamp.Constructor)
|
||||
registerFunction("discretize", distribution.numerical.Discretized.Constructor)
|
||||
spark.udf.register("entropy", udf.Entropy.udf)
|
||||
spark.udf.register("kl_divergence", udf.KLDivergence.udf)
|
||||
|
||||
// Aggregates
|
||||
spark.udf.register("uniform_mixture", distribution.NumericalMixture.uniform)
|
||||
spark.udf.register("uniform_mixture", distribution.numerical.NumericalMixture.uniform)
|
||||
spark.udf.register("histogram", udaf(udf.Histogram))
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
package org.mimirdb.pip.distribution.boolean
|
||||
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||
|
||||
object Bernoulli
|
||||
extends BooleanDistributionFamily
|
||||
with ProbabilitySupported
|
||||
{
|
||||
|
||||
def probability(params: Any): Double =
|
||||
params.asInstanceOf[Double]
|
||||
|
||||
def describe(params: Any): String =
|
||||
s"Bernoulli($params)"
|
||||
|
||||
def sample(params: Any, random: scala.util.Random): Boolean =
|
||||
random.nextDouble() < params.asInstanceOf[Double]
|
||||
|
||||
def deserialize(in: java.io.ObjectInputStream): Any =
|
||||
in.readDouble()
|
||||
|
||||
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
|
||||
out.writeDouble(params.asInstanceOf[Double])
|
||||
|
||||
case class Constructor(args: Seq[Expression])
|
||||
extends UnivariateDistributionConstructor
|
||||
{
|
||||
def family = Bernoulli
|
||||
def params(values: Seq[Any]) = values(0).asInstanceOf[Double]
|
||||
|
||||
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
|
||||
copy(args = newChildren)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package org.mimirdb.pip.distribution.boolean
|
||||
|
||||
import org.mimirdb.pip.distribution.DistributionFamily
|
||||
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||
import org.mimirdb.pip.distribution.numerical.CDFSupported
|
||||
|
||||
object Between
|
||||
extends BooleanDistributionFamily
|
||||
{
|
||||
case class Params(lower: Double, upper: Double, baseDist: String, baseParams: Any)
|
||||
{
|
||||
def dist =
|
||||
DistributionFamily(baseDist).asInstanceOf[NumericalDistributionFamily]
|
||||
def apply[A](op: (NumericalDistributionFamily, Any) => A): A =
|
||||
{
|
||||
op(dist, baseParams)
|
||||
}
|
||||
}
|
||||
|
||||
override def approximateProbability(params: Any, samples: Int): Double =
|
||||
{
|
||||
val config = params.asInstanceOf[Params]
|
||||
|
||||
config.dist match {
|
||||
case dist: CDFSupported =>
|
||||
dist.cdf(config.upper, config.baseParams)
|
||||
- dist.cdf(config.lower, config.baseParams)
|
||||
case dist => super.approximateProbability(params, samples)
|
||||
}
|
||||
}
|
||||
override def approximateProbabilityIsFast(params: Any): Boolean =
|
||||
params.asInstanceOf[Params].dist.isInstanceOf[CDFSupported]
|
||||
|
||||
def describe(params: Any): String =
|
||||
{
|
||||
val config = params.asInstanceOf[Params]
|
||||
s"Between(${config.lower} < ${config { _.describe(_) }} < ${config.upper})"
|
||||
}
|
||||
|
||||
def sample(params: Any, random: scala.util.Random): Boolean =
|
||||
{
|
||||
val config = params.asInstanceOf[Params]
|
||||
val v:Double = config { _.sample(_, random).asInstanceOf[Double] }
|
||||
|
||||
return (v > config.lower) && (v < config.upper)
|
||||
}
|
||||
|
||||
def deserialize(in: java.io.ObjectInputStream): Any =
|
||||
{
|
||||
val lower = in.readDouble()
|
||||
val upper = in.readDouble()
|
||||
val baseDist = in.readUTF()
|
||||
val dist = DistributionFamily(baseDist)
|
||||
Params(
|
||||
lower = lower,
|
||||
upper = upper,
|
||||
baseDist = baseDist,
|
||||
baseParams = dist.deserialize(in)
|
||||
)
|
||||
}
|
||||
|
||||
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
|
||||
{
|
||||
val config = params.asInstanceOf[Params]
|
||||
out.writeDouble(config.lower)
|
||||
out.writeDouble(config.upper)
|
||||
out.writeUTF(config.baseDist)
|
||||
config { _.serialize(out, _) }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.mimirdb.pip.distribution.boolean
|
||||
|
||||
import org.apache.spark.sql.types.{ DataType, BooleanType }
|
||||
import org.mimirdb.pip.distribution.DistributionFamily
|
||||
|
||||
/**
|
||||
* A [Distribution] that specifically samples numbers
|
||||
*/
|
||||
trait BooleanDistributionFamily extends DistributionFamily
|
||||
{
|
||||
val baseType = BooleanType
|
||||
|
||||
def approximateProbability(params: Any, samples: Int): Double =
|
||||
this match {
|
||||
case c:ProbabilitySupported => c.probability(params)
|
||||
case _ =>
|
||||
{
|
||||
val rand = new scala.util.Random()
|
||||
(0 until samples).count { _ =>
|
||||
sample(params, rand).asInstanceOf[Boolean]
|
||||
}.toDouble / samples
|
||||
}
|
||||
}
|
||||
|
||||
def approximateProbabilityIsFast(params: Any): Boolean = this.isInstanceOf[ProbabilitySupported]
|
||||
}
|
||||
|
||||
/**
|
||||
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
||||
*/
|
||||
trait ProbabilitySupported
|
||||
{
|
||||
val baseType: DataType
|
||||
|
||||
assert(baseType == BooleanType, "Non-boolean distributions can not support probabilities")
|
||||
|
||||
def probability(params: Any): Double
|
||||
}
|
|
@ -57,57 +57,6 @@ trait DistributionFamily
|
|||
def label = this.getClass.getSimpleName.toLowerCase
|
||||
}
|
||||
|
||||
/**
|
||||
* A [Distribution] that specifically samples numbers
|
||||
*/
|
||||
trait NumericalDistributionFamily extends DistributionFamily
|
||||
{
|
||||
val baseType = DoubleType
|
||||
|
||||
/**
|
||||
* Compute the CDF
|
||||
*/
|
||||
def approximateCDF(value: Double, params: Any, samples: Int): Double =
|
||||
this match {
|
||||
case c:CDFSupported => c.cdf(value, params)
|
||||
case _ =>
|
||||
{
|
||||
val rand = new scala.util.Random()
|
||||
(0 until samples).count { _ =>
|
||||
sample(params, rand).asInstanceOf[Double] <= value
|
||||
}.toDouble / samples
|
||||
}
|
||||
}
|
||||
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
|
||||
|
||||
def min(params: Any): Double
|
||||
def max(params: Any): Double
|
||||
}
|
||||
|
||||
/**
|
||||
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
||||
*/
|
||||
trait CDFSupported
|
||||
{
|
||||
val baseType: DataType
|
||||
|
||||
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
|
||||
|
||||
def cdf(value: Double, params: Any): Double
|
||||
}
|
||||
|
||||
/**
|
||||
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
|
||||
*/
|
||||
trait ICDFSupported
|
||||
{
|
||||
val baseType: DataType
|
||||
|
||||
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
|
||||
|
||||
def icdf(value: Double, params: Any): Double
|
||||
}
|
||||
|
||||
/**
|
||||
* Companion object for distributions: Keeps a registry of all known distributions
|
||||
*/
|
||||
|
@ -128,9 +77,9 @@ object DistributionFamily
|
|||
|
||||
|
||||
/// Pre-defined distributions
|
||||
register(Gaussian)
|
||||
register(NumericalMixture)
|
||||
register(Clamp)
|
||||
register(Discretized)
|
||||
register(Uniform)
|
||||
register(numerical.Gaussian)
|
||||
register(numerical.NumericalMixture)
|
||||
register(numerical.Clamp)
|
||||
register(numerical.Discretized)
|
||||
register(numerical.Uniform)
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.ObjectOutputStream
|
||||
|
@ -7,6 +7,7 @@ import org.mimirdb.pip.udt.UnivariateDistribution
|
|||
import org.apache.spark.sql.functions
|
||||
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||
import org.mimirdb.pip.distribution.DistributionFamily
|
||||
|
||||
object Clamp
|
||||
extends NumericalDistributionFamily
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.Serializable
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.ObjectOutputStream
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.Serializable
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
|
@ -7,6 +7,7 @@ import org.apache.spark.sql.functions.udaf
|
|||
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||
import java.util.UUID
|
||||
import org.mimirdb.pip.SampleParams
|
||||
import org.mimirdb.pip.distribution.DistributionFamily
|
||||
|
||||
object NumericalMixture
|
||||
extends NumericalDistributionFamily
|
|
@ -1,4 +1,4 @@
|
|||
package org.mimirdb.pip.distribution
|
||||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import scala.util.Random
|
||||
import java.io.Serializable
|
|
@ -0,0 +1,55 @@
|
|||
package org.mimirdb.pip.distribution.numerical
|
||||
|
||||
import org.apache.spark.sql.types.{ DataType, DoubleType }
|
||||
import org.mimirdb.pip.distribution.DistributionFamily
|
||||
|
||||
/**
|
||||
* A [Distribution] that specifically samples numbers
|
||||
*/
|
||||
trait NumericalDistributionFamily extends DistributionFamily
|
||||
{
|
||||
val baseType = DoubleType
|
||||
|
||||
/**
|
||||
* Compute the CDF
|
||||
*/
|
||||
def approximateCDF(value: Double, params: Any, samples: Int): Double =
|
||||
this match {
|
||||
case c:CDFSupported => c.cdf(value, params)
|
||||
case _ =>
|
||||
{
|
||||
val rand = new scala.util.Random()
|
||||
(0 until samples).count { _ =>
|
||||
sample(params, rand).asInstanceOf[Double] <= value
|
||||
}.toDouble / samples
|
||||
}
|
||||
}
|
||||
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
|
||||
|
||||
def min(params: Any): Double
|
||||
def max(params: Any): Double
|
||||
}
|
||||
|
||||
/**
|
||||
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
||||
*/
|
||||
trait CDFSupported
|
||||
{
|
||||
val baseType: DataType
|
||||
|
||||
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
|
||||
|
||||
def cdf(value: Double, params: Any): Double
|
||||
}
|
||||
|
||||
/**
|
||||
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
|
||||
*/
|
||||
trait ICDFSupported
|
||||
{
|
||||
val baseType: DataType
|
||||
|
||||
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
|
||||
|
||||
def icdf(value: Double, params: Any): Double
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
package org.mimirdb.pip.udf
|
||||
|
||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||
import org.mimirdb.pip.distribution.Discretized
|
||||
import org.mimirdb.pip.distribution.NumericalDistributionFamily
|
||||
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||
import org.apache.spark.sql.functions
|
||||
|
||||
object Entropy
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
package org.mimirdb.pip.udf
|
||||
|
||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||
import org.mimirdb.pip.distribution.Discretized
|
||||
import org.mimirdb.pip.distribution.NumericalDistributionFamily
|
||||
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||
import org.apache.spark.sql.functions
|
||||
|
||||
object KLDivergence
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
package org.mimirdb.pip.lib
|
||||
import org.mimirdb.pip.distribution.Discretized
|
||||
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
|
|
Loading…
Reference in New Issue