WIP: Adding support for boolean ops
parent
5479e9578c
commit
443e34651d
|
@ -28,16 +28,16 @@ object Pip {
|
||||||
.functionRegistry
|
.functionRegistry
|
||||||
.createOrReplaceTempFunction(name, fn, "scala_udf")
|
.createOrReplaceTempFunction(name, fn, "scala_udf")
|
||||||
|
|
||||||
registerFunction("gaussian", distribution.Gaussian.Constructor(_))
|
registerFunction("gaussian", distribution.numerical.Gaussian.Constructor(_))
|
||||||
registerFunction("uniform", distribution.Uniform.Constructor(_))
|
registerFunction("uniform", distribution.numerical.Uniform.Constructor(_))
|
||||||
registerFunction("num_const", distribution.ConstantNumber.Constructor(_))
|
registerFunction("num_const", distribution.numerical.ConstantNumber.Constructor(_))
|
||||||
registerFunction("clamp", distribution.Clamp.Constructor)
|
registerFunction("clamp", distribution.numerical.Clamp.Constructor)
|
||||||
registerFunction("discretize", distribution.Discretized.Constructor)
|
registerFunction("discretize", distribution.numerical.Discretized.Constructor)
|
||||||
spark.udf.register("entropy", udf.Entropy.udf)
|
spark.udf.register("entropy", udf.Entropy.udf)
|
||||||
spark.udf.register("kl_divergence", udf.KLDivergence.udf)
|
spark.udf.register("kl_divergence", udf.KLDivergence.udf)
|
||||||
|
|
||||||
// Aggregates
|
// Aggregates
|
||||||
spark.udf.register("uniform_mixture", distribution.NumericalMixture.uniform)
|
spark.udf.register("uniform_mixture", distribution.numerical.NumericalMixture.uniform)
|
||||||
spark.udf.register("histogram", udaf(udf.Histogram))
|
spark.udf.register("histogram", udaf(udf.Histogram))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
package org.mimirdb.pip.distribution.boolean
|
||||||
|
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
|
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||||
|
|
||||||
|
object Bernoulli
|
||||||
|
extends BooleanDistributionFamily
|
||||||
|
with ProbabilitySupported
|
||||||
|
{
|
||||||
|
|
||||||
|
def probability(params: Any): Double =
|
||||||
|
params.asInstanceOf[Double]
|
||||||
|
|
||||||
|
def describe(params: Any): String =
|
||||||
|
s"Bernoulli($params)"
|
||||||
|
|
||||||
|
def sample(params: Any, random: scala.util.Random): Boolean =
|
||||||
|
random.nextDouble() < params.asInstanceOf[Double]
|
||||||
|
|
||||||
|
def deserialize(in: java.io.ObjectInputStream): Any =
|
||||||
|
in.readDouble()
|
||||||
|
|
||||||
|
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
|
||||||
|
out.writeDouble(params.asInstanceOf[Double])
|
||||||
|
|
||||||
|
case class Constructor(args: Seq[Expression])
|
||||||
|
extends UnivariateDistributionConstructor
|
||||||
|
{
|
||||||
|
def family = Bernoulli
|
||||||
|
def params(values: Seq[Any]) = values(0).asInstanceOf[Double]
|
||||||
|
|
||||||
|
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
|
||||||
|
copy(args = newChildren)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
package org.mimirdb.pip.distribution.boolean
|
||||||
|
|
||||||
|
import org.mimirdb.pip.distribution.DistributionFamily
|
||||||
|
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||||
|
import org.mimirdb.pip.distribution.numerical.CDFSupported
|
||||||
|
|
||||||
|
object Between
|
||||||
|
extends BooleanDistributionFamily
|
||||||
|
{
|
||||||
|
case class Params(lower: Double, upper: Double, baseDist: String, baseParams: Any)
|
||||||
|
{
|
||||||
|
def dist =
|
||||||
|
DistributionFamily(baseDist).asInstanceOf[NumericalDistributionFamily]
|
||||||
|
def apply[A](op: (NumericalDistributionFamily, Any) => A): A =
|
||||||
|
{
|
||||||
|
op(dist, baseParams)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override def approximateProbability(params: Any, samples: Int): Double =
|
||||||
|
{
|
||||||
|
val config = params.asInstanceOf[Params]
|
||||||
|
|
||||||
|
config.dist match {
|
||||||
|
case dist: CDFSupported =>
|
||||||
|
dist.cdf(config.upper, config.baseParams)
|
||||||
|
- dist.cdf(config.lower, config.baseParams)
|
||||||
|
case dist => super.approximateProbability(params, samples)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
override def approximateProbabilityIsFast(params: Any): Boolean =
|
||||||
|
params.asInstanceOf[Params].dist.isInstanceOf[CDFSupported]
|
||||||
|
|
||||||
|
def describe(params: Any): String =
|
||||||
|
{
|
||||||
|
val config = params.asInstanceOf[Params]
|
||||||
|
s"Between(${config.lower} < ${config { _.describe(_) }} < ${config.upper})"
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(params: Any, random: scala.util.Random): Boolean =
|
||||||
|
{
|
||||||
|
val config = params.asInstanceOf[Params]
|
||||||
|
val v:Double = config { _.sample(_, random).asInstanceOf[Double] }
|
||||||
|
|
||||||
|
return (v > config.lower) && (v < config.upper)
|
||||||
|
}
|
||||||
|
|
||||||
|
def deserialize(in: java.io.ObjectInputStream): Any =
|
||||||
|
{
|
||||||
|
val lower = in.readDouble()
|
||||||
|
val upper = in.readDouble()
|
||||||
|
val baseDist = in.readUTF()
|
||||||
|
val dist = DistributionFamily(baseDist)
|
||||||
|
Params(
|
||||||
|
lower = lower,
|
||||||
|
upper = upper,
|
||||||
|
baseDist = baseDist,
|
||||||
|
baseParams = dist.deserialize(in)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
|
||||||
|
{
|
||||||
|
val config = params.asInstanceOf[Params]
|
||||||
|
out.writeDouble(config.lower)
|
||||||
|
out.writeDouble(config.upper)
|
||||||
|
out.writeUTF(config.baseDist)
|
||||||
|
config { _.serialize(out, _) }
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.mimirdb.pip.distribution.boolean
|
||||||
|
|
||||||
|
import org.apache.spark.sql.types.{ DataType, BooleanType }
|
||||||
|
import org.mimirdb.pip.distribution.DistributionFamily
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A [Distribution] that specifically samples numbers
|
||||||
|
*/
|
||||||
|
trait BooleanDistributionFamily extends DistributionFamily
|
||||||
|
{
|
||||||
|
val baseType = BooleanType
|
||||||
|
|
||||||
|
def approximateProbability(params: Any, samples: Int): Double =
|
||||||
|
this match {
|
||||||
|
case c:ProbabilitySupported => c.probability(params)
|
||||||
|
case _ =>
|
||||||
|
{
|
||||||
|
val rand = new scala.util.Random()
|
||||||
|
(0 until samples).count { _ =>
|
||||||
|
sample(params, rand).asInstanceOf[Boolean]
|
||||||
|
}.toDouble / samples
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def approximateProbabilityIsFast(params: Any): Boolean = this.isInstanceOf[ProbabilitySupported]
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
||||||
|
*/
|
||||||
|
trait ProbabilitySupported
|
||||||
|
{
|
||||||
|
val baseType: DataType
|
||||||
|
|
||||||
|
assert(baseType == BooleanType, "Non-boolean distributions can not support probabilities")
|
||||||
|
|
||||||
|
def probability(params: Any): Double
|
||||||
|
}
|
|
@ -57,57 +57,6 @@ trait DistributionFamily
|
||||||
def label = this.getClass.getSimpleName.toLowerCase
|
def label = this.getClass.getSimpleName.toLowerCase
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* A [Distribution] that specifically samples numbers
|
|
||||||
*/
|
|
||||||
trait NumericalDistributionFamily extends DistributionFamily
|
|
||||||
{
|
|
||||||
val baseType = DoubleType
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Compute the CDF
|
|
||||||
*/
|
|
||||||
def approximateCDF(value: Double, params: Any, samples: Int): Double =
|
|
||||||
this match {
|
|
||||||
case c:CDFSupported => c.cdf(value, params)
|
|
||||||
case _ =>
|
|
||||||
{
|
|
||||||
val rand = new scala.util.Random()
|
|
||||||
(0 until samples).count { _ =>
|
|
||||||
sample(params, rand).asInstanceOf[Double] <= value
|
|
||||||
}.toDouble / samples
|
|
||||||
}
|
|
||||||
}
|
|
||||||
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
|
|
||||||
|
|
||||||
def min(params: Any): Double
|
|
||||||
def max(params: Any): Double
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
|
||||||
*/
|
|
||||||
trait CDFSupported
|
|
||||||
{
|
|
||||||
val baseType: DataType
|
|
||||||
|
|
||||||
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
|
|
||||||
|
|
||||||
def cdf(value: Double, params: Any): Double
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
|
|
||||||
*/
|
|
||||||
trait ICDFSupported
|
|
||||||
{
|
|
||||||
val baseType: DataType
|
|
||||||
|
|
||||||
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
|
|
||||||
|
|
||||||
def icdf(value: Double, params: Any): Double
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Companion object for distributions: Keeps a registry of all known distributions
|
* Companion object for distributions: Keeps a registry of all known distributions
|
||||||
*/
|
*/
|
||||||
|
@ -128,9 +77,9 @@ object DistributionFamily
|
||||||
|
|
||||||
|
|
||||||
/// Pre-defined distributions
|
/// Pre-defined distributions
|
||||||
register(Gaussian)
|
register(numerical.Gaussian)
|
||||||
register(NumericalMixture)
|
register(numerical.NumericalMixture)
|
||||||
register(Clamp)
|
register(numerical.Clamp)
|
||||||
register(Discretized)
|
register(numerical.Discretized)
|
||||||
register(Uniform)
|
register(numerical.Uniform)
|
||||||
}
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import java.io.ObjectOutputStream
|
import java.io.ObjectOutputStream
|
||||||
|
@ -7,6 +7,7 @@ import org.mimirdb.pip.udt.UnivariateDistribution
|
||||||
import org.apache.spark.sql.functions
|
import org.apache.spark.sql.functions
|
||||||
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
|
||||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
|
import org.mimirdb.pip.distribution.DistributionFamily
|
||||||
|
|
||||||
object Clamp
|
object Clamp
|
||||||
extends NumericalDistributionFamily
|
extends NumericalDistributionFamily
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import java.io.Serializable
|
import java.io.Serializable
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import java.io.ObjectOutputStream
|
import java.io.ObjectOutputStream
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import java.io.Serializable
|
import java.io.Serializable
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||||
|
@ -7,6 +7,7 @@ import org.apache.spark.sql.functions.udaf
|
||||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||||
import java.util.UUID
|
import java.util.UUID
|
||||||
import org.mimirdb.pip.SampleParams
|
import org.mimirdb.pip.SampleParams
|
||||||
|
import org.mimirdb.pip.distribution.DistributionFamily
|
||||||
|
|
||||||
object NumericalMixture
|
object NumericalMixture
|
||||||
extends NumericalDistributionFamily
|
extends NumericalDistributionFamily
|
|
@ -1,4 +1,4 @@
|
||||||
package org.mimirdb.pip.distribution
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
import java.io.Serializable
|
import java.io.Serializable
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.mimirdb.pip.distribution.numerical
|
||||||
|
|
||||||
|
import org.apache.spark.sql.types.{ DataType, DoubleType }
|
||||||
|
import org.mimirdb.pip.distribution.DistributionFamily
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A [Distribution] that specifically samples numbers
|
||||||
|
*/
|
||||||
|
trait NumericalDistributionFamily extends DistributionFamily
|
||||||
|
{
|
||||||
|
val baseType = DoubleType
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute the CDF
|
||||||
|
*/
|
||||||
|
def approximateCDF(value: Double, params: Any, samples: Int): Double =
|
||||||
|
this match {
|
||||||
|
case c:CDFSupported => c.cdf(value, params)
|
||||||
|
case _ =>
|
||||||
|
{
|
||||||
|
val rand = new scala.util.Random()
|
||||||
|
(0 until samples).count { _ =>
|
||||||
|
sample(params, rand).asInstanceOf[Double] <= value
|
||||||
|
}.toDouble / samples
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
|
||||||
|
|
||||||
|
def min(params: Any): Double
|
||||||
|
def max(params: Any): Double
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
|
||||||
|
*/
|
||||||
|
trait CDFSupported
|
||||||
|
{
|
||||||
|
val baseType: DataType
|
||||||
|
|
||||||
|
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
|
||||||
|
|
||||||
|
def cdf(value: Double, params: Any): Double
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
|
||||||
|
*/
|
||||||
|
trait ICDFSupported
|
||||||
|
{
|
||||||
|
val baseType: DataType
|
||||||
|
|
||||||
|
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
|
||||||
|
|
||||||
|
def icdf(value: Double, params: Any): Double
|
||||||
|
}
|
|
@ -1,8 +1,8 @@
|
||||||
package org.mimirdb.pip.udf
|
package org.mimirdb.pip.udf
|
||||||
|
|
||||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||||
import org.mimirdb.pip.distribution.Discretized
|
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||||
import org.mimirdb.pip.distribution.NumericalDistributionFamily
|
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||||
import org.apache.spark.sql.functions
|
import org.apache.spark.sql.functions
|
||||||
|
|
||||||
object Entropy
|
object Entropy
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
package org.mimirdb.pip.udf
|
package org.mimirdb.pip.udf
|
||||||
|
|
||||||
import org.mimirdb.pip.udt.UnivariateDistribution
|
import org.mimirdb.pip.udt.UnivariateDistribution
|
||||||
import org.mimirdb.pip.distribution.Discretized
|
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||||
import org.mimirdb.pip.distribution.NumericalDistributionFamily
|
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
|
||||||
import org.apache.spark.sql.functions
|
import org.apache.spark.sql.functions
|
||||||
|
|
||||||
object KLDivergence
|
object KLDivergence
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
package org.mimirdb.pip.lib
|
package org.mimirdb.pip.lib
|
||||||
import org.mimirdb.pip.distribution.Discretized
|
import org.mimirdb.pip.distribution.numerical.Discretized
|
||||||
|
|
||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue