WIP: Adding support for boolean ops

spatial-index
Oliver Kennedy 2024-04-19 00:03:07 -04:00
parent 5479e9578c
commit 443e34651d
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
16 changed files with 222 additions and 73 deletions

View File

@ -28,16 +28,16 @@ object Pip {
.functionRegistry
.createOrReplaceTempFunction(name, fn, "scala_udf")
registerFunction("gaussian", distribution.Gaussian.Constructor(_))
registerFunction("uniform", distribution.Uniform.Constructor(_))
registerFunction("num_const", distribution.ConstantNumber.Constructor(_))
registerFunction("clamp", distribution.Clamp.Constructor)
registerFunction("discretize", distribution.Discretized.Constructor)
registerFunction("gaussian", distribution.numerical.Gaussian.Constructor(_))
registerFunction("uniform", distribution.numerical.Uniform.Constructor(_))
registerFunction("num_const", distribution.numerical.ConstantNumber.Constructor(_))
registerFunction("clamp", distribution.numerical.Clamp.Constructor)
registerFunction("discretize", distribution.numerical.Discretized.Constructor)
spark.udf.register("entropy", udf.Entropy.udf)
spark.udf.register("kl_divergence", udf.KLDivergence.udf)
// Aggregates
spark.udf.register("uniform_mixture", distribution.NumericalMixture.uniform)
spark.udf.register("uniform_mixture", distribution.numerical.NumericalMixture.uniform)
spark.udf.register("histogram", udaf(udf.Histogram))
}

View File

@ -0,0 +1,35 @@
package org.mimirdb.pip.distribution.boolean
import org.apache.spark.sql.catalyst.expressions.Expression
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
object Bernoulli
extends BooleanDistributionFamily
with ProbabilitySupported
{
def probability(params: Any): Double =
params.asInstanceOf[Double]
def describe(params: Any): String =
s"Bernoulli($params)"
def sample(params: Any, random: scala.util.Random): Boolean =
random.nextDouble() < params.asInstanceOf[Double]
def deserialize(in: java.io.ObjectInputStream): Any =
in.readDouble()
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
out.writeDouble(params.asInstanceOf[Double])
case class Constructor(args: Seq[Expression])
extends UnivariateDistributionConstructor
{
def family = Bernoulli
def params(values: Seq[Any]) = values(0).asInstanceOf[Double]
def withNewChildrenInternal(newChildren: IndexedSeq[Expression]) =
copy(args = newChildren)
}
}

View File

@ -0,0 +1,70 @@
package org.mimirdb.pip.distribution.boolean
import org.mimirdb.pip.distribution.DistributionFamily
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
import org.mimirdb.pip.distribution.numerical.CDFSupported
object Between
extends BooleanDistributionFamily
{
case class Params(lower: Double, upper: Double, baseDist: String, baseParams: Any)
{
def dist =
DistributionFamily(baseDist).asInstanceOf[NumericalDistributionFamily]
def apply[A](op: (NumericalDistributionFamily, Any) => A): A =
{
op(dist, baseParams)
}
}
override def approximateProbability(params: Any, samples: Int): Double =
{
val config = params.asInstanceOf[Params]
config.dist match {
case dist: CDFSupported =>
dist.cdf(config.upper, config.baseParams)
- dist.cdf(config.lower, config.baseParams)
case dist => super.approximateProbability(params, samples)
}
}
override def approximateProbabilityIsFast(params: Any): Boolean =
params.asInstanceOf[Params].dist.isInstanceOf[CDFSupported]
def describe(params: Any): String =
{
val config = params.asInstanceOf[Params]
s"Between(${config.lower} < ${config { _.describe(_) }} < ${config.upper})"
}
def sample(params: Any, random: scala.util.Random): Boolean =
{
val config = params.asInstanceOf[Params]
val v:Double = config { _.sample(_, random).asInstanceOf[Double] }
return (v > config.lower) && (v < config.upper)
}
def deserialize(in: java.io.ObjectInputStream): Any =
{
val lower = in.readDouble()
val upper = in.readDouble()
val baseDist = in.readUTF()
val dist = DistributionFamily(baseDist)
Params(
lower = lower,
upper = upper,
baseDist = baseDist,
baseParams = dist.deserialize(in)
)
}
def serialize(out: java.io.ObjectOutputStream, params: Any): Unit =
{
val config = params.asInstanceOf[Params]
out.writeDouble(config.lower)
out.writeDouble(config.upper)
out.writeUTF(config.baseDist)
config { _.serialize(out, _) }
}
}

View File

@ -0,0 +1,38 @@
package org.mimirdb.pip.distribution.boolean
import org.apache.spark.sql.types.{ DataType, BooleanType }
import org.mimirdb.pip.distribution.DistributionFamily
/**
* A [Distribution] that specifically samples numbers
*/
trait BooleanDistributionFamily extends DistributionFamily
{
val baseType = BooleanType
def approximateProbability(params: Any, samples: Int): Double =
this match {
case c:ProbabilitySupported => c.probability(params)
case _ =>
{
val rand = new scala.util.Random()
(0 until samples).count { _ =>
sample(params, rand).asInstanceOf[Boolean]
}.toDouble / samples
}
}
def approximateProbabilityIsFast(params: Any): Boolean = this.isInstanceOf[ProbabilitySupported]
}
/**
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
*/
trait ProbabilitySupported
{
val baseType: DataType
assert(baseType == BooleanType, "Non-boolean distributions can not support probabilities")
def probability(params: Any): Double
}

View File

@ -57,57 +57,6 @@ trait DistributionFamily
def label = this.getClass.getSimpleName.toLowerCase
}
/**
* A [Distribution] that specifically samples numbers
*/
trait NumericalDistributionFamily extends DistributionFamily
{
val baseType = DoubleType
/**
* Compute the CDF
*/
def approximateCDF(value: Double, params: Any, samples: Int): Double =
this match {
case c:CDFSupported => c.cdf(value, params)
case _ =>
{
val rand = new scala.util.Random()
(0 until samples).count { _ =>
sample(params, rand).asInstanceOf[Double] <= value
}.toDouble / samples
}
}
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
def min(params: Any): Double
def max(params: Any): Double
}
/**
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
*/
trait CDFSupported
{
val baseType: DataType
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
def cdf(value: Double, params: Any): Double
}
/**
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
*/
trait ICDFSupported
{
val baseType: DataType
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
def icdf(value: Double, params: Any): Double
}
/**
* Companion object for distributions: Keeps a registry of all known distributions
*/
@ -128,9 +77,9 @@ object DistributionFamily
/// Pre-defined distributions
register(Gaussian)
register(NumericalMixture)
register(Clamp)
register(Discretized)
register(Uniform)
register(numerical.Gaussian)
register(numerical.NumericalMixture)
register(numerical.Clamp)
register(numerical.Discretized)
register(numerical.Uniform)
}

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.ObjectOutputStream
@ -7,6 +7,7 @@ import org.mimirdb.pip.udt.UnivariateDistribution
import org.apache.spark.sql.functions
import org.mimirdb.pip.udt.UnivariateDistributionConstructor
import org.apache.spark.sql.catalyst.expressions.Expression
import org.mimirdb.pip.distribution.DistributionFamily
object Clamp
extends NumericalDistributionFamily

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.Serializable

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.ObjectOutputStream

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.Serializable

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@ -7,6 +7,7 @@ import org.apache.spark.sql.functions.udaf
import org.mimirdb.pip.udt.UnivariateDistribution
import java.util.UUID
import org.mimirdb.pip.SampleParams
import org.mimirdb.pip.distribution.DistributionFamily
object NumericalMixture
extends NumericalDistributionFamily

View File

@ -1,4 +1,4 @@
package org.mimirdb.pip.distribution
package org.mimirdb.pip.distribution.numerical
import scala.util.Random
import java.io.Serializable

View File

@ -0,0 +1,55 @@
package org.mimirdb.pip.distribution.numerical
import org.apache.spark.sql.types.{ DataType, DoubleType }
import org.mimirdb.pip.distribution.DistributionFamily
/**
* A [Distribution] that specifically samples numbers
*/
trait NumericalDistributionFamily extends DistributionFamily
{
val baseType = DoubleType
/**
* Compute the CDF
*/
def approximateCDF(value: Double, params: Any, samples: Int): Double =
this match {
case c:CDFSupported => c.cdf(value, params)
case _ =>
{
val rand = new scala.util.Random()
(0 until samples).count { _ =>
sample(params, rand).asInstanceOf[Double] <= value
}.toDouble / samples
}
}
def approximateCDFIsFast(params: Any): Boolean = this.isInstanceOf[CDFSupported]
def min(params: Any): Double
def max(params: Any): Double
}
/**
* An add-on to NumericalDistributionFamily that indicates an exact CDF can be computed
*/
trait CDFSupported
{
val baseType: DataType
assert(baseType == DoubleType, "Non-numerical distributions can not support CDFs")
def cdf(value: Double, params: Any): Double
}
/**
* An add-on to NumericalDistributionFamily that indicates an exact Inverse CDF can be computed
*/
trait ICDFSupported
{
val baseType: DataType
assert(baseType == DoubleType, "Non-numerical distributions can not support ICDFs")
def icdf(value: Double, params: Any): Double
}

View File

@ -1,8 +1,8 @@
package org.mimirdb.pip.udf
import org.mimirdb.pip.udt.UnivariateDistribution
import org.mimirdb.pip.distribution.Discretized
import org.mimirdb.pip.distribution.NumericalDistributionFamily
import org.mimirdb.pip.distribution.numerical.Discretized
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
import org.apache.spark.sql.functions
object Entropy

View File

@ -1,8 +1,8 @@
package org.mimirdb.pip.udf
import org.mimirdb.pip.udt.UnivariateDistribution
import org.mimirdb.pip.distribution.Discretized
import org.mimirdb.pip.distribution.NumericalDistributionFamily
import org.mimirdb.pip.distribution.numerical.Discretized
import org.mimirdb.pip.distribution.numerical.NumericalDistributionFamily
import org.apache.spark.sql.functions
object KLDivergence

View File

@ -1,5 +1,5 @@
package org.mimirdb.pip.lib
import org.mimirdb.pip.distribution.Discretized
import org.mimirdb.pip.distribution.numerical.Discretized
import scala.util.Random