Build issues with scalatest.
parent
cefac0f6ad
commit
b0b515f5a4
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"javaSemanticDBVersion": "0.9.6",
|
||||
"semanticDBVersion": "4.8.12",
|
||||
"supportedScalaVersions": [
|
||||
"2.13.12",
|
||||
"2.12.18",
|
||||
"2.12.17",
|
||||
"2.12.16",
|
||||
"2.12.15",
|
||||
"2.13.9",
|
||||
"2.13.10",
|
||||
"2.13.11",
|
||||
"2.11.12",
|
||||
"2.12.11",
|
||||
"2.12.12",
|
||||
"2.12.13",
|
||||
"2.12.14",
|
||||
"2.13.5",
|
||||
"2.13.6",
|
||||
"2.13.7",
|
||||
"2.13.8"
|
||||
]
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"files.watcherExclude": {
|
||||
"**/target": true
|
||||
}
|
||||
}
|
5
build.sc
5
build.sc
|
@ -17,4 +17,9 @@ object mimir_pip extends RootModule with ScalaModule {
|
|||
ivy"org.apache.spark::spark-sql:3.3.1",
|
||||
)
|
||||
|
||||
object test extends ScalaTests {
|
||||
def ivyDeps = Agg(ivy"org.scalatest::scalatest-flatspec:3.2.17")
|
||||
def testFramework = "Flatspec"
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
log4j.rootCategory=WARN, console
|
||||
log4j.appender.console=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.console.target=System.err
|
||||
log4j.appender.console.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
|
|
@ -0,0 +1,41 @@
|
|||
package org.mimirdb
|
||||
import scala.util.Random
|
||||
|
||||
/* Gaussian object NOTE: Scala uses Camel Case; create a new class file for this UDT and its associated methods */
|
||||
final case class Gauss(mean: Float, sd: Float) {
|
||||
def generate_value():Float = {
|
||||
Random.nextGaussian().asInstanceOf[Float] * sd + mean
|
||||
/*
|
||||
I think this may be an approximation, i.e., not certain that
|
||||
multiplying with sd and then taking the sum of the product and the mean is the exact computation
|
||||
|
||||
-you will probably need one more class to represent the random variable??
|
||||
*/
|
||||
|
||||
}
|
||||
def gaussPlus(operand: Gauss): Gauss = {
|
||||
val new_mean = this.mean + operand.mean
|
||||
val new_sd = math.sqrt(math.pow(this.sd, 2) + math.pow(operand.sd, 2))
|
||||
Gauss(new_mean, new_sd.asInstanceOf[Float])
|
||||
}
|
||||
|
||||
def gaussMinus(operand: Gauss): Gauss = {
|
||||
this.gaussPlus(Gauss(operand.mean * (-1), operand.sd * (-1)))
|
||||
}
|
||||
|
||||
def gaussMult(operand: Gauss): Gauss = {
|
||||
/* https://ccrma.stanford.edu/~jos/sasp/Product_Two_Gaussian_PDFs.html
|
||||
I am following the formula at the above link for taking the product of two Gaussians.
|
||||
I don't know if this is correct based on other websites. For now, the above link is the formula
|
||||
being followed.
|
||||
*/
|
||||
val new_mean = (this.mean * math.pow(operand.sd, 2) + operand.mean * math.pow(this.sd, 2)) / (math.pow(this.sd, 2) + math.pow(operand.sd, 2))
|
||||
val new_sd = (math.pow(this.sd, 2) * math.pow(operand.sd, 2)) / (math.pow(this.sd, 2) + math.pow(operand.sd, 2))
|
||||
Gauss(new_mean.asInstanceOf[Float], new_sd.asInstanceOf[Float])
|
||||
}
|
||||
|
||||
override def toString(): String = {
|
||||
s"Gauss(mean: ${mean}, stdiv: ${sd})"
|
||||
}
|
||||
|
||||
}//this needs to be serializable
|
|
@ -0,0 +1,53 @@
|
|||
package org.mimirdb
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.unsafe.types.UTF8String
|
||||
import org.apache.hadoop.shaded.com.nimbusds.jose.util.StandardCharset
|
||||
|
||||
class GaussType extends UserDefinedType[Gauss]{
|
||||
/*
|
||||
Guass is the user facing object being represented.
|
||||
It is usually customary to define an object that inherits from Gauss.
|
||||
The internal type spark already knows how to deal with. This is sqlType. It is the datatype that
|
||||
spark uses for serialized instances.
|
||||
*/
|
||||
override def sqlType: DataType = BinaryType
|
||||
override def serialize(obj: Gauss): Array[Byte] =
|
||||
{
|
||||
val byteBuffer = new java.io.ByteArrayOutputStream()
|
||||
val out = new java.io.ObjectOutputStream(byteBuffer)
|
||||
out.writeObject(obj)
|
||||
return byteBuffer.toByteArray()
|
||||
}
|
||||
/*
|
||||
override def serialize(obj: Gauss): Array[Byte] = { obj match {
|
||||
case Gauss(mean, sd) => s"Gauss(${mean}, ${sd})".getBytes("UTF-8")
|
||||
case _ => s"ERROR".getBytes("UTF-8")
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
When spark itself is compiled, it doesn't know types until the query is run.
|
||||
Deserialize only gets called on an object of type Array[Byte]. That is not information that is
|
||||
available to spark's type checker. It is assumed that this object is then an arbitrary object type.
|
||||
But you can then always cast it because spark will maintain the invariant that it will never call
|
||||
deserialize on something it previously did not call to serialize
|
||||
*/
|
||||
override def deserialize(datum: Any): Gauss = {
|
||||
/*
|
||||
val s_datum = new String(datum.asInstanceOf[Array[Byte]], StandardCharset.UTF_8)
|
||||
val first_split = s_datum.split("(")
|
||||
val mean = first_split(1).split(",")(0).asInstanceOf[Float]
|
||||
val sd = first_split(1).split(",")(1).trim.asInstanceOf[Float]
|
||||
Gauss(mean, sd)
|
||||
*/
|
||||
val bis = new java.io.ByteArrayInputStream(datum.asInstanceOf[Array[Byte]])
|
||||
val obj = new java.io.ObjectInputStream(bis)
|
||||
return obj.readObject().asInstanceOf[Gauss]
|
||||
|
||||
}
|
||||
|
||||
override def userClass: Class[Gauss] = classOf[Gauss]
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package org.mimirdb
|
||||
|
||||
|
||||
import org.apache.spark.sql.types.UDTRegistration
|
||||
|
||||
|
||||
object MimirUDTRegistrator {//by convention uppercase first letter of class/object name
|
||||
//then just model this after the sedona registrator
|
||||
|
||||
def registerAll(): Unit = {
|
||||
//val udt_class = "filler_class"
|
||||
UDTRegistration.register(classOf[Gauss].getName(), classOf[GaussType].getName())
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,10 +1,14 @@
|
|||
package org.mimirdb
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.functions.udf
|
||||
import org.apache.log4j.{ Logger, Level }
|
||||
import org.apache.spark.sql.functions._
|
||||
import scala.util.Random
|
||||
|
||||
object Pip
|
||||
{
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit =
|
||||
{
|
||||
|
@ -15,9 +19,84 @@ object Pip
|
|||
|
||||
spark.sparkContext.setLogLevel("WARN")
|
||||
|
||||
MimirUDTRegistrator.registerAll()
|
||||
//you could alias spark.range: import spark.range; then line 21 becomes RHS range(10)
|
||||
//This yields a dataframe that has one column with 10 rows, the set of natural numbers [1:10]
|
||||
val df = spark.range(10)
|
||||
|
||||
df.show()
|
||||
//val col_n_dist = randn()
|
||||
/*
|
||||
val df_rand = df.select(rand().as("Random Data"))
|
||||
val df_rand1 = df.select(rand() as "More Random Data")
|
||||
*/
|
||||
val sd_window = 3
|
||||
val mean_window = 9
|
||||
|
||||
/*
|
||||
val df_rand_sd = df.select(rand() * sd_window as "Standard Deviation")
|
||||
val df_rand_mean = df.select((rand() * 9) + 1 as "Mean")
|
||||
*/
|
||||
|
||||
/* Create df of sd/mean measurements */
|
||||
val df_mean_sd = df.select(expr("id"), rand() * sd_window as "SD", (rand() * mean_window) + 1 as "Mean")
|
||||
df_mean_sd.show()
|
||||
|
||||
/*
|
||||
Old stuff to be deleted
|
||||
|
||||
var gauss_vals = List[Gauss]()
|
||||
|
||||
df_mean_sd.take(df_mean_sd.count.toInt).foreach(t => gauss_vals = gauss_vals :+ Gauss(t(1).asInstanceOf[Double].asInstanceOf[Float], t(2).asInstanceOf[Double].asInstanceOf[Float]))
|
||||
df_mean_sd.take(df_mean_sd.count.toInt).foreach(t => println(s"SD=${t(1)},\tmean=${t(2)}"))
|
||||
|
||||
gauss_vals.foreach(o => println(o))
|
||||
println(gauss_vals)
|
||||
|
||||
val test_gauss = Gauss(2, 3)
|
||||
print(s"Testing: ${test_gauss}")
|
||||
gauss_vals = gauss_vals :+ test_gauss
|
||||
print(s"Testing list: ${gauss_vals}")
|
||||
val rand_gauss = Random.nextGaussian().asInstanceOf[Float]
|
||||
println(s"\nrandom value geneerated from gaussian distribution with mean 0 standard deviation 1: ${rand_gauss}")
|
||||
|
||||
//val class_name = "guassian"
|
||||
//val class_type = "random"
|
||||
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/* Create spark udf */
|
||||
val create_gauss = udf((mu, sigma) => Gauss(mu, sigma))//this is a function that takes 2 Column objects and returns a Column object
|
||||
//you can verify this by printing the type signature of create_gauss
|
||||
|
||||
|
||||
//test .gaussPlus functionality
|
||||
|
||||
//think compositionally; break things down into composable units, i.e. use Guass object parameters
|
||||
val add_gauss = udf((mu1: Float, sigma1: Float, mu2:Float, sigma2: Float) => Gauss(mu1, sigma1).gaussPlus(Gauss(mu2, sigma2)))
|
||||
|
||||
//test .gaussMult functionality
|
||||
val prod_gauss = udf((mu1: Float, sigma1: Float, mu2: Float, sigma2: Float) => Gauss(mu1, sigma1).gaussMult(Gauss(mu2, sigma2)))
|
||||
spark.udf.register("create_gauss", create_gauss)
|
||||
//if we are going to keep add_gauss around, we need to register it
|
||||
//spark.udf.register("add_gauss", add_gauss)
|
||||
|
||||
df_mean_sd.select(df_mean_sd("id"), df_mean_sd("Mean"), df_mean_sd("SD"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD"))).show()
|
||||
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD"))).show(false)
|
||||
|
||||
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD")), add_gauss(df_mean_sd("Mean"), df_mean_sd("SD"), df_mean_sd("Mean"), df_mean_sd("SD")) as("Add Gauss to itself")).show(false)
|
||||
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD")), prod_gauss(df_mean_sd("Mean"), df_mean_sd("SD"), df_mean_sd("Mean"), df_mean_sd("SD")) as("(Reflexive) Gauss Product")).show(false)
|
||||
//false argument tells spark not to truncate data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package org.mimirdb
|
||||
|
||||
final case class RandomVariable(i: ()=> Float){
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"workerCache": {
|
||||
|
||||
},
|
||||
"evalWatched": [
|
||||
|
||||
],
|
||||
"moduleWatched": [
|
||||
|
||||
],
|
||||
"scriptImportGraph": [
|
||||
|
||||
],
|
||||
"classLoaderIdentity": [
|
||||
|
||||
],
|
||||
"runClasspath": [
|
||||
|
||||
],
|
||||
"runClasspathHash": 473519988
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
1
|
Binary file not shown.
|
@ -0,0 +1,7 @@
|
|||
package mimir
|
||||
|
||||
import org.scalatest.flatspec.AnyFlatSpec
|
||||
|
||||
class gaussPlusSuite extends AnyFlatspec {
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package mimirdb
|
||||
|
||||
import org.scalatest.flatspec.AnyFlatSpec
|
||||
|
||||
class gaussPlusSuite extends AnyFlatSpec {
|
||||
"An empyt set" should "have size 0" in {
|
||||
assert(Set.empty.size == 0)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue