Build issues with scalatest.

main
ahuber 2023-11-21 08:35:03 -05:00
parent cefac0f6ad
commit b0b515f5a4
60 changed files with 6355 additions and 1 deletions

View File

@ -0,0 +1,23 @@
{
"javaSemanticDBVersion": "0.9.6",
"semanticDBVersion": "4.8.12",
"supportedScalaVersions": [
"2.13.12",
"2.12.18",
"2.12.17",
"2.12.16",
"2.12.15",
"2.13.9",
"2.13.10",
"2.13.11",
"2.11.12",
"2.12.11",
"2.12.12",
"2.12.13",
"2.12.14",
"2.13.5",
"2.13.6",
"2.13.7",
"2.13.8"
]
}

Binary file not shown.

2999
.bloop/root-module.json Normal file

File diff suppressed because it is too large Load Diff

3085
.bloop/test.json Normal file

File diff suppressed because it is too large Load Diff

5
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,5 @@
{
"files.watcherExclude": {
"**/target": true
}
}

View File

@ -17,4 +17,9 @@ object mimir_pip extends RootModule with ScalaModule {
ivy"org.apache.spark::spark-sql:3.3.1",
)
object test extends ScalaTests {
def ivyDeps = Agg(ivy"org.scalatest::scalatest-flatspec:3.2.17")
def testFramework = "Flatspec"
}
}

5
conf/log4j2.properties Normal file
View File

@ -0,0 +1,5 @@
log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

View File

@ -0,0 +1,41 @@
package org.mimirdb
import scala.util.Random
/* Gaussian object NOTE: Scala uses Camel Case; create a new class file for this UDT and its associated methods */
final case class Gauss(mean: Float, sd: Float) {
def generate_value():Float = {
Random.nextGaussian().asInstanceOf[Float] * sd + mean
/*
I think this may be an approximation, i.e., not certain that
multiplying with sd and then taking the sum of the product and the mean is the exact computation
-you will probably need one more class to represent the random variable??
*/
}
def gaussPlus(operand: Gauss): Gauss = {
val new_mean = this.mean + operand.mean
val new_sd = math.sqrt(math.pow(this.sd, 2) + math.pow(operand.sd, 2))
Gauss(new_mean, new_sd.asInstanceOf[Float])
}
def gaussMinus(operand: Gauss): Gauss = {
this.gaussPlus(Gauss(operand.mean * (-1), operand.sd * (-1)))
}
def gaussMult(operand: Gauss): Gauss = {
/* https://ccrma.stanford.edu/~jos/sasp/Product_Two_Gaussian_PDFs.html
I am following the formula at the above link for taking the product of two Gaussians.
I don't know if this is correct based on other websites. For now, the above link is the formula
being followed.
*/
val new_mean = (this.mean * math.pow(operand.sd, 2) + operand.mean * math.pow(this.sd, 2)) / (math.pow(this.sd, 2) + math.pow(operand.sd, 2))
val new_sd = (math.pow(this.sd, 2) * math.pow(operand.sd, 2)) / (math.pow(this.sd, 2) + math.pow(operand.sd, 2))
Gauss(new_mean.asInstanceOf[Float], new_sd.asInstanceOf[Float])
}
override def toString(): String = {
s"Gauss(mean: ${mean}, stdiv: ${sd})"
}
}//this needs to be serializable

View File

@ -0,0 +1,53 @@
package org.mimirdb
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.hadoop.shaded.com.nimbusds.jose.util.StandardCharset
class GaussType extends UserDefinedType[Gauss]{
/*
Guass is the user facing object being represented.
It is usually customary to define an object that inherits from Gauss.
The internal type spark already knows how to deal with. This is sqlType. It is the datatype that
spark uses for serialized instances.
*/
override def sqlType: DataType = BinaryType
override def serialize(obj: Gauss): Array[Byte] =
{
val byteBuffer = new java.io.ByteArrayOutputStream()
val out = new java.io.ObjectOutputStream(byteBuffer)
out.writeObject(obj)
return byteBuffer.toByteArray()
}
/*
override def serialize(obj: Gauss): Array[Byte] = { obj match {
case Gauss(mean, sd) => s"Gauss(${mean}, ${sd})".getBytes("UTF-8")
case _ => s"ERROR".getBytes("UTF-8")
}
}
*/
/*
When spark itself is compiled, it doesn't know types until the query is run.
Deserialize only gets called on an object of type Array[Byte]. That is not information that is
available to spark's type checker. It is assumed that this object is then an arbitrary object type.
But you can then always cast it because spark will maintain the invariant that it will never call
deserialize on something it previously did not call to serialize
*/
override def deserialize(datum: Any): Gauss = {
/*
val s_datum = new String(datum.asInstanceOf[Array[Byte]], StandardCharset.UTF_8)
val first_split = s_datum.split("(")
val mean = first_split(1).split(",")(0).asInstanceOf[Float]
val sd = first_split(1).split(",")(1).trim.asInstanceOf[Float]
Gauss(mean, sd)
*/
val bis = new java.io.ByteArrayInputStream(datum.asInstanceOf[Array[Byte]])
val obj = new java.io.ObjectInputStream(bis)
return obj.readObject().asInstanceOf[Gauss]
}
override def userClass: Class[Gauss] = classOf[Gauss]
}

View File

@ -0,0 +1,16 @@
package org.mimirdb
import org.apache.spark.sql.types.UDTRegistration
object MimirUDTRegistrator {//by convention uppercase first letter of class/object name
//then just model this after the sedona registrator
def registerAll(): Unit = {
//val udt_class = "filler_class"
UDTRegistration.register(classOf[Gauss].getName(), classOf[GaussType].getName())
}
}

View File

@ -1,10 +1,14 @@
package org.mimirdb
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.udf
import org.apache.log4j.{ Logger, Level }
import org.apache.spark.sql.functions._
import scala.util.Random
object Pip
{
def main(args: Array[String]): Unit =
{
@ -15,9 +19,84 @@ object Pip
spark.sparkContext.setLogLevel("WARN")
MimirUDTRegistrator.registerAll()
//you could alias spark.range: import spark.range; then line 21 becomes RHS range(10)
//This yields a dataframe that has one column with 10 rows, the set of natural numbers [1:10]
val df = spark.range(10)
df.show()
//val col_n_dist = randn()
/*
val df_rand = df.select(rand().as("Random Data"))
val df_rand1 = df.select(rand() as "More Random Data")
*/
val sd_window = 3
val mean_window = 9
/*
val df_rand_sd = df.select(rand() * sd_window as "Standard Deviation")
val df_rand_mean = df.select((rand() * 9) + 1 as "Mean")
*/
/* Create df of sd/mean measurements */
val df_mean_sd = df.select(expr("id"), rand() * sd_window as "SD", (rand() * mean_window) + 1 as "Mean")
df_mean_sd.show()
/*
Old stuff to be deleted
var gauss_vals = List[Gauss]()
df_mean_sd.take(df_mean_sd.count.toInt).foreach(t => gauss_vals = gauss_vals :+ Gauss(t(1).asInstanceOf[Double].asInstanceOf[Float], t(2).asInstanceOf[Double].asInstanceOf[Float]))
df_mean_sd.take(df_mean_sd.count.toInt).foreach(t => println(s"SD=${t(1)},\tmean=${t(2)}"))
gauss_vals.foreach(o => println(o))
println(gauss_vals)
val test_gauss = Gauss(2, 3)
print(s"Testing: ${test_gauss}")
gauss_vals = gauss_vals :+ test_gauss
print(s"Testing list: ${gauss_vals}")
val rand_gauss = Random.nextGaussian().asInstanceOf[Float]
println(s"\nrandom value geneerated from gaussian distribution with mean 0 standard deviation 1: ${rand_gauss}")
//val class_name = "guassian"
//val class_type = "random"
*/
/* Create spark udf */
val create_gauss = udf((mu, sigma) => Gauss(mu, sigma))//this is a function that takes 2 Column objects and returns a Column object
//you can verify this by printing the type signature of create_gauss
//test .gaussPlus functionality
//think compositionally; break things down into composable units, i.e. use Guass object parameters
val add_gauss = udf((mu1: Float, sigma1: Float, mu2:Float, sigma2: Float) => Gauss(mu1, sigma1).gaussPlus(Gauss(mu2, sigma2)))
//test .gaussMult functionality
val prod_gauss = udf((mu1: Float, sigma1: Float, mu2: Float, sigma2: Float) => Gauss(mu1, sigma1).gaussMult(Gauss(mu2, sigma2)))
spark.udf.register("create_gauss", create_gauss)
//if we are going to keep add_gauss around, we need to register it
//spark.udf.register("add_gauss", add_gauss)
df_mean_sd.select(df_mean_sd("id"), df_mean_sd("Mean"), df_mean_sd("SD"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD"))).show()
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD"))).show(false)
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD")), add_gauss(df_mean_sd("Mean"), df_mean_sd("SD"), df_mean_sd("Mean"), df_mean_sd("SD")) as("Add Gauss to itself")).show(false)
df_mean_sd.select(df_mean_sd("*"), create_gauss(df_mean_sd("Mean"), df_mean_sd("SD")), prod_gauss(df_mean_sd("Mean"), df_mean_sd("SD"), df_mean_sd("Mean"), df_mean_sd("SD")) as("(Reflexive) Gauss Product")).show(false)
//false argument tells spark not to truncate data
}
}

View File

@ -0,0 +1,5 @@
package org.mimirdb
final case class RandomVariable(i: ()=> Float){
}

View File

@ -0,0 +1,21 @@
{
"workerCache": {
},
"evalWatched": [
],
"moduleWatched": [
],
"scriptImportGraph": [
],
"classLoaderIdentity": [
],
"runClasspath": [
],
"runClasspathHash": 473519988
}

View File

@ -0,0 +1 @@
1

Binary file not shown.

View File

@ -0,0 +1,7 @@
package mimir
import org.scalatest.flatspec.AnyFlatSpec
class gaussPlusSuite extends AnyFlatspec {
}

View File

@ -0,0 +1,9 @@
package mimirdb
import org.scalatest.flatspec.AnyFlatSpec
class gaussPlusSuite extends AnyFlatSpec {
"An empyt set" should "have size 0" in {
assert(Set.empty.size == 0)
}
}