mimir-pip/src/org/mimirdb/pip/Main.scala

210 lines
7.0 KiB
Scala

package org.mimirdb.pip
import org.apache.spark.sql.SparkSession
import org.apache.log4j.{ Logger, Level }
import org.apache.spark.sql.functions._
import scala.util.Random
/* To read file from URL */
import sys.process._
import java.net.URL
import java.io.File
/*To check for existence */
import java.nio.file.{Paths, Files}
/* For SedonaContext */
import org.apache.sedona.spark.SedonaContext
import org.apache.logging.log4j.core.tools.picocli.CommandLine.Help.Column
import org.mimirdb.pip.distribution._
import scala.util.Random
import org.mimirdb.pip.lib._
object Main
{
/*I/O source: https://stackoverflow.com/questions/24162478/how-to-download-and-save-a-file-from-the-internet-using-scala#26422540 */
def fileDownload(url: String, filename: String): Unit = {
new URL(url) #> new File(filename) !!
}
def main(args: Array[String]): Unit =
{
val size =
args.headOption.map { _.toInt }.getOrElse { 4000 }
println(s"Size: $size")
val data = TestData.makeData(size).zipWithIndex
Time(s"Hierarchical@$size"){
HierarchicalClustering.bottomUp(data, TestData.EuclideanDistance)
}
// val spark = SparkSession.builder
// .appName("pip")
// .master("local[*]")
// .getOrCreate()
// spark.sparkContext.setLogLevel("WARN")
// spark.sparkContext.setCheckpointDir("spark-warehouse")
// Pip.init(spark)
// /*
// Reproducing Vizier mars_rover workflow
// NOTE:
// this will probably be migrated into its own files later down the road
// */
// import org.apache.spark.sql.DataFrameReader
// /* It appears hadoop doesn't like urls, so need to
// i) download the file locally
// ii) then read it in
// */
// val webData = "https://mars.nasa.gov/mmgis-maps/M20/Layers/json/M20_waypoints.json"
// val fileData: String = "marsRoverData.json"
// /* We don't need to download if we already have the file.
// Source: http://stackoverflow.com/questions/21177107/ddg#21178667
// */
// if (!Files.exists(Paths.get(fileData))) {
// println("We didn't find the file. Now downloading...")
// fileDownload(webData, fileData)
// }
// println("We did find the file, now reading.")
// assert(SedonaContext.create(spark) eq spark)
// var df = spark.read.option("multiLine", true).json(fileData)
// /* Create temporary Spark view to query */
// df.createOrReplaceTempView("trips")
// ////////////////////////////////////////////////////////
// // Extract GeoJSON and properties field from the data
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT features.type,
// features.properties.*,
// ST_GeomFromGeoJSON(to_json(features.geometry)) as geo
// FROM (
// SELECT explode(features) AS features FROM trips
// )
// """).coalesce(1)
// // sqlDF.printSchema()
// // df.show(false)
// df.createOrReplaceTempView("traverse_data")
// ////////////////////////////////////////////////////////
// // Trip Times
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT *,
// dist_km - lag(dist_km, 1, 0) OVER (PARTITION BY 1 ORDER BY sol) AS km_traveled,
// sol - lag(sol, 1, 0) OVER (PARTITION BY 1 ORDER BY sol) AS sols_traveled
// FROM traverse_data
// WHERE dist_km > 0
// """)
// // df.show(false)
// df.createOrReplaceTempView("traverse_data")
// // spark.sql("""
// // SELECT max(km_traveled * 1000 / sols_traveled) as m_per_sol
// // FROM traverse_data
// // WHERE sols_traveled > 0
// // """).show()
// // return
// ////////////////////////////////////////////////////////
// // Trip Distances
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT ST_Point(
// CAST(lon as decimal(24,20)),
// CAST(lat as decimal(24,20))
// ) as geometry,
// clamp(gaussian(cast(km_traveled as double) * 1000 / sols_traveled, 3.0), 0.0, 800) as m_per_sol
// FROM traverse_data
// WHERE sols_traveled > 0
// """)//.checkpoint()
// // tripDist.printSchema()
// // df.show(false)
// df.createOrReplaceTempView("trip_points")
// ////////////////////////////////////////////////////////
// // Bounding Box
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT min(lat) as min_lat,
// max(lat) as max_lat,
// min(lon) as min_lon,
// max(lon) as max_lon
// FROM traverse_data
// """)
// // df.show(false)
// df.createOrReplaceTempView("mission_region")
// ////////////////////////////////////////////////////////
// // Example Histogram Regions
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT id, ST_PolygonFromEnvelope(lon_low, lat_low, lon_high, lat_high) as geometry
// FROM (
// SELECT
// 10 * lon_idx + lat_idx AS id,
// (max_lat - min_lat)/10 * lat_idx + min_lat AS lat_low,
// (max_lat - min_lat)/10 * (lat_idx+1) + min_lat AS lat_high,
// (max_lon - min_lon)/10 * lon_idx + min_lon AS lon_low,
// (max_lon - min_lon)/10 * (lon_idx+1) + min_lon AS lon_high
// FROM (SELECT id AS lon_idx from range(0,10)) AS lon_split,
// (SELECT id AS lat_idx from range(0,10)) AS lat_split,
// mission_region
// )
// """)
// // df.show(false)
// df.createOrReplaceTempView("bounding_boxes")
// ////////////////////////////////////////////////////////
// // Per-region distributions
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT box.id,
// array_agg(m_per_sol) as components,
// discretize(
// uniform_mixture(m_per_sol),
// array(0.0, 40.0, 80.0, 120.0, 160.0, 200.0, 240.0, 280.0, 320.0, 360.0, 400.0),
// 1000
// ) as m_per_sol
// FROM trip_points point,
// bounding_boxes box
// WHERE ST_Contains(box.geometry, point.geometry)
// GROUP BY box.id
// """)
// // df.show(false)
// df.createOrReplaceTempView("grid_squares")
// ////////////////////////////////////////////////////////
// // Per-region metrics
// ////////////////////////////////////////////////////////
// df = spark.sql("""
// SELECT id,
// m_per_sol,
// entropy(m_per_sol) as entropy,
// array_max(
// transform(
// components,
// x -> kl_divergence(x, m_per_sol)
// )
// ) as max_kl_div
// -- components
// FROM grid_squares
// -- LIMIT 1
// """)
// df.show(false)
}
}