210 lines
7.0 KiB
Scala
210 lines
7.0 KiB
Scala
package org.mimirdb.pip
|
|
|
|
import org.apache.spark.sql.SparkSession
|
|
import org.apache.log4j.{ Logger, Level }
|
|
import org.apache.spark.sql.functions._
|
|
import scala.util.Random
|
|
|
|
/* To read file from URL */
|
|
import sys.process._
|
|
import java.net.URL
|
|
import java.io.File
|
|
|
|
/*To check for existence */
|
|
import java.nio.file.{Paths, Files}
|
|
|
|
/* For SedonaContext */
|
|
import org.apache.sedona.spark.SedonaContext
|
|
import org.apache.logging.log4j.core.tools.picocli.CommandLine.Help.Column
|
|
import org.mimirdb.pip.distribution._
|
|
|
|
import scala.util.Random
|
|
import org.mimirdb.pip.lib._
|
|
|
|
object Main
|
|
{
|
|
|
|
/*I/O source: https://stackoverflow.com/questions/24162478/how-to-download-and-save-a-file-from-the-internet-using-scala#26422540 */
|
|
def fileDownload(url: String, filename: String): Unit = {
|
|
new URL(url) #> new File(filename) !!
|
|
}
|
|
|
|
def main(args: Array[String]): Unit =
|
|
{
|
|
val size =
|
|
args.headOption.map { _.toInt }.getOrElse { 4000 }
|
|
|
|
println(s"Size: $size")
|
|
|
|
val data = TestData.makeData(size).zipWithIndex
|
|
Time(s"Hierarchical@$size"){
|
|
HierarchicalClustering.bottomUp(data, TestData.EuclideanDistance)
|
|
}
|
|
|
|
|
|
// val spark = SparkSession.builder
|
|
// .appName("pip")
|
|
// .master("local[*]")
|
|
// .getOrCreate()
|
|
|
|
// spark.sparkContext.setLogLevel("WARN")
|
|
// spark.sparkContext.setCheckpointDir("spark-warehouse")
|
|
|
|
// Pip.init(spark)
|
|
|
|
// /*
|
|
// Reproducing Vizier mars_rover workflow
|
|
// NOTE:
|
|
// this will probably be migrated into its own files later down the road
|
|
// */
|
|
|
|
// import org.apache.spark.sql.DataFrameReader
|
|
// /* It appears hadoop doesn't like urls, so need to
|
|
// i) download the file locally
|
|
// ii) then read it in
|
|
|
|
// */
|
|
// val webData = "https://mars.nasa.gov/mmgis-maps/M20/Layers/json/M20_waypoints.json"
|
|
// val fileData: String = "marsRoverData.json"
|
|
|
|
// /* We don't need to download if we already have the file.
|
|
// Source: http://stackoverflow.com/questions/21177107/ddg#21178667
|
|
// */
|
|
// if (!Files.exists(Paths.get(fileData))) {
|
|
// println("We didn't find the file. Now downloading...")
|
|
// fileDownload(webData, fileData)
|
|
// }
|
|
|
|
// println("We did find the file, now reading.")
|
|
// assert(SedonaContext.create(spark) eq spark)
|
|
// var df = spark.read.option("multiLine", true).json(fileData)
|
|
|
|
// /* Create temporary Spark view to query */
|
|
// df.createOrReplaceTempView("trips")
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Extract GeoJSON and properties field from the data
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT features.type,
|
|
// features.properties.*,
|
|
// ST_GeomFromGeoJSON(to_json(features.geometry)) as geo
|
|
// FROM (
|
|
// SELECT explode(features) AS features FROM trips
|
|
// )
|
|
// """).coalesce(1)
|
|
// // sqlDF.printSchema()
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("traverse_data")
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Trip Times
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT *,
|
|
// dist_km - lag(dist_km, 1, 0) OVER (PARTITION BY 1 ORDER BY sol) AS km_traveled,
|
|
// sol - lag(sol, 1, 0) OVER (PARTITION BY 1 ORDER BY sol) AS sols_traveled
|
|
// FROM traverse_data
|
|
// WHERE dist_km > 0
|
|
// """)
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("traverse_data")
|
|
// // spark.sql("""
|
|
// // SELECT max(km_traveled * 1000 / sols_traveled) as m_per_sol
|
|
// // FROM traverse_data
|
|
// // WHERE sols_traveled > 0
|
|
// // """).show()
|
|
// // return
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Trip Distances
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT ST_Point(
|
|
// CAST(lon as decimal(24,20)),
|
|
// CAST(lat as decimal(24,20))
|
|
// ) as geometry,
|
|
// clamp(gaussian(cast(km_traveled as double) * 1000 / sols_traveled, 3.0), 0.0, 800) as m_per_sol
|
|
// FROM traverse_data
|
|
// WHERE sols_traveled > 0
|
|
// """)//.checkpoint()
|
|
// // tripDist.printSchema()
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("trip_points")
|
|
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Bounding Box
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT min(lat) as min_lat,
|
|
// max(lat) as max_lat,
|
|
// min(lon) as min_lon,
|
|
// max(lon) as max_lon
|
|
// FROM traverse_data
|
|
// """)
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("mission_region")
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Example Histogram Regions
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT id, ST_PolygonFromEnvelope(lon_low, lat_low, lon_high, lat_high) as geometry
|
|
// FROM (
|
|
// SELECT
|
|
// 10 * lon_idx + lat_idx AS id,
|
|
// (max_lat - min_lat)/10 * lat_idx + min_lat AS lat_low,
|
|
// (max_lat - min_lat)/10 * (lat_idx+1) + min_lat AS lat_high,
|
|
// (max_lon - min_lon)/10 * lon_idx + min_lon AS lon_low,
|
|
// (max_lon - min_lon)/10 * (lon_idx+1) + min_lon AS lon_high
|
|
// FROM (SELECT id AS lon_idx from range(0,10)) AS lon_split,
|
|
// (SELECT id AS lat_idx from range(0,10)) AS lat_split,
|
|
// mission_region
|
|
// )
|
|
// """)
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("bounding_boxes")
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Per-region distributions
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT box.id,
|
|
// array_agg(m_per_sol) as components,
|
|
// discretize(
|
|
// uniform_mixture(m_per_sol),
|
|
// array(0.0, 40.0, 80.0, 120.0, 160.0, 200.0, 240.0, 280.0, 320.0, 360.0, 400.0),
|
|
// 1000
|
|
// ) as m_per_sol
|
|
// FROM trip_points point,
|
|
// bounding_boxes box
|
|
// WHERE ST_Contains(box.geometry, point.geometry)
|
|
// GROUP BY box.id
|
|
// """)
|
|
// // df.show(false)
|
|
// df.createOrReplaceTempView("grid_squares")
|
|
|
|
// ////////////////////////////////////////////////////////
|
|
// // Per-region metrics
|
|
// ////////////////////////////////////////////////////////
|
|
// df = spark.sql("""
|
|
// SELECT id,
|
|
// m_per_sol,
|
|
// entropy(m_per_sol) as entropy,
|
|
// array_max(
|
|
// transform(
|
|
// components,
|
|
// x -> kl_divergence(x, m_per_sol)
|
|
// )
|
|
// ) as max_kl_div
|
|
// -- components
|
|
// FROM grid_squares
|
|
// -- LIMIT 1
|
|
// """)
|
|
// df.show(false)
|
|
}
|
|
|
|
}
|
|
|