Merge branch 'master' of gitlab.odin.cse.buffalo.edu:odin-lab/Website

# Conflicts:
#	src/teaching/cse-562/2017sp/index.erb
#	src/teaching/cse-562/2017sp/slides/2017-02-09-Indexes.pdf
This commit is contained in:
Oliver Kennedy 2017-02-20 11:07:24 -05:00
commit 5aca95c223
16 changed files with 779 additions and 56 deletions

View file

@ -1,3 +1,4 @@
require "rubygems"
$:.push "lib"
require "gemsmith.rb"
require "jdb.rb"
@ -5,6 +6,7 @@ require "lab_metadata.rb"
require "util.rb"
require "cv.rb"
require "nsfcp.rb"
require "nsfconflicts.rb"
require "bootstrap_markdown.rb"
include GemSmith
@ -179,4 +181,10 @@ end
file "okennedy_short.pdf" => "okennedy_short.tex" do
system("pdflatex okennedy_short.tex")
end
file "artifacts/okennedy_collaborators.tsv" => $db.files do
NSFConflicts.new("Oliver Kennedy", $db).
render("artifacts/okennedy_collaborators.tsv")
end
task :collab => "artifacts/okennedy_collaborators.tsv"

View file

@ -1,4 +1,32 @@
[
{ "title" : "INgestion, Transformation, Alignment, Association and Curation Technologies (INTACT)",
"agency" : "DARPA",
"role" : "PI",
"amount" : 523700,
"effort" : "50%",
"status" : "submitted",
"start" : "08/2017", "end" : "07/2021",
"type" : "grant",
"copis" : ["Barry Smith"],
"commitment" : { "summer" : 1 },
"collaborative" : [
{ "institution" : "CUBRC",
"pis" : ["Justin Del Vecchio", "Moises Sudit", "Greg Tauer", "Ron Rudnicki"],
"amount" : 3460467
}
]
},
{ "title" : "Strategic Planning for Waste Management in Smart and Sustainable Cities: The Promise of Using Product Lifecycle Data",
"agency" : "NSF: S&CC",
"role" : "CoPI",
"amount" : 98214,
"effort" : "1%",
"status" : "submitted",
"start" : "08/2017", "end" : "08/2018",
"type" : "grant",
"copis" : ["Kemper Lewis", "Debabrata Talukdar", "Sara Behdad"],
"commitment" : { "academic" : 0 }
},
{ "title" : "CIF21 DIBBs: EI: Vizier, Streamlined Data Curation",
"agency" : "NSF: ACI: DIBBS",
"role" : "PI",

View file

@ -45,8 +45,10 @@
"ubit" : "wmspoth"
},
"Jon Logan" : {
"status" : "PhD",
"projects" : ["mimir"]
"status" : "PhD",
"projects" : ["mimir"],
"email" : "jonathan.logan@cubrc.org",
"ubit" : "jmlogan"
},
"Aaron Huber" : {
"status" : "PhD",
@ -63,7 +65,8 @@
"status" : "PhD",
"projects" : ["astral"],
"advisor" : ["Luke Ziarek"],
"joint_advisor": true
"joint_advisor": true,
"ubit" : "sauravsi"
},
"Grant Wrazen" : {
"status" : "BS",

View file

@ -1,19 +1,196 @@
{
"Barry Smith" : {
"institution" : "University at Buffalo",
"updated" : "2017",
"email" : "phismith@buffalo.edu"
},
"Debabrata Talukdar" : {
"institution" : "University at Buffalo",
"updated" : "2017",
"email" : "dtalukda@buffalo.edu"
},
"Greg Tauer" : {
"institution" : "CUBRC",
"updated" : "2017",
"email" : "tauer@cubrc.org"
},
"Justin Del Vecchio" : {
"institution" : "CUBRC",
"updated" : "2017",
"email" : "delvecchio@cubrc.org"
},
"Kemper Lewis" : {
"institution" : "University at Buffalo",
"updated" : "2017",
"email" : "kelewis@buffalo.edu"
},
"Ron Rudnicki" : {
"institution" : "CUBRC",
"updated" : "2017",
"email" : "rudnicki@cubrc.org"
},
"Sara Behdad" : {
"institution" : "University at Buffalo",
"updated" : "2017",
"email" : "sarabehd@buffalo.edu"
},
"Beda Hammerschmidt" : {
"institution" : "Oracle",
"updated" : "2017",
"email" : "beda.hammerschmidt@oracle.com"
},
"Carmela Troncoso" : {
"institution" : "IMDEA Software Institute",
"updated" : "2017",
"email" : "carmela.troncoso@imdea.org"
},
"D. Richard Hipp" : {
"institution" : "Hipp, Wyrick & Company, Inc.",
"updated" : "2017",
"email" : "drh@sqlite.org"
},
"Daniel Williams" : {
"institution" : "Unknown Employer",
"updated" : "2017"
},
"Eduardo Lage-Otero" : {
"institution" : "Yale-NUS College",
"updated" : "2017"
},
"Emin Gun Sirer" : {
"institution" : "Cornell University",
"updated" : "2017"
},
"Eric S. Chan" : {
"institution" : "Oracle",
"updated" : "2017",
"email" : "eric.s.chan@oracle.com"
},
"Eugene Wu" : {
"institution" : "Columbia University",
"updated" : "2017",
"email" : "ew2493@columbia.edu"
},
"Fred Schneider" : {
"institution" : "Cornell University",
"updated" : "2017"
},
"Gary Shawver" : {
"institution" : "New York University",
"updated" : "2017"
},
"Heiko Mueller" : {
"institution" : "New York University",
"updated" : "2017",
"email" : "heiko.mueller@nyu.edu"
},
"Jianchang Mao" : {
"institution" : "Unknown Employer",
"updated" : "2017"
},
"Juliana Freire" : {
"institution" : "New York University",
"updated" : "2017",
"email" : "juliana.freire@nyu.edu"
},
"Kevin Walsh" : {
"institution" : "Unknown Employer",
"updated" : "2017"
},
"Moises Sudit" : {
"institution" : "University at Buffalo / CUBRC",
"updated" : "2017",
"email" : "sudit@cubrc.org"
},
"Patrick Coonan" : {
"institution" : "University at Buffalo",
"updated" : "2017",
"email" : "pcoonan@buffalo.edu"
},
"Patrick Reynolds" : {
"institution" : "LinkedIn",
"updated" : "2017"
},
"Said Achmiz" : {
"institution" : "Self-Employed",
"updated" : "2017",
"email" : "achmizs@gmail.com"
},
"Seokki Lee" : {
"institution" : "Illinois Institute of Technology",
"updated" : "2017",
"email" : "slee195@hawk.iit.edu"
},
"Stratos Idreos" : {
"institution" : "Harvard University",
"updated" : "2017",
"email" : "stratos@seas.harvard.edu"
},
"Vasudha Krishnaswamy" : {
"institution" : "Oracle",
"updated" : "2017",
"email" : "vasudha.krishnaswamy@oracle.com"
},
"Wolfgang Gatterbauer" : {
"institution" : "Carnegie Mellon University",
"updated" : "2017",
"email" : "gatt@cmu.edu"
},
"Xing Niu" : {
"institution" : "Illinois Institute of Technology",
"updated" : "2017",
"email" : "xniu7@hawk.iit.edu"
},
"Zhen Hua Liu" : {
"canonical" : "Zhen Hua-Liu"
},
"Arnab Nandi" : {
"institution" : "Ohio State University",
"updated" : 2017,
"email" : "arnab@arnab.org"
},
"Amélie Marian" : {
"institution" : "Rutgers University",
"updated" : 2017,
"email" : "amelie@cs.rutgers.edu"
},
"Alan Shieh" : {
"institution" : "VMware",
"updated" : 2011
},
"Adel Ghoneimy" : {
"institution" : "Oracle",
"updated" : 2017,
"email" : "adel.ghoneimy@oracle.com"
},
"Bahareh Arab" : {
"canonical" : "Bahareh Sadat Arab",
"updated" : 2017
},
"Bahareh Sadat Arab" : {
"institution" : "Illinois Institute of Technology",
"updated" : 2017,
"email" : "barab@hawk.iit.edu"
},
"Xuanlong Nguyen" : {
"institution" : "University of Michigan",
"updated" : 2016
"updated" : 2016,
"email" : "xuanlong@umich.edu"
},
"Hung Ngo" : {
"institution" : "University at Buffalo",
"updated" : 2016
"institution" : "LogicBlox",
"updated" : 2017,
"email" : "hung.q.ngo@gmail.com"
},
"Shambhu Upadhyaya" : {
"institution" : "University at Buffalo",
"updated" : 2016
"updated" : 2016,
"email" : "shambhu@buffalo.edu"
},
"Varun Chandola" : {
"institution" : "University at Buffalo",
"updated" : 2016
"updated" : 2016,
"email" : "chandola@buffalo.edu"
},
"Vinayak Karuppasamy" : {
"institution" : "Bloomberg",
@ -25,7 +202,8 @@
},
"Boris Glavic" : {
"institution" : "Illinois Inst. Tech.",
"updated" : 2016
"updated" : 2016,
"email" : "bglavic@iit.edu"
},
"Oliver Kennedy" : {
"institution" : "University at Buffalo",
@ -41,7 +219,8 @@
},
"Anandatirtha Nandugudi" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "ans25@buffalo.edu"
},
"Andres Nötzli" : {
"institution" : "Stanford",
@ -53,7 +232,8 @@
},
"Anudipa Maiti" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "anudipam@buffalo.edu"
},
"Charles Loboz" : {
"institution" : "Microsoft Corp.",
@ -61,6 +241,7 @@
},
"Christoph Koch" : {
"institution" : "EPFL",
"email" : "christoph.koch@epfl.ch",
"updated" : 2014
},
"Daniel Bellinger" : {
@ -73,7 +254,8 @@
},
"Dieter Gawlick" : {
"institution" : "Oracle",
"updated" : 2014
"updated" : 2014,
"email" : "dieter.gawlick@oracle.com"
},
"Eric Vee" : {
"institution" : "Yahoo!",
@ -85,15 +267,18 @@
},
"Geoffrey Challen" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "challen@buffalo.edu"
},
"Guru Prasad Srinivasa" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "gurupras@buffalo.edu"
},
"Jan Chomicki" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "chomicki@buffalo.edu"
},
"Jayavel Shanmugasundaram" : {
"institution" : "Google",
@ -101,7 +286,8 @@
},
"Jerry Antony Ajay" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "jerryant@buffalo.edu"
},
"Jian Yang" : {
"institution" : "Yahoo!",
@ -109,7 +295,8 @@
},
"Jinghao Shi" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "jinghaos@buffalo.edu"
},
"John Tomlin" : {
"institution" : "Yahoo!",
@ -121,19 +308,22 @@
},
"Lukasz Ziarek" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "lziarek@buffalo.edu"
},
"Milos Nicolic" : {
"institution" : "EPFL",
"updated" : 2014
},
"Nick DiRienzo" : {
"institution" : "University at Buffalo",
"updated" : 2014
"institution" : "Unknown Employer",
"updated" : 2014,
"email" : "nvdirienzo@gmail.com"
},
"Ronny Fehling" : {
"institution" : "Oracle",
"updated" : 2014
"institution" : "Airbus",
"updated" : 2017,
"email" : "ronny.fehling@airbus.com"
},
"Sergei Vassilvitskii" : {
"institution" : "Google",
@ -153,7 +343,8 @@
},
"Sriram Shantharam" : {
"institution" : "University at Buffalo",
"updated" : 2014
"updated" : 2014,
"email" : "sriramsh@buffalo.edu"
},
"Steve Lee" : {
"institution" : "Microsoft Corp.",
@ -181,6 +372,7 @@
},
"Zhen Hua-Liu" : {
"institution" : "Oracle",
"updated" : 2014
"updated" : 2014,
"email" : "zhen.liu@oracle.com"
}
}

View file

@ -4,7 +4,7 @@
"authors" : [
"Oliver Kennedy",
"D. Richard Hipp",
"Stratos Idreosh",
"Stratos Idreos",
"Amélie Marian",
"Arnab Nandi",
"Carmela Troncoso",

View file

@ -56,4 +56,8 @@ class JDB
end
[data, files]
end
def keys
@data.keys
end
end

156
lib/nsfconflicts.rb Normal file
View file

@ -0,0 +1,156 @@
require "rubygems"
require "prawn"
require "prawn/measurement_extensions"
require "util.rb"
class NSFConflicts
include Text
## The NSF considers collaborators from within the past 48 months
## Be conservative and add a few extra days in
## Track this in seconds (because time deltas are given in seconds)
@@time_to_consider_collabs = 60*60*24*365.25*2
def initialize(me, data)
@me = me
@data = data
@meta =
@data["people"].
merge(
@data["lab/members"].
map { |k,m|
[k, {
"institution" => "University at Buffalo",
"email" => "#{m["ubit"]}@buffalo.edu"
}.merge(m)]
}.to_h
).merge(
@data["lab/alumni"].
map { |k,m|
[k, {
"name" => k,
"institution" => m["company"],
"email" => "#{m["ubit"]}@buffalo.edu"
}.merge(m)]
}.to_h
)
@shortname = full_details(@me)["ubit"]
end
def grant_co_pis
@data["cv/okennedy/grants"].
where { |grant|
case grant["status"]
when "accepted", "submitted", "completed" then true
when "rejected", "retired" then false
else raise "Unknown grant status #{grant["status"]} for #{grant["title"]}"
end
}.
where { |grant| Time.now - to_date(grant["end"]) < @@time_to_consider_collabs }.
map do |grant|
co_pis = grant.fetch("copis", [])
grant.fetch("collaborative", []).each do |inst|
co_pis = co_pis + inst["pis"]
end
last_active = to_date(grant["end"])
last_active = Time.now if last_active > Time.now
co_pis.flatten.map { |name| [name, last_active.year] }
end.
flatten(1)
end
def paper_coauthors
start_year = (Time.now - @@time_to_consider_collabs).year
@data["publications"].
where { |paper| paper["authors"].include? @me }.
where { |paper| paper["year"].to_i >= start_year }.
map { |paper|
paper["authors"].
where { |name| name != @me }.
map { |name| [name, paper["year"].to_i] }
}.
flatten(1)
end
def advisors
@data["cv/#{@shortname}/education"].
where { |inst| inst["degree"] == "PhD" }.
map { |inst| inst["Advisor"] }
end
def phd_students
@data["lab/members"].
values.
where { |person| person["status"] == "PhD" }.
map { |person| person["name"] }
end
def phd_alumni
@data["lab/alumni"].
values.
where { |person| person["degree"] == "PhD" }.
map { |person| person["name"] }
end
def canonicalize(person)
@meta[person].fetch("canonical", person)
end
def full_details(person)
return person.map { |x| full_details(x) } if person.is_a? Array
meta = @meta[person] or raise "Unknown Person: #{person}"
{
"name" => person,
"name_parts" => split_name(person)
}.merge(meta)
end
def render_name(person)
person["name_parts"].reverse.join(", ")
end
def render(target)
all_collabs = (
(grant_co_pis + paper_coauthors).map { |x| x[0] } +
(phd_students + phd_alumni + advisors)
).uniq.sort
missing_collabs =
all_collabs.select { |person| not (@meta.has_key? person) }
unless missing_collabs.empty?
puts ("You are missing collaborator details. Here's a template to add to /db/people.json:")
missing_collabs.each do |collab|
puts " \"#{collab}\" : {"
puts " \"institution\" : \"\","
puts " \"updated\" : \"#{Time.now.year}\""
puts " },"
end
end
File.open(target, "w+") do |f|
f.puts("==== TABLE B ====")
full_details(advisors).each do |adv|
f.puts "G:\t#{render_name(adv)}\t#{adv["institution"]}\t#{adv["email"]}"
end
full_details((phd_students + phd_alumni).uniq.sort).each do |stud|
f.puts "T:\t#{render_name(stud)}\t#{stud["institution"]}\t#{stud["email"]}"
end
f.puts("==== TABLE C ====")
all_collabs = (
grant_co_pis.map { |name, y| [canonicalize(name), [:grant, y]] } +
paper_coauthors.map { |name, y| [canonicalize(name), [:pub, y]] }
)
all_collabs.reduce.to_a.
each { |name, collabs|
categories, years = collabs.unzip
category = "C:"
category = "A:" if(categories.include? :pub)
details = full_details(name)
f.puts("#{category}\t#{render_name(details)}\t#{details["institution"]}\t#{details["email"]}\t#{years.max}")
}
end
end
end

View file

@ -1,26 +1,7 @@
require "rubygems"
require "prawn"
require "prawn/measurement_extensions"
class Array
def symbolize_json
map { |v| if v.respond_to? :symbolize_json then v.symbolize_json else v end }
end
end
class Hash
def symbolize_json
self.map do |k,v|
v = v.symbolize_json if v.respond_to? :symbolize_json
case k
when String then [k.to_sym, v]
else [k, v]
end
end.to_h
end
end
require "util.rb"
class NSFCnP
@@box_heights = [ 8.66.in, 7.085.in, 5.52.in, 3.97.in, 2.42.in ]
@ -30,7 +11,7 @@ class NSFCnP
@me = me
@grants = grants.symbolize_json
end
def render(target)
grants = @grants
pi_name = @me

View file

@ -140,4 +140,9 @@ module Text
raise "Unknown Country '#{addr["country"]}' in Address: #{addr}"
end
end
def split_name(name)
first, *last = name.split(/ +/)
[first, last.join(" ")]
end
end

View file

@ -316,7 +316,7 @@ class Array
end.flatten(1)
end
def merge(other, args = {})
def my_join(other, args = {})
if args.has_key?(:eq)
args[:eq] = [args[:eq], args[:eq]] unless args[:eq].is_a? Array
a, b = args[:eq]
@ -335,6 +335,11 @@ class Array
def where
map {|i| i if yield i }.compact
end
def symbolize_json
map { |v| if v.respond_to? :symbolize_json then v.symbolize_json else v end }
end
end
class Hash
@ -409,6 +414,17 @@ class Hash
end
}.flatten(1).to_h
end
def symbolize_json
self.map do |k,v|
v = v.symbolize_json if v.respond_to? :symbolize_json
case k
when String then [k.to_sym, v]
else [k, v]
end
end.to_h
end
end
class Float

View file

@ -0,0 +1,61 @@
---
title: CIDR Recap
projects:
- mimir
author: William Spoth
---
How big is BIG and how fast is FAST? This seemed to be a re-occurring theme of
the CIDR 2017 conference. A general consensus and major point of many
presentations is that RDBMS used to be the king of scaling to large data twenty
years ago but for some inexplicable reason has become lost to the ever changing
scope of BIG and FAST. Multiple papers attempted to address this problem in
different ways and added to multiple different tools on the market for data
stream processing and large calculations such as SPARK but there seemed to be
no silver bullet. To add to the theme that big data is too big, there were
keynote talks given by Emily Galt and Sam Madden that drove this point home and
gave different real work scenarios and outlooks on this problem.
To break this theme apart Ill split the papers into groups and explain the
different outlooks the authors took and how they addressed this common problem.
The papers, Prioritizing Attention in Analytic Monitoring, The Myria Big Data
Management and Analytics System and Cloud Services, Weld: A Common Runtime for
High Performance Data Analysis, A Database System with Amnesia, and Releasing
Cloud Databases for the Chains of Performance Prediction Models, were focused
on the theme that databases are not keeping pace with the rate that data is
growing. Sam Madden brought up an interesting point that the hardware
components like the bus are not the bottle neck in this system. With advances
in big data computing like apache spark, it feels like RDBMS are the end of the
line where data goes to die. These papers looked at different ways of
addressing this, A Database System with Amnesia looked at throwing out unused
data since most data in RDBMS gets put in and never used again and with the
increasing use of data streams the problem of not being able to process and
store this data fast enough becomes exemplified.
The second common ground problem is even if you can efficiently store and
perform queries over your data lakes, humans often lack the ability to
efficiently create queries or have the necessary insight into how the data is
formatted. The papers, The Data Civilizer System, Establishing Common Ground
with Data Context, Adaptive Schema Databases, Combining Design, and Performance
in a Data Visualization Management System, all try to address this problem but
from slightly different angles. The data civilizer system and adaptive
databases look at aiding an analyst in schema and table exploration and to help
an analyst discover unknown or desired qualities about their data source. These
papers approach user insight in a way that would otherwise exist as internal
middleware in large companies, the problem is that big data and messy data
lakes are becoming more and more prevalent for other users. Medium sized
businesses can be buried in data following user surges or new product upgrades,
government agencies can have large amounts of uncleaned sensor and user
submitted data that they do not have the abilities or tools to manage.
To me a large take away from this conference was databases need a better way to
handle big data. Databases are the hero big data needs AND the one it deserves.
To achieve these goals databases are going to need to relax the constraints on
ridged schemas and perfect data, which open up a large amount of research
opportunities and the realization that there might not currently be a right
answer to this problem. Either way it should be interesting to see what
sacrifices RDBMS make to compete with the growing amount of data and if they
are able to apply decades worth of research to this hot field that is looking
for an answer.

View file

@ -30,6 +30,23 @@ schedule:
other domains. Prior to Duke, she did a postdoc at the University of
Washington, and obtained her Ph.D. from the University of Pennsylvania.
She is a recipient of the NSF CAREER award and a Google PhD Fellowship.
- when: March 9; Time TBD
what: Software Synthesis for Networks
who: Hossein Hojjat
where: Location TBD
details:
abstract: |
Software synthesis is a powerful technique that can dramatically increase the productivity of programmers by automating the construction of complex programs.
One area where synthesis seems particularly promising is in computer networks.
Although Software-Defined Networking (SDN) makes it possible to build rich applications in software, programmers today are forced to deal with numerous low-level details such as encoding high-level policies using low-level hardware primitives, processing asynchronous events and dealing with unexpected failures.
This talk will present highlights from our recent works using synthesis to generate correct-by-construction network programs. In the first part of the talk, I will describe an approach for generating configuration updates that are guaranteed to preserve specified invariants. In the second part of the talk, I will present an extension that supports finer-grained updates triggered by data-plane events. In the last part of the talk, I will describe a repair technique that uses Horn clause models to repair buggy network configurations.
(Joint work with Nate Foster (Cornell University), Pavol Cerny (University of Colorado at Boulder), Jedidiah McClurg (University of Colorado at Boulder), Philipp Ruemmer (Uppsala University))
bio: |
Hossein Hojjat is an assistant professor in the Computer Science department at the Rochester Institute of Technology (RIT). Before joining RIT, he was a postdoctoral researcher at Cornell University. He earned a PhD in Computer Science from EPFL in 2013. His research interests center on program synthesis and computer-aided verification.
- when: April 6; Time TBD
what: Title TBD
who: Amol Deshpande
where: Location TBD
---
<p>The UBDB seminar meets on Mondays at TBD, typically in TBD. Subscribe to <a href="https://listserv.buffalo.edu/cgi-bin/wa?A0=cse-database-list">cse-database-list</a> for more details.</p>

View file

@ -2,15 +2,263 @@
title: CSE-562; Project 1
---
<ul>
<li><strong>Overview</strong>: Answer Select/Project/Aggregate Queries Efficiently
<li><strong>Deadline</strong>: TBD</li>
<li><strong>Overview</strong>: Answer Select/Project/Aggregate Queries
<li><strong>Deadline</strong>: Friday, March 10</li>
<li><strong>Grade</strong>: 15% of Project Component
<ul>
<li>5% Correctness</li>
<li>5% Efficiency</li>
<li>7% Correctness</li>
<li>3% Efficiency</li>
<li>5% Code Review</li>
</ul>
</li>
</ul>
<div style="text-color: red">In Progress</div>
<p>In this project, you'll be asked to evaluate a few single-table SQL queries. That said, there's a few ways to pull this off, some better than others. In short, this project exists to lay the foundation for the later two projects. In this writeup, we'll be going through a few design decisions, pointing out tradeoffs, and explaining why a strategy that might seem easier in the short term turns out to be significantly harder later.</p>
<p>Specifically, you'll be given a number of queries in one of the following patterns:
<ol>
<li><tt>SELECT A, B, ... FROM R</tt></li>
<li><tt>SELECT A, B, ... FROM R WHERE ...</tt></li>
<li><tt>SELECT A+B AS C, ... FROM R</tt></li>
<li><tt>SELECT A+B AS C, ... FROM R WHERE ...</tt></li>
<li><tt>SELECT SUM(A+B) AS C, ... FROM R</tt></li>
<li><tt>SELECT SUM(A+B) AS C, ... FROM R WHERE ...</tt></li>
</ol>
Your task is to answer these queries as they arrive.
</p>
<h2>Volcano-Style Computation (Iterators)</h2>
Let's take a look at the script we've used as an example in class.
<div style="text-align:left;color:#000000; background-color:#ffffff; border:solid black 1px; padding:0.5em 1em 0.5em 1em; overflow:auto;font-size:small; font-family:monospace; ">with <span style="color:#5b2a96;">open</span>(<span style="color:#f4181b;">'data.csv'</span>, <span style="color:#f4181b;">'r'</span>) <span style="color:#a71790;"><strong>as</strong></span> f:<br />
&nbsp;&nbsp;<span style="color:#a71790;"><strong>for</strong></span> line <span style="color:#a71790;"><strong>in</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;fields = split(<span style="color:#f4181b;">&quot;,&quot;</span>, line)<br />
&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>if</strong></span>(fields[<span style="color:#0000ff;">2</span>] != <span style="color:#f4181b;">&quot;Ensign&quot;</span> <span style="color:#a71790;"><strong>and</strong></span> <span style="color:#5b2a96;">int</span>(fields[<span style="color:#0000ff;">3</span>]) &gt; <span style="color:#0000ff;">25</span>):<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;">print</span>(fields[<span style="color:#0000ff;">1</span>])<br />
</div>
<p>This script is basically a form of pattern 2 above
<pre><code class="sql">SELECT fields[1] FROM 'data.csv'
WHERE fields[2] != "Ensign" AND CAST(fields[3] AS int) > 25
</code></pre>
</p>
<p>Or in other words, any query that follows the pattern...
<pre><code class="sql">SELECT /*targets*/ FROM /*file*/ WHERE /*condition*/</code></pre>
...becomes a script of the form...</p>
<div style="text-align:left;color:#000000; background-color:#ffffff; border:solid black 1px; padding:0.5em 1em 0.5em 1em; overflow:auto;font-size:small; font-family:monospace; ">with <span style="color:#5b2a96;">open</span>(<span style="color:#4444ff;font-weight:bold;">file</span>, <span style="color:#f4181b;">'r'</span>) <span style="color:#a71790;"><strong>as</strong></span> f:<br />
&nbsp;&nbsp;<span style="color:#a71790;"><strong>for</strong></span> line <span style="color:#a71790;"><strong>in</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;fields = split(<span style="color:#f4181b;">&quot;,&quot;</span>, line)<br />
&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>if</strong></span> <span style="color:#4444ff;font-weight:bold;">condition</span><br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;">print</span>(<span style="color:#4444ff;font-weight:bold;">targets</span>)<br />
</div>
<p>This is nice and simple, but the code is very specific to pattern 2. That's something that will lead us into trouble. To see a simple example of the sort of problems we're going to run into, let's come up with an example of pattern 6:</p>
<pre><code class="sql">SELECT SUM(age) FROM 'data.csv' WHERE rank != 'Ensign'</code></pre>
That is, we're asking for the total age of non-ensigns in our example table. An equivalent script would be...</p>
<div style="text-align:left;color:#000000; background-color:#ffffff; border:solid black 1px; padding:0.5em 1em 0.5em 1em; overflow:auto;font-size:small; font-family:monospace; ">total = <span style="color:#0000ff;">0</span><br />
<br />
with <span style="color:#5b2a96;">open</span>(<span style="color:#f4181b;">'data.csv'</span>, <span style="color:#f4181b;">'r'</span>) <span style="color:#a71790;"><strong>as</strong></span> f:<br />
&nbsp;&nbsp;<span style="color:#a71790;"><strong>for</strong></span> line <span style="color:#a71790;"><strong>in</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;fields = split(<span style="color:#f4181b;">&quot;,&quot;</span>, line)<br />
&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>if</strong></span> fields[<span style="color:#0000ff;">2</span>] != <span style="color:#f4181b;">'Ensign'</span>:<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total += <span style="color:#5b2a96;">int</span>(fields[<span style="color:#0000ff;">3</span>])<br />
<br />
<span style="color:#a71790;">print</span>(total) &nbsp;&nbsp;&nbsp;<br />
</div>
<p>There's a pretty significant difference in the flow of the code in this version of the script. For one, there's a new global variable with the <tt>total</tt> age. For another, the print statement is now outside of the for loop. Now let's say we wanted to support both patterns 2 and 6. We'd need to make the code quite a bit more complex:</p>
<div style="text-align:left;color:#000000; background-color:#ffffff; border:solid black 1px; padding:0.5em 1em 0.5em 1em; overflow:auto;font-size:small; font-family:monospace; "><span style="color:#a71790;"><strong>if</strong></span> query_pattern == <span style="color:#0000ff;">2</span>:<br />
&nbsp;&nbsp;with <span style="color:#5b2a96;">open</span>(<span style="color:#5b2a96;">file</span>, <span style="color:#f4181b;">'r'</span>) <span style="color:#a71790;"><strong>as</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>for</strong></span> line <span style="color:#a71790;"><strong>in</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;fields = split(<span style="color:#f4181b;">&quot;,&quot;</span>, line)<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>if</strong></span> condition:<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;">print</span> target<br />
<span style="color:#a71790;"><strong>elif</strong></span> query_pattern == <span style="color:#0000ff;">6</span>:<br />
&nbsp;&nbsp;total = &Oslash;<br />
&nbsp;&nbsp;with <span style="color:#5b2a96;">open</span>(<span style="color:#5b2a96;">file</span>, <span style="color:#f4181b;">'r'</span>) <span style="color:#a71790;"><strong>as</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>for</strong></span> line <span style="color:#a71790;"><strong>in</strong></span> f:<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;fields = split(<span style="color:#f4181b;">&quot;,&quot;</span>, line)<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#a71790;"><strong>if</strong></span> condition:<br />
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total += expression<br />
&nbsp;&nbsp;<span style="color:#a71790;">print</span>(total) &nbsp;&nbsp;&nbsp;<br />
</div>
<p>As you can see, this code is already quite a bit more complex and we haven't even looked at patterns 1, 3, 4, or 5 yet or the even more complex queries that will show up in Checkpoint 2 and 3.</p>
<p>There are a number of workflow steps that appear in more than one pattern. For example:
<ol>
<li>Loading the CSV file in as data</li>
<li>Filtering rows out of data</li>
<li>Transforming (mapping) data into a new structure</li>
<li>Summarizing (aggregating) data</li>
<li>Printing output data</li>
</ol>
Most of these steps do something with <b>data</b>, so let's be a little more precise with respect to what we mean there. (1) When a CSV file is loaded, it's a sequence of rows and attributes. (2) Filtering doesn't change the structure: it's still rows and attributes. (3) Transforming (picking out specific columns) does change the structure, but at the end of the day we're still working with rows and attributes (or in the case of this script, just one attribute). (4) Aggregation does change the structure... it produces one row with one attribute, but it's not hard to pretend that it still fits into the same model of rows and attributes. </p>
<p>In short, this idea of rows and attributes is pretty fundamental, so let's use it. We're going to work with data expressed in terms of <tt>table</tt>s: or collections of rows and attributes. This allows us to abstract out each of those workflow steps from before into a set of functions:
<ol style="font-family: Courier;font-size: 10pt;">
<li>read_table(filename) -> table</li>
<li>filter_table(table, condition) -> table</li>
<li>map_table(table, rules) -> table</li>
<li>aggregate_table(table, aggregates) -> table</li>
<li>print_table(table)</li>
</ol>
Now the script gets quite a bit simpler (if not shorter), and a lot easier to factor further:</p>
<div style="text-align:left;color:#000000; background-color:#ffffff; border:solid black 1px; padding:0.5em 1em 0.5em 1em; overflow:auto;font-size:small; font-family:monospace; "><span style="color:#a71790;"><strong>if</strong></span> query_pattern = <span style="color:#0000ff;">2</span>:<br />
&nbsp;&nbsp;table = read_table(<span style="color:#f4181b;">'data.csv'</span>)<br />
&nbsp;&nbsp;table = filter_table(table, condition)<br />
&nbsp;&nbsp;table = transform_table(table, target)<br />
&nbsp;&nbsp;print_table(table)<br />
<span style="color:#a71790;"><strong>elif</strong></span> query_pattern = <span style="color:#0000ff;">6</span>:<br />
&nbsp;&nbsp;table = read_table(<span style="color:#f4181b;">'data.csv'</span>)<br />
&nbsp;&nbsp;table = filter_table(table, condition)<br />
&nbsp;&nbsp;table = aggregate_table(table, target)<br />
&nbsp;&nbsp;print_table(table)<br />
</div>
<p><b>But we still have a problem.</b> These <tt>table</tt> objects are going to be as big as the data they represent... they can get super large. That's a massive drawback compared to our initial script design, which has constant-space usage. So what else can we do?</p>
<p>Let's look at why the original script uses constant-space. We load one record in upfront (that's constant space). We decide whether the record is useful to us (still constant space). Whether or not we print it, by the time we get to the next record, we're done with the current row and can safely discard it. Can we recover the same sort of property?</p>
<p>For this checkpoint, it turns out that we can. If you've used java, you're probably familiar with the <a href="http://docs.oracle.com/javase/8/docs/api/java/util/Iterator.html">Iterator</a> interface. An iterator lets you access elements of a collection without needing to have all of those elements available at once. That is, you define two methods:
<dl>
<dt style="font-family: Courier">hasNext()</dt>
<dd>Returns true if there are any more rows to read</dd>
<dt style="font-family: Courier">next()</dt>
<dd>Returns exactly one row. (the next row in the list)</dd>
</dl>
Because the iterator eventually returns each row of the table, it behaves sort of like a <tt>table</tt> object, but because it only returns one row at a time it doesn't strictly need all of the data to be in memory at once. Moreover, you can define one iterator in terms of another. For example, you might define a filtering iterator that takes a source iterator as part of its constructor, and every time you call <tt>next()</tt>, keeps calling <tt>source.next()</tt> until it finds a row that satisfies the where clause.</p>
<p>In short, iterators give you <i>composability</i> and <i>low memory use</i>. The first property is important for your sanity, while the latter property is important for your performance.</p>
<h2>Data Representation</h2>
<p>When it comes to figuring out how to represent one row of data, you have two questions to answer: (1) How do I represent a single primitive value, and (2) How do I represent an entire row of primitive values. </p>
<p>For the first question, there are two practical choices: Either as raw strings (taken directly from the CSV file) or parsed into <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/PrimitiveValue.html">PrimitiveValue</a> objects. PrimitiveValue is an interface implemented by several classes that represent specific types of values, for example <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/LongValue.html">longs</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/DateValue.html">dates</a>, and others. Because EvalLib (a library that I'll describe shortly) uses PrimitiveValues internally, most students find that it is easier to write code that performs well if you use PrimitiveValue.</p>
<p>For the second question, I strongly encourage the use of Java arrays. There are a few options, including ArrayLists, Vectors, Maps, and other structures. Java arrays outperform them all pretty drastically.</p>
<h2>EvalLib</h2>
<p>The JSqlParser <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/Expression.html">Expression</a> type can represent a whole mess of different arithmetic, boolean, and other primitive-valued expressions. For this project, you'll have a library to help you in evaluating these expressions: <b>EvalLib</b>. Before we get into it, you should note a distinction between two types of expression:
<dl>
<dt><a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/Expression.html">Expression</a></dt>
<dd>A generic expression. Can be anything: a comparison, a string, a multiplication, a regular expression match.</dd>
<dt><a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/PrimitiveValue.html">PrimitiveValue</a></dt>
<dd>The basic unit of data. Can be a: <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/BooleanValue.html">Boolean</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/DateValue.html">Date</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/DoubleValue.html">Double</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/LongValue.html">Long</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/NullValue.html">Null</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/StringValue.html">String</a>, <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/TimestampValue.html">Timestamp</a> or a <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/expression/TimeValue.html">Time</a>. Note that PrimitiveValues are also perfectly legitimate (if somewhat boring) Expressions.</dd>
</dl>
EvalLib includes a single class called <a href="https://github.com/UBOdin/evallib/blob/master/src/main/java/net/sf/jsqlparser/eval/Eval.java">Eval</a> that helps you to resolve Expression objects into PrimitiveValues. Eval is an abstract class, which means you'll need to subclass it to make use of it, but we'll get back to that in a moment. First, let's see a quick example.</p>
<pre><code class="java">
Eval eval = new Eval(){ /* we'll get what goes here shortly */ }
// Evaluate "1 + 2.0"
PrimitiveValue result;
result =
eval.eval(
new Addition(
new LongPrimitive(1),
new DoublePrimitive(2.0)
)
);
System.out.println("Result: "+result); // "Result: 3.0"
// Evaluate "1 > (3.0 * 2)"
result =
eval.eval(
new GreaterThan(
new LongPrimitive(1),
new Multiplication(
new DoublePrimitive(3.0),
new LongPrimitive(2)
)
)
);
System.out.println("Result: "+result); // "Result: false"
</code></pre>
<p>In short, eval helps you evaluate the Expression objects that JSQLParser gives you. However, there's one thing it can't do: It has no idea how to convert attribute names to values. That is, there's one type of Expression object that Eval has no clue how to evaluate: <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/net/sf/jsqlparser/schema/Column.html">Column</a>. That is, let's take the following example:</p>
<pre><code class="java">
// Evaluate "R.A >= 5"
result =
eval.eval(
new GreaterThanEquals(
new Column(new Table(null, "R"), "A"),
new LongPrimitive(5)
)
);
</code></pre>
<p>What value should Eval give to R.A? This depends on the data. Because EvalLib has no way of knowing how you represent your data, you need to tell it:</p>
<pre><code>
Eval eval = new Eval(){
public PrimitiveValue eval(Column c){
/* Figure out what value 'c' has */
}
}
</code></pre>
<h2>Deliverable</h2>
<p>For this checkpoint, you'll be running multiple queries in sequence. This means a few changes to your code. First, before calling <tt>parser.Statement()</tt>, you will need to print a prompt to <tt>System.out</tt>. Use the string
'<tt>$&gt; </tt>' (without quotes), and make sure that it's the very first thing on its own. This is so that the testing framework knows when your code is ready for the next query.</p>
<h4>Source Data</h4>
<p>Because you are implementing a query evaluator and not a full database engine, there will not be any tables -- at least not in the traditional sense of persistent objects that can be updated and modified. Instead, you will be given a <strong>Table Schema</strong> and a <strong>CSV File</strong> with the instance in it. To keep things simple, we will use the <tt>CREATE TABLE</tt> statement to define a relation's schema. To reiterate, <tt>CREATE TABLE</tt> statements <strong>only appear to give you a schema</strong>. You do not need to allocate any resources for the table in reaction to a <tt>CREATE TABLE</tt> statement -- Simply save the schema that you are given for later use. Sql types (and their corresponding java types) that will be used in this project are as follows:</p>
<table>
<tbody>
<tr>
<th>SQL Type</th>
<th>Java Equivalent</th>
</tr>
<tr>
<td>string</td>
<td>StringValue</td>
</tr>
<tr>
<td>varchar</td>
<td>StringValue</td>
</tr>
<tr>
<td>char</td>
<td>StringValue</td>
</tr>
<tr>
<td>int</td>
<td>LongValue</td>
</tr>
<tr>
<td>decimal</td>
<td>DoubleValue</td>
</tr>
<tr>
<td>date</td>
<td>DateValue</td>
</tr>
</tbody>
</table>
<p>In addition to the schema, you will find a corresponding <tt>[tablename].csv</tt> file in the <tt>data</tt> directory (just like in checkpoint 0). The name of the table corresponds to the table names given in the <tt>CREATE TABLE</tt> statements your code receives. For example, let's say that you see the following statement in your query file:</p>
<pre>CREATE TABLE R(A int, B int, C int);</pre>
<p>That means that the data directory contains a data file called 'R.dat' that might look like this:</p>
<pre>1|1|5
1|2|6
2|3|7</pre>
<p>Each line of text (see <a href="http://docs.oracle.com/javase/8/docs/api/java/io/BufferedReader.html">BufferedReader.readLine()</a>) corresponds to one row of data. Each record is delimited by a vertical pipe '|' character.  Integers and floats are stored in a form recognized by Javas Long.parseLong() and Double.parseDouble() methods. Dates are stored in YYYY-MM-DD form, where YYYY is the 4-digit year, MM is the 2-digit month number, and DD is the 2-digit date. Strings are stored unescaped and unquoted and are guaranteed to contain no vertical pipe characters.</p>
<h4>Grading Workflow</h4>
<p>As before, all .java files in the src directory at the root of your repository will be compiled (and linked against JSQLParser). Also as before, the class <tt>dubstep.Main</tt> will be invoked with no arguments, and a stream of <b>semicolon-delimited</b> queries will be printed to System.in (after you print out a prompt)</p>
<p>For example (<span style="color: red">red</span> text is entered by the user/grader):</p>
<pre>bash&gt; <span style="color: red">ls data</span>
R.dat
S.dat
T.dat
bash&gt; <span style="color: red">cat data/R.dat</span>
1|1|5
1|2|6
2|3|7
bash&gt; <span style="color: red">java -cp build:jsqlparser.jar dubstep.Main -</span>
$> <span style="color: red">CREATE TABLE R(A int, B int, C int);</span>
$> <span style="color: red">SELECT B, C FROM R WHERE A = 1;</span>
1|5
2|6
$> <span style="color: red">SELECT SUM(B), SUM(C) FROM R;</span>
6|18
</pre>
<p>For this project, your code will not be timed, but you will need to answer some queries with a cap on available memory. You will receive up to 7 points for answering queries successfully, up to 3 additional points for remaining within memory usage bounds, and 5 points awarded as part of a code review after the project deadline.</p>dr

View file

@ -12,16 +12,19 @@ classContent:
notes: slides/2017-02-02-PhysicalLayout.pdf
board: slides/2017-02-02-Board.pdf
csv-script: slides/2017-02-02-CSVExample.zip
video: https://youtu.be/RqooMEr39sM
- date: Feb. 7
topic: Checkpoint 0 Overview
topic: Checkpoint 0 + Indexing
meta:
checkpoint0: slides/2017-02-07-Checkpoint0.html
slides-checkpoint0: slides/2017-02-07-Checkpoint 0.html
notes: slides/2017-02-07-Indexing.pdf
board: slides/2017-02-07-Board.pdf
video: https://youtu.be/APTMjlVQVwc
- date: Feb. 9
topic: Indexes (Overview, Tree Indexes)
meta:
notes: slides/2017-02-09-Indexes.pdf
board: slides/2017-02-09-Board.pdf
video: https://youtu.be/0a622-T4obk
- date: Feb. 14
topic: Indexes (Primary/Secondary, LSM Trees, B+Trees)
meta:
@ -107,7 +110,8 @@ In this course, you will learn...
<li>"<a href="http://www.redbook.io/">The Red Book: Readings in Databases</a>"<br/> ed. Bailis, Hellerstein, and Stonebraker</li>
</ul></li>
<li><strong>Software</strong>: <ul>
<li>JSQLParser (<a href="http://maven.mimirdb.info/info/mimirdb/jsqlparser/1.0.0/jsqlparser-1.0.0.jar">Jar</a> | <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/">JavaDoc</a> | <a href="https://github.com/UBOdin/jsqlparser">Source</a>)
<li>JSQLParser-UB (<a href="http://maven.mimirdb.info/info/mimirdb/jsqlparser/1.0.0/jsqlparser-1.0.0.jar">Jar</a> | <a href="http://doc.odin.cse.buffalo.edu/jsqlparser/">JavaDoc</a> | <a href="https://github.com/UBOdin/jsqlparser">Source</a>)
<li>EvalLib (<a href="http://maven.mimirdb.info/info/mimirdb/evallib/1.0/evallib-1.0.jar">Jar</a> | <a href="https://github.com/UBOdin/evallib">Source</a>)
</ul></li>
<li><strong>Project Submission</strong>:<a href="http://dubstep.odin.cse.buffalo.edu"> http://dubstep.odin.cse.buffalo.edu</a></li>
<li><strong>Project Groups</strong>: 1-3 people</li>