2018-01-15 23:43:28 -05:00
<!doctype html>
< html lang = "en" >
< head >
< meta charset = "utf-8" >
< title > Embracing Uncertainty< / title >
< meta name = "description" content = "Mimir" >
< meta name = "author" content = "Oliver Kennedy" >
< meta name = "apple-mobile-web-app-capable" content = "yes" / >
< meta name = "apple-mobile-web-app-status-bar-style" content = "black-translucent" / >
< meta name = "viewport" content = "width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui" >
< link rel = "stylesheet" href = "../reveal.js-3.5.0/css/reveal.css" >
< link rel = "stylesheet" href = "ubodin.css" id = "theme" >
<!-- Code syntax highlighting -->
< link rel = "stylesheet" href = "../reveal.js-3.5.0/lib/css/zenburn.css" >
<!-- Printing and PDF exports -->
< script >
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.5.0/css/print/pdf.css' : '../reveal.js-3.5.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
< / script >
<!-- [if lt IE 9]>
< script src = "../reveal.js-3.5.0/lib/js/html5shiv.js" > < / script >
<![endif]-->
< / head >
< body >
< div class = "reveal" >
<!-- Any section element inside of this container is displayed as a slide -->
< div class = "header" >
<!-- Any Talk - Specific Header Content Goes Here -->
Don't Wrangle, Guess
< / div >
< div class = "footer" >
<!-- Any Talk - Specific Footer Content Goes Here -->
< div style = "float: left; margin-top: 15px; " >
Exploring < u > < b > O< / b > < / u > nline < u > < b > D< / b > < / u > ata < u > < b > In< / b > < / u > teractions
< / div >
< img src = "graphics/FullText-white.png" height = "40" style = "float: right;" / >
< / div >
< div class = "slides" >
< section >
< h2 > Don't Wrangle, Guess Instead< / h2 >
< h4 > with< / h4 >
2018-01-23 01:36:35 -05:00
< img src = "graphics/mimir_logo_final.png" / >
2018-01-15 23:43:28 -05:00
< / section >
< section >
< section >
< h3 > A Big Data Fairy Tale< / h3 >
< / section >
< section >
< img src = "graphics/dagobert83-female-user-icon-800px.png" height = "300" / >
< h4 > Meet Alice< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/dagobert83-female-user-icon-800px.png" height = "300" / >
< img src = "graphics/littlestorefront-800px.png" height = "300" / >
< h4 > Alice has a Store< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/littlestorefront-800px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > →< / span >
< img src = "graphics/matt-icons_text-x-log-300px.png" height = "300" style = " vertical-align: middle;" / >
< h4 > Alice's store collects sales data< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/dagobert83-female-user-icon-800px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > +< / span >
< img src = "graphics/matt-icons_text-x-log-300px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > =< / span >
< img src = "graphics/saco-800px.png" height = "300" style = " vertical-align: middle;" / >
< h4 > Alice wants to use her sales data to run a promotion< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/matt-icons_text-x-log-300px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > →< / span >
< img src = "graphics/database-server-800px.png" height = "300" style = " vertical-align: middle;" / >
< h4 > So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/database-server-800px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > + ?< / span >
< h4 > ... asks her question ...< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< section >
< img src = "graphics/database-server-800px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > + ? →< / span >
< img src = "graphics/crystalball-800px.png" height = "300" style = " vertical-align: middle;" / >
< h4 > ... and basks in the limitless possibilities of big data.< / h4 >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
< / section >
< section >
< section >
< h2 > Why is this a fairy tale?< / h2 >
< / section >
< section >
< img src = "graphics/matt-icons_text-x-log-300px.png" height = "300" style = " vertical-align: middle;" / >
< span style = "font-size: 3em; vertical-align: middle;" > →< / span >
< img src = "graphics/database-server-800px.png" height = "300" style = " vertical-align: middle;" / >
< h4 > It's never this easy...< / h4 >
< / section >
< / section >
< section >
< section >
< h2 > CSV Import< / h2 >
< h4 > Run a < code > SELECT< / code > on a raw CSV File< / h4 >
< ul >
< li > File may not have column headers< / li >
< li > CSV does not provide "types"< / li >
< li > Lines may be missing fields< / li >
< li > Fields may be mistyped (typo, missing comma)< / li >
< li > Comment text can be inlined into the file< / li >
< / ul >
< p >
< b > State of the art< / b > : External Table Defn < span > + "Manually" edit CSV< / span >
< / p >
< / section >
< section >
2018-01-23 01:36:35 -05:00
< h2 > Merge Data From Two Sources< / h2 >
2018-01-15 23:43:28 -05:00
< h4 > < code > UNION< / code > two data sources< / h4 >
< ul >
< li > Schema matching< / li >
< li > Deduplication< / li >
< li > Format alignment (GIS coordinates, $ vs €)< / li >
< li > Precision alignment (State vs County)< / li >
< / ul >
< p >
< b > State of the art< / b > : Manually map schema
< / p >
< / section >
< section >
< h2 > JSON Shredding< / h2 >
< h4 > Run a < code > SELECT< / code > on JSON or a Doc Store< / h4 >
< ul >
< li > Separating fields and record sets:< br / > (e.g., < code > { A: "Bob", B: "Alice" }< / code > )< / li >
< li > Missing fields (Records with no 'address')< / li >
< li > Type alignment (Records with 'address' as an array)< / li >
< li > Schema matching$^2$< / li >
< / ul >
< p >
< b > State of the art< / b > : DataGuide, Wrangler, etc...
< / p >
< / section >
< / section >
< section >
< section >
2018-01-23 01:36:35 -05:00
< img src = "graphics/sad_alice.svg" width = "200px" height = "200px" / >
2018-01-15 23:43:28 -05:00
< / section >
< section >
2018-01-23 01:36:35 -05:00
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
< svg xmlns:dc = "http://purl.org/dc/elements/1.1/" xmlns:cc = "http://creativecommons.org/ns#" xmlns:rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg = "http://www.w3.org/2000/svg" xmlns = "http://www.w3.org/2000/svg" xmlns:xlink = "http://www.w3.org/1999/xlink" xmlns:sodipodi = "http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape = "http://www.inkscape.org/namespaces/inkscape" width = "600" height = "400" viewBox = "0 0 600 400" version = "1.1" class = "stretch" id = "svg27" inkscape:version = "0.92.2 5c3e80d, 2017-08-06" sodipodi:docname = "heuristics.svg" >
< g inkscape:label = "Layer 1" inkscape:groupmode = "layer" id = "layer1" transform = "translate(0,103)" >
< image xlink:href = "papers/p37-aggarwal.png" class = "fragment" width = "215.89999" height = "279.39999" preserveAspectRatio = "none" id = "image227" x = "310.09167" y = "-89.140472" / >
< image xlink:href = "papers/p517-shepard.png" class = "fragment" width = "215.89999" height = "279.39999" preserveAspectRatio = "none" id = "image238" x = "22.073807" y = "-98.211899" / >
< image xlink:href = "papers/p1-bhattacharya.png" class = "fragment" width = "215.89999" height = "279.39999" preserveAspectRatio = "none" id = "image216" x = "114.3" y = "-52.854759" / >
< image xlink:href = "papers/p1483-wang.png" class = "fragment" width = "215.89999" height = "279.39999" preserveAspectRatio = "none" id = "image249" x = "230.71667" y = "-23.372616" / >
< image xlink:href = "papers/p2018-getoor.png" class = "fragment" width = "215.89999" height = "279.39999" preserveAspectRatio = "none" id = "image260" x = "114.3" y = "-52.854759" / >
< image xlink:href = "papers/hodge.png" class = "fragment" width = "209.804" height = "297.01068" preserveAspectRatio = "none" id = "image205" x = "62.919422" y = "64.583939" / >
< image xlink:href = "papers/btr597.png" class = "fragment" width = "215.73067" height = "279.06134" preserveAspectRatio = "none" id = "image194" x = "371.72296" y = "-14.482616" / >
< image xlink:href = "papers/778_2008_Article_98.png" class = "fragment" width = "209.97333" height = "278.892" preserveAspectRatio = "none" id = "image183" x = "182.44458" y = "55.179905" / >
< image xlink:href = "papers/1-s2.0-S030439750400725X-main.png" class = "fragment" width = "191.85466" height = "261.62" preserveAspectRatio = "none" id = "image172" x = "354.6203" y = "52.79715" / >
< / g >
< / svg >
< p class = "fragment" > Tons of Curation Heuristics Available!< / p >
2018-01-15 23:43:28 -05:00
< / section >
2018-01-23 01:36:35 -05:00
< section >
< img src = "graphics/happy_alice.svg" width = "200px" height = "200px" / >
< imagecredits > (OpenClipArt.org)< / imagecredits >
< / section >
2018-01-15 23:43:28 -05:00
< section >
2018-01-23 01:36:35 -05:00
< img src = "graphics/StateStreet.png" / >
< p class = "fragment" > ... that can be wrong< / p >
< imagecredits > (google.com)< / imagecredits >
2018-01-15 23:43:28 -05:00
< / section >
2018-01-23 01:36:35 -05:00
< section >
< img src = "graphics/Mickey12.png" height = "400px" / >
< p > ... very wrong< / p >
< imagecredits > (nytimes.com)< / imagecredits >
< / section >
< section >
< h3 >
In the name of Codd,< br / > < span class = "fragment grow highlight-current-blue" data-fragment-index = "2" > thou shalt not give the user a wrong answer.< / span >
< / h3 >
< / section >
< section >
< p > ... but when combined with heuristics< / p >
< img src = "graphics/obamacare_stats_fail.jpg" height = "400" / >
< imagecredits > (Fox News)< / imagecredits >
< / section >
< / section >
< section >
< section >
< h3 > On representing incomplete information in a relational data base< / h3 >
< h4 > T. Imielinski & W. Lipski Jr.< span style = "margin-left: 40px" > (< i > VLDB 1981< / i > )< / span > < / h4 >
< p class = "fragment" style = "margin-top: 60px" >
Incomplete and Probabilistic Databases< br / > have existed since the 1980s...
< / p >
< / section >
< section >
< h1 > But...< / h1 >
< / section >
< section >
< img src = "graphics/blackbox.svg" height = "400" >
< p > (Typical Heuristics)< / p >
< / section >
2018-01-15 23:43:28 -05:00
< section >
2018-01-23 01:36:35 -05:00
2018-01-19 01:18:16 -05:00
< canvas data-chart = "bar" >
<!--
2018-01-23 01:36:35 -05:00
{"options": {
"title" : {
"display" : true,
"text" : ["Query Performance on PDBench/TPCH","(1 GB; 5 min timeout)"]
},
"scales": {
"yAxes": [{
"scaleLabel": {
"display": true,
"labelString": "Query Runtime (s)"
}
}]
}
}}
2018-01-19 01:18:16 -05:00
-->
2018-01-23 01:36:35 -05:00
Label , PDB-1, PDB-2, PDB-3, TPCH-1, TPCH-3, TPCH-5, TPCH-9
SQLite , 9.521, 7.59, 31.22, 19.561, 22.835, 33.308, 51.125
MayBMS-SQLite , 22.1345477, 7.291376699999999, 29.1511957
MayBMS-PGSql , 23.439012999999996, 13.000651999999999, 20.2954832
Sampling (x10), 300, 242.5666234549135, 300, 119.61607021316885, 162.00108394436538, 258.74168805666267, 300
2018-01-19 01:18:16 -05:00
< / canvas >
2018-01-15 23:43:28 -05:00
< / section >
< section >
2018-01-23 01:36:35 -05:00
< img src = "graphics/Normal_Distribution_PDF.svg" height = "500" / >
< p > (Probabilistic Query Outputs)< / p >
2018-01-15 23:43:28 -05:00
< / section >
2018-01-23 01:36:35 -05:00
< section >
< h2 > Probabilistic Databases...< / h2 >
< ol style = "margin-top: 50px;" >
< li class = "fragment" > ... require probabilities as inputs< / li >
< li class = "fragment" > ... are slow< / li >
< li class = "fragment" > ... produce probabilities as outputs< / li >
< / ol >
< / section >
2018-01-15 23:43:28 -05:00
< / section >
2018-01-23 01:36:35 -05:00
< section >
< section >
< p > The < img src = "graphics/mimir_logo_final.png" height = "150px" style = "vertical-align: middle" / > Uncertainty-Aware Database< / p >
< p > < a href = "http://mimirdb.info" > http://mimirdb.info< / a > < / p >
< / section >
< section >
< p > Mimir is a vehicle for research on...< / p >
< ol >
< li class = "fragment" data-fragment-index = "1" > ... uncertainty capture< / li >
< li class = "fragment" data-fragment-index = "2" > < span class = "fragment highlight-current-red" data-fragment-index = "5" > ... query processing over uncertain data< / span > < / li >
< li class = "fragment" data-fragment-index = "3" > < span class = "fragment highlight-current-red" data-fragment-index = "5" > ... intuitive and qualitative presentation of uncertainty< / span > < / li >
< li class = "fragment" data-fragment-index = "4" > ... other things that we can do to make Alice's life easier< / li >
< / ol >
< / section >
< section >
< ul style = "font-size: larger" >
< li style = "color: lightgrey;" > Why should you care about uncertain data?< / li >
< li style = "font-weight: bold" > Background: K-Relations and Possible Worlds< / li >
< li > Uncertainty-Annotated Databases< div style = "font-size: smaller; font-weight: normal; font-style: italic;" > (Joint work with Boris Glavic, Su Feng, Aaron Huber)< / div > < / li >
< li > Other Mimir Projects< / li >
< / ul >
< / section >
< / section >
< section >
< section >
< ul >
< li > Semirings< / li >
< li > K-Relations< / li >
< li > Possible Worlds< / li >
< li > Certain, Possible Tuples< / li >
< / ul >
< / section >
< / section >
< section >
< section >
< ul >
< li > $K^W$-Relations< / li >
< li > $PW_i$, Certain, Possible< / li >
< li > Performance< / li >
< / ul >
< / section >
< / section >
< section >
< h5 > Thanks...< / h5 >
< table >
< tr >
< th colspan = "5" style = "font-size: 12pt" > Students< / th >
< / tr >
< tr height = "80px" >
< td width = "100px" >
< img src = "people/poonam.jpg" width = "70px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Poonam< br / > (PhD-3Y)< / p >
< / td >
< td width = "100px" >
< img src = "people/will.png" width = "61px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Will< br / > (PhD-2Y)< / p >
< / td >
< td width = "100px" >
< img src = "people/aaron.jpg" width = "64px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Aaron< br / > (PhD-3Y)< / p >
< / td >
< td width = "100px" >
< img src = "people/lisa.jpg" width = "71px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Lisa< br / > (PhD-0Y)< / p >
< / td >
< td width = "100px" >
< img src = "people/olivia.png" width = "50px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Olivia< br / > (BS-Sr)< / p >
< / td >
< / tr >
< / table >
< table style = "display: inline-block;" >
< tr >
< th colspan = "4" style = "font-size: 12pt" > Alumni< / th >
< / tr >
< tr height = "80px" >
< td width = "100px" >
< img src = "people/ying.jpg" width = "60px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Ying< br / > (PhD 2017)< / p >
< / td >
< td width = "100px" >
< img src = "people/niccolo.png" width = "50px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Niccolò< br / > (PhD 2016)< / p >
< / td >
< td width = "100px" >
< img src = "people/arindam.jpg" width = "80px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Arindam< br / > (MS 2016)< / p >
< / td >
< td width = "100px" >
< img src = "people/shivang.jpg" width = "55px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Shivang< br / > (MS-2Y)< / p >
< / td >
< / tr >
< / table >
< table style = "display: inline-block; margin-left: 100px" >
< tr >
< th colspan = "1" style = "font-size: 12pt" > Dev< / th >
< / tr >
< tr >
< td width = "100px" >
< img src = "people/mike.jpg" width = "80px" height = "80px" style = "margin-bottom: 0px" / >
< p style = "margin-top: 0px; font-size: 10pt;" > Mike< br / > (Sr. Rsrch. Dev.)< / p >
< / td >
< / tr >
< / table >
< table >
< tr >
< th colspan = "4" style = "font-size: 12pt" > External Collaborators< / th >
< / tr >
< tr >
< td width = "130px" style = "font-size: 10pt;" >
Dieter Gawlick< br / > (Oracle)
< / td >
< td width = "130px" style = "font-size: 10pt;" >
Zhen Hua Liu< br / > (Oracle)
< / td >
< td width = "130px" style = "font-size: 10pt;" >
Ronny Fehling< br / > (Airbus)
< / td >
< td width = "130px" style = "font-size: 10pt;" >
Beda Hammerschmidt< br / > (Oracle)
< / td >
< / tr >
< / table >
< table style = "margin-top: 5px" >
< tr >
< td width = "140px" style = "font-size: 10pt;" >
Boris Glavic< br / > (IIT)
< / td >
< td width = "140px" style = "font-size: 10pt;" >
Su Feng< br / > (IIT)
< / td >
< td width = "140px" style = "font-size: 10pt;" >
Juliana Freire< br / > (NYU)
< / td >
< td width = "140px" style = "font-size: 10pt;" >
Wolfgang Gatterbauer< br / > (NEU)
< / td >
< td width = "140px" style = "font-size: 10pt;" >
Heiko Mueller< br / > (NYU)
< / td >
< td width = "140px" style = "font-size: 10pt;" >
Remi Rampin< br / > (NYU)
< / td >
< / tr >
< / table >
< p style = "font-size: 10pt; font-weight: bold;" > Mimir is supported by NSF Award ACI-1640864, NPS Award N00244-16-1-0022, and gifts from Oracle< / p >
< / section >
2018-01-15 23:43:28 -05:00
< section >
< p style = "font-size: x-large;" > < img src = "graphics/mimir_logo_final.png" height = "150px" > < br / > < a href = "http://mimirdb.info" > http://mimirdb.info< / a > < / p >
< ul style = "font-size: smaller;" >
< li > It's not the data that's uncertain, it's the interpretation.< / li >
< li > Tagged best-guess evaluation is faster and easier to understand.< / li >
< li > Not committing to one representation allows faster query processing.< / li >
< / ul >
< p > < b > Thanks!< / b > < / p >
< / section >
< / div > < / div >
< script src = "../reveal.js-3.5.0/lib/js/head.min.js" > < / script >
< script src = "../reveal.js-3.5.0/js/reveal.js" > < / script >
< script >
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: false,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
2018-01-19 01:18:16 -05:00
chart: {
defaults: {
global: {
2018-01-23 01:36:35 -05:00
title: { fontColor: "#333", fontSize: 24 },
2018-01-19 01:18:16 -05:00
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
2018-01-15 23:43:28 -05:00
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.5.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.5.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.5.0/js/MathJax.js'
},
{ src: '../reveal.js-3.5.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.5.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.5.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.5.0/plugin/zoom-js/zoom.js', async: true },
2018-01-19 01:18:16 -05:00
{ src: '../reveal.js-3.5.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.5.0/plugin/chart/Chart.min.js'},
// the plugin
2018-01-23 23:32:32 -05:00
{ src: '../reveal.js-3.5.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.5.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.5.0/plugin/svginline/data-src-svg.js', async: false }
2018-01-15 23:43:28 -05:00
]
});
< / script >
< / body >
< / html >