Website/slides/talks/2018-1-Tour-Mimir/index.html

543 lines
21 KiB
HTML
Raw Normal View History

2018-01-15 23:43:28 -05:00
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Embracing Uncertainty</title>
<meta name="description" content="Mimir">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../reveal.js-3.5.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../reveal.js-3.5.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.5.0/css/print/pdf.css' : '../reveal.js-3.5.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<!--[if lt IE 9]>
<script src="../reveal.js-3.5.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
Don't Wrangle, Guess
</div>
<div class="footer">
<!-- Any Talk-Specific Footer Content Goes Here -->
<div style="float: left; margin-top: 15px; ">
Exploring <u><b>O</b></u>nline <u><b>D</b></u>ata <u><b>In</b></u>teractions
</div>
<img src="graphics/FullText-white.png" height="40" style="float: right;"/>
</div>
<div class="slides">
<section>
<h2>Don't Wrangle, Guess Instead</h2>
<h4>with</h4>
2018-01-23 01:36:35 -05:00
<img src="graphics/mimir_logo_final.png" />
2018-01-15 23:43:28 -05:00
</section>
<section>
<section>
<h3>A Big Data Fairy Tale</h3>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
<h4>Meet Alice</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
<img src="graphics/littlestorefront-800px.png" height="300" />
<h4>Alice has a Store</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/littlestorefront-800px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
<h4>Alice's store collects sales data</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;">+</span>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">=</span>
<img src="graphics/saco-800px.png" height="300" style=" vertical-align: middle;" />
<h4>Alice wants to use her sales data to run a promotion</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<h4>So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">+&nbsp;?</span>
<h4>... asks her question ...</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">+&nbsp;?&nbsp;</span>
<img src="graphics/crystalball-800px.png" height="300" style=" vertical-align: middle;" />
<h4>... and basks in the limitless possibilities of big data.</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
</section>
<section>
<section>
<h2>Why is this a fairy tale?</h2>
</section>
<section>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<h4>It's never this easy...</h4>
</section>
</section>
<section>
<section>
<h2>CSV Import</h2>
<h4>Run a <code>SELECT</code> on a raw CSV File</h4>
<ul>
<li>File may not have column headers</li>
<li>CSV does not provide "types"</li>
<li>Lines may be missing fields</li>
<li>Fields may be mistyped (typo, missing comma)</li>
<li>Comment text can be inlined into the file</li>
</ul>
<p>
<b>State of the art</b>: External Table Defn <span>+ "Manually" edit CSV</span>
</p>
</section>
<section>
2018-01-23 01:36:35 -05:00
<h2>Merge Data From Two Sources</h2>
2018-01-15 23:43:28 -05:00
<h4><code>UNION</code> two data sources</h4>
<ul>
<li>Schema matching</li>
<li>Deduplication</li>
<li>Format alignment (GIS coordinates, $ vs €)</li>
<li>Precision alignment (State vs County)</li>
</ul>
<p>
<b>State of the art</b>: Manually map schema
</p>
</section>
<section>
<h2>JSON Shredding</h2>
<h4>Run a <code>SELECT</code> on JSON or a Doc Store</h4>
<ul>
<li>Separating fields and record sets:<br/>(e.g., <code>{ A: "Bob", B: "Alice" }</code>)</li>
<li>Missing fields (Records with no 'address')</li>
<li>Type alignment (Records with 'address' as an array)</li>
<li>Schema matching$^2$</li>
</ul>
<p>
<b>State of the art</b>: DataGuide, Wrangler, etc...
</p>
</section>
</section>
<section>
<section>
2018-01-23 01:36:35 -05:00
<img src="graphics/sad_alice.svg" width="200px" height="200px" />
2018-01-15 23:43:28 -05:00
</section>
<section>
2018-01-23 01:36:35 -05:00
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" width="600" height="400" viewBox="0 0 600 400" version="1.1" class="stretch" id="svg27" inkscape:version="0.92.2 5c3e80d, 2017-08-06" sodipodi:docname="heuristics.svg">
<g inkscape:label="Layer 1" inkscape:groupmode="layer" id="layer1" transform="translate(0,103)">
<image xlink:href="papers/p37-aggarwal.png" class="fragment" width="215.89999" height="279.39999" preserveAspectRatio="none" id="image227" x="310.09167" y="-89.140472" />
<image xlink:href="papers/p517-shepard.png" class="fragment" width="215.89999" height="279.39999" preserveAspectRatio="none" id="image238" x="22.073807" y="-98.211899" />
<image xlink:href="papers/p1-bhattacharya.png" class="fragment" width="215.89999" height="279.39999" preserveAspectRatio="none" id="image216" x="114.3" y="-52.854759" />
<image xlink:href="papers/p1483-wang.png" class="fragment" width="215.89999" height="279.39999" preserveAspectRatio="none" id="image249" x="230.71667" y="-23.372616" />
<image xlink:href="papers/p2018-getoor.png" class="fragment" width="215.89999" height="279.39999" preserveAspectRatio="none" id="image260" x="114.3" y="-52.854759" />
<image xlink:href="papers/hodge.png" class="fragment" width="209.804" height="297.01068" preserveAspectRatio="none" id="image205" x="62.919422" y="64.583939" />
<image xlink:href="papers/btr597.png" class="fragment" width="215.73067" height="279.06134" preserveAspectRatio="none" id="image194" x="371.72296" y="-14.482616" />
<image xlink:href="papers/778_2008_Article_98.png" class="fragment" width="209.97333" height="278.892" preserveAspectRatio="none" id="image183" x="182.44458" y="55.179905" />
<image xlink:href="papers/1-s2.0-S030439750400725X-main.png" class="fragment" width="191.85466" height="261.62" preserveAspectRatio="none" id="image172" x="354.6203" y="52.79715" />
</g>
</svg>
<p class="fragment">Tons of Curation Heuristics Available!</p>
2018-01-15 23:43:28 -05:00
</section>
2018-01-23 01:36:35 -05:00
<section>
<img src="graphics/happy_alice.svg" width="200px" height="200px" />
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
2018-01-15 23:43:28 -05:00
<section>
2018-01-23 01:36:35 -05:00
<img src="graphics/StateStreet.png"/>
<p class="fragment">... that can be wrong</p>
<imagecredits>(google.com)</imagecredits>
2018-01-15 23:43:28 -05:00
</section>
2018-01-23 01:36:35 -05:00
<section>
<img src="graphics/Mickey12.png" height="400px"/>
<p>... very wrong</p>
<imagecredits>(nytimes.com)</imagecredits>
</section>
<section>
<h3>
In the name of Codd,<br/><span class="fragment grow highlight-current-blue" data-fragment-index="2">thou shalt not give the user a wrong answer.</span>
</h3>
</section>
<section>
<p>... but when combined with heuristics</p>
<img src="graphics/obamacare_stats_fail.jpg" height="400" />
<imagecredits>(Fox News)</imagecredits>
</section>
</section>
<section>
<section>
<h3>On representing incomplete information in a relational data base</h3>
<h4>T. Imielinski &amp; W. Lipski Jr.<span style="margin-left: 40px">(<i>VLDB 1981</i>)</span></h4>
<p class="fragment" style="margin-top: 60px">
Incomplete and Probabilistic Databases<br/>have existed since the 1980s...
</p>
</section>
<section>
<h1>But...</h1>
</section>
<section>
<img src="graphics/blackbox.svg" height="400">
<p>(Typical Heuristics)</p>
</section>
2018-01-15 23:43:28 -05:00
<section>
2018-01-23 01:36:35 -05:00
2018-01-19 01:18:16 -05:00
<canvas data-chart="bar">
<!--
2018-01-23 01:36:35 -05:00
{"options": {
"title" : {
"display" : true,
"text" : ["Query Performance on PDBench/TPCH","(1 GB; 5 min timeout)"]
},
"scales": {
"yAxes": [{
"scaleLabel": {
"display": true,
"labelString": "Query Runtime (s)"
}
}]
}
}}
2018-01-19 01:18:16 -05:00
-->
2018-01-23 01:36:35 -05:00
Label , PDB-1, PDB-2, PDB-3, TPCH-1, TPCH-3, TPCH-5, TPCH-9
SQLite , 9.521, 7.59, 31.22, 19.561, 22.835, 33.308, 51.125
MayBMS-SQLite , 22.1345477, 7.291376699999999, 29.1511957
MayBMS-PGSql , 23.439012999999996, 13.000651999999999, 20.2954832
Sampling (x10), 300, 242.5666234549135, 300, 119.61607021316885, 162.00108394436538, 258.74168805666267, 300
2018-01-19 01:18:16 -05:00
</canvas>
2018-01-15 23:43:28 -05:00
</section>
<section>
2018-01-23 01:36:35 -05:00
<img src="graphics/Normal_Distribution_PDF.svg" height="500" />
<p>(Probabilistic Query Outputs)</p>
2018-01-15 23:43:28 -05:00
</section>
2018-01-23 01:36:35 -05:00
<section>
<h2>Probabilistic Databases...</h2>
<ol style="margin-top: 50px;">
<li class="fragment">... require probabilities as inputs</li>
<li class="fragment">... are slow</li>
<li class="fragment">... produce probabilities as outputs</li>
</ol>
</section>
2018-01-15 23:43:28 -05:00
</section>
2018-01-23 01:36:35 -05:00
<section>
<section>
<p>The <img src="graphics/mimir_logo_final.png" height="150px" style="vertical-align: middle"/>Uncertainty-Aware Database</p>
<p><a href="http://mimirdb.info">http://mimirdb.info</a></p>
</section>
<section>
<p>Mimir is a vehicle for research on...</p>
<ol>
<li class="fragment" data-fragment-index="1">... uncertainty capture</li>
<li class="fragment" data-fragment-index="2"><span class="fragment highlight-current-red" data-fragment-index="5">... query processing over uncertain data</span></li>
<li class="fragment" data-fragment-index="3"><span class="fragment highlight-current-red" data-fragment-index="5">... intuitive and qualitative presentation of uncertainty</span></li>
<li class="fragment" data-fragment-index="4">... other things that we can do to make Alice's life easier</li>
</ol>
</section>
<section>
<ul style="font-size: larger">
<li style="color: lightgrey;">Why should you care about uncertain data?</li>
<li style="font-weight: bold">Background: K-Relations and Possible Worlds</li>
<li>Uncertainty-Annotated Databases<div style="font-size: smaller; font-weight: normal; font-style: italic;">(Joint work with Boris Glavic, Su Feng, Aaron Huber)</div></li>
<li>Other Mimir Projects</li>
</ul>
</section>
</section>
<section>
<section>
<ul>
<li>Semirings</li>
<li>K-Relations</li>
<li>Possible Worlds</li>
<li>Certain, Possible Tuples</li>
</ul>
</section>
</section>
<section>
<section>
<ul>
<li>$K^W$-Relations</li>
<li>$PW_i$, Certain, Possible</li>
<li>Performance</li>
</ul>
</section>
</section>
<section>
<h5>Thanks...</h5>
<table>
<tr>
<th colspan="5" style="font-size: 12pt">Students</th>
</tr>
<tr height="80px">
<td width="100px">
<img src="people/poonam.jpg" width="70px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Poonam<br/>(PhD-3Y)</p>
</td>
<td width="100px">
<img src="people/will.png" width="61px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Will<br/>(PhD-2Y)</p>
</td>
<td width="100px">
<img src="people/aaron.jpg" width="64px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Aaron<br/>(PhD-3Y)</p>
</td>
<td width="100px">
<img src="people/lisa.jpg" width="71px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Lisa<br/>(PhD-0Y)</p>
</td>
<td width="100px">
<img src="people/olivia.png" width="50px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Olivia<br/>(BS-Sr)</p>
</td>
</tr>
</table>
<table style="display: inline-block;">
<tr>
<th colspan="4" style="font-size: 12pt">Alumni</th>
</tr>
<tr height="80px">
<td width="100px">
<img src="people/ying.jpg" width="60px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Ying<br/>(PhD 2017)</p>
</td>
<td width="100px">
<img src="people/niccolo.png" width="50px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Niccolò<br/>(PhD 2016)</p>
</td>
<td width="100px">
<img src="people/arindam.jpg" width="80px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Arindam<br/>(MS 2016)</p>
</td>
<td width="100px">
<img src="people/shivang.jpg" width="55px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Shivang<br/>(MS-2Y)</p>
</td>
</tr>
</table>
<table style="display: inline-block; margin-left: 100px">
<tr>
<th colspan="1" style="font-size: 12pt">Dev</th>
</tr>
<tr>
<td width="100px">
<img src="people/mike.jpg" width="80px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Mike<br/>(Sr. Rsrch. Dev.)</p>
</td>
</tr>
</table>
<table>
<tr>
<th colspan="4" style="font-size: 12pt">External Collaborators</th>
</tr>
<tr>
<td width="130px" style="font-size: 10pt;">
Dieter Gawlick<br/>(Oracle)
</td>
<td width="130px" style="font-size: 10pt;">
Zhen Hua Liu<br/>(Oracle)
</td>
<td width="130px" style="font-size: 10pt;">
Ronny Fehling<br/>(Airbus)
</td>
<td width="130px" style="font-size: 10pt;">
Beda Hammerschmidt<br/>(Oracle)
</td>
</tr>
</table>
<table style="margin-top: 5px">
<tr>
<td width="140px" style="font-size: 10pt;">
Boris Glavic<br/>(IIT)
</td>
<td width="140px" style="font-size: 10pt;">
Su Feng<br/>(IIT)
</td>
<td width="140px" style="font-size: 10pt;">
Juliana Freire<br/>(NYU)
</td>
<td width="140px" style="font-size: 10pt;">
Wolfgang Gatterbauer<br/>(NEU)
</td>
<td width="140px" style="font-size: 10pt;">
Heiko Mueller<br/>(NYU)
</td>
<td width="140px" style="font-size: 10pt;">
Remi Rampin<br/>(NYU)
</td>
</tr>
</table>
<p style="font-size: 10pt; font-weight: bold;">Mimir is supported by NSF Award ACI-1640864, NPS Award N00244-16-1-0022, and gifts from Oracle</p>
</section>
2018-01-15 23:43:28 -05:00
<section>
<p style="font-size: x-large;"><img src="graphics/mimir_logo_final.png" height="150px"><br/><a href="http://mimirdb.info">http://mimirdb.info</a></p>
<ul style="font-size: smaller;">
<li>It's not the data that's uncertain, it's the interpretation.</li>
<li>Tagged best-guess evaluation is faster and easier to understand.</li>
<li>Not committing to one representation allows faster query processing.</li>
</ul>
<p><b>Thanks!</b></p>
</section>
</div></div>
<script src="../reveal.js-3.5.0/lib/js/head.min.js"></script>
<script src="../reveal.js-3.5.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: false,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
2018-01-19 01:18:16 -05:00
chart: {
defaults: {
global: {
2018-01-23 01:36:35 -05:00
title: { fontColor: "#333", fontSize: 24 },
2018-01-19 01:18:16 -05:00
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
2018-01-15 23:43:28 -05:00
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.5.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.5.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.5.0/js/MathJax.js'
},
{ src: '../reveal.js-3.5.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.5.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.5.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.5.0/plugin/zoom-js/zoom.js', async: true },
2018-01-19 01:18:16 -05:00
{ src: '../reveal.js-3.5.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.5.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.5.0/plugin/chart/csv2chart.js'}
2018-01-15 23:43:28 -05:00
]
});
</script>
</body>
</html>