Website/slides/cse501/2018/index.html

1156 lines
43 KiB
HTML

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Embracing Uncertainty</title>
<meta name="description" content="Mimir">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../../reveal.js-3.7.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../../reveal.js-3.7.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../../reveal.js-3.7.0/css/print/pdf.css' : '../../reveal.js-3.7.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<!--[if lt IE 9]>
<script src="../../reveal.js-3.7.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
Don't Wrangle, Guess
</div>
<div class="footer">
<!-- Any Talk-Specific Footer Content Goes Here -->
<div style="float: left; margin-top: 15px; ">
Exploring <u><b>O</b></u>nline <u><b>D</b></u>ata <u><b>In</b></u>teractions
</div>
<img src="graphics/FullText-white.png" height="40" style="float: right;"/>
</div>
<div class="slides">
<section>
<section>
<h2>Don't Wrangle, Guess Instead</h2>
<h4>with</h4>
<img src="graphics/mimir_logo_final.png" />
</section>
</section>
<section>
<section>
<h3>A Big Data Fairy Tale</h3>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
<h4>Meet Alice</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
<img src="graphics/littlestorefront-800px.png" height="300" />
<h4>Alice has a Store</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/littlestorefront-800px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
<h4>Alice's store collects sales data</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;">+</span>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">=</span>
<img src="graphics/saco-800px.png" height="300" style=" vertical-align: middle;" />
<h4>Alice wants to use her sales data to run a promotion</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<h4>So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">+&nbsp;?</span>
<h4>... asks her question ...</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
<section>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<span style="font-size: 3em; vertical-align: middle;">+&nbsp;?&nbsp;</span>
<img src="graphics/crystalball-800px.png" height="300" style=" vertical-align: middle;" />
<h4>... and basks in the limitless possibilities of big data.</h4>
<imagecredits>(OpenClipArt.org)</imagecredits>
</section>
</section>
<section>
<section>
<h2>Why is this a fairy tale?</h2>
</section>
<section>
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
<span style="font-size: 3em; vertical-align: middle;"></span>
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
<h4>It's never this easy...</h4>
</section>
</section>
<section>
<section>
<h2>CSV Import</h2>
<h4>Run a <code>SELECT</code> on a raw CSV File</h4>
<ul>
<li>File may not have column headers</li>
<li>CSV does not provide "types"</li>
<li>Lines may be missing fields</li>
<li>Fields may be mistyped (typo, missing comma)</li>
<li>Comment text can be inlined into the file</li>
</ul>
<!-- <p>
<b>State of the art</b>: External Table Defn <span>+ "Manually" edit CSV</span>
</p>
--> </section>
<section>
<h2>Merge Data From Two Sources</h2>
<h4><code>UNION</code> two data sources</h4>
<ul>
<li>Schema matching</li>
<li>Deduplication</li>
<li>Format alignment (GIS coordinates, $ vs €)</li>
<li>Precision alignment (State vs County)</li>
</ul>
<!-- <p>
<b>State of the art</b>: Manually map schema
</p>
--> </section>
<section>
<h2>JSON Shredding</h2>
<h4>Run a <code>SELECT</code> on JSON or a Doc Store</h4>
<ul>
<li>Separating fields and record sets:<br/>(e.g., <code>{ A: "Bob", B: "Alice" }</code>)</li>
<li>Missing fields (Records with no 'address')</li>
<li>Type alignment (Records with 'address' as an array)</li>
<li>Schema matching$^2$</li>
</ul>
<!-- <p>
<b>State of the art</b>: DataGuide, Wrangler, etc...
</p>
--> </section>
</section>
<section>
<section>
<img src="graphics/sad_alice.svg" width="200px" height="200px" />
</section>
<section>
<svg data-src="papers/heuristics.svg" height="600px" width="900px"/>
</section>
<section>
<p>We have tools that can solve these problem!</p>
<img src="graphics/happy_alice.svg" width="200px" height="200px" class="fragment" />
</section>
<section>
<img src="graphics/StateStreet.png"/>
<p>... most of the time</p>
<imagecredits>(google.com)</imagecredits>
</section>
<section>
<p>
<b>Problem:</b> It's hard to trust tools that can be wrong!
</p>
</section>
<section>
Your Pass Phrase for today is
<h2>Mary Wheeler</h2>
</section>
</section>
<section>
<section>
<h2>Options</h2>
<ol style="margin-top: 50px">
<li class="fragment">Ignore the Problem</li>
<li style="color: white;"></li>
<li style="color: white;"></li>
</ol>
</section>
<section>
<svg data-src="graphics/dataquality-normal.svg"/>
</section>
<section>
<p>
<b>In the name of Codd</b><br/>Thou shalt not give the user a wrong answer.
</p>
<p style="margin-top: 50px;" class="fragment">
... but this assumes that we start with perfect data.
</p>
</section>
<section>
<svg data-src="graphics/blackbox.svg" class="stretch" />
</section>
<section>
<img src="graphics/dataquality-poop.svg"/>
</section>
<section>
<img src="graphics/obamacare_stats_fail.jpg" height="400" />
<imagecredits>(Fox News)</imagecredits>
</section>
</section>
<section>
<section>
<h2>Options</h2>
<ol style="margin-top: 50px">
<li style="text-decoration: line-through; color: grey;">Ignore the Problem</li>
<li class="fragment">Heresy</li>
<li style="color: white;"></li>
</ol>
</section>
<section>
<h3>On representing incomplete information in a relational data base</h3>
<p style="font-size: smaller">T. Imielinski &amp; W. Lipski Jr.<span style="margin-left: 40px">(<i>VLDB 1981</i>)</span></p>
</section>
<section>
<h1>But...</h1>
</section>
<section>
<img src="graphics/probdb-query.svg" class="stretch"/>
<p><b>1.</b> ProbDBs Produce Probability Distributions as Outputs</p>
</section>
<section>
<img src="graphics/probdb-init.svg" class="stretch"/>
<p><b>2.</b> ProbDBs Require Probability Distributions as Inputs</p>
</section>
<section>
<canvas data-chart="bar">
<!--
{"options": {
"title" : {
"display" : true,
"text" : ["Query Performance on PDBench/TPCH","(1 GB; 5 min timeout)"]
},
"scales": {
"yAxes": [{
"scaleLabel": {
"display": true,
"labelString": "Query Runtime (s)"
}
}]
}
}}
-->
Label , PDB-1, PDB-2, PDB-3, TPCH-1, TPCH-3, TPCH-5, TPCH-9
SQLite , 9.521, 7.59, 31.22, 19.561, 22.835, 33.308, 51.125
MayBMS-SQLite , 22.1345477, 7.291376699999999, 29.1511957
MayBMS-PGSql , 23.439012999999996, 13.000651999999999, 20.2954832
Sampling (x10), 300, 242.5666234549135, 300, 119.61607021316885, 162.00108394436538, 258.74168805666267, 300
</canvas>
</section>
<section>
<h2>Probabilistic Databases...</h2>
<ol style="margin-top: 50px;">
<li>... require probabilities as inputs</li>
<li>... produce probabilities as outputs</li>
<li>... are slow</li>
</ol>
</section>
<section>
<h2>Options</h2>
<ol style="margin-top: 50px">
<li style="text-decoration: line-through; color: grey;">Ignore the Problem</li>
<li style="text-decoration: line-through; color: grey;">Heresy</li>
<li>?</li>
</ol>
</section>
</section>
<section>
<section>
<h2>Declarative Uncertainty</h2>
</section>
<section>
<img src="graphics/mimir_logo_final.png" height="150px" style="vertical-align: middle"/>
<p>The Uncertainty Management System</p>
<p><a href="http://mimirdb.info">http://mimirdb.info</a></p>
</section>
<section>
<svg data-src="graphics/mimir-workflow.svg" />
</section>
<section>
<p>At each step, Mimir tracks ambiguity and potential errors.</p>
<ul>
<li>A row that may or may not exist.</li>
<li>An attribute value that is missing or ambiguous.</li>
<li>A table with multiple possible schemas.</li>
<li style="color: #999">A violated constraint.</li>
</ul>
</section>
<section>
<p>Declarative uncertainty requires...</p>
<ol>
<li class="fragment" data-fragment-index="1">... uncertainty capture</li>
<li class="fragment" data-fragment-index="2"><span class="fragment highlight-current-red" data-fragment-index="4">... query processing over uncertain data</span></li>
<li class="fragment" data-fragment-index="3"><span class="fragment highlight-current-red" data-fragment-index="4">... intuitive and qualitative presentation of uncertainty</span></li>
</ol>
</section>
<section>
<h3>Uncertainty-Annotated Databases</h3>
<p style="font-size: smaller">(Joint work with Boris Glavic, Su Feng, Aaron Huber)</p>
<div style="margin-top: 50px" class="fragment">
<h3>Other Projects</h3>
<ul>
<li>Adaptive Schemas</li>
<li>Probabilistic Query Compilers</li>
</ul>
</div>
</section>
</section>
<section>
<section>
<h2>Background</h2>
<ol>
<li>Possible Worlds</li>
<li>$K$-Relations</li>
<li>$K^W$-Relations</li>
</ol>
</section>
<section>
<svg data-src="graphics/possibleworlds.svg" />
</section>
<section>
<h2>$K$-Relations</h2>
</section>
<section>
<h3>Provenance Semirings</h3>
<p style="font-size: smaller">T.J. Green &amp; G. Karvounarakis &amp; V. Tannen<span style="margin-left: 40px">(<i>PODS 2007</i>)</span></p>
</section>
<section>
<table><tr>
<td><table>
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4</td><td>3</td></tr>
</table></td>
<td style="width: 50px"></td>
<td><table>
<tr><th style="border-right: solid 1px">S</th><th>B</th><th>C</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>2</td><td>5</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>3</td><td>6</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>3</td><td>6</td></tr>
</table></td>
</tr></table>
<p class="fragment">The relational view</p>
</section>
<section>
<p>The functional view</p>
<p>
$$R(1, 2) \mapsto 1$$
$$R(1, 3) \mapsto 1$$
$$R(4, 3) \mapsto 1$$
$$S(2, 5) \mapsto 1$$
$$S(3, 6) \mapsto 2$$
</p>
<p class="fragment">
$$R(4, 5) \mapsto 0$$
</p>
</section>
<section>
$$[R_1 \cup R_2](\vec X) \equiv R_1(\vec X) + R_2(\vec X)$$
<div style="margin-top: 100px; width: 500px; margin-left: auto; margin-right: auto; text-align: left;" class="fragment">
$[S \cup S](3, 6)$
<p class="fragment" style="margin-left: 100px;">$= S(3, 6) + S(3, 6)$</p>
<p class="fragment" style="margin-left: 100px;">$= 2 + 2 = 4$</p>
</div>
</section>
<section>
$$[R_1 \bowtie R_2](\vec X) \equiv R_1(\vec X) \times R_2(\vec X)$$
<div style="margin-top: 100px; width: 500px; margin-left: auto; margin-right: auto; text-align: left;" class="fragment">
$[R \bowtie S](4, 3, 6)$
<p class="fragment" style="margin-left: 100px;">$= R(4, 3) \times S(3, 6)$</p>
<p class="fragment" style="margin-left: 100px;">$= 1 \times 2 = 2$</p>
</div>
</section>
<section>
$$[\pi_{\vec A} R](\vec X) \equiv \sum_{\vec Y} R(\vec X \vec Y)$$
<!-- \in adom(sch(R)-A) -->
<div style="margin-top: 100px; width: 500px; margin-left: auto; margin-right: auto; text-align: left;" class="fragment">
$[\pi_{B} R](3)$
<p class="fragment" style="margin-left: 100px;">$= \sum_{Y} R(Y, 3)$</p>
<p class="fragment" style="margin-left: 100px;">$ = R(1, 3) + R(4, 3) + \ldots$</p>
<p class="fragment" style="margin-left: 100px;">$= 1 + 1 + 0 = 2$</p>
</div>
</section>
<section>
<table>
<tr><td>$\cup$ </td><td style="width: 100px; font-size: smaller;">$\approx$</td><td>$+$</td></tr>
<tr><td>$\bowtie$</td><td style="width: 100px; font-size: smaller;">$\approx$</td><td>$\times$</td></tr>
<tr><td>$\pi$ </td><td style="width: 100px; font-size: smaller;">$\approx$</td><td>$+$</td></tr>
</table>
</section>
<section>
$$\left<\;\mathcal K,\;\oplus,\;\otimes,\;\mathbb 0,\;\mathbb 1\;\right>$$
<table style="margin-top: 50px;" class="fragment">
<tr>
<th>Semiring</th><th>Equivalent Query Semantics</th>
</tr>
<tr>
<td>$\left<\mathbb N, +, \times, 0, 1\right>$</td>
<td>Bag Semantics</td>
</tr>
<tr class="fragment">
<td>$\left<\mathbb B, \vee, \wedge, \bot, \top\right>$</td>
<td>Set Semantics</td>
</tr>
<tr class="fragment">
<td>$\left<\mathcal K^W, \vec \oplus, \vec \otimes, \mathbb{\vec 0}, \mathbb{\vec 1}\right>$</td>
<td>Possible Worlds Semantics</td>
</tr>
</table>
</section>
</section>
<section>
<section>
<h2>$K^W$-Relations</h2>
</section>
<section>
<table>
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td><img src="graphics/nine_or_four.svg" style="height: 30px; margin: 0px;"></td><td>3</td></tr>
</table>
</section>
<section>
<table><tr>
<td><table>
<tr><th style="border-right: solid 1px">$R_1$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td style="color: red">4</td><td>3</td></tr>
</table></td>
<td style="width: 50px"></td>
<td><table>
<tr><th style="border-right: solid 1px">$R_2$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td style="color: red">9</td><td>3</td></tr>
</table></td>
</tr></table>
</section>
<section>
<table>
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th><td></td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td><td>$\mapsto [1,1]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td><td>$\mapsto [1,1]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4</td><td>3</td><td>$\mapsto [1,0]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>9</td><td>3</td><td>$\mapsto [0,1]$</td></tr>
</table>
</section>
<section>
<h3>Summarizing Possible Worlds</h3>
<p>
$$\mathcal K^W \rightarrow \mathcal K$$
(plug in any $K$-Relation-compatible $\mathcal K$)
</p>
<dl style="font-size: 80%">
<dt class="fragment" data-fragment-index="1" style="margin-top: 50px;">Annotation in World $i$</dt>
<dd class="fragment" data-fragment-index="1">$\texttt{PW}_i(\vec k) \equiv \vec k_i$</dd>
<dt class="fragment" data-fragment-index="2" style="margin-top: 20px;">Certain Annotation</dt>
<dd class="fragment" data-fragment-index="2">$\mathcal C(\vec k) \equiv min(\vec k)$</dd>
<dt class="fragment" data-fragment-index="3" style="margin-top: 20px;">Possible Annotation</dt>
<dd class="fragment" data-fragment-index="3">$\mathcal P(\vec k) \equiv max(\vec k)$</dd>
</dl>
<p style="font-size: 50%" class="fragment">Correct/Possible mirrors "Correctness of SQL Queries on Databases with Nulls" [Guagliardo, Libkin 2017]</p>
</section>
<section>
<table>
<tr>
<td>
<table>
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th><td></td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td><td>$\mapsto [1,1]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td><td>$\mapsto [1,1]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4</td><td>3</td><td>$\mapsto [1,0]$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>9</td><td>3</td><td>$\mapsto [0,1]$</td></tr>
</table>
</td>
<td style="padding-left: 50px;">
<p class="fragment" data-fragment-index="1">$$\texttt{PW}_0(R(1, 2)) = 1$$</p>
<p class="fragment" data-fragment-index="2">$$\texttt{PW}_0(R(4, 3)) = 1$$</p>
<p class="fragment" data-fragment-index="3">$$\texttt{PW}_1(R(4, 3)) = 0$$</p>
<p class="fragment" data-fragment-index="4">$$\mathcal C(R(4, 3)) = 0$$</p>
<p class="fragment" data-fragment-index="5">$$\mathcal P(R(4, 3)) = 1$$</p>
</td>
</tr>
</table>
</section>
</section>
<section>
<section>
<p>A quick step back into reality...</p>
<img src="graphics/dagobert83-female-user-icon-800px.png" height="100px" class="fragment">
</section>
<section>
<img src="graphics/probdb-query.svg" height="200px"/>
<img src="graphics/sad_alice.svg" width="200px" height="200px" style="margin-left: 50px" class="fragment" />
</section>
<section>
<table style="display: inline-block;">
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4 or 9</td><td>3</td></tr>
</table>
<p>&nbsp;</p>
</section>
<section>
<table style="display: inline-block;">
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4 <span style="color: lightgrey; text-decoration: line-through red;">or 9</span></td><td>3</td></tr>
</table>
<p><b>Standard practice:</b> "Just use the best option."</p>
</section>
<section>
<img src="graphics/Mickey12.png">
</section>
<section>
<p>What's in between these extremes?</p>
</section>
<section>
<table style="display: inline-block;">
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th><td></td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td><td></td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td><td></td></tr>
<tr class="fragment highlight-current-red" data-fragment-index="2"><td style="border-right: solid 1px"> </td><td>4</td><td>3</td><td style="font-weight: bold;" class="fragment" data-fragment-index="1">*</td></tr>
</table>
<img src="graphics/happy_alice.svg" width="200px" height="200px" style="margin-left: 150px" class="fragment" data-fragment-index="4" />
<p class="fragment" data-fragment-index="3">Use the best option, but mark potential errors.</p>
</section>
<section>
To answer $Q(\mathcal D)$ we want...
<table style="margin-top: 70px;">
<tr class="fragment">
<td>$PW_{i}(Q(\mathcal D))$</td>
<td>The results Alice would have "just used".</td>
</tr>
<tr style="height: 50px;"><td></td><td></td></tr>
<tr class="fragment">
<td>$\mathcal C(Q(\mathcal D))$</td>
<td>Which of those results are trustworthy.</td>
</tr>
</table>
</section>
</section>
<section>
<section>
$$\texttt{PW}_i(Q(\mathcal D)) \equiv Q(\texttt{PW}_i(\mathcal D))$$
<p class="fragment">(Computing $PW_{i}(Q(\mathcal D))$ is cheap!)</p>
</section>
<section>
<p>Can we do the same thing for $\mathcal C(Q(\mathcal D))$?</p>
<p class="fragment">$$C(Q(\mathcal D)) \stackrel{?}{=} Q(\mathcal C(\mathcal D))$$</p>
</section>
<section>
<h1>No.</h1>
</section>
<section>
<table>
<tr><th style="border-right: solid 1px">R</th><th>A</th><th>B</th><td></td><th>$K^W$</th><th>$\mathcal C$</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>2</td><td>$\mapsto$</td><td>$[1,1]$</td><td>1</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>1</td><td>3</td><td>$\mapsto$</td><td>$[1,1]$</td><td>1</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>4</td><td>3</td><td>$\mapsto$</td><td>$[1,0]$</td><td>0</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>9</td><td>3</td><td>$\mapsto$</td><td>$[0,1]$</td><td>0</td></tr>
</table>
<p class="fragment">Compute $\pi_B(R)$</p>
</section>
<section>
<table>
<tr><th style="border-right: solid 1px">$\pi_B$R</th><th>B</th><td></td><th>$K^W$</th><th>$\mathcal C$</th></tr>
<tr><td style="border-right: solid 1px"> </td><td>2</td><td>$\mapsto$</td><td>$[1,1]$</td><td class="fragment">$\mathcal C([1,1]) = 1$</td></tr>
<tr><td style="border-right: solid 1px"> </td><td>3</td><td>$\mapsto$</td><td>$[2,2]$</td><td class="fragment">$\mathcal C([1,1])+\mathcal C([1,0])+\mathcal C([0,1])$</td></tr>
</table>
<p class="fragment">$=1+0+0=1$ <span class="fragment">$\neq C([2,2])$</span></p>
</section>
<section>
<h2>So what <u>can</u> we do with $\mathcal C$?</h2>
</section>
<section>
<h3>We can Approximate</h3>
<dl>
<dt>(C-) Soundness</dt>
<dd class="fragment" data-fragment-index="1">$Q(\mathcal C(\mathcal D)) \leq \mathcal C(Q(\mathcal D))$</dd>
<dd class="fragment" data-fragment-index="2">We can efficiently compute a conservative approximation of $\mathcal C$.
<dt>Completeness (for some queries)</dt>
<dd class="fragment" data-fragment-index="3">$Q(\mathcal C(\mathcal D)) = \mathcal C(Q(\mathcal D))$ ...if $Q$ is <i>???</i> (work in progress)</dd>
</dl>
</section>
<section>
<p>... also attribute level uncertainty</p>
<p class="fragment">(but not today)</p>
</section>
</section>
<section>
<section>
<h3>We implemented it...</h3>
</section>
<section>
<!-- <img src="graphics/system.png"> -->
<svg data-src="graphics/system.svg" class="stretch"/>
<imagecredits>Su Feng</imagecredits>
</section>
<section>
<h3>Defining Possible Worlds</h3>
<p>Mimir allows users to define special UDFs called <i>Models</i>.</p>
<pre><code class="sql">
CREATE MODEL TYPE Geocoder AS mimir.models.GeocodingModel;
CREATE MODEL INSTANCE Text_To_Loc USING Geocoder('Google');
SELECT C.name, C.id, Text_To_Loc(C.address) AS address
FROM Customer C;
</code></pre>
<p style="font-size: small;" class="fragment">(Not actual Mimir-SQL. Language adapted for your viewing pleasure.)</p>
<p style="text-align: left; width: 600px; margin-left: auto; margin-right: auto;" class="fragment">Models...<br/>... return one <b>best guess</b><br/>... define the space of alternatives</p>
</section>
<section>
<h3>Example Models</h3>
<ul>
<li>Geocoding Addresses</li>
<li>Imputation using a SparkML classifier</li>
<li>Heuristic detection of order-by columns for interpolation</li>
<li>Schema matching based on edit-distance</li>
<li>MayBMS-style probabilistic repair-key</li>
<li>And more...</li>
</ul>
</section>
<section>
<h3>Convenience Operators: Lenses</h3>
<p>Lenses instantiate/train a model and wrap a query</p>
<ul style="font-size: 16pt">
<li>Domain Constraint Repair / Missing Value Imputation</li>
<li>Schema Matching</li>
<li>Sequence Repair</li>
<li>Key Repair</li>
<li>Arbitrary Choice</li>
<li>Type Detection *</li>
<li>Header Detection *</li>
<li>JSON Shredder *</li>
</ul>
</section>
<section>
<p>Evaluation handled by a DBMS or Spark<br/>via query rewriting using GProM.</p>
<pre><code class="sql">
SELECT C.name, C.id, Text_To_Loc(C.address) AS address
FROM Customer C;
</code></pre>
<p>becomes...</p>
<pre><code class="sql">
SELECT C.name, C.id, Text_To_Loc(C.address) AS address,
1 AS name_certain, 1 AS id_certain,
0 AS address_certain, 1 AS row_certain
FROM Customer C;
</code></pre>
</section>
<section>
<canvas data-chart="bar">
<!--
{"options": {
"title" : {
"display" : true,
"text" : ["Query Performance on PDBench"," (1 GB; 10% Uncertainty)"]
},
"scales": {
"yAxes": [{
"ticks" : {
"beginAtZero" : true
},
"scaleLabel": {
"display": true,
"labelString": "Query Runtime (s)"
}
}]
}
}}
-->
Label, PDB-1, PDB-2, PDB-3
Deterministic, 4.714, 4.073, 5.238
Mimir+GProM+SQLite, 4.962, 4.257, 6.989
MayBMS, 21.814, 9.171, 18.137
</canvas>
</section>
</section>
<section>
<p>A few more things we're doing with Mimir...</p>
</section>
<section>
<section>
<h3>Adaptive Schemas</h3>
<ul style="font-size: 16pt" class="fragment">
<li>Domain Constraint Repair / Missing Value Imputation</li>
<li>Schema Matching</li>
<li>Sequence Repair</li>
<li>Key Repair</li>
<li>Arbitrary Choice</li>
<li>Type Detection *</li>
<li>Header Detection *</li>
<li>JSON Shredder *</li>
</ul>
</section>
<section>
<h3>Adaptive Schemas</h3>
<ul style="font-size: 16pt">
<li style="color: lightgrey;">Domain Constraint Repair / Missing Value Imputation</li>
<li style="color: lightgrey;">Schema Matching</li>
<li style="color: lightgrey;">Sequence Repair</li>
<li style="color: lightgrey;">Key Repair</li>
<li style="color: lightgrey;">Arbitrary Choice</li>
<li>Type Detection *</li>
<li>Header Detection *</li>
<li>JSON Shredder *</li>
</ul>
</section>
<section>
<pre><code class="sql">
LOAD 'customers.csv';
SELECT name FROM customers WHERE last_purchase < LAST_WEEK();
</code></pre>
</section>
<section>
<h3>How does the system know...</h3>
<dl>
<dt style="margin-top: 50px">... which column is 'name'?</dt>
<dd class="fragment">Guess that row 1 is headers.</dd>
<dt style="margin-top: 50px">... that 'last_purchase' is a <span style="font-family: Courier">date</span>?</dt>
<dd class="fragment">All rows look like <span style="font-family: Courier">YYYY-MM-DD</span></dd>
</dl>
<p class="fragment">This is all guesswork!</p>
</section>
<section>
<p><b>Idea:</b> Make the System Catalog a Probabilistic Table</p>
</section>
</section>
<section>
<section>
<h3>Probabilistic Query Compilers</h3>
<p style="margin-top: 100px">Sampling from ProbDBs is Sloooow</p>
</section>
<section>
<h3>Trivial Sampling</h3>
<p>Evaluate the query $N$ times.<br/>Plug in samples instead of best guesses.</p>
<div class="fragment" style="margin-top: 60px;">
<h3>Better Solutions</h3>
<p>Merge evaluation to mitigate redundancy.</p>
</div>
</section>
<section>
<h3>Sparse Encoding</h3>
<table>
<tr><td>
<table>
<tr><th style="border-right: 1px solid;">$R_1$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>2</td></tr>
<tr><td style="border-right: 1px solid;"></td><td>3</td><td>4</td></tr>
<tr><td></td><td></td><td></td></tr>
<tr><th style="border-right: 1px solid;">$R_2$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>5</td></tr>
</table>
</td><td style="vertical-align: middle;">
</td><td style="vertical-align: middle;">
<table>
<tr><th style="border-right: 1px solid;">$R_{sparse}$</th><th>A</th><th>B</th><th>S#</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>2</td><td>1</td></tr>
<tr><td style="border-right: 1px solid;"></td><td>3</td><td>4</td><td>1</td></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>5</td><td>2</td></tr>
</table>
</td></tr>
</table>
</section>
<section>
<h3>Tuple Bundles</h3>
<table>
<tr><td>
<table>
<tr><th style="border-right: 1px solid;">$R_1$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>2</td></tr>
<tr><td style="border-right: 1px solid;"></td><td>3</td><td>4</td></tr>
<tr><td></td><td></td><td></td></tr>
<tr><th style="border-right: 1px solid;">$R_2$</th><th>A</th><th>B</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>5</td></tr>
</table>
</td><td style="vertical-align: middle;">
</td><td style="vertical-align: middle;">
<table>
<tr><th style="border-right: 1px solid;">$R_{bundle}$</th><th>A</th><th>B</th><th>$\phi$</th></tr>
<tr><td style="border-right: 1px solid;"></td><td>1</td><td>[2,5]</td><td>[T,T]</td></tr>
<tr><td style="border-right: 1px solid;"></td><td>3</td><td>4</td><td>[T,F]</td></tr>
</table>
</td></tr>
</table>
</section>
<section>
<canvas data-chart="bar">
<!--
{"options": {
"title" : {
"display" : true,
"text" : ["Query Performance on TPC-H"," (1 GB; 5 min timeout)"]
},
"scales": {
"yAxes": [{
"ticks" : {
"beginAtZero" : true
},
"scaleLabel": {
"display": true,
"labelString": "Query Runtime (s)"
}
}]
}
}}
-->
Label, TPCH-1, TPCH-3, TPCH-5
Sparse Tables, 119.6160702, 162.0010839, 258.7416881
Tuple Bundles, 14.65919489, 300, 300
</canvas>
</section>
<section>
<p><b>Idea:</b> Let the compiler pick the right representation<br/>(or combination)</p>
</section>
</section>
<section>
<img src="graphics/mimir_logo_final.png" height="150px" style="vertical-align: middle"/>
<p><a href="http://mimirdb.info">http://mimirdb.info</a></p>
</section>
<section>
<table style="display: inline-block;">
<tr>
<th colspan="3" style="font-size: 12pt">Students</th>
</tr>
<tr height="80px">
<td width="100px">
<img src="people/poonam.jpg" width="70px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Poonam<br/>(PhD-3Y)</p>
</td>
<td width="100px">
<img src="people/will.png" width="61px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Will<br/>(PhD-2Y)</p>
</td>
<td width="100px">
<img src="people/aaron.jpg" width="64px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt; font-weight: bold;">Aaron<br/>(PhD-3Y)</p>
</td>
</tr>
</table>
<table style="display: inline-block; margin-left: 100px">
<tr>
<th colspan="1" style="font-size: 12pt">Dev</th>
</tr>
<tr>
<td width="100px">
<img src="people/mike.jpg" width="80px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Mike<br/>(Sr. Rsrch. Dev.)</p>
</td>
</tr>
</table>
<table>
<tr>
<th colspan="7" style="font-size: 12pt">Alumni</th>
</tr>
<tr height="80px">
<td width="100px">
<img src="people/ying.jpg" width="60px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Ying<br/>(PhD 2017)</p>
</td>
<td width="100px">
<img src="people/niccolo.png" width="50px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Niccolò<br/>(PhD 2016)</p>
</td>
<td width="100px">
<img src="people/arindam.jpg" width="80px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Arindam<br/>(MS 2016)</p>
</td>
<td width="100px">
<img src="people/shivang.jpg" width="55px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Shivang<br/>(MS 2018)</p>
</td>
<td width="100px">
<img src="people/olivia.png" width="50px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Olivia<br/>(BS 2017)</p>
</td>
<td width="100px">
<img src="people/gourab.jpg" width="80px" height="80px" style="margin-bottom: 0px"/>
<p style="margin-top: 0px; font-size: 10pt;">Gourab<br/>(MS 2018)</p>
</td>
</tr>
</table>
<table>
<tr>
<th colspan="4" style="font-size: 12pt">External Collaborators</th>
</tr>
<tr>
<td width="130px" style="font-size: 10pt;">
Dieter Gawlick<br/>(Oracle)
</td>
<td width="130px" style="font-size: 10pt;">
Zhen Hua Liu<br/>(Oracle)
</td>
<td width="130px" style="font-size: 10pt;">
Ronny Fehling<br/>(Airbus)
</td>
<td width="130px" style="font-size: 10pt;">
Beda Hammerschmidt<br/>(Oracle)
</td>
</tr>
</table>
<table style="margin-top: 5px">
<tr>
<td width="140px" style="font-size: 10pt; font-weight: bold;">
Boris Glavic<br/>(IIT)
</td>
<td width="140px" style="font-size: 10pt; font-weight: bold;">
Su Feng<br/>(IIT)
</td>
<td width="140px" style="font-size: 10pt;">
Juliana Freire<br/>(NYU)
</td>
<td width="140px" style="font-size: 10pt;">
Wolfgang Gatterbauer<br/>(NEU)
</td>
<td width="140px" style="font-size: 10pt;">
Heiko Mueller<br/>(NYU)
</td>
<td width="140px" style="font-size: 10pt;">
Remi Rampin<br/>(NYU)
</td>
</tr>
</table>
<p style="font-size: 10pt; text-decoration: underline;">Mimir is supported by NSF Award ACI-1640864, NPS Award N00244-16-1-0022, and gifts from Oracle</p>
</section>
</div></div>
<script src="../../reveal.js-3.7.0/lib/js/head.min.js"></script>
<script src="../../reveal.js-3.7.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: false,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../../reveal.js-3.7.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../../reveal.js-3.7.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../../reveal.js-3.7.0/js/MathJax.js'
},
{ src: '../../reveal.js-3.7.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../../reveal.js-3.7.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../../reveal.js-3.7.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../../reveal.js-3.7.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../../reveal.js-3.7.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../../reveal.js-3.7.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../../reveal.js-3.7.0/plugin/chart/csv2chart.js'},
{ src: '../../reveal.js-3.7.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../../reveal.js-3.7.0/plugin/svginline/data-src-svg.js', async: false }
]
});
</script>
</body>
</html>