1270 lines
44 KiB
HTML
1270 lines
44 KiB
HTML
|
<!doctype html>
|
|||
|
<html lang="en">
|
|||
|
|
|||
|
<head>
|
|||
|
<meta charset="utf-8">
|
|||
|
|
|||
|
<title>Embracing Uncertainty</title>
|
|||
|
|
|||
|
<meta name="description" content="Mimir">
|
|||
|
<meta name="author" content="Oliver Kennedy">
|
|||
|
|
|||
|
<meta name="apple-mobile-web-app-capable" content="yes" />
|
|||
|
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
|||
|
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
|
|||
|
|
|||
|
<link rel="stylesheet" href="../reveal.js-3.1.0/css/reveal.css">
|
|||
|
<link rel="stylesheet" href="ubodin.css" id="theme">
|
|||
|
|
|||
|
<!-- Code syntax highlighting -->
|
|||
|
<link rel="stylesheet" href="../reveal.js-3.1.0/lib/css/zenburn.css">
|
|||
|
|
|||
|
<!-- Printing and PDF exports -->
|
|||
|
<script>
|
|||
|
var link = document.createElement( 'link' );
|
|||
|
link.rel = 'stylesheet';
|
|||
|
link.type = 'text/css';
|
|||
|
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.1.0/css/print/pdf.css' : '../reveal.js-3.1.0/css/print/paper.css';
|
|||
|
document.getElementsByTagName( 'head' )[0].appendChild( link );
|
|||
|
</script>
|
|||
|
|
|||
|
<!--[if lt IE 9]>
|
|||
|
<script src="../reveal.js-3.1.0/lib/js/html5shiv.js"></script>
|
|||
|
<![endif]-->
|
|||
|
</head>
|
|||
|
|
|||
|
<body>
|
|||
|
|
|||
|
<div class="reveal">
|
|||
|
<!-- Any section element inside of this container is displayed as a slide -->
|
|||
|
|
|||
|
<div class="header">
|
|||
|
<!-- Any Talk-Specific Header Content Goes Here -->
|
|||
|
Embracing Uncertainty
|
|||
|
</div>
|
|||
|
<div class="footer">
|
|||
|
<!-- Any Talk-Specific Footer Content Goes Here -->
|
|||
|
<div style="float: left; margin-top: 15px; ">
|
|||
|
Exploring <u><b>O</b></u>nline <u><b>D</b></u>ata <u><b>In</b></u>teractions
|
|||
|
</div>
|
|||
|
<img src="graphics/FullText-white.png" height="40" style="float: right;"/>
|
|||
|
</div>
|
|||
|
|
|||
|
<div class="slides">
|
|||
|
|
|||
|
<section>
|
|||
|
<h4>Embracing uncertainty with</h4>
|
|||
|
<img src="graphics/mimir_logo_final.png">
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h4>Joint work with:</h4>
|
|||
|
<p style="text-align:left;"><small>
|
|||
|
<b>PhD Students</b>: Ying Yang, Will Spoth, Aaron Huber, Poonam Kumari, Jon Logan<br/>
|
|||
|
<b>BS Students</b>: Lisa Lu, Jacob P. Verghese<br/>
|
|||
|
<b>Alums</b>: Arindam Nandi, Niccoló Meneghetti (HPE/Vertica), Vinayak Karuppasamy (Bloomberg)<br/>
|
|||
|
<b>Collabs</b>: Ronny Fehling (Airbus), Zhen-Hua Liu (Oracle), Dieter Gawlick (Oracle), Beda Hammerschmidt (Oracle),
|
|||
|
Boris Glavic (IIT), Wolfgang Gatterbauer (CMU), Juliana Freire (NYU), Heiko Mueller (NYU), Moises Sudit (UB-ISE)
|
|||
|
</small></p>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h3>A Big Data Fairy Tale</h3>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
|
|||
|
<h4>Meet Alice</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" />
|
|||
|
<img src="graphics/littlestorefront-800px.png" height="300" />
|
|||
|
<h4>Alice has a Store</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/littlestorefront-800px.png" height="300" style=" vertical-align: middle;"/>
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">→</span>
|
|||
|
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<h4>Alice's store collects sales data</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/dagobert83-female-user-icon-800px.png" height="300" style=" vertical-align: middle;"/>
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">+</span>
|
|||
|
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">=</span>
|
|||
|
<img src="graphics/saco-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<h4>Alice wants to use her sales data to run a promotion</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">→</span>
|
|||
|
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<h4>So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">+ ?</span>
|
|||
|
<h4>... asks her question ...</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">+ ? →</span>
|
|||
|
<img src="graphics/crystalball-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<h4>... and basks in the limitless possibilities of big data.</h4>
|
|||
|
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Why is this a fairy tale?</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/matt-icons_text-x-log-300px.png" height="300" style=" vertical-align: middle;"/>
|
|||
|
<span style="font-size: 3em; vertical-align: middle;">→</span>
|
|||
|
<img src="graphics/database-server-800px.png" height="300" style=" vertical-align: middle;" />
|
|||
|
<h4>It's never this easy...</h4>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>CSV Import</h2>
|
|||
|
<h4>Run a <code>SELECT</code> on a raw CSV File</h4>
|
|||
|
<ul class="fragment">
|
|||
|
<li>File may not have column headers</li>
|
|||
|
<li>CSV does not provide "types"</li>
|
|||
|
<li>Lines may be missing fields</li>
|
|||
|
<li>Fields may be mistyped (typo, missing comma)</li>
|
|||
|
<li>Comment text can be inlined into the file</li>
|
|||
|
</ul>
|
|||
|
<p class="fragment">
|
|||
|
<b>State of the art</b>: External Table Defn <span class="fragment">+ "Manually" edit CSV</span>
|
|||
|
</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Merge Two Datasets</h2>
|
|||
|
<h4><code>UNION</code> two data sources</h4>
|
|||
|
<ul class="fragment">
|
|||
|
<li>Schema matching</li>
|
|||
|
<li>Deduplication</li>
|
|||
|
<li>Format alignment (GIS coordinates, $ vs €)
|
|||
|
<li>Precision alignment (State vs County)</li>
|
|||
|
</ul>
|
|||
|
<p class="fragment">
|
|||
|
<b>State of the art</b>: Manually map schema
|
|||
|
</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>JSON Shredding</h2>
|
|||
|
<h4>Run a <code>SELECT</code> on JSON or a Doc Store</h4>
|
|||
|
<ul class="fragment">
|
|||
|
<li>Separating fields and record sets:<br/>(e.g., <code>{ A: "Bob", B: "Alice" }</code>)</li>
|
|||
|
<li>Missing fields (Records with no 'address')</li>
|
|||
|
<li>Type alignment (Records with 'address' as an array)</li>
|
|||
|
<li>Schema matching$^2$</li>
|
|||
|
</ul>
|
|||
|
<p class="fragment">
|
|||
|
<b>State of the art</b>: DataGuide, Wrangler, etc...
|
|||
|
</p>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Data Cleaning is Hard!</h2>
|
|||
|
<h2>Lots of potential data errors!</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Running Example</h2>
|
|||
|
<pre><code>
|
|||
|
PID, RATING, REVIEW_CT
|
|||
|
P123, 4.5, 50.0
|
|||
|
P2345, A3, 245.0
|
|||
|
P124, 4.0, 100.0
|
|||
|
P325, 6.4, 30
|
|||
|
|
|||
|
</code></pre>
|
|||
|
<p class="fragment">'A3' is not a valid rating</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Enter the <tt>NULL</tt> Value</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>RATING</th><th>REVIEW_CT</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>50.0 </td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">NULL</td><td>245.0</td></tr>
|
|||
|
<tr><td>P124</td><td>4.0</td><td>100.0</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>30</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>3-Valued Logic</h2>
|
|||
|
|
|||
|
<p>$NULL > 4$ is <b>Unknown</b></p>
|
|||
|
<p class="fragment">(Same for $<$, $\leq$, $\geq$, $\neq$, $=$)</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>3-Valued Logic - AND</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th></th><th>True</th><th>False</th><th>Unknown</th></tr>
|
|||
|
<tr><th>True</th><td>True</td><td>False</td><td>Unknown</td></tr>
|
|||
|
<tr><th>False</th><td>False</td><td>False</td><td>False</td></tr>
|
|||
|
<tr><th>Unknown</th><td>Unknown</td><td>False</td><td>Unknown</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>3-Valued Logic - OR</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th></th><th>True</th><th>False</th><th>Unknown</th></tr>
|
|||
|
<tr><th>True</th><td>True</td><td>True</td><td>True</td></tr>
|
|||
|
<tr><th>False</th><td>True</td><td>False</td><td>Unknown</td></tr>
|
|||
|
<tr><th>Unknown</th><td>True</td><td>Unknown</td><td>Unknown</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>3-Valued Logic - NOT</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th>True</th><th>False</th><th>Unknown</th></tr>
|
|||
|
<tr><td>False</td><td>True</td><td>Unknown</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Query Evaluation</h2>
|
|||
|
<ul>
|
|||
|
<li><b>NULL</b> contaminates everything.</li>
|
|||
|
<li>Return only deterministically true rows.</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Limitations</h2>
|
|||
|
|
|||
|
<ul>
|
|||
|
<li>Unintuitive semantics: $$Unknown \equiv \neg Unknown$$
|
|||
|
<li>Silently discarding unknown values: $$(Ratings_1 \bowtie Ratings_1) \subset Ratings_1$$</li>
|
|||
|
<li>Null represents a complete lack of information</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>V-Tables</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>RATING</th><th>REVIEW_CT</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>50.0 </td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">$NULL_1$</td><td>245.0</td></tr>
|
|||
|
<tr><td>P124</td><td>4.0</td><td>100.0</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>30</td></tr>
|
|||
|
</table>
|
|||
|
<p class="fragment">Label nulls with a Subscript</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Labeled Nulls</h2>
|
|||
|
|
|||
|
<table><tr><th>Expression</th><th>Truth Value</th></tr>
|
|||
|
<tr class="fragment"><td>$NULL_i = 4$</td><td><b>Unknown</b></td></tr>
|
|||
|
<tr class="fragment"><td>$NULL_i = NULL_j$ ($i \neq j$)</td><td><b>Unknown</b></td></tr>
|
|||
|
<tr class="fragment"><td>$NULL_i = NULL_j$ ($i = j$)</td><td><b>True</b></td></tr>
|
|||
|
</table>
|
|||
|
|
|||
|
<p class="fragment">Solves some erroneously discarded values:
|
|||
|
$$(Ratings_1 \bowtie Ratings_1) = Ratings_1$$
|
|||
|
(but not all)
|
|||
|
</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>C-Tables</h2>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>RATING</th><th>REVIEW_CT</th><th>$\phi$</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>50.0 </td><td>$\top$</td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">$NULL_1$</td><td>245.0</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P124</td><td>4.0</td><td>100.0</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>30</td><td>$\top$</td></tr>
|
|||
|
</table>
|
|||
|
<p class="fragment">Add a 'local condition' column</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Selection</h2>
|
|||
|
|
|||
|
<p>$$\sigma_{RATING > 4} RATINGS_1$$</p>
|
|||
|
|
|||
|
<table class="fragment">
|
|||
|
<tr><th>PID</th><th>RATING</th><th>REVIEW_CT</th><th>$\phi$</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>50.0 </td><td>$\top$</td></tr>
|
|||
|
<tr class="fragment"><td>P2345</td><td style="color: red">$NULL_1$</td><td>245.0</td><td>$NULL_1 > 4$</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>30</td><td>$\top$</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>(Bag-) Evaluation Semantics</h2>
|
|||
|
<small>
|
|||
|
<table>
|
|||
|
<tr><th>Expression</th><th>Evaluates To</th></tr>
|
|||
|
<tr>
|
|||
|
<tr>
|
|||
|
<td>$[[\pi_{a_i \leftarrow e_i}(R)]]_{CT}$ </td>
|
|||
|
<td>$\{\left< a_i:[[e_i(t)]]_{lazy}, \phi: t.\phi\right>\;|\;t \in [[R]]_{CT}\}$</td>
|
|||
|
</tr><tr>
|
|||
|
<td>$[[\sigma_\psi(R)]]_{CT}$ </td>
|
|||
|
<td>$\{\left< a_i: t.a_i, \phi: [[t.\phi \wedge \psi(t)]]_{lazy}\right>\;|\;t \in [[R]]_{CT} \wedge \left([[t.\phi \wedge \psi(t)]]_{lazy} \not \equiv \bot \right) \}$</td>
|
|||
|
</tr><tr>
|
|||
|
<td>$[[R\times S]]_{CT}$</td>
|
|||
|
<td>$\{\left< a_i: t_1.a_i, a_j: t_2.a_j, \phi: t_1.\phi \wedge t_2.\phi \right>\;|\;t_1 \in [[R]]_{CT} \wedge t_2 \in [[S]]_{CT}\}$</td>
|
|||
|
</tr><tr>
|
|||
|
<td>$[[R\uplus S]]_{CT}$</td>
|
|||
|
<td>$\{\left< a_i: t.a_i, \phi: t.\phi\right>\;|\;t \in ([[R]]_{CT} \uplus [[S]]_{CT})\}$</td>
|
|||
|
</tr>
|
|||
|
</table>
|
|||
|
</small>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>C-Tables</h2>
|
|||
|
|
|||
|
<p>'Plug-in' a valuation to get a specific result.</p>
|
|||
|
|
|||
|
<p>For example, with $\{NULL_1 \rightarrow 5\}$</p>
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>RATING</th><th>REVIEW_CT</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>50.0 </td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">5</td><td>245.0</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>30</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>C-Tables</h2>
|
|||
|
<ul>
|
|||
|
<li><b>Pro:</b> 'Hidden' correlations created by the query preserved</li>
|
|||
|
<li><b>Pro:</b> 'Model' for values decoupled from representation</li>
|
|||
|
<li><b>Con:</b> Expensive (e.g., Joins may degrade to cross-products)</li>
|
|||
|
<li><b>Con:</b> Databases don't support labeled nulls</li>
|
|||
|
<li><b>Con:</b> Generalized projection has side effects</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
$$\pi_{PID,\; A \leftarrow RATING,\; B \leftarrow \frac{RATING}{2}} (RATINGS_1)$$
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>A</th><th>B</th><th>$\phi$</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>2.25</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">$NULL_1$</td><td style="color: red">$NULL_2$</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P124</td><td>4.0</td><td>2.0</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>3.2</td><td>$\top$</td></tr>
|
|||
|
</table>
|
|||
|
<p class="fragment">(Must also record that $NULL_2 = \frac{NULL_1}{2}$)</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Generalized C-Tables<h2>
|
|||
|
<h4>aka PIP-Tables</h4>
|
|||
|
|
|||
|
<table>
|
|||
|
<tr><th>PID</th><th>A</th><th>B</th><th>$\phi$</th></tr>
|
|||
|
<tr><td>P123</td><td>4.5</td><td>2.25</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P2345</td><td style="color: red">$NULL_1$</td><td style="color: red">$\frac{NULL_1}{2}$</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P124</td><td>4.0</td><td>2.0</td><td>$\top$</td></tr>
|
|||
|
<tr><td>P325</td><td>6.4</td><td>3.2</td><td>$\top$</td></tr>
|
|||
|
</table>
|
|||
|
<p class="fragment">(Labeled Nulls + Lazy Arithmetic Expressions)</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>C-Tables</h2>
|
|||
|
<ul>
|
|||
|
<li><b>Pro:</b> 'Hidden' correlations created by the query preserved</li>
|
|||
|
<li><b>Pro:</b> 'Model' for values decoupled from representation</li>
|
|||
|
<li><b>Con:</b> Expensive (e.g., Joins may degrade to cross-products)</li>
|
|||
|
<li style="color: red"><b>Con:</b> Databases don't support labeled nulls</li>
|
|||
|
<li style="text-decoration: line-through"><b>Con:</b> Generalized projection has side effects</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h4>Back to the beginning</h4>
|
|||
|
<h2>How did the uncertainty arise?</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>VGTerm</h3>
|
|||
|
<p>$VGTerm(\ldots)$ constructs new variables<br/>(it's a skolem function)</p>
|
|||
|
<ul>
|
|||
|
<li class="fragment">$VGTerm('X')$ constructs a new variable $X$</li>
|
|||
|
<li class="fragment">$VGTerm('X', 1)$ constructs a new variable $X_{1}$</li>
|
|||
|
<li class="fragment">$VGTerm('X', ROWID)$ evaluates $ROWID$ and then constructs a new variable $X_{ROWID}$</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p>Mimir SQL allows the $VGTerm()$ operator to inlined</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A, VGTerm('X', B)+2 AS C FROM R;
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<center><div style="width: 600px" class="fragment">
|
|||
|
<table style="float: left">
|
|||
|
<thead>
|
|||
|
<tr><th>A</th><th>B</th></tr>
|
|||
|
</thead><tbody>
|
|||
|
<tr><td>1</td><td>2</th></tr>
|
|||
|
<tr><td>3</td><td>4</th></tr>
|
|||
|
<tr><td>5</td><td>6</th></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<table style="float: right" class="fragment">
|
|||
|
<tr><th>A</th><th>C</th></tr>
|
|||
|
<tr><td>1</td><td>$X_2+2$</th></tr>
|
|||
|
<tr><td>3</td><td>$X_4+2$</th></tr>
|
|||
|
<tr><td>5</td><td>$X_6+2$</th></tr>
|
|||
|
</table>
|
|||
|
</div></center>
|
|||
|
<div style="clear: both;"> </div>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<pre><code>
|
|||
|
|
|||
|
SELECT PID,
|
|||
|
CASE WHEN CAN_CAST(RATING AS FLOAT)
|
|||
|
THEN CAST(RATING AS FLOAT)
|
|||
|
ELSE VGTerm('RATING', ROWID)
|
|||
|
END AS RATING,
|
|||
|
REVIEW_CT
|
|||
|
FROM RATINGS1;
|
|||
|
|
|||
|
|
|||
|
</code></pre>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<pre><code>
|
|||
|
CREATE VIEW FIXED_RATINGS AS
|
|||
|
SELECT PID,
|
|||
|
CASE WHEN CAN_CAST(RATING AS FLOAT)
|
|||
|
THEN CAST(RATING AS FLOAT)
|
|||
|
ELSE VGTerm('RATING', ROWID)
|
|||
|
END AS RATING,
|
|||
|
REVIEW_CT
|
|||
|
FROM RATINGS1;
|
|||
|
|
|||
|
|
|||
|
</code></pre>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<pre><code class="sql">
|
|||
|
CREATE VIEW FIXED_RATINGS AS SELECT ...
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<div>
|
|||
|
<p>Behind the scenes, we also create a model...</p>
|
|||
|
<pre class="fragment"><code>
|
|||
|
SELECT * FROM RATINGS1;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<div style="font-size: 1em; vertical-align: middle;">↓</div>
|
|||
|
<div>
|
|||
|
<img src="graphics/weka.png" />
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<div style="font-size: 1em; vertical-align: middle;">↓</div>
|
|||
|
<div><p>A distribution for <small style="vertical-align: baseline;">$RATING_{ROWID}$</small><p></div>
|
|||
|
</div>
|
|||
|
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<pre><code>
|
|||
|
CREATE VIEW FIXED_RATINGS AS
|
|||
|
SELECT PID,
|
|||
|
CASE WHEN CAN_CAST(RATING AS FLOAT)
|
|||
|
THEN CAST(RATING AS FLOAT)
|
|||
|
ELSE VGTerm('RATING', ROWID)
|
|||
|
END AS RATING,
|
|||
|
REVIEW_CT
|
|||
|
FROM RATINGS1;
|
|||
|
|
|||
|
SELECT PID FROM FIXED_RATINGS WHERE RATING > 4
|
|||
|
</code></pre>
|
|||
|
<p style="font-size: 70%;">The query+view+model completely describe the distribution of possible results...<p>
|
|||
|
</section>
|
|||
|
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>... but how much detail do we actually need?</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Uncertainty Representation User Study</h3>
|
|||
|
<p>Participants were shown a table of 3 products with 3 ratings (e.g., Amazon, Best Buy, Walmart) each</p>
|
|||
|
<p><b>Part 1</b>: The randomly generated ratings were biased to encourage a predictable, but mildly ambiguous ordering of the three products.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p><b>Part 2</b>: We used the same randomization, but this time we marked several of the values as uncertain:
|
|||
|
<table>
|
|||
|
<tr><td>Red Text</td><td><span style="color: red">value</span></td></tr>
|
|||
|
<tr><td>Red Background</td><td><span style="background-color: red">value</span></td></tr>
|
|||
|
<tr><td>Asterisk</td><td>$value*$</td></tr>
|
|||
|
<tr><td>Tolerance</td><td>$value \pm tolerance$</td></tr>
|
|||
|
<tr><td>Range</td><td>$low – high$</td></tr>
|
|||
|
</table>
|
|||
|
</p>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Probability of Agreement With Elicited Order</h3>
|
|||
|
<img src="graphics/interfaces.png" />
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Time Taken</h3>
|
|||
|
<img src="graphics/PresentationTime.png" />
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Takeaway...</h2>
|
|||
|
<ol>
|
|||
|
<li>Small, '1-bit' representations can be sufficient</li>
|
|||
|
<li>Small, '1-bit' representations help users make faster decisions</li>
|
|||
|
</ol>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>1-Bit Representations</h2>
|
|||
|
<ol>
|
|||
|
<li>Make a best guess (Maximize prior probability)</li>
|
|||
|
<li class="fragment">Plug in the best guess</li>
|
|||
|
<li class="fragment">Profit?</li>
|
|||
|
</ol>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p>UDFs allow guesses to be plugged in</h3>
|
|||
|
<pre><code>
|
|||
|
SELECT PID,
|
|||
|
CASE WHEN CAN_CAST(RATING AS FLOAT)
|
|||
|
THEN CAST(RATING AS FLOAT)
|
|||
|
ELSE MIMIR_VG_BESTGUESS('RATING', ROWID)
|
|||
|
END AS RATING,
|
|||
|
REVIEW_CT
|
|||
|
FROM RATINGS1;
|
|||
|
</code></pre>
|
|||
|
<p class="fragment">... but we lose track of which outputs are uncertain</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Provenance Recovers Explanations</h3>
|
|||
|
<pre><code>
|
|||
|
SELECT PID,
|
|||
|
CASE WHEN CAN_CAST(RATING AS FLOAT)
|
|||
|
THEN CAST(RATING AS FLOAT)
|
|||
|
ELSE MIMIR_VG_BESTGUESS('RATING', ROWID)
|
|||
|
END AS RATING,
|
|||
|
REVIEW_CT,
|
|||
|
TRUE AS MIMIR_IS_DETERMINISTIC_PID,
|
|||
|
CAN_CAST(RATING AS FLOAT) AS MIMIR_IS_DETERMINISTIC_RATING,
|
|||
|
TRUE AS MIMIR_IS_DETERMINISTIC_REVIEW_CT,
|
|||
|
TRUE AS MIMIR_ROW_IS_DETERMINISTIC
|
|||
|
FROM RATINGS1;
|
|||
|
</code></pre>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h3>Performance</h3>
|
|||
|
<p>PDBench: TPC-H Data, but add random FK violations.</p>
|
|||
|
<ul>
|
|||
|
<li><b>Query 1:</b> ~TPC-H Q3; 3-way FK Join with Predicates</li>
|
|||
|
<li><b>Query 2:</b> ~TPC-H Q6; Table Scan with Predicates.</li>
|
|||
|
<li><b>Query 3:</b> ~TPC-H Q7; 5-way Star Join with Predicates.</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<dl>
|
|||
|
<dt>Partition:</dt>
|
|||
|
<dd>Separate query fragments compute 'certain' results and one or more classes of uncertain results.</dd>
|
|||
|
<dt>TupleBundle:</dt>
|
|||
|
<dd>Compute and summarize 10 sampled results in parallel</dd>
|
|||
|
<dt>Inline:</dt>
|
|||
|
<dd>UDFs dynamically inject best guess values into the query.</dd>
|
|||
|
</dl>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<table>
|
|||
|
<tr><th>Strategy</th><th>Q1</th><th>Q2</th><th>Q3</th></tr>
|
|||
|
<tr><td>Inline</td><td>85.5s</td><td>676.6s</td><td>103.3s</td></tr>
|
|||
|
<tr><td>TupleBundle</td><td>8.2s</td><td>55.2s</td><td>9.8s</td></tr>
|
|||
|
<tr><td>Partition</td><td>>1hr</td><td>739.7s</td><td>>1hr</td></tr>
|
|||
|
</table>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<img src="graphics/mimir_logo_final.png" height="200px">
|
|||
|
<ul>
|
|||
|
<li>On-Demand Data Curation makes data exploration easier.</li>
|
|||
|
<li>"Best-Guess" results streamline analytics.
|
|||
|
<div> ... if the DB communicates the resulting uncertainty.</div></li>
|
|||
|
</ul>
|
|||
|
<p class="fragment"><b>Questions?</b></p>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h1>Backup Slides</h1>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Industry says...</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/maybe-screen.png" height="500px" />
|
|||
|
|
|||
|
<img src="graphics/maybe-detail.png" height="500px" class="fragment" /><br/>
|
|||
|
<p class="fragment">My phone is guessing, but is letting me know that it did</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/Calendar_Base.png" height="500px" />
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/Calendar_Explain.png" height="500px" />
|
|||
|
<p>Easy interactions to <i>accept</i>, <i>reject</i>, or <i>explain</i> uncertainty</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<img src="graphics/BingTranslate.png" height="400px" />
|
|||
|
<p>Easy access to: Provenance, Alternatives, and Confidence</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Communication</h2>
|
|||
|
<ul>
|
|||
|
<li>Why is my data uncertain?</li>
|
|||
|
<li>How bad is it?</li>
|
|||
|
<li>What can I do about it?</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>What if a database did the same?</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<ul style="width:35%; font-size: 24pt; margin-top: 50px; margin-bottom: 100px; margin-right: 10px">
|
|||
|
<li class="fragment"><b>A:</b> Standard SQL.</li>
|
|||
|
<li class="fragment"><b>B:</b> Annotated Output.</li>
|
|||
|
<li class="fragment"><b>C:</b> Subway Diagram.</li>
|
|||
|
<li class="fragment"><b>D:</b> Result Explanations.</li>
|
|||
|
</ul>
|
|||
|
<a href="http://localhost:9000">
|
|||
|
<img src="graphics/UIExample.png" style="width:60%; float:right"/>
|
|||
|
</a>
|
|||
|
<b><a href="http://localhost:9000" class="fragment">Demo</a></b>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Mimir is a DB <u>Overlay</u></h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<svg width="500px" height="400px">
|
|||
|
<g>
|
|||
|
<g>
|
|||
|
<image xlink:href="graphics/db.svg" x="10" y="5" height="50px" width="50px"/>
|
|||
|
<text x="0" y="80" style="font-size:50%">(Any DB)</text>
|
|||
|
</g>
|
|||
|
<polygon
|
|||
|
points="0,0 120,0 105,-5 105,5 120,0"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<g transform="translate(220,0)">
|
|||
|
<image xlink:href="graphics/primary-queries.svg" x="0" y="5" height="50px" width="50px"/>
|
|||
|
<text x="0" y="80" style="font-size:50%">(Lens)</text>
|
|||
|
</g>
|
|||
|
<polygon
|
|||
|
points="0,0 100,0 85,-5 85,5 100,0"
|
|||
|
transform="translate(290,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<g transform="translate(400,0)">
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="0" y="10" height="50px" width="50px"/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="5" y="15" height="50px" width="50px"/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="10" y="20" height="50px" width="50px"/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="15" y="25" height="50px" width="50px"/>
|
|||
|
</g>
|
|||
|
</g>
|
|||
|
<g class="fragment">
|
|||
|
<polygon
|
|||
|
points="0,0 0,110 -5,95 5,95 0,110"
|
|||
|
transform="translate(200,100)"
|
|||
|
style="
|
|||
|
stroke: red;
|
|||
|
fill: red;
|
|||
|
stroke-width: 4;
|
|||
|
"
|
|||
|
/>
|
|||
|
<polygon
|
|||
|
points="0,0 0,110 -5,95 5,95 0,110"
|
|||
|
transform="translate(245,100)"
|
|||
|
style="
|
|||
|
stroke: red;
|
|||
|
fill: red;
|
|||
|
stroke-width: 4;
|
|||
|
"
|
|||
|
/>
|
|||
|
<polygon
|
|||
|
points="0,0 0,110 -5,95 5,95 0,110"
|
|||
|
transform="translate(290,100)"
|
|||
|
style="
|
|||
|
stroke: red;
|
|||
|
fill: red;
|
|||
|
stroke-width: 4;
|
|||
|
"
|
|||
|
/>
|
|||
|
<g transform="translate(0,200)">
|
|||
|
<g>
|
|||
|
<image xlink:href="graphics/db.svg" x="10" y="5" height="50px" width="50px"/>
|
|||
|
<text x="0" y="80" style="font-size:50%">(Any DB)</text>
|
|||
|
</g>
|
|||
|
<polygon
|
|||
|
points="0,0 120,0 105,-5 105,5 120,0"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<polygon
|
|||
|
points="0,0 120,65 105,50 105,62 120,65"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<polygon
|
|||
|
points="0,0 120,130 105,105 105,122 120,130"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<g transform="translate(210,0)">
|
|||
|
<text x="0" y="45" style="font-size:50%; font-family: courier">SELECT</text>
|
|||
|
<polygon
|
|||
|
points="0,0 110,0 95,-5 95,5 110,0"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="200" y="15" height="50px" width="50px"/>
|
|||
|
</g>
|
|||
|
<g transform="translate(210,65)">
|
|||
|
<text x="0" y="45" style="font-size:50%; font-family: courier">SELECT</text>
|
|||
|
<polygon
|
|||
|
points="0,0 110,0 95,-5 95,5 110,0"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="200" y="15" height="50px" width="50px"/>
|
|||
|
</g>
|
|||
|
<g transform="translate(210,130)">
|
|||
|
<text x="0" y="45" style="font-size:50%; font-family: courier">SELECT</text>
|
|||
|
<polygon
|
|||
|
points="0,0 110,0 95,-5 95,5 110,0"
|
|||
|
transform="translate(80,40)"
|
|||
|
style="
|
|||
|
stroke: black;
|
|||
|
fill: black;
|
|||
|
stroke-width: 2;
|
|||
|
"
|
|||
|
/>
|
|||
|
<image xlink:href="graphics/jean-victor-balin-icon-table.svg" x="200" y="15" height="50px" width="50px"/>
|
|||
|
</g>
|
|||
|
</g>
|
|||
|
</g>
|
|||
|
<g transform="translate(220,230)" class="fragment">
|
|||
|
<text x="0" y="48" style="font-family: courier; font-size:40%">UNION</text>
|
|||
|
<text x="0" y="113" style="font-family: courier; font-size:40%">UNION</text>
|
|||
|
</g>
|
|||
|
</svg>
|
|||
|
<p class="fragment">Mimir <i>virtualizes</i> uncertainty
|
|||
|
<attribution>(OpenClipArt.org)</attribution>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>How?</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Labeled Nulls</h3>
|
|||
|
<p>$Var(\ldots)$ constructs new variables</p>
|
|||
|
<ul>
|
|||
|
<li class="fragment">$Var('X')$ constructs a new variable $X$</li>
|
|||
|
<li class="fragment">$Var('X', 1)$ constructs a new variable $X_{1}$</li>
|
|||
|
<li class="fragment">$Var('X', ROWID)$ evaluates $ROWID$ and then constructs a new variable $X_{ROWID}$</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Lazy Evaluation</h3>
|
|||
|
<p>Variables can't be evaluated until they are bound.<br/>So, we allow arbitrary expressions to represent data.</p>
|
|||
|
<ul>
|
|||
|
<li class="fragment">$X$ is a legitimate data value.</li>
|
|||
|
<li class="fragment">$X+1$ is a legitimate data value.</li>
|
|||
|
<li class="fragment">$1+1$ is a legitimate data value<span class="fragment">, but can be reduced to $2$.</span></li>
|
|||
|
</ul>
|
|||
|
<p class="fragment">A lazy value without variables is <b>deterministic</b></p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p>Mimir SQL allows the $Var()$ operator to inlined</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A, VAR('X', B)+2 AS C FROM R;
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<center><div style="width: 600px" class="fragment">
|
|||
|
<table style="float: left">
|
|||
|
<thead>
|
|||
|
<tr><th>A</th><th>B</th></tr>
|
|||
|
</thead><tbody>
|
|||
|
<tr><td>1</td><td>2</th></tr>
|
|||
|
<tr><td>3</td><td>4</th></tr>
|
|||
|
<tr><td>5</td><td>6</th></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<table style="float: right" class="fragment">
|
|||
|
<tr><th>A</th><th>C</th></tr>
|
|||
|
<tr><td>1</td><td>$X_2+2$</th></tr>
|
|||
|
<tr><td>3</td><td>$X_4+2$</th></tr>
|
|||
|
<tr><td>5</td><td>$X_6+2$</th></tr>
|
|||
|
</table>
|
|||
|
</div></center>
|
|||
|
<div style="clear: both;"> </div>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p>Selects on $Var()$ need to be deferred too...</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A FROM R WHERE VAR('X', B) > 2;
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<center><div style="width: 600px">
|
|||
|
<table style="float: left">
|
|||
|
<thead>
|
|||
|
<tr><th>A</th><th>B</th></tr>
|
|||
|
</thead><tbody>
|
|||
|
<tr><td>1</td><td>2</th></tr>
|
|||
|
<tr><td>3</td><td>4</th></tr>
|
|||
|
<tr><td>5</td><td>6</th></tr>
|
|||
|
</tbody>
|
|||
|
</table>
|
|||
|
<table style="float: right" class="fragment">
|
|||
|
<tr><th>A</th><th>$\phi$</th></tr>
|
|||
|
<tr><td>1</td><td>$X_2>2$</th></tr>
|
|||
|
<tr><td>3</td><td>$X_4>2$</th></tr>
|
|||
|
<tr><td>5</td><td>$X_6>2$</th></tr>
|
|||
|
</table>
|
|||
|
</div></center>
|
|||
|
<div style="clear: both;"> </div>
|
|||
|
<p class="fragment">When evaluating the table, rows where $\phi = \bot$ are dropped.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>C-Tables</h3>
|
|||
|
<ul>
|
|||
|
<li>Original Formulation <small>[Imielinski, Lipski 1981]</small></li>
|
|||
|
<li class="fragment">PC-Tables <small>[Green, Tannen 2006]</small></li>
|
|||
|
<li class="fragment">Systems<ul>
|
|||
|
<li>Orchestra <small>[Green, Karvounarakis, Taylor, Biton, Ives, Tannen 2007]</small></li>
|
|||
|
<li>MayBMS <small>[Huang, Antova, Koch, Olteanu 2009]</small></li>
|
|||
|
<li>Pip <small>[Kennedy, Koch 2009]</small>
|
|||
|
<li>Sprout <small>[Fink, Hogue, Olteanu, Rath 2011]</small></li>
|
|||
|
</ul></li>
|
|||
|
<li class="fragment">Generalized PC-Tables <small>[Kennedy, Koch 2009]</small></li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Labeled nulls capture a lens' uncertainty</h2>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<pre><code>
|
|||
|
CREATE LENS PRODUCTS
|
|||
|
AS SELECT * FROM PRODUCTS_RAW
|
|||
|
USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<div class="fragment">
|
|||
|
<p>is (almost) the same as the query...</p>
|
|||
|
<pre><code>
|
|||
|
CREATE VIEW PRODUCTS
|
|||
|
AS SELECT ID, NAME, ...,
|
|||
|
CASE WHEN DEPARTMENT IS NOT NULL THEN DEPARTMENT
|
|||
|
ELSE VAR('PRODUCTS.DEPARTMENT', ROWID)
|
|||
|
END AS DEPARTMENT
|
|||
|
FROM PRODUCTS_RAW;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
|
|||
|
<small class="fragment">
|
|||
|
<table>
|
|||
|
<tr><th>ID</th><th>Name</th><th>...</th><th>Department</th></tr>
|
|||
|
<tr><td>123</td><td>Apple 6s, White</td><td>...</td><td>Phone</td></tr>
|
|||
|
<tr><td>34234</td><td>Dell, Intel 4 core</td><td>...</td><td>Computer</td></tr>
|
|||
|
<tr><td>34235</td><td>HP, AMD 2 core</td><td>...</td><td class="fragment">$Prod.Dept_3$</td></tr>
|
|||
|
<tr><td>...</td><td>...</td><td>...</td><td>...</td></tr>
|
|||
|
</table>
|
|||
|
</small>
|
|||
|
</section>
|
|||
|
<section>
|
|||
|
<pre><code>
|
|||
|
CREATE LENS PRODUCTS
|
|||
|
AS SELECT * FROM PRODUCTS_RAW
|
|||
|
USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
|
|||
|
</code></pre>
|
|||
|
|
|||
|
<div>
|
|||
|
<p>Behind the scenes, a lens also creates a model...</p>
|
|||
|
<pre class="fragment"><code>
|
|||
|
SELECT * FROM PRODUCTS_RAW;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<div style="font-size: 1em; vertical-align: middle;">↓</div>
|
|||
|
<div>
|
|||
|
<img src="graphics/weka.png" />
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<div style="font-size: 1em; vertical-align: middle;">↓</div>
|
|||
|
<div><p>An estimator for <small style="vertical-align: baseline;">$PRODUCTS.DEPARTMENT_{ROWID}$</small><p></div>
|
|||
|
</div>
|
|||
|
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h3>... but databases don't support labeled nulls</h3>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Labeled Nulls Percolate Up</h3>
|
|||
|
<pre><code>
|
|||
|
SELECT A, VAR('X', B)+2 AS C FROM R;
|
|||
|
</code></pre>
|
|||
|
<div class="fragment">
|
|||
|
<p>Mimir dispatches this query to the DB:</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A, B FROM R;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<p>And for each row of the result, evaluates:</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A, VAR('X', B)+2 AS C FROM RESULT;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Generating Explanations</h3>
|
|||
|
<p>All uncertainty comes from labeled nulls in the expressions that Mimir evaluates for each row of the output.</p>
|
|||
|
<dl>
|
|||
|
<dt>Why is the data uncertain?</dt>
|
|||
|
<dd>All relevant lenses referenced in <code>VAR('X', B)+2</code>.</dd>
|
|||
|
|
|||
|
<dt>How uncertain?</dt>
|
|||
|
<dd>Estimate by sampling from <code>VAR('X', B)</code>.</dd>
|
|||
|
|
|||
|
<dt>How do I fix it?</dt>
|
|||
|
<dd>Each lens fixes one well-defined type of error.</dd>
|
|||
|
</dl>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Lazy evaluation can cause problems</h3>
|
|||
|
<pre><code>
|
|||
|
SELECT R.A, S.C FROM R, S WHERE VAR('X', R.B) = S.B;
|
|||
|
</code></pre>
|
|||
|
<div class="fragment">
|
|||
|
<p>Mimir dispatches this query to the DB:</p>
|
|||
|
<pre><code>
|
|||
|
SELECT R.A, S.C, R.B AS TEMP_1, S.B AS TEMP_2 FROM R, S;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
<div class="fragment">
|
|||
|
<p>And for each row of the result, evaluates:</p>
|
|||
|
<pre><code>
|
|||
|
SELECT A, C FROM RESULT WHERE VAR('X', TEMP_1) = TEMP_2;
|
|||
|
</code></pre>
|
|||
|
</div>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p>UDFs allow the DB to interpret labeled nulls</h3>
|
|||
|
<pre><code>
|
|||
|
SELECT R.A, S.C FROM R, S
|
|||
|
WHERE S.B = MIMIR_VG_BESTGUESS('VARIABLE_X', R.B);
|
|||
|
</code></pre>
|
|||
|
<p class="fragment">... but we lose the ability to <i>explain</i> outputs</p>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h2>Selection (Filtering)</h2>
|
|||
|
<pre><code>
|
|||
|
SELECT NAME FROM PRODUCTS
|
|||
|
WHERE DEPARTMENT='PHONE'
|
|||
|
AND ( VENDOR='APPLE'
|
|||
|
OR PLATFORM='ANDROID' )
|
|||
|
</code></pre>
|
|||
|
<p class="fragment">Row-level uncertainty is a boolean formula $\phi$.</p>
|
|||
|
<p class="fragment">
|
|||
|
For this query, $\phi$ can be as complex as:
|
|||
|
<small>$$DEPT_{ROWID}='P\ldots' \wedge \left( VEND_{ROWID}='Ap\ldots' \vee PLAT_{ROWID} = 'An\ldots' \right)$$</small></p>
|
|||
|
<p class="fragment"><b>Too many variables! Which is the most important?</b></p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>What is important?</h2>
|
|||
|
<p class="fragment">Data Cleaning</p>
|
|||
|
<h2 class="fragment">Which variables are important?</h2>
|
|||
|
<p class="fragment">The ones that keep us from knowing everything</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<p><small>$$D_{ROWID}='P' \wedge \left( V_{ROWID}='Ap' \vee PLAT_{ROWID} = 'An' \right)$$</small></p>
|
|||
|
<div style="font-size: 2em">⬍</div>
|
|||
|
<p>$$A \wedge (B \vee C)$$</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Naive Approach</h3>
|
|||
|
|
|||
|
<p>Consider a game between a database and an impartial oracle.</p>
|
|||
|
<ul>
|
|||
|
<li>The DB picks a variable $v$ in $\phi$ and pays a cost $c_v$.</li>
|
|||
|
<li>The Oracle reveals the truth value of $v$.</li>
|
|||
|
<li>The DB updates $\phi$ accordingly and repeats until $\phi$ is deterministic.</li>
|
|||
|
</ul>
|
|||
|
<p class="fragment"><b>Naive Algorithm: </b> Pick all variables!</p>
|
|||
|
<p class="fragment"><b>Less Naive Algorithm: </b> Minimize $E\left[\sum c_v\right]$.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h2>Exponential Time Bad!</h2>
|
|||
|
</section>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<section>
|
|||
|
<h3>The Value of What We Don't Know</h3>
|
|||
|
<p>$$\phi = A \wedge (B \vee C)$$</p>
|
|||
|
<ol>
|
|||
|
<li class="fragment" data-fragment-index="1">Generate Samples for $A$, $B$, $C$</li>
|
|||
|
<li class="fragment" data-fragment-index="2">Estimate $p(\phi)$</li>
|
|||
|
<li class="fragment" data-fragment-index="3">Compute $H[\phi] = -\log\left(p(\phi) \cdot (1-p(\phi))\right)$</li>
|
|||
|
</ol>
|
|||
|
<p class="fragment" data-fragment-index="4"><b>Entropy is intuitive: </b><br/> $H = 1$ means we know nothing, <br/>$H = 0$ means we know everything.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Information Gain</h3>
|
|||
|
<p>$$\mathcal I_{A \leftarrow \top} (\phi) = H\left[\phi\right] - H\left[\phi(A \leftarrow \top)\right]$$</p>
|
|||
|
<p><b>Information gain of</b> $v$: The reduction in entropy from knowing the truth value of a variable $v$.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Expected Information Gain</h3>
|
|||
|
<p>$$\mathcal I_{A} (\phi) = \left(p(A)\cdot \mathcal I_{A\leftarrow \top}(\phi)\right) + \left(p(\neg A)\cdot \mathcal I_{A\leftarrow \bot}(\phi)\right)$$</p>
|
|||
|
<p><b>Expected information gain of</b> $v$: The probability-weighted average of the information gain for $v$ and $\neg v$.</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>The Cost of Perfect Information</h3>
|
|||
|
<p>Combine Information Gain and Cost</p>
|
|||
|
<p>$$f(\mathcal I_{A}(\phi), c_A)$$</p>
|
|||
|
<p class="fragment"><b>For example: </b>$EG2(\mathcal I_{A}(\phi), c_A) = \frac{2^{\mathcal I_{A}(\phi)} - 1}{c_A}$</p>
|
|||
|
<p class="fragment"><b>Greedy Algorithm: </b> Minimize $f(\mathcal I_{A}(\phi), c_A)$ at each step</p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Experimental Data</h3>
|
|||
|
|
|||
|
<ul>
|
|||
|
<li>Start with a large dataset.</li>
|
|||
|
<li>Delete random fields (~50%).</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Experimental Queries</h3>
|
|||
|
|
|||
|
<p>Simulate an analyst trying to manually explore correlations.</p>
|
|||
|
<ul>
|
|||
|
<li>Train a tree-classifier on the base data.</li>
|
|||
|
<li>Convert the decision tree to a query for all rows where the tree predicts a specific value.</li>
|
|||
|
</ul>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Cost vs Entropy: Credit Data</h3>
|
|||
|
<img src="graphics/credit_entropy.png" height=400 />
|
|||
|
<p><small>
|
|||
|
<b>EG2:</b> Greedy Cost/Value Ordering<br/>
|
|||
|
<b>NMETC:</b> Naive Minimal Expected Total Cost<br/>
|
|||
|
<b>Random:</b> Completely Random Order
|
|||
|
</small></p>
|
|||
|
</section>
|
|||
|
|
|||
|
<section>
|
|||
|
<h3>Cost vs Entropy: Product Data</h3>
|
|||
|
<img src="graphics/product_entropy.png" height=400 />
|
|||
|
<p><small>
|
|||
|
<b>EG2:</b> Greedy Cost/Value Ordering<br/>
|
|||
|
<b>NMETC:</b> Naive Minimal Expected Total Cost<br/>
|
|||
|
<b>Random:</b> Completely Random Order
|
|||
|
</small></p>
|
|||
|
</section>
|
|||
|
|
|||
|
</section>
|
|||
|
|
|||
|
</div></div>
|
|||
|
|
|||
|
<script src="../reveal.js-3.1.0/lib/js/head.min.js"></script>
|
|||
|
<script src="../reveal.js-3.1.0/js/reveal.js"></script>
|
|||
|
|
|||
|
<script>
|
|||
|
|
|||
|
// Full list of configuration options available at:
|
|||
|
// https://github.com/hakimel/../reveal.js#configuration
|
|||
|
Reveal.initialize({
|
|||
|
controls: false,
|
|||
|
progress: true,
|
|||
|
history: true,
|
|||
|
center: true,
|
|||
|
slideNumber: true,
|
|||
|
|
|||
|
transition: 'fade', // none/fade/slide/convex/concave/zoom
|
|||
|
|
|||
|
// Optional ../reveal.js plugins
|
|||
|
dependencies: [
|
|||
|
{ src: '../reveal.js-3.1.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/math/math.js',
|
|||
|
condition: function() { return true; },
|
|||
|
mathjax: '../reveal.js-3.1.0/js/MathJax.js'
|
|||
|
},
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/zoom-js/zoom.js', async: true },
|
|||
|
{ src: '../reveal.js-3.1.0/plugin/notes/notes.js', async: true }
|
|||
|
]
|
|||
|
});
|
|||
|
|
|||
|
</script>
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|