Website/src/talks/2020-11-16-Vizier+Relationa...

719 lines
20 KiB
Plaintext

---
template: templates/talk_slides_v1.erb
title: Vizier - A Workflow System Disguised as a Notebook
---
<%
$sections = [
"Workflow Provenance",
"Fine-Grained Provenance",
"Caveat Provenance"
]
def render_overview(current)
"""
<h3>Overview</h3>
<ul>
#{$sections.map { |sec|
"<li #{if sec == current then "style='font-weight: bold'" else "" end}>#{sec}</li>"
}.join("")}
</ul>"""
end
def animated_svg(path, prefix=true)
"<svg data-src='#{if prefix then "graphics/2020-11-16/" else "" end}#{path}' class='stretch' />"
end
%>
<section>
<h2>
<img src="graphics/logos/vizier-blue.svg" height="100px" style="vertical-align: middle; margin-right: 20px;" />
<span style="vertical-align: middle;" >VizierDB</span>
</h2>
<h4>
A Workflow System Disguised as a Notebook
</h4>
</section>
<section>
<section>
<img src="graphics/2020-11-16/i_dont_like_notebooks.png">
<attribution>Joel Grus, JupyterCon 2018</attribution>
</section>
<section>
<h3>Jupyter (&amp; all other notebooks)</h3>
<ul>
<li>Hidden, hard-to-reason about state</li>
<li class="fragment highlight-grey">I can't use my editor</li>
<li>Not good for reusability</li>
</ul>
</section>
<section>
<h3>Hidden, hard-to-reason about state</h3>
<img src="graphics/2020-11-16/out_of_order.png">
<attribution>Joel Grus, JupyterCon 2018</attribution>
</section>
<section>
<h3>The Problem: What People Expect</h3>
<img src="graphics/2020-11-16/intuition_for_code.png">
<attribution>Joel Grus, JupyterCon 2018</attribution>
</section>
<section>
<h3>The Problem: What Notebooks Deliver</h3>
<img src="graphics/2020-11-16/pile_of_code.svg" height="400px">
</section>
<section>
<h3>Hidden, hard-to-reason about state</h3>
<img src="graphics/2020-11-16/provenance_notebook.svg" height=400px>
</section>
<section>
<h3>Not good for reusability</h3>
<img src="graphics/2020-11-16/factorizing.png">
<attribution>Joel Grus, JupyterCon 2018</attribution>
</section>
<section>
<h3>Not good for reusability</h3>
<img src="graphics/2020-11-16/pimentel_study.svg" height="400px">
<attribution>João Felipe Pimentel, Leonardo Murta, Vanessa Braganholo, Juliana Freire</attribution>
</section>
</section>
<section>
<section>
<img src="graphics/logos/vizier-blue.svg" height="200px" />
<h2><a href="http://localhost:5000/">Demo</a></h2>
</section>
</section>
<section>
<section><%=render_overview("Workflow Provenance")%></section>
<section>
<h2>Notebook → Workflow Graph</h2>
<!--
<ul>
<li>Extracting Dependencies</li>
<li>Extracting Dependencies... from Python</li>
<li>Scheduling Execution</li>
<li>Versioning</li>
</ul>
-->
</section>
<section>
<h3>Notebook → Workflow Graph</h3>
<img style="vertical-align: middle" src="graphics/2020-11-16/provenance_notebook.svg" height="400px">
<span style="font-size: 200%; vertical-align: middle; padding: 50px;">vs</span>
<img style="vertical-align: middle" src="graphics/2020-11-16/provenance_workflow.svg" height="400px">
</section>
<section>
<%=animated_svg("provenance_vizier_naive.svg")%>
</section>
<section>
<pre><code>
LOAD DATASET 'nyc_taxi.csv' AS taxi
</code></pre>
<pre><code>
SAMPLE 0.1 FROM taxi INTO taxi
</code></pre>
<pre><code class="python">
model = wave_hands_mysteriously()
</code></pre>
<pre><code class="python">
for row in ds.rows:
row["prediction"] = model.predict(row.values)
</code></pre>
</section>
<section>
<img style="vertical-align: middle" src="graphics/2020-11-16/provenance_vizier_ideal.svg" height="300px">
</section>
<section>
<h3>The Shared Kernel</h3>
<dl>
<div class="fragment">
<dt>Dependency Analysis is Hard</dt>
<dd>Need to re-execute all subsequent cells</dd>
</div>
<div class="fragment">
<dt>Checkpointing Python is Hard</dt>
<dd>Need to re-execute all preceding cells</dd>
</div>
</dl>
<p class="fragment"><b>Solution:</b> Get rid of shared kernels</p>
<p class="fragment"><b>Solution:</b> Explicit API for inter-cell sharing</p>
</section>
<section>
<pre><code class="python">
model = wave_hands_mysteriously()
vizierdb.export_pickle(model)
</code></pre>
<pre><code class="python">
ds = vizierdb.get_dataset("taxi")
model = vizierdb.get_pickle("model")
for row in ds.rows:
row["prediction"] = model.predict(row.values)
ds.save()
</code></pre>
</section>
<section>
<p>
Inputs : { name → artifact_version }
</p>
<p>
Outputs : { name → artifact_version }
</p>
</section>
<section>
<h3>Cell State</h3>
<dl>
<dt>READY</dt>
<dd>Cell value up-to-date</dd>
<dt>STALE</dt>
<dd>Cell needs to be re-executed</dd>
<div class="fragment">
<dt>WAITING</dt>
<dd>Cell <i>may</i> need to be re-executed</dd>
</div>
</dl>
</section>
<%
artifacts = [:taxi, :model]
[
[ ["READY", {}, {taxi: 1}],
["READY", {taxi: 1}, {taxi: 2}],
["READY", {}, {model: 3}],
["READY", {taxi: 2, model: 3}, {taxi: 4}],
],
[ ["READY", {}, {taxi: 1}],
["STALE", {taxi: 1}, {taxi: 2}],
["WAIT", {}, {model: 3}],
["WAIT", {taxi: 2, model: 3}, {taxi: 4}],
],
[ ["READY", {}, {taxi: 1}],
["READY", {taxi: 1}, {taxi: 5}],
["WAIT", {}, {model: 3}],
["WAIT", {taxi: 2, model: 3}, {taxi: 4}],
],
[ ["READY", {}, {taxi: 1}],
["READY", {taxi: 1}, {taxi: 5}],
["READY", {}, {model: 3}],
["WAIT", {taxi: 2, model: 3}, {taxi: 4}],
],
[ ["READY", {}, {taxi: 1}],
["READY", {taxi: 1}, {taxi: 5}],
["READY", {}, {model: 3}],
["STALE", {taxi: 2, model: 3}, {taxi: 4}],
],
[ ["READY", {}, {taxi: 1}],
["READY", {taxi: 1}, {taxi: 4}],
["READY", {}, {model: 3}],
["READY", {taxi: 5, model: 3}, {taxi: 6}],
],
].each do |provenance| %>
<section>
<table style="font-size: 90%;">
<tr><th>Cell</th><th>State</th>
<%=artifacts.map { |a| "<th>In[#{a}]</th>" }.join %>
<%=artifacts.map { |a| "<th>Out[#{a}]</th>" }.join%>
</tr>
<%
["LOAD", "SAMPLE", "Python1", "Python2"]
.zip(provenance)
.each do |cell, cell_provenance|
state, cell_inputs, cell_outputs = cell_provenance
%>
<tr>
<td><b><%=cell%></b></td>
<td style='font-family: monospace;'><%=state%></td>
<%artifacts.each { |a| %> <td><%=if cell_inputs.include?(a) then "v#{cell_inputs[a]}" else "" end%></td><%}%>
<%artifacts.each { |a| %> <td><%=if cell_outputs.include?(a) then "v#{cell_outputs[a]}" else "" end%></td><%}%>
<% end %>
</table>
</section>
<% end %>
<section>
<img src="graphics/2020-11-16/artifacts_vizier_workflow.png">
</section>
<section>
<h3>I don't like notebooks (revisited)</h3>
<dl>
<dt>State Model</dt>
<dd>Enforced in-order execution</dd>
<dt>Reproducibility</dt>
<dd>Artifacts explicitly part of the workflow</dd>
<dt>Reusability</dt>
<dd>Clearer interfaces for changing things</dd>
</dl>
</section>
</section>
<section>
<section><%=render_overview("Fine-Grained Provenance")%></section>
<section>
<img src="graphics/2020-11-16/typed_artifacts.svg">
</section>
<section>
<dl>
<dt>Dataframe</dt>
<dd>Python ↔ SQL ↔ UI</dd>
<dd class="fragment highlight-blue">Provenance Analysis</dd>
<dt>Function</dt>
<dd>Python → SQL</dd>
<dt>Directory</dt>
<dd>Enumerable Contents</dd>
<dt>Chart</dt>
<dd>Interactivity in UI</dd>
</dl>
</section>
<section>
<h3>Fine-Grained Provenance</h3>
<p class="fragment">Why is this value here?</p>
<p class="fragment">Caveats</p>
<p class="fragment">Computation → Data</p>
</section>
<section>
<h3>Fine-Grained Provenance</h3>
<p>Cells are compiled to Spark Operators (≅ RA)</p>
<img src="graphics/2020-11-16/provenance_dataflow.png">
<p class="fragment">... but that doesn't always work</p>
</section>
<section>
<img src="graphics/2020-11-16/provenance_data_and_workflow.png">
<ul>
<li>Introspectable "dataflow" cells compile down to Spark.</li>
<li>Opaque "workflow" cells create dependencies from every inpout row/value to every output.</li>
</ul>
</section>
<section>
<table>
<tr><th>Category</th><th>Examples</th><th>API</th></tr>
<tr><td>Script</td><td>Python, Scala, R</td><td>Workflow</td></tr>
<tr><td>SQL</td><td>SQL</td><td>Dataflow</td></tr>
<tr><td>Point/Click</td><td>Load, Plot, Export</td><td>Workflow</td></tr>
<tr><td>Cleaning Wizard</td><td>Repair Key, Impute</td><td>Dataflow</td></tr>
<tr class="fragment highlight-blue"><td>Spreadsheet</td><td>Add Col, Edit Value</td><td>Dataflow</td></tr>
</table>
</section>
<section>
<h2>Supporting One-Off Edits</h2>
<p>How do you preserve fine-grained provenance through spreadsheet operations?</p>
</section>
<section>
<h3>Vizual</h3>
<p>Spreadsheet Operations → SQL DDL / SQL DML</p>
<dl>
<div class="fragment">
<dt>Edit Cell A3 to 'foo'</dt>
<dd style="font-family: monospace;">UPDATE R SET A = 'foo' WHERE ROWID = 3;</dd>
</div>
<div class="fragment">
<dt>Insert Row</dt>
<dd style="font-family: monospace;">INSERT INTO R() VALUES ();</dd>
</div>
<div class="fragment">
<dt>Insert Column `bar`</dt>
<dd style="font-family: monospace;">ALTER TABLE R ADD COLUMN `bar`;</dd>
</div>
</dl>
<attribution>"The Exception That Improves The Rule" (Freire, Glavic, Kennedy, Mueller)</attribution>
</section>
<section>
<p>This gives us an edit history in DDL/DML.</p>
</section>
<section>
<h3>Fine-Grained Provenance on DDL/DML</h3>
<div class="fragment">
<h4 style="margin-top: 50px;">DML → SQL</h4>
<p style="font-size: 70%">
<b>Using Reenactment to Retroactively Capture Provenance for Transactions</b><br/>
Bahareh Sadat Arab, Dieter Gawlick, Vasudha Krishnaswamy, Venkatesh Radhakrishnan, Boris Glavic
</p>
</div>
<div class="fragment">
<h4 style="margin-top: 50px;">DDL → SQL</h4>
<p style="font-size: 70%">
<b>Graceful database schema evolution: the PRISM workbench</b><br/>
Carlo Curino, Hyun Jin Moon, Carlo Zaniolo
</p>
</div>
</section>
<section>
<pre><code class="sql">
UPDATE R SET A = 'foo' WHERE ROWID = 3;
</code></pre>
becomes
<pre><code class="sql">
SELECT CASE ROWID
WHEN 3 THEN 'foo'
ELSE A END AS A,
B, C, /* ... */
FROM R
</code></pre>
</section>
</section>
<section>
<section><%=render_overview("Caveat Provenance")%></section>
<section>
<p>
<img src="graphics/clipart/female-computer-user.svg" height="70px" style="vertical-align: middle;"/>
<span style="vertical-align: middle; padding-left: 70px; padding-right: 70px">→</span>
<img src="graphics/clipart/db.svg" height="70px" style="vertical-align: middle;"/>
<span style="vertical-align: middle; padding-left: 70px; padding-right: 70px">→</span>
<img src="graphics/clipart/male-computer-user.png" height="70px" style="vertical-align: middle;"/>
</p>
<p class="fragment">
<span style="margin-right: 250px; vertical-align: middle;">↓</span>
<span style="margin-left: 250px; vertical-align: middle;">↓</span>
<br/>
<span style="margin-right: 100px; vertical-align: middle;">Assumption</span>
<span style="font-size: 300%; vertical-align: middle;" class="fragment">≠</span>
<span style="margin-left: 100px; vertical-align: middle;">Assumption</span>
</p>
<attribution>freesvg.org</attribution>
</section>
<section style="top: 121px; display: block;" class="" aria-hidden="true">
<h3>Assumptions?</h3>
<ol style="font-size: 70%">
<li>"This outlier is actually a data error"</li>
<li>"There will always be six values in this column"</li>
<li>"The correct fix is to delete erroneous records"</li>
<li>"Unparseable values should be treated as NULL"</li>
<li>"Nobody will analyze this portion of the dataset"</li>
<li>"These subjective field observations are correct"</li>
</ol>
<p class="fragment">Alice needs to document each and every assumption.</p>
<p class="fragment">Bob needs to understand the implications<br/>on every part of his analysis.</p>
</section>
<section>
<img src="graphics/2020-11-16/montoya.jpeg" height="400px" />
<attribution>&copy; 20th Century Fox</attribution>
</section>
<section>
<h3>What is a Caveat?</h3>
<div style="margin-top: 70px;">
<p class="fragment">An assumption tied to a fragment of the dataset.</p>
<p class="fragment">If the assumption is wrong, so is the fragment.</p>
</div>
</section>
<section>
<pre><code class="sql">
caveat(race_ethnicity,
'Unexpected race_ethnicity: ' & race_ethnicity)
</code></pre>
</section>
<section>
<pre><code class="sql">
CASE WHEN race_ethnicity NOT IN ('Black Non-Hispanic', /* ... */)
THEN caveat(race_ethnicity,
'Unexpected race_ethnicity: ' & race_ethnicity)
ELSE race_ethnicity
</code></pre>
</section>
<section>
<pre><code class="sql">
SELECT
CASE WHEN race_ethnicity NOT IN ('Black Non-Hispanic', /* ... */)
THEN caveat(race_ethnicity,
'Unexpected race_ethnicity: ' & race_ethnicity)
ELSE race_ethnicity
END, /* ... */
FROM R
</code></pre>
</section>
<section>
<h3>Propagation</h3>
<p>Can twiddling the caveatted value change the output?</p>
<p class="fragment" data-fragment-index="1" style="margin-top: 50px;">$C \leftarrow (5 \times X) + Y$</p>
<p class="fragment" data-fragment-index="1">Caveats on $X$ and $Y$ propagate to $C$<span class="fragment" data-fragment-index="2">*</span></p>
<p class="fragment" data-fragment-index="2" style="font-size:30%">Some conditions may apply</p>
</section>
</section>
<section>
<section>
<h2>Sloooow!</h2>
</section>
<section>
<img src="graphics/2020-11-16/caveat-spreadsheet.png" height="150px;" />
<p style="font-size: 200%">+</p>
<img src="graphics/2020-11-16/caveat-list.png" height="150px;" />
</section>
<section>
<p>Is a value caveatted?</p>
<p class="fragment">≡ Certain answers in incomplete databases</p>
<p class="fragment">(coNP-complete)</p>
</section>
<section>
<h3>Conservative Approximation</h3>
<div>
<p style="margin-top: 20px; font-size: 60%;">
<b>Correctness of SQL Queries on Databases with Nulls.</b><br/>
Paolo Guagliardo, Leonid Libkin
</p>
<p style="margin-top: 20px; font-size: 60%;">
<b>Uncertainty Annotated Databases - A Lightweight Approach for Approximating Certain Answers</b><br/>
Su Feng, Aaron Huber, Boris Glavic, Oliver Kennedy
</p>
</div>
<ul>
<li class="fragment" data-fragment-index="1">Unmarked rows are guaranteed to be caveat-free.</li>
<li class="fragment" data-fragment-index="2">Marked rows might not be caveatted.</li>
</ul>
</section>
</section>
<section>
<!--
Enumerating Caveats
-->
<section>
<h3>Enumerating Caveats</h3>
<dl style="margin-bottom: 50px;">
<dt>Static Analysis</dt>
<dd>Which caveats could possibly affect the element?</dd>
<dt>Dynamic Analysis</dt>
<dd>Which specific caveats affect the element?</dd>
</dl>
<attribution>"Your notebook is not crumby enough, REPLace it" (Brachmann, Spoth, Kennedy, Glavic, Mueller, Castelo, Bautista, Freire)</attribution>
</section>
<section>
<h3>Static Analysis</h3>
<p>What calls to <span style="font-family: monospace;">caveat()</span> appear in the derivation of the specified element?</p>
<p class="fragment">Analogous to <i>program slicing</i>.</p>
</section>
<section>
<h3>Program Slicing</h3>
<p>Find lines of code needed to compute a value.</p>
<p class="fragment">Turing Complete: {Variable}</p>
<p class="fragment">RA: { (Feature, Predicate) }</p>
<ul class="fragment">
<li>Row</li>
<li>Column</li>
<li>Sort Order</li>
</ul>
</section>
<section>
<h3>Dynamic Analysis</h3>
<ol>
<li>For each operator containng a <span style="font-family: monospace;">caveat()</span>, generate a query for the slice.</li>
<li>Generate every message output by that <span style="font-family: monospace;">caveat()</span></li>
<li>Union the message query results together.
</ol>
</section>
<section>
<pre><code class="sql">
WITH data_source AS
SELECT caveat(A, 'valid if '& B &' is within tolerances.') AS A,
C, D, E FROM R
SELECT A FROM data_source WHERE ROWID = i
</code></pre>
<div class="fragment">
<p>↓</p>
$$\{ \texttt{R} \mapsto (\texttt{A}, \texttt{ROWID = i}) \}$$
</div>
<div class="fragment">
<p>↓</p>
<pre><code class="sql">
SELECT 'valid if '& B &' is within tolerances.'
AS caveat_message
FROM R WHERE ROWID = i
</code></pre>
</div>
</section>
</section>
<section>
<section>
<h3>
<img src="graphics/logos/vizier-blue.svg" height="100px" style="vertical-align: middle; margin-right: 20px;" />
<span style="vertical-align: middle;" ><a href="https://vizierdb.info/">https://vizierdb.info</a></span>
</h3>
<pre style="margin-top: 50px;"><code class="sql">
$> pip3 install --user vizier-webapi
$> vizier
</code></pre>
</section>
<section>
<h3>
<img src="graphics/logos/vizier-blue.svg" height="100px" style="vertical-align: middle; margin-right: 50px;" />
<span style="vertical-align: middle;" >
<span style="font-size: 50%;">[https://]</span>VizierDB<span style="font-size: 50%;">[.info]</span>
</span>
<img src="graphics/2020-11-16/qr.png" height="150px" style="vertical-align: middle; margin-left: 50px">
</h3>
<hr/>
<h4 style="font-size: 70%">
Michael&nbsp;Brachmann,
Munaf&nbsp;Arshad&nbsp;Qazi,
William&nbsp;Spoth,
Poonam&nbsp;Kumari,
Oliver&nbsp;Kennedy,
Boris&nbsp;Glavic,
Heiko&nbsp;Mueller,
Sonia&nbsp;Castelo,
Juliana&nbsp;Freire,
</h4>
<hr/>
<h4 style="font-size: 70%">
Ying&nbsp;Yang,
Su&nbsp;Feng,
Aaron&nbsp;Huber,
Niccolò&nbsp;Meneghetti,
Arindam&nbsp;Nandi,
Shivang&nbsp;Agarwal,
Olivia&nbsp;Alphonse,
Lisa&nbsp;Lu,
Gourab&nbsp;Malhotra,
Remi&nbsp;Rampin,
Carlos&nbsp;Bautista
</h4>
<hr/>
<img src="graphics/logos/nsf-small.png" height="100px">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src="graphics/logos/ub_odin_standalone_black.png" height="90px">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src="graphics/logos/iit_dbgroup.png" height="100px">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src="graphics/logos/nyu.png" height="100px">
<p style="font-size: 16pt">Vizier is supported by NSF Awards ACI-1640864, IIS-1750460, IIS-1956149 and gifts from Oracle</p>
</section>
</section>