770 lines
20 KiB
Plaintext
770 lines
20 KiB
Plaintext
---
|
|
template: templates/talk_slides_v1.erb
|
|
title: Principled management of notebook state in Vizier
|
|
---
|
|
|
|
<style type="text/css">
|
|
.notebook {
|
|
width: 100%;
|
|
}
|
|
.notebook .nbcell .nblabel {
|
|
font-family: Courier;
|
|
vertical-align: middle;
|
|
margin-right: 20px;
|
|
color: blue;
|
|
font-weight: bold;
|
|
}
|
|
.notebook .nbcell pre {
|
|
width: calc(100% - 100px);
|
|
display: inline-block;
|
|
vertical-align: middle;
|
|
}
|
|
.notebook .nbcell pre code {
|
|
padding: 20px;
|
|
}
|
|
.notebook .nbcell pre.nbresult {
|
|
width: calc(100% - 110px);
|
|
padding-left: 20px;
|
|
margin-left: 90px;
|
|
padding-top: 10px;
|
|
padding-bottom: 10px;
|
|
}
|
|
.notebook .nbcell.fragment.highlight-blue.current-fragment
|
|
{
|
|
border: solid 4px blue;
|
|
}
|
|
|
|
</style>
|
|
|
|
<%
|
|
$cells = []
|
|
def notebook()
|
|
$cells = []
|
|
ret = ""
|
|
ret += "<div class='notebook'>"
|
|
yield
|
|
ret += $cells.join("")
|
|
ret += "</div>"
|
|
return ret
|
|
end
|
|
|
|
def nbdiv(body, varargs={})
|
|
hide = varargs.fetch(:hide, nil)
|
|
show = varargs.fetch(:show, nil)
|
|
highlight = varargs.fetch(:highlight, nil)
|
|
css_class = varargs.fetch(:css, "nbcell")
|
|
extra_attrs = ""
|
|
unless show.nil?
|
|
css_class += " fragment"
|
|
extra_attrs += " data-fragment-index='#{show}'"
|
|
end
|
|
unless hide.nil?
|
|
css_class += " fragment fade-out"
|
|
extra_attrs += " data-fragment-index='#{hide}'"
|
|
end
|
|
unless highlight.nil?
|
|
css_class += " fragment highlight-blue"
|
|
extra_attrs += " data-fragment-index='#{highlight}'"
|
|
end
|
|
return "<div class='#{css_class}'#{extra_attrs}>#{body}</div>"
|
|
end
|
|
|
|
def nbcell(text, varargs={})
|
|
lang = varargs.fetch(:lang, "python")
|
|
idx = varargs.fetch(:idx, nil)
|
|
output = varargs.fetch(:output, nil)
|
|
idx = $cells.size + 1 if idx.nil?
|
|
cmd = "<span class='nblabel'>[#{idx}]</span><pre><code class='#{lang}'>#{text}</code></pre>"
|
|
unless output.nil?
|
|
cmd += "<br/><pre class='nbresult'>#{output}</pre>"
|
|
end
|
|
$cells += [nbdiv(cmd, varargs)]
|
|
end
|
|
|
|
def nbnote(note, varargs={})
|
|
varargs[:css_class] = "nbnote"
|
|
$cells += [nbdiv(note, varargs)]
|
|
end
|
|
%>
|
|
|
|
<section>
|
|
<h2><%= title %></h2>
|
|
|
|
<h4>Oliver Kennedy</h4>
|
|
<h5>University at Buffalo</h5>
|
|
</section>
|
|
|
|
<section>
|
|
<svg data-src="graphics/2022-06-20/NotebookOverview.svg" height="400px" style="margin-left: -100px"/>
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<div style="display: inline-block; width: 45%;">
|
|
<img src="graphics/2022-06-20/Pimentel.png" height="400px">
|
|
<p style="font-size: 70%;"><a href="https://ieeexplore.ieee.org/document/8816763">Pimentel et al</a>: "4.03% of notebooks on github are reproducible"</p>
|
|
</div>
|
|
|
|
<div style="display: inline-block; width: 45%;" class="fragment">
|
|
<img src="graphics/2022-06-20/Grus.png">
|
|
<p style="font-size: 70%;"><a href="https://www.youtube.com/watch?v=7jiPeIFXb6U">Joel Grus</a>: "For beginners, with dozens of cells and more complex code [the ability to run code snippets out of order] is utterly confusing."</p>
|
|
</div>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>High-Level Challenges</h3>
|
|
<ul>
|
|
<li>Not clear from context where a variable was written.</li>
|
|
<li>A cell that runs may still be wrong (for the program).</li>
|
|
<li>A state that was computed may be inconsistent.</li>
|
|
</ul>
|
|
|
|
<p class="fragment takeaway">
|
|
So why does everyone use this confusing state model?
|
|
</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Notebooks: The Good</h3>
|
|
<ul>
|
|
<li>They're interactive</li>
|
|
<li>Less intimidating than the command line</li>
|
|
<li class="fragment highlight-blue">In principle, a record of what you did.</li>
|
|
<li>Everyone's using them</li>
|
|
</ul>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Notebook</h3>
|
|
<svg data-src="graphics/2024-04-12/NotebookExtensions.svg" height="400px"/>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>High-Level Challenges</h3>
|
|
<svg data-src="graphics/2024-04-12/Dependencies.svg" height="400px"/>
|
|
</section>
|
|
|
|
<section>
|
|
<%=
|
|
notebook() do
|
|
nbcell("x = 3", idx: 1)
|
|
nbcell("y = x + 2", idx: 2)
|
|
nbcell("x = 4", idx: 3)
|
|
nbcell("print(y)", idx: 4, output: "5")
|
|
end
|
|
%>
|
|
</section>
|
|
<section>
|
|
<%=
|
|
notebook() do
|
|
nbcell("x = 3", idx: 1, highlight: 1)
|
|
nbcell("y = x + 102", idx: 5)
|
|
nbcell("x = 4", idx: 3, highlight: 1)
|
|
nbcell("print(y)", idx: 4, output: "5", highlight: 2)
|
|
end
|
|
%>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Dependencies</h3>
|
|
|
|
<p class="fragment"><b>Reads: </b> <tt>x</tt></p>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<p class="fragment"><b>Writes: </b> <tt>y</tt></p>
|
|
|
|
<p class="fragment takeaway">
|
|
<b>Question:</b> Which variables does the cell read/write?
|
|
</p>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<%=
|
|
notebook() do
|
|
nbnote("$$\\{\\;\\;\\}$$", show: 1)
|
|
nbcell("x = 3", idx: 1)
|
|
nbnote("$$\\{\\;x \\rightarrow \\textbf{@1}\\;\\}$$", show: 2)
|
|
nbcell("y = x + 2", idx: 2)
|
|
nbnote("$$\\{\\;x \\rightarrow \\textbf{@1},\\;y \\rightarrow \\textbf{@2}\\;\\}$$", show: 3)
|
|
nbcell("x = 4", idx: 3)
|
|
nbnote("$$\\{\\;x \\rightarrow \\textbf{@3},\\;y \\rightarrow \\textbf{@2}\\;\\}$$", show: 4)
|
|
nbcell("print(y)", idx: 4)
|
|
end
|
|
%>
|
|
</section>
|
|
|
|
<section>
|
|
<p>Interpreter State: $\{\;x \rightarrow \textbf{@3},\;y \rightarrow \textbf{@2}\;\}$</p>
|
|
|
|
<p>... but Cell 2 read $x \rightarrow \textbf{@1}$</p>
|
|
|
|
<p class="fragment takeaway">
|
|
<b>Question:</b> How do we get the interpreter back to a known state?
|
|
</p>
|
|
</section>
|
|
|
|
<section>
|
|
<%=
|
|
notebook() do
|
|
nbcell("y = x + 102", idx: 5)
|
|
nbnote("$\\{\\;\\;\\}$ vs $\\{\\;x \\rightarrow \\textbf{@1},\\;y \\rightarrow \\color{blue}{\\textbf{@4}}\\;\\}$", show:1)
|
|
nbcell("x = 4", idx: 3, highlight: 2)
|
|
nbnote("$\\{\\;x \\rightarrow \\textbf{@2}\\;\\}$ vs $\\{\\;x \\rightarrow \\textbf{@3},\\;y \\rightarrow \\color{blue}{\\textbf{@4}}\\;\\}$", show:3)
|
|
nbcell("print(y)", idx: 4, highlight: 4)
|
|
end
|
|
%>
|
|
|
|
<p class="fragment takeaway">
|
|
<b><strike>Question</strike></b> A cell is stale if a value it read last time changed.
|
|
</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Overview: Workflow-Style Notebooks</h3>
|
|
|
|
<ol>
|
|
<li>Static Analysis</li>
|
|
<li>Microkernel Notebooks</li>
|
|
<li>Inter-Kernel Interop <span style="color: grey;">[Work In Progress]</span></li>
|
|
</ol>
|
|
</section>
|
|
|
|
<!------------------------- Static Analysis -------------------------->
|
|
<section>
|
|
<h3>Obtaining Cell Dependencies</h3>
|
|
|
|
<ul>
|
|
<li class="fragment" data-fragment-index="1">What could the cell read/write? <span class="fragment" data-fragment-index="5">[Static]</span></li>
|
|
<li class="fragment" data-fragment-index="2"><span class="fragment strike" data-fragment-index="4">What will the cell read/write?</span></li>
|
|
<li class="fragment" data-fragment-index="3">What did the cell read/write? <span class="fragment" data-fragment-index="5">[Dynamic]</span></li>
|
|
</ul>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Dynamic Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<pre class="fragment">
|
|
|
|
0 LOAD_GLOBAL 0 (z)
|
|
2 POP_JUMP_IF_FALSE 12
|
|
4 LOAD_GLOBAL 1 (x)
|
|
6 LOAD_CONST 1 (2)
|
|
8 BINARY_ADD
|
|
10 STORE_GLOBAL 2 (y)
|
|
12 LOAD_CONST 0 (None)
|
|
14 RETURN_VALUE
|
|
</pre>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Dynamic Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<pre>
|
|
|
|
0 LOAD_GLOBAL 0 (z)
|
|
2 POP_JUMP_IF_FALSE 12
|
|
|
|
|
|
|
|
|
|
12 LOAD_CONST 0 (None)
|
|
14 RETURN_VALUE
|
|
</pre>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Dynamic Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
<p>If <tt>z == False</tt>:</p>
|
|
<p><b>Reads: </b> $\{\;\textbf{z}\;\}$</p>
|
|
<p><b>Writes: </b> $\{\;\;\}$</p>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Static Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<pre>
|
|
|
|
0 LOAD_GLOBAL 0 (z)
|
|
2 POP_JUMP_IF_FALSE 12
|
|
4 LOAD_GLOBAL 1 (x)
|
|
6 LOAD_CONST 1 (2)
|
|
8 BINARY_ADD
|
|
10 STORE_GLOBAL 2 (y)
|
|
12 LOAD_CONST 0 (None)
|
|
14 RETURN_VALUE
|
|
</pre>
|
|
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Static Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<pre>
|
|
|
|
0 LOAD_GLOBAL 0 (z) # <---- reads
|
|
2 POP_JUMP_IF_FALSE 12
|
|
4 LOAD_GLOBAL 1 (x) # <---- reads
|
|
6 LOAD_CONST 1 (2)
|
|
8 BINARY_ADD
|
|
10 STORE_GLOBAL 2 (y) # ----> writes
|
|
12 LOAD_CONST 0 (None)
|
|
14 RETURN_VALUE
|
|
</pre>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Static Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<p><b>Could Read: </b> $\{\;\textbf{x},\;\textbf{z}\;\}$</p>
|
|
<p><b>Could Write: </b> $\{\;\textbf{y}\;\}$</p>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Static Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("my_data.filter( items )", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<p><b>Could Read: </b> $\{\;\textbf{my_data},\;\textbf{items}\;\}$</p>
|
|
<p><b>Could Write: </b> $\{\;\;\}$</p>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Static Dependencies</h3>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("my_data.push( items )", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<p><b>Could Read: </b> $\{\;\textbf{my_data},\;\textbf{items}\;\}$</p>
|
|
<p><b>Could Write: </b> $\{\;\textbf{my_data}\;\}$</p>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<%=
|
|
notebook() do
|
|
nbcell("my_data.filter( items )", idx: 2)
|
|
end
|
|
%>
|
|
vs
|
|
<%=
|
|
notebook() do
|
|
nbcell("my_data.push( items )", idx: 2)
|
|
end
|
|
%>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<img src="graphics/2024-04-12/LibraryStaticAnalysisDSL.png">
|
|
|
|
<attribution>"Bolt-on, Compact, and Rapid Program Slicing for Notebooks" (Shenkar et. al.; VLDB 2023)</attribution>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Dependency Needs</h3>
|
|
|
|
<ul>
|
|
<li class="fragment" data-fragment-index="1">Does a cell need to be re-run based on these changes?
|
|
<div class="fragment" data-fragment-index="2" style="font-weight: bold;">Dynamic sufficient (assuming deterministic cells).</div>
|
|
</li>
|
|
<li class="fragment" data-fragment-index="3">Which cell last wrote to a variable?
|
|
<div class="fragment" data-fragment-index="4" style="font-weight: bold;">Dynamic sufficient.</div>
|
|
</li>
|
|
<li class="fragment" data-fragment-index="5">
|
|
<div class="fragment highlight-blue" data-fragment-index="7">
|
|
What is the minimal set of inputs a cell needs to run?
|
|
<div class="fragment" data-fragment-index="6" style="font-weight: bold;">Static required.</div>
|
|
</div>
|
|
</li>
|
|
</ul>
|
|
</section>
|
|
|
|
<!------------------------- Microkernel Notebooks -------------------------->
|
|
<section>
|
|
<div class="fragment fade-out" data-fragment-index="3">
|
|
$\{\;???\;\}$ <span class="fragment" data-fragment-index="1">$\leftarrow \{\;z \rightarrow \textbf{@1}; x \rightarrow \textbf{@2}\;\}$</span>
|
|
</div>
|
|
<div class="fragment" data-fragment-index="3">
|
|
$\{\;\;\;\;\;\}$ $\leftarrow \{\;z \rightarrow \textbf{@1}; x \rightarrow \textbf{@2}\;\}$
|
|
</div>
|
|
<%=
|
|
notebook() do
|
|
nbcell("if z:\n y = x + 2", idx: 2)
|
|
end
|
|
%>
|
|
|
|
<p class="fragment takeaway" data-fragment-index="2">We need to be able to recover the kernel to <i>any</i> state.</p>
|
|
</section>
|
|
|
|
<section>
|
|
<%=
|
|
notebook() do
|
|
nbcell("x = expensive_initialization()")
|
|
nbcell("y = expensive_cloud_training1(x)")
|
|
nbcell("z = expensive_cloud_training2(x)")
|
|
nbcell("print( compare(y, z)")
|
|
end
|
|
%>
|
|
</section>
|
|
|
|
<section>
|
|
<svg data-src="graphics/2024-04-12/Parallel.svg" height="400px"/>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Why have only one kernel?</h2>
|
|
|
|
<p class="fragment">🤷</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Parallelism</h3>
|
|
<ul>
|
|
<li class="fragment">When is parallelism allowed?</li>
|
|
<li class="fragment">When is a cell runnable?</li>
|
|
</ul>
|
|
</section>
|
|
|
|
<section>
|
|
<p><b>Static</b>: What variables <i>could</i> be read/written.</p>
|
|
<p style="font-size: 50%; margin: 50px;">vs</p>
|
|
<p><b>Dynamic</b>: What variables <i>were</i> read/written.</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Actual State</h3>
|
|
|
|
$$\{\;x \rightarrow \textbf{@1}\;\}$$
|
|
|
|
<div class="fragment">
|
|
<h3 style="margin-top: 50px;">Tentative State</h3>
|
|
|
|
$$\{\;x \rightarrow \textbf{@1},\;y \rightarrow \textbf{???}\;\}$$
|
|
<div class="fragment">
|
|
$$\{\;* \rightarrow \textbf{???}\;\}$$
|
|
</div>
|
|
</div>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Cell Status</h3>
|
|
|
|
<dl style="font-size: 70%">
|
|
<dt>Complete</dt>
|
|
<dd>Active if: $\forall (x \rightarrow \textbf{@i}) \in \texttt{DynamicReads} : \texttt{InState}[x] = \textbf{@i}$</dd>
|
|
<dd>$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{@i}\;|\;\forall (x \rightarrow \textbf{@i}) \in \texttt{DynamicWrites}\;\}$</dd>
|
|
|
|
<dt>Runnable</dt>
|
|
<dd>Active if: $\forall x \in \texttt{StaticReads} : \texttt{InState}[x] \neq \textbf{???}$</dd>
|
|
<dd>$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$</dd>
|
|
|
|
<dt>Stale</dt>
|
|
<dd>Active if: first run or $\exists (x \rightarrow \textbf{@i}) \in \texttt{DynamicReads} : \texttt{InState}[x] \neq \textbf{@i}$</dd>
|
|
<dd>$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$</dd>
|
|
|
|
<dt>Unknown</dt>
|
|
<dd>Active otherwise.</dd>
|
|
<dd>$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$</dd>
|
|
</dl>
|
|
</section>
|
|
|
|
<section>
|
|
<div style="display: inline-block;">
|
|
<h3>Serial</h3>
|
|
<svg data-src="graphics/2024-04-12/gantt_serial.svg" width="450px"/>
|
|
</div>
|
|
<div style="display: inline-block;">
|
|
<h3>Parallel</h3>
|
|
<svg data-src="graphics/2024-04-12/gantt_parallel.svg" width="450px"/>
|
|
</div>
|
|
<attribution>"Runtime Provenance Refinement for Notebooks" (Deo et. al.; TaPP 2022)</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Microkernel Notebooks</h3>
|
|
<img src="graphics/2022-06-20/MicrokernelCheckpoints.svg" height="400px">
|
|
<attribution>https://openclipart.com</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Why have only one python version?</h2>
|
|
|
|
<p class="fragment">🤷</p>
|
|
</section>
|
|
|
|
<section>
|
|
<img src="graphics/2022-06-20/MicrokernelPyV2Checkpoints.svg" height="400px">
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Why have only one language?</h2>
|
|
|
|
<p class="fragment">🤷</p>
|
|
</section>
|
|
|
|
<section>
|
|
<img src="graphics/2022-06-20/MicrokernelPyScalaCheckpoints.svg" height="400px">
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Why require code?</h2>
|
|
|
|
<p class="fragment">🤷</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Vizier Demo</h3>
|
|
</section>
|
|
|
|
<section>
|
|
<svg data-src="graphics/2024-04-12/MultiRunnerBlockDiagram.svg" height="300px"/>
|
|
<attribution>"The Right Tool for the Job: Data-Centric Workflows in Vizier" (Kennedy et. al.; IEEE DEB 2022)
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Repeatable Spreadsheet Dataframe Editing</h3>
|
|
<img src="graphics/2023-06-18/vizier-spreadsheet.png" height="300px">
|
|
<attribution>"Overlay Spreadsheets" (Kennedy et. al.; HILDA 2022)</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Data Widgets</h3>
|
|
<img src="graphics/2024-04-12/VizierLoadData.png" height="300px">
|
|
<attribution>"Your notebook is not crumby enough, REPLace it" (Brachmann et. al.; CIDR 2020)</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Data Vis</h3>
|
|
<img src="graphics/2024-04-12/VizierDataVis.png" height="300px">
|
|
<attribution>"Your notebook is not crumby enough, REPLace it" (Brachmann et. al.; CIDR 2020)</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Data Curation</h3>
|
|
<img src="graphics/2024-04-12/VizierMimir.png" height="200px">
|
|
<attribution>"Lenses: An On-Demand Approach to ETL" (Yang et. al.; VLDB 2015)</attribution>
|
|
</section>
|
|
|
|
<!------------------------- State Management -------------------------->
|
|
|
|
<section>
|
|
<img src="graphics/2024-04-12/14thWarrior-Cartoon-Elephant.svg" height="300px">
|
|
<p class="fragment takeaway">... but this requires migrating state.<span class="fragment">.. across languages.</span></p>
|
|
<attribution>https://openclipart.com</attribution>
|
|
</section>
|
|
|
|
<section>
|
|
<svg data-src="graphics/2024-04-12/Dependencies.svg" height="400px"/>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Approach 1: Pickle</h3>
|
|
|
|
<p style="font-size: 70%">Python's native serialization support.</p>
|
|
|
|
<dl style="font-size: 90%">
|
|
<div class="fragment" data-fragment-index="1">
|
|
<dt>The Good</dt>
|
|
<dd>Easy</dd>
|
|
</div>
|
|
<div class="fragment" data-fragment-index="2">
|
|
<dt>The Bad</dt>
|
|
<dd><span class="fragment highlight-grey" data-fragment-index="3">Not everything is serializable</span><span class="fragment" data-fragment-index="3" style="font-size: 50%; vertical-align: top;">†</span></dd>
|
|
<dd>Limited compatibility with ¬Python</dd>
|
|
<dd>Expensive for e.g., dataframes</dd>
|
|
</div>
|
|
</dl>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Approach 2: Json</h3>
|
|
|
|
<p style="font-size: 70%">Standard data interchange format.</p>
|
|
|
|
<dl style="font-size: 90%">
|
|
<div class="fragment" data-fragment-index="1">
|
|
<dt>The Good</dt>
|
|
<dd>Easy</dd>
|
|
<dd>Near universal platform compatibility</dd>
|
|
</div>
|
|
<div class="fragment" data-fragment-index="2">
|
|
<dt>The Bad</dt>
|
|
<dd>Even less state is supported</dd>
|
|
<dd>Even more expensive for e.g., dataframes</dd>
|
|
<dd>Limited support for nuanced types (e.g., dates)</dd>
|
|
</div>
|
|
</dl>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Approach 3: Arrow, Shapefile, Parquet, NPY</h3>
|
|
|
|
<p style="font-size: 70%">Specialized formats for specific datatypes.</p>
|
|
<dl style="font-size: 90%">
|
|
<div class="fragment" data-fragment-index="1">
|
|
<dt>The Good</dt>
|
|
<dd>High Performance</dd>
|
|
<dd>Precise, Well Typed</dd>
|
|
</div>
|
|
<div class="fragment" data-fragment-index="2">
|
|
<dt>The Bad</dt>
|
|
<dd>Only one type of state is supported</dd>
|
|
</div>
|
|
</dl>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Vizier (Now)</h3>
|
|
|
|
<p style="font-size: 70%">Vizier-level Typing.</p>
|
|
|
|
<ul>
|
|
<li class="fragment" data-fragment-index="1"><b>Simple Data:</b> JSON</li>
|
|
<li class="fragment" data-fragment-index="2"><b>Typed Data:</b> Standard JSON Encoding</li>
|
|
<li class="fragment" data-fragment-index="3"><b>Special Data:</b> <span class="fragment highlight-blue" data-fragment-index="5">'Active' Data</span></li>
|
|
<li class="fragment" data-fragment-index="4"><b>Fallback:</b> Pickle</li>
|
|
</ul>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Active Data</h3>
|
|
|
|
<p style="font-size: 70%">Datasets, Functions/Classes, etc...</p>
|
|
<ul style="font-size: 80%">
|
|
<li class="fragment">One concept, Many physical representations (Arrow, Parquet, CSV).
|
|
<ul>
|
|
<li class="fragment">A cell interpreter may not support a representation.</li>
|
|
<li class="fragment">Generating a standard representation can be expensive.</li>
|
|
</ul>
|
|
</li>
|
|
<li class="fragment">State (e.g., Datasets) can get big.
|
|
<ul>
|
|
<li class="fragment">An interpreter may not want/need to load the entire state.</li>
|
|
<li class="fragment">Versioning all checkpoints becomes infeasible.</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Desiderata</h3>
|
|
|
|
<div style="text-align: left; font-size: 80%;">
|
|
<p>An abstraction that...</p>
|
|
<ul>
|
|
<li>... represents the concept.</li>
|
|
<li>... allows on-demand conversion between representations.</li>
|
|
<li>... allows partial in-store interactions.</li>
|
|
<li>... allows incremental changes.</li>
|
|
</ul>
|
|
</div>
|
|
<p class="fragment takeaway">Vizier's artifact store provides a thin wrapper around standards compliant libraries (e.g., Apache Spark).</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>"Active" Data</h3>
|
|
<svg data-src="graphics/2024-04-12/DataframeAbstraction.svg" height="400px"/>
|
|
</section>
|
|
|
|
<section>
|
|
<p>... but it's a lot of special case code.</p>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>Generalizing Active Data</h3>
|
|
<p style="font-size: 60%">(future work)</p>
|
|
|
|
<ul>
|
|
<li class="fragment">What's the right abstraction?</li>
|
|
<li class="fragment">Efficient type coercion (without $N^2$)</li>
|
|
<li class="fragment">Microservice RPCs</li>
|
|
<li class="fragment">Caching Strategies</li>
|
|
</ul>
|
|
|
|
<p class="fragment takeaway">Questions?</p>
|
|
</section>
|
|
|
|
|
|
<!------------------------- Closing -------------------------->
|
|
|
|
<section>
|
|
<a href="https://vizierdb.info">
|
|
<img src="graphics/2022-06-20/vizier.svg" height="200px">
|
|
<p style="margin-top: -20px;">https://vizierdb.info</p>
|
|
</a>
|
|
|
|
<p style="font-size: 65%">Mike Brachmann, Boris Glavic, Nachiket Deo, Juliana Freire, Heiko Mueller, Sonia Castello, Munaf Arshad Qazi, William Spoth, Poonam Kumari, Nicholas Brown, Soham Patel, Thomas Slowe, and more...</p>
|
|
|
|
|
|
<div style="width: 100%; text-align: right">
|
|
<span style="font-size: 40%; vertical-align: top;">Supported by:</span>
|
|
<img src="graphics/logos/nsf.png" height="50px">
|
|
<img src="graphics/logos/breadcrumb.png" height="50px">
|
|
</div>
|
|
</section>
|