204 lines
5.4 KiB
TeX
204 lines
5.4 KiB
TeX
\documentclass[sigconf]{acmart}
|
|
%\documentclass{vldb}
|
|
|
|
\settopmatter{printacmref=false}
|
|
\setcopyright{none}
|
|
|
|
\usepackage{hyperref}
|
|
\usepackage[a-1b]{pdfx}
|
|
\usepackage{booktabs} % For formal tables
|
|
\usepackage{xspace}
|
|
\usepackage[utf8]{inputenc}
|
|
\usepackage{stmaryrd}
|
|
\usepackage{balance} % for \balance command ON LAST PAGE (only there!)
|
|
\usepackage{cleveref}
|
|
\usepackage{pifont}
|
|
\usepackage{todonotes}
|
|
\usepackage{setspace}
|
|
\usepackage{balance}
|
|
\usepackage[noend]{algpseudocode}
|
|
\usepackage{algorithm}
|
|
\usepackage{subcaption}
|
|
\usepackage{minted}
|
|
\usepackage{multirow}
|
|
\usepackage{colortbl}
|
|
|
|
|
|
\newcommand{\trimfigurespacing}{\vspace*{-5mm}}
|
|
|
|
\newcommand{\OK}[1]{\todo[backgroundcolor=blue!25]{\tiny \textbf{Oliver says:} #1}}
|
|
\newcommand{\BG}[1]{\todo[backgroundcolor=red!25]{\tiny \textbf{Boris says:} #1}}
|
|
\newcommand{\BGI}[1]{\todo[backgroundcolor=red!25,inline]{\textbf{Boris says:} #1}}
|
|
\newcommand{\ND}[1]{\todo[backgroundcolor=green!25]{\tiny \textbf{Nachiket says:} #1}}
|
|
|
|
\definecolor{PineGreen}{HTML}{007B62}
|
|
\definecolor{Purple} {HTML}{99479B}
|
|
\definecolor{NavyBlue} {HTML}{006EB8}
|
|
\definecolor{BrickRed} {HTML}{B6321C}
|
|
\definecolor{Black} {HTML}{000000}
|
|
|
|
\newminted{python}{linenos,frame=single,numbersep=3pt,fontsize=\footnotesize}
|
|
|
|
% \newcommand{\reva}[1]{\textcolor{Black}{#1}}
|
|
% \newcommand{\revb}[1]{\textcolor{Black}{#1}}
|
|
% \newcommand{\revc}[1]{\textcolor{Black}{#1}}
|
|
% \newcommand{\revm}[1]{\textcolor{Black}{#1}}
|
|
|
|
|
|
\input{preamble}
|
|
|
|
\newtheorem{example}{Example}
|
|
\newtheorem{definition}{Definition}
|
|
|
|
\newcommand{\systemname}{Workbook\xspace}
|
|
|
|
\newcommand{\TheTitle}{Runtime Provenance Refinement for Notebooks}
|
|
|
|
\pagestyle{plain}
|
|
|
|
|
|
\AtBeginDocument{%
|
|
\providecommand\BibTeX{{%
|
|
\normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}
|
|
|
|
\begin{CCSXML}
|
|
<ccs2012>
|
|
<concept>
|
|
<concept_id>10011007.10011006.10011041.10011048</concept_id>
|
|
<concept_desc>Software and its engineering~Runtime environments</concept_desc>
|
|
<concept_significance>300</concept_significance>
|
|
</concept>
|
|
<concept>
|
|
<concept_id>10002951.10002952.10002953.10010820.10003623</concept_id>
|
|
<concept_desc>Information systems~Data provenance</concept_desc>
|
|
<concept_significance>500</concept_significance>
|
|
</concept>
|
|
</ccs2012>
|
|
\end{CCSXML}
|
|
|
|
\ccsdesc[300]{Software and its engineering~Runtime environments}
|
|
\ccsdesc[500]{Information systems~Data provenance}
|
|
|
|
\begin{document}
|
|
\fancyhead{}
|
|
\title{\TheTitle}
|
|
|
|
\author{Nachiket Deo}
|
|
\affiliation{%
|
|
\institution{University of Connecticut}
|
|
nachiket.deo@uconn.edu
|
|
}
|
|
\author{Boris Glavic}
|
|
\affiliation{%
|
|
\institution{Illinois Institute of Technology}
|
|
bglavic@iit.edu
|
|
}
|
|
\author{Oliver Kennedy}
|
|
\affiliation{%
|
|
\institution{University at Buffalo}
|
|
okennedy@buffalo.edu
|
|
}
|
|
|
|
% The default list of authors is too long for headers.
|
|
% \renewcommand{\shortauthors}{Spoth, Xie et al.}
|
|
|
|
|
|
%
|
|
% The code below should be generated by the tool at
|
|
% http://dl.acm.org/ccs.cfm
|
|
% Please copy and paste the code instead of the example below.
|
|
%
|
|
|
|
% \keywords{JSON Schemas, Independence, Markov Models}
|
|
|
|
|
|
\begin{abstract}
|
|
\input{sections/abstract.tex}
|
|
\end{abstract}
|
|
|
|
\maketitle
|
|
|
|
\section{Introduction}
|
|
\label{sec:introduction}
|
|
\input{sections/introduction.tex}
|
|
|
|
% \subsection{Problem Statement}
|
|
% \label{sec:problem}
|
|
% \input{sections/problem}
|
|
|
|
\section{Runtime Provenance Refinement}
|
|
\label{sec:approx-prov}
|
|
\input{sections/approx-prov}
|
|
|
|
\section{Isolated Cell Execution}
|
|
\label{sec:isolation}
|
|
\input{sections/isolation}
|
|
|
|
\section{Scheduler}
|
|
\label{sec:scheduler}
|
|
\input{sections/scheduler}
|
|
|
|
\section{Jupyter Import}
|
|
\label{sec:import}
|
|
\input{sections/import}
|
|
|
|
\section{Implementation}
|
|
\label{sec:experiments}
|
|
\input{sections/experiments}
|
|
|
|
\section{Related Work}
|
|
\label{sec:related}
|
|
\input{sections/related}
|
|
|
|
\section{Conclusions}
|
|
\label{sec:conclusions}
|
|
\input{sections/conclusions}
|
|
|
|
% \paragraph{Acknowledgements}
|
|
% \label{sec:acknowledgements}
|
|
% \input{sections/acknowledgements.tex}
|
|
|
|
\bibliographystyle{abbrv}
|
|
\balance
|
|
\bibliography{main}
|
|
|
|
\end{document}
|
|
--- For Camera-Readh ---
|
|
|
|
- Spend more time emphasizing:
|
|
- The "First run" problem
|
|
- Dynamic provenance changes with each update
|
|
- Static buys you better scheduling decisions
|
|
|
|
- Better explain "hot" vs "cold" cache (e.g., spark loading data into memory)
|
|
|
|
- Explain that the choice of spark is due to Vizier having already chosen it. The main thing we need it for is Arrow and scheduling.
|
|
|
|
- Space permitting, maybe spend a bit more time contrasting microkernel with jupyter "hacks" like Nodebook.
|
|
|
|
- Add some text emphasizing the point that even though Jupyter is not intended for batch ETL processing, that is how a lot of people (e.g., cite netflix, stitchfix?). (and yes, we're aware that this is bad practice)
|
|
|
|
- Around the point where we describe that Vizier involves explicit dependencies, also point out that we describe how to provide a Jupyter-like experience on top of this model later in the paper. "Keep the mental model"
|
|
|
|
- Typos:
|
|
- " Not that Vizier"
|
|
|
|
- Add more future work
|
|
- Direct output to Arrow instead of via parquet.
|
|
|
|
- Add copyright text
|
|
|
|
- Check for and remove Type 3 fonts if any exist.
|
|
|
|
- Make sure fonts are embedded (should be default for LaTeX)
|
|
|
|
|
|
--- For Next Paper ---
|
|
|
|
- Use GIT history to recover the dependency graph
|
|
- e.g., figure out how much dynamic provenance changes for a single cell over a series of edits.
|
|
|
|
- Static vs Dynamic provenance: How different are they?
|
|
- e.g., how often do you need to "repair"
|
|
- How much further away from serial does dynamic get you?
|
|
|