master
Boris Glavic 2019-12-16 20:58:26 -06:00
parent b4395b969a
commit 9c92b54fcd
4 changed files with 16 additions and 47 deletions

View File

@ -66,30 +66,12 @@ Paris, France, April 16-19, 2018},
number = {1},
pages = {51--62},
projects = {GProM; Reenactment},
title = {GProM - A Swiss Army Knife for Your Provenance Needs},
title = {{GProM} - A Swiss Army Knife for Your Provenance Needs},
volume = {41},
year = {2018}
}
@article{DBLP:journals/tkde/ArabGKRG18,
author = {Bahareh Sadat Arab and Dieter Gawlick and Vasudha Krishnaswamy and Venkatesh Radhakrishnan and Boris Glavic},
journal = {IEEE Trans. Knowl. Data Eng.},
number = {3},
pages = {599--612},
title = {Using Reenactment to Retroactively Capture Provenance for Transactions},
volume = {30},
year = {2018}
}
@article{DBLP:journals/debu/ArabFGLNZ17,
author = {Bahareh Sadat Arab and Su Feng and Boris Glavic and Seokki Lee and Xing Niu and Qitian Zeng},
journal = {IEEE Data Eng. Bull.},
number = {1},
pages = {51--62},
title = {GProM - A Swiss Army Knife for Your Provenance Needs},
volume = {41},
year = {2018}
}
@incollection{BC04a,
author = {Bertossi, Leopoldo and Chomicki, Jan},
@ -99,12 +81,7 @@ Paris, France, April 16-19, 2018},
year = {2004}
}
@article{BB14,
author = {Bhardwaj, Anant and Bhattacherjee, Souvik and Chavan, Amit and Deshpande, Amol and Elmore, Aaron J and Madden, Samuel and Parameswaran, Aditya G},
journal = {arXiv preprint arXiv:1409.0798},
title = {DataHub: Collaborative Data Science \& Dataset Version Management at Scale},
year = {2014}
}
@article{BD15,
author = {Bhardwaj, Anant and Deshpande, Amol and Elmore, Aaron J and Karger, David and Madden, Sam and Parameswaran, Aditya and Subramanyam, Harihar and Wu, Eugene and Zhang, Rebecca},
@ -171,7 +148,7 @@ H. V. Jagadish},
@inproceedings{CF06b,
author = {Callahan, Steven P and Freire, Juliana and Santos, Emanuele and Scheidegger, Carlos E and Silva, Claudio T and Vo, Huy T},
booktitle = {Data Engineering Workshops, 2006. Proceedings. 22nd International Conference on},
booktitle = {ICDE Workshops},
pages = {71--71},
title = {Managing the evolution of dataflows with vistrails},
year = {2006}
@ -312,12 +289,6 @@ H. V. Jagadish},
year = {2016}
}
@phdthesis{F07a,
author = {Fuxman, A.D.},
school = {University of Toronto},
title = {Efficient query processing over inconsistent databases},
year = {2007}
}
@inproceedings{FM05,
author = {Fuxman, Ariel D and Miller, Renée J},
@ -475,12 +446,8 @@ century},
}
@inproceedings{koop@tapp2017,
address = {Berkeley, CA, USA},
author = {Koop, David and Patel, Jay},
booktitle = {TaPP},
numpages = {1},
pages = {17--17},
series = {TaPP'17},
title = {Dataflow Notebooks: Encoding and Tracking Dependencies of Cells},
year = {2017}
}
@ -592,8 +559,7 @@ Shankar Pal and
Istvan Cseri and
Gideon Schaller and
Nigel Westbury},
booktitle = {Proceedings of the ACM SIGMOD International Conference on Management
of Data, Paris, France, June 13-18, 2004},
booktitle = {SIGMOD},
pages = {903--908},
title = {ORDPATHs: Insert-Friendly XML Node Labels},
year = {2004}
@ -752,7 +718,10 @@ Xibei Jia},
@inproceedings{XH,
author = {Xu, Liqi and Huang, Silu and Hui, Sili and Elmore, A and Parameswaran, Aditya},
title = {ORPHEUSDB: A lightweight approach to relational dataset versioning}
title = {{OrpheusDB}: {A} Lightweight Approach to Relational Dataset Versioning},
booktitle = {SIGMOD},
pages = {1655--1658},
year = {2017},
}
@article{yang2015lenses,

View File

@ -71,7 +71,7 @@ Vizier facilitates debugging and re-usability of data and workflows by tracking
Vizier can propagate these annotations through operations of a notebook based on a principled, yet lightweight, uncertainty model called UA-DBs~\cite{feng:2019:sigmod:uncertainty}.
While some aspects of Vizier such as automated dependency tracking for notebooks, versioning, and workflow provenance tracking are also supported by other approaches, the combination of these features and the support for caveats leads to a system that is more than the sum of its components and provides unique capabilities that to the best of our knowledge are not supported by any other approach.
Many aspects of Vizier, including parts of its user interface~\cite{freire:2016:hilda:exception,kumari:2016:qdb:communicating}, provenance models~\cite{DBLP:journals/debu/ArabFGLNZ17,DBLP:conf/visualization/BavoilCSVCSF05}, and caveats~\cite{yang2015lenses,feng:2019:sigmod:uncertainty}, were explored independently in prior work.
Many aspects of Vizier, including parts of its user interface~\cite{freire:2016:hilda:exception,kumari:2016:qdb:communicating}, provenance models~\cite{AG17c,DBLP:conf/visualization/BavoilCSVCSF05}, and caveats~\cite{yang2015lenses,feng:2019:sigmod:uncertainty}, were explored independently in prior work.
In this paper, we focus on the challenge of unifying these components into a cohesive, notebook-style system for data exploration and curation, and in particular on the practical implementation of caveats and Vizier's spreadsheet interface.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -17,24 +17,24 @@ Notebook systems like Jupyter and Zeppelin have received praise for their intera
\tinysection{Versioning and Provenance}
Another problem with notebook systems is their lack of versioning capabilities. For reproducibility and collaboration, it is essential to keep track of both versions of the datasets produced and consumed by notebooks as well as versions of the notebook itself. Versioning is closely related to data provenance which tracks the creation process of data keeping track of both dependencies among data items and the processes and actors involved in the creation process. The W3C PROV standard~\cite{MB13a} has been proposed as an application-independent way of representing provenance information.
Provenance in workflow systems has been studied intensively in the past~\cite{CW17a,BM08,DF08,SV08,DC07,FS12,SV08,CF06b,DBLP:conf/visualization/BavoilCSVCSF05}. So-called retrospective provenance, the data and control-dependencies of a workflow execution, can be used to reproduce a result and understand how it was derived. Koop~\cite{DBLP:conf/ipaw/Koop16} and \cite{CF06b} propose to track the provenance of how a workflow evolves over time in addition to tracking the provenance of its executions. Niu et al.~\cite{DBLP:journals/pvldb/NiuALFZGKLG17} use a similar model to enable ``provenance-aware data workspaces'' which allow analysts to non-destructively change their workflows and update their data.
In the context of dataset versioning, prior work has investigated optimized storage for versioned datasets~\cite{XH,BD15,BB14,MG16a}. Bhattacherjee et al.~\cite{BC15a} study the trade-off between storage versus recreation cost for versioned datasets.
In the context of dataset versioning, prior work has investigated optimized storage for versioned datasets~\cite{XH,BD15,MG16a}. Bhattacherjee et al.~\cite{BC15a} study the trade-off between storage versus recreation cost for versioned datasets.
The version graphs used in this work essentially track coarse-grained provenance.
The Nectar system~\cite{GR10} automatically caches intermediate results of distributed dataflow computations also trading storage versus computational cost.
Similarly, metadata management systems like Ground and Apache Atlas (\url{https://atlas.apache.org/}) manage coarse-grained provenance for datasets in a data lake.
In contrast to workflow provenance which is often coarse-grained, i.e., at the level of datasets, database provenance is typically more fine-grained, e.g., at the level of rows~\cite{CC09,HD17,DBLP:journals/debu/ArabFGLNZ17,GM13,SJ18,MD18}. Many systems capture database provenance by annotating data and propagating these annotations during query processing.
Vizier's version and provenance management techniques integrate several lines of prior work by the authors including tracking the provenance of workflow versions~\cite{DBLP:journals/concurrency/ScheideggerKSVCFS08,XN16}, provenance tracking for updates and reenactment~\cite{DBLP:journals/tkde/ArabGKRG18,DBLP:journals/pvldb/NiuALFZGKLG17}, and using provenance-based techniques for tracking uncertainty annotations~\cite{yang2015lenses,feng:2019:sigmod:uncertainty}.
In contrast to workflow provenance which is often coarse-grained, i.e., at the level of datasets, database provenance is typically more fine-grained, e.g., at the level of rows~\cite{CC09,HD17,AF18,AG17c,GM13,SJ18,MD18}. Many systems capture database provenance by annotating data and propagating these annotations during query processing.
Vizier's version and provenance management techniques integrate several lines of prior work by the authors including tracking the provenance of workflow versions~\cite{DBLP:journals/concurrency/ScheideggerKSVCFS08,XN16}, provenance tracking for updates and reenactment~\cite{AG17c,DBLP:journals/pvldb/NiuALFZGKLG17}, and using provenance-based techniques for tracking uncertainty annotations~\cite{yang2015lenses,feng:2019:sigmod:uncertainty}.
The result is a system that is more than the sum of it components and to the best of our knowledge is the first system to support all of these features.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\tinysection{Uncertain Data}
Vizier's cavets are a practical application of uncertain data management. Incomplete~\cite{GM18a,CL03b,M98b,IL84a}, inconsistent~\cite{F07a,FM05,BC04a,CL03b}, and probabilistic databases~\cite{SO11,OH10,WT08,AK07,RD06a} have been studied for several decades. % in the past.
Vizier's cavets are a practical application of uncertain data management. Incomplete~\cite{GM18a,CL03b,M98b,IL84a}, inconsistent~\cite{FM05,BC04a,CL03b}, and probabilistic databases~\cite{SO11,OH10,WT08,AK07,RD06a} have been studied for several decades. % in the past.
However, even simple types of queries become intractable when evaluated over uncertain data. While approximation techniques have been proposed (e.g., ~\cite{GM18a,OH10,GP17}), these techniques are often still not efficient enough, ignore useful, albeit uncertain, data, or do not support complex queries. In~\cite{feng:2019:sigmod:uncertainty} we formalized \emph{uncertainty-annotated databases} (\emph{UA-DBs}), a light-weight model for uncertain data where rows are annotated as either certain or uncertain.
In~\cite{yang2015lenses} we introduced Lenses which are uncertain versions of data curation and cleaning operators that represent the uncertainty inherent in a curation step using an attribute-level version of the UA-DB model. Data cavets in Vizier generalize this idea to support non-relational operations and to enrich such annotations with additional information to record more details about data errors.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\tinysection{Data Spreadsheets}
Approaches like DataSpread and others~\cite{DBLP:conf/icde/BendreVZCP18,DBLP:conf/icde/LiuJ09,DBLP:conf/sigmod/BakkeK16} utilize spreadsheet interfaces as front-ends for databases. Vizier stands out through its seamless integration of spreadsheets and notebooks~\cite{freire:2016:hilda:exception}. Like other approaches that improve the usability of databases~\cite{DBLP:journals/debu/LiJ12}, Vizier provides a simple user interface that can be used effectively by both experts and non-experts and does not require any background in relational data processing to be understood. Furthermore, we argue in~\cite{freire:2016:hilda:exception} that the spreadsheets and notebook interfaces complement each other well for data curation and exploration tasks. For example, spreadsheets are suited well for handling rare exceptions by manually updating cells and are convenient for certain schema-level operations (e.g., creating or deleting columns) while notebooks are more suited for complex workflows and bulk operations (e.g., automated data repair).
Integrating the spreadsheet paradigm which heavily emphasizes updates, e.g., a user overwrites the value of a cell, with Viziers functional, data-flow model of notebook workflows would have been challenging if not for our prior work on \emph{reenactment}~\cite{DBLP:journals/tkde/ArabGKRG18,AF18,AG17c,DBLP:journals/pvldb/NiuALFZGKLG17}. Reenactment enables us to
Integrating the spreadsheet paradigm which heavily emphasizes updates, e.g., a user overwrites the value of a cell, with Viziers functional, data-flow model of notebook workflows would have been challenging if not for our prior work on \emph{reenactment}~\cite{AG17c,AF18,AG17c,DBLP:journals/pvldb/NiuALFZGKLG17}. Reenactment enables us to
translates updates into queries (side-effect free functions).
%%% Local Variables:

View File

@ -24,7 +24,7 @@ Otherwise, an empty value is treated as \texttt{NULL}.
Through the spreadsheet interface, users can create, rename, reorder, or delete rows and columns, or alter data --- a standard set of DDL and DML operations for spreadsheets.
These operations can not be applied in-place without sacrificing the immutability of versions.
To preserve versioning and avoid unnecessary data copies, Vizier builds on a technique called reenactment~\cite{DBLP:journals/pvldb/NiuALFZGKLG17,DBLP:journals/tkde/ArabGKRG18}, which translates sequences of DML operations into equivalent queries.
To preserve versioning and avoid unnecessary data copies, Vizier builds on a technique called reenactment~\cite{DBLP:journals/pvldb/NiuALFZGKLG17,AF18}, which translates sequences of DML operations into equivalent queries.
We emphasize that our use of the SQL code examples shown in this section are produced automatically as part of the translation of Vizual into SQL queries. Users will not need to write SQL queries to express spreadsheet operations. The user' actions in the spreadsheet are automatically added as Vizual cells to the notebook and these Vizual operations are automatically translated into equivalent SQL DDL/DML expressions~\cite{freire:2016:hilda:exception}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -66,7 +66,7 @@ Ideally, we would like to use row identifiers that are stable through such chang
%Column identifiers are already defined by the source table.
%For row identifiers,
For derived data, Vizier uses a row identity model based on GProM's~\cite{DBLP:journals/debu/ArabFGLNZ17} encoding of provenance.
For derived data, Vizier uses a row identity model based on GProM's~\cite{AF18} encoding of provenance.
Derived rows, such as those produced by declaratively specified table updates, are identified as follows:
(1) Rows in the output of a projection or selection use the identifier of the source row that produced them;
(2) Rows in the output of a \lstinline{UNION ALL} are identified by the identifier of the source row and an identifier marking which side of the union the row came from\footnote{To preserve associativity and commutativity during optimization, union-handedness is recorded during parsing};