main
Oliver Kennedy 2023-03-07 15:26:23 -05:00
parent 25c451236b
commit 0a78c87f27
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
6 changed files with 161 additions and 55 deletions

BIN
graphics/overlay.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 174 KiB

BIN
graphics/sketch.xopp Normal file

Binary file not shown.

View File

@ -7,3 +7,99 @@
year = {2016}
}
@inproceedings{DBLP:conf/sigmod/BendreWMCP19,
author = {Mangesh Bendre and
Tana Wattanawaroon and
Kelly Mack and
Kevin Chang and
Aditya G. Parameswaran},
title = {Anti-Freeze for Large and Complex Spreadsheets: Asynchronous Formula
Computation},
booktitle = {{SIGMOD} Conference},
pages = {1277--1294},
publisher = {{ACM}},
year = {2019}
}
@inproceedings{DBLP:conf/sigmod/RahmanMBZKP20,
author = {Sajjadur Rahman and
Kelly Mack and
Mangesh Bendre and
Ruilin Zhang and
Karrie Karahalios and
Aditya G. Parameswaran},
title = {Benchmarking Spreadsheet Systems},
booktitle = {{SIGMOD} Conference},
pages = {1589--1599},
publisher = {{ACM}},
year = {2020}
}
@inproceedings{brachmann:2020:cidr:your,
author = {Brachmann, Michael and Spoth, William and Kennedy, Oliver and Glavic, Boris and Mueller, Heiko and Castelo, Sonia and Bautista, Carlos and Freire, Juliana},
title = {Your notebook is not crumby enough, REPLace it},
booktitle = {CIDR},
year = {2020}
}
@inproceedings{DBLP:conf/chi/KandelPHH11,
author = {Sean Kandel and
Andreas Paepcke and
Joseph M. Hellerstein and
Jeffrey Heer},
title = {Wrangler: interactive visual specification of data transformation
scripts},
booktitle = {{CHI}},
pages = {3363--3372},
publisher = {{ACM}},
year = {2011}
}
@inproceedings{kumari:2021:cidr:datasense,
author = {Kumari, Poonam and Brachmann, Michael and Kennedy, Oliver and Feng, Su and Glavic, Boris},
title = {DataSense: Display Agnostic Data Documentation},
booktitle = {CIDR},
year = {2021}
}
@article{kennedy:2022:ieee-deb:right,
author = {Kennedy, Oliver and Glavic, Boris and Freire, Juliana and Brachmann, Mike},
title = {The Right Tool for the Job: Data-Centric Workflows in Vizier},
journal = {IEEE-DEB},
year = {2022}
}
@inproceedings{brachmann:2019:sigmod:data,
author = {Brachmann, Mike and Bautista, Carlos and Castelo, Sonia and Feng, Su and Freire, Juliana and Glavic, Boris and Kennedy, Oliver and Mueller, Heiko and Rampin, Remi and Spoth, William and Yang, Ying},
title = {Data Debugging and Exploration with Vizier},
booktitle = {SIGMOD-Demo},
year = {2019}
}
@inproceedings{DBLP:conf/cidr/BakkeB11,
author = {Eirik Bakke and
Edward Benson},
title = {The Schema-Independent Database {UI:} {A} Proposed Holy Grail and
Some Suggestions},
booktitle = {{CIDR}},
pages = {219--222},
publisher = {www.cidrdb.org},
year = {2011}
}

View File

@ -1,57 +1,6 @@
%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx (with options: `sigconf')
%%
%% IMPORTANT NOTICE:
%%
%% For the copyright see the source file.
%%
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%%
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%%
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass
%% command.
%%
%% For submission and review of your manuscript please change the
%% command to \documentclass[manuscript, screen, review]{acmart}.
%%
%% When submitting camera ready or to TAPS, please change the command
%% to \documentclass[sigconf]{acmart} or whichever template is required
%% for your publication.
%%
%%
\documentclass[sigconf,draft]{acmart}
\documentclass[sigconf,review]{acmart}
%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
\providecommand\BibTeX{{%
Bib\TeX}}}
\usepackage{cleveref}
%% Rights management information. This information is sent to you
%% when you complete the rights form. These commands have SAMPLE
@ -117,7 +66,7 @@
%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Sparking Insights: Overlay Spreadsheets on Apache Spark}
\title{Sparking Insights: Overlaying Spreadsheets onto Apache Spark}
%%
%% The "author" command and its associated commands are used to define
@ -131,6 +80,7 @@
\email{okennedy@buffalo.edu}
\affiliation{%
\institution{University at Buffalo}
\city{Buffalo}
\country{USA}
}
@ -138,12 +88,14 @@
\email{bglavic@iit.edu}
\affiliation{%
\institution{Illinois Institute of Technology}
\city{Illinois}
\country{USA}
}
\author{Michael Brachmann}
\affiliation{%
\institution{Breadcrumb Analytics}
\city{Buffalo}
\country{USA}
}
@ -199,7 +151,8 @@
%% information and builds the first part of the formatted document.
\maketitle
\section{Introduction}
\input{sections/introduction}
\input{sections/overview}

39
sections/introduction.tex Normal file
View File

@ -0,0 +1,39 @@
%!TEX root=../main.tex
\section{Introduction}
\label{sec:introduction}
Spreadsheets are a popular tool for data exploration, as they provide a simple environment for programmatically accessing and manipulating data.
However, spreadsheets have historically had challenges managing ``big data'', with as few as fifty thousand rows of data creating problems for existing spreadsheet engines~\cite{DBLP:conf/sigmod/RahmanMBZKP20}.
Several approaches have been proposed to enable scalable spreadsheets, from translating spreadsheet interactions into database-style relational algebra~\cite{freire:2016:hilda:exception,brachmann:2020:cidr:your,DBLP:conf/chi/KandelPHH11}, to re-architecting spreadsheets using database primitives like indexes and incremental maintenance~\cite{DBLP:conf/sigmod/BendreWMCP19}.
\begin{figure}
\includegraphics[width=0.6\columnwidth]{graphics/overlay.png}
\Description{
Two layers. At the bottom, a classical relational table including column headers and miscellaneous data values. Stacked over it is a set of edits: cells with dependencies marked by arrows.
}
\caption{Architecture of an overlay spreadsheet}
\label{fig:overlay}
\end{figure}
Because classical relational engines are not optimized for low-latency update queries, a re-architected spreadsheet (i.e., the latter approach) can provide significantly better interactive performance.
However, it sacrifices a key benefit of the former approach, the ability to re-play a user's interactions on a new, updated dataset.
In this paper, we propose a new approach to spreadsheet design, called \emph{Overlay Spreadsheets}, where the user's edits to a spreadsheet are decoupled from the source data over which the spreadsheet is built.
As illustrated in \Cref{fig:overlay}, an overlay spreadsheet is a collection of \emph{updates} overlaid on top of a static dataset.
Users interact with the resulting structure much like they would an ordinary spreadsheet, inserting or removing rows or columns, overwriting data with formulas or literals, and reorganizing the data.
References to the underlying data are virtualized in the overlay layer, and only materialized when the spreadsheet is opened.
In addition to allowing source data to be seamlessly updated, this approach can also improve scalability.
The memory requirements of a fully in-memory spreadsheet (e.g.,Dataspread~\cite{DBLP:conf/sigmod/BendreWMCP19}, Excel, or OpenOffice Calc) are proportional to the total amount of data.
Conversely, under typical usage patterns, an overlay spreadsheet's memory consumption and performance overheads are proportional to the complexity of the user's modifications, rather than the size of the initial dataset.
Because interactive sessions are typically limited by the bandwidth of the user, and not the raw data, this reduction in memory consumption, and the resulting performance improvement can be significant.
As a further advantage of the overlay approach, user interactions with the overlay can be translated into other representations~\cite{freire:2016:hilda:exception}.
For example, a user's edits on a spreadsheet can be transformed into a series of transformations over a dataframe, allowing seamless integration of existing approaches to provenance management~\cite{brachmann:2020:cidr:your,kumari:2021:cidr:datasense} and workflow execution~\cite{kennedy:2022:ieee-deb:right}.
In this paper, we introduce Overlay Spreadsheets, and present the details of our prototype implementation.
We implement the concept in the Vizier notebook~\cite{kennedy:2022:ieee-deb:right,brachmann:2020:cidr:your,brachmann:2019:sigmod:data}, a workflow-style notebook built over Apache Spark.
We explore the challenges of integrating overlay spreadsheets with Apache Spark dataframes, and discuss preliminary work in translating an overlay spreadsheet to derive a dataframe.

18
sections/overview.tex Normal file
View File

@ -0,0 +1,18 @@
%!TEX root=../main.tex
\section{System Overview}
\label{sec:overview}
A spreadsheet is a regular grid of cells, which are defined by formulas.
A cell's formula may be a literal value, or an expression defining a computation that may be based on the value of other cells.
The value of a cell is the result of evaluating the cell's formula.
This may require obtaining the value of cells on which the formula depends; we refer to such cells as \emph{upstream} cells.
When a cell is modified, the values of downstream (i.e., dependent) cells are updated accordingly;
That is, in contrast to a relational table, which can be updated by a sequence of imperative operations, the formulas of a spreadsheet are evaluated (conceptually) at the same time.
A cycle in the dependency graph (i.e., a cell being upstream of itself) is an error, and any cells participating in the cycle evaluate to a special error value.
In contrast to classical spreadsheets, where each cell is a completely independent entity, we adopt the Relational spreadsheet model~\cite{DBLP:conf/cidr/BakkeB11}, which focuses on so-called `tidy data,' where each row is one record, and each column represents a distinct (strongly typed) variable.
\subsection{Source Data}
An Overlay data source is primarily responsible for defining the initial shape (i.e., schema and number of rows) of the dataset, and providing random access to individual cell values.