main
Boris Glavic 2023-03-18 17:39:00 -05:00
parent 60cc34c3b6
commit 00ee7d9bcd
6 changed files with 90 additions and 5 deletions

View File

@ -145,4 +145,38 @@ Nigel Westbury},
year = {2004}
}
@article{tang-23-efcsfg,
author = {Dixin Tang and Fanchao Chen and Christopher De Leon and Tana Wattanawaroon and Jeaseok Yun and Srinivasan Seshadri and Aditya G. Parameswaran},
eprint = {2302.05482},
eprinttype = {arXiv},
journal = {CoRR},
title = {Efficient and Compact Spreadsheet Formula Graphs},
volume = {abs/2302.05482},
year = {2023}
}
@article{bendre-15-d,
author = {Mangesh Bendre and Bofan Sun and Ding Zhang and Xinyan Zhou and Kevin Chen-Chuan Chang and Aditya G. Parameswaran},
journal = {PVLDB},
number = {12},
pages = {2000--2003},
title = {DATASPREAD: Unifying Databases and Spreadsheets},
volume = {8},
year = {2015}
}
@inproceedings{bendre-19-fhs,
author = {Mangesh Bendre and Tana Wattanawaroon and Sajjadur Rahman and Kelly Mack and Yuyang Liu and Shichu Zhu and Yu Lu and Ping-Jing Yang and Xinyan Zhou and Kevin Chen-Chuan Chang and Karrie Karahalios and Aditya G. Parameswaran},
booktitle = {ICDE},
pages = {1972--1975},
title = {Faster, Higher, Stronger: Redesigning Spreadsheets for Scale},
year = {2019}
}
@misc{rahman-19-exsssns,
author = {Rahman, Sajjadur and Bendre, Mangesh and Yang, P and Liu, SZ Yuyang and Su, Zhaoyuan and Chang, K and Karahalios, K and Parameswaran, A},
title = {Extending Spreadsheets to Support Seamless Navigation at Scale},
year = {2019}
}

View File

@ -156,6 +156,9 @@ While databases provide extensive functionality and guarantees for working with
\input{sections/introduction}
\input{sections/overview}
\input{sections/data}
\input{sections/relwork}
\input{sections/conclusions}
%%

10
sections/conclusions.tex Normal file
View File

@ -0,0 +1,10 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Conclusions}
\label{sec:conclusions}
In this work, we introduced overlay spreadsheets as a potential solutions for implementing scalable, versioned spreadsheets which have the important advantage that a users's edits can (where possible) be reapplied when there are updates to the input data, a common issue in practice. This novel capability is powered by overlays which allow updates to the spread to be represented declaratively. While representing updates as ``views'' over the original spreadsheet has been applied in the Vizier project to enable provenance-tracking and light-weight versioning for computational notebooks whose datasets can be accessed and edited through a spreadsheet interface, the overlay approach we present in this work significantly improves performance. \BG{How do we fare against data spread?}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

View File

@ -114,3 +114,8 @@ In a situation like this, when the closed form solution includes an aggregate te
Instead, we can identify a set of cells that form a cut over the dependency graph, where all of the visible cells are on one side of the cut.
If the source data can evaluate SQL (e.g., an Apache Spark dataframe), an approach analogous to~\cite{freire:2016:hilda:exception} can outsource materialization of these cells to the database.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

View File

@ -4,10 +4,11 @@
Spreadsheets are a popular tool for data exploration, as they provide a simple environment for programmatically accessing and manipulating data.
However, spreadsheets have historically had challenges managing ``big data'', with as few as fifty thousand rows of data creating problems for existing spreadsheet engines~\cite{DBLP:conf/sigmod/RahmanMBZKP20}.
One approach to scalability, employed by Wrangler~\cite{DBLP:conf/chi/KandelPHH11}, Vizier~\cite{freire:2016:hilda:exception,brachmann:2020:cidr:your}, and others relies on translating spreadsheet interactions into declarative transformations that can be deployed to a classical relational database.
A second, more recent approach, employed by DataSpread~\cite{DBLP:conf/sigmod/BendreWMCP19,DBLP:conf/sigmod/RahmanMBZKP20,DBLP:conf/icde/BendreVZCP18}, instead re-architects the entire spreadsheet runtime around database primitives like indexes and incremental maintenance.
One approach to scalability, employed by \emph{Wrangler}~\cite{DBLP:conf/chi/KandelPHH11}, \emph{Vizier}~\cite{freire:2016:hilda:exception,brachmann:2020:cidr:your}, and others relies on translating spreadsheet interactions into declarative transformations that can be deployed to a classical relational database.
A second, more recent approach, employed by \emph{DataSpread}~\cite{DBLP:conf/sigmod/BendreWMCP19,DBLP:conf/sigmod/RahmanMBZKP20,DBLP:conf/icde/BendreVZCP18}, instead re-architects the entire spreadsheet runtime around database primitives like indexes and incremental maintenance.
We refer to these as the relational and architectural approaches, respectively.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}
\includegraphics[width=0.6\columnwidth]{graphics/overlay.png}
\Description{
@ -16,12 +17,21 @@ We refer to these as the relational and architectural approaches, respectively.
\caption{Architecture of an overlay spreadsheet}
\label{fig:overlay}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Because classical relational engines are not optimized for low-latency update queries, the architectural approach can provide significantly better interactive performance than the relational approach.
However, the relational redesign has a subtle, but important benefit:
The data access patterns of spreadsheet have several characteristics~\cite{DBLP:conf/icde/BendreVZCP18, DBLP:conf/sigmod/RahmanMBZKP20, DBLP:conf/sigmod/BendreWMCP19} that differ from typical SQL queries and lead to several inefficiencies if not taken into account when implementing a spreadsheet system:
\begin{itemize}
\item \textbf{Positional Order:} Both the rows and columns in a spreadsheet are ordered and cells can be referenced by position. As already observed in \cite{DBLP:conf/icde/BendreVZCP18}, cell positions needs to be maintained under operations (e.g., inserting or deleting rows and columns).
\item \textbf{Local Updates:} Editing the content of cells in a spreadsheet results in local updates that typically affect one or a small number of cells. Operations like inserting \& deleting rows or columns affect the whole spreadsheet and require updates to the positions of a large number of cells (\cite{DBLP:conf/icde/BendreVZCP18} presented an index structure that allows fast maintenance of positions under updates and access to a cell at a certain position).
\item \textbf{Limited Visibility:} At each point in time, the user can only observe a small portion of the spreadsheet (that fits on the screen). This enables lazy materialization of cells values and updates to the spreadsheet as the values of cells which are not currently visible to the user only have to be computed if they affect the value of a currently visible cell (e.g., because the visible cell uses a formula the references the other cell).
\item \textbf{Formulas:} An important feature of spreadsheets are that cells may contain formulas which are expressions that reference other cells in the spreadsheets. This essentially empowers the user to write highly localized ``views''. Formulas complicate the implementation of spreadsheets as to compute the value of a cell the values of all cells it depends on directly or indirectly have to be determined first.
\end{itemize}
Because relational engines are not optimized for low-latency update queries, the architectural approach can provide significantly better interactive performance than the relational approach.\BG{Furthermore, an important feature of spreadsheets is that cells may contain expressions (\emph{formulas}) which reference other cells. This enables users to create ``local'' views which populate some parts of the spreadsheet automatically. The evaluation of formulas is an inherently recursive process (a cell with a formula may refer to another cell that itself contains a formula and so on) which is hard to express efficiently in SQL).}
However, the relational design an important benefit:
User interactions manipulate a data transformation process, rather than the data itself.
In Wrangler, the resulting data transformation process can be easily upscaled from an interaction-friendly sample of the data to the entire dataset.
In Vizier, the user's manipulations are encoded in the lineage of a Spark's dataframe, facilitating detailed provenance analysis.
In Vizier, the user's manipulations are encoded in the lineage of a Spark's dataframe, facilitating detailed provenance analysis and effective versioning.
In this paper, we present a hybrid of the architectural and relational approaches to scalable spreadsheets: \emph{Overlay Spreadsheets}.
An Overlay Spreadsheet keeps source data in-situ, decoupled from the user's edits to a spreadsheet ``overlaid'' on top of the source data, as illustrated in \Cref{fig:overlay}.
@ -57,3 +67,8 @@ We explore the challenges of integrating overlay spreadsheets with Apache Spark
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

18
sections/relwork.tex Normal file
View File

@ -0,0 +1,18 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Related Work}
\label{sec:related-work}
Scaling up spreadsheets has been identified as an issue in prior work by the database community. One noteworthy project is \emph{DataSpread}~\cite{DBLP:conf/icde/BendreVZCP18, DBLP:conf/sigmod/RahmanMBZKP20, DBLP:conf/sigmod/BendreWMCP19}. \cite{DBLP:conf/icde/BendreVZCP18} introduced several storage layouts for spreadsheet data (e.g, a position to value mapping that is efficient for sparse spreadsheets or encoding the rows / columns as rows of a single relation which is more efficient for dense spreadsheets), introduce a heuristic for self-tuning storage by selecting an appropriate layout for individual parts of a large spreadsheet, and introduced a tree index structure that enables the positions of cells to be maintained under insertions and deletion of rows in time logarithmic in the size of spread while also supporting look-ups (retrieving the cell at a certain position) in logarithmic time.
%
\cite{DBLP:conf/sigmod/BendreWMCP19} introduced asynchronous algorithms for updating the values of cells with formulas when a cell is updated. This work compresses the dependency graph of a spreadsheet which stores dependencies between cells (the formula of a cell references another cell) into a table that compactly over-approximates the transitive closure of the inverse dependency relation for cells using a constant number of cell ranges. When a cell is updated, this table is then used to determine a super-set of the cells that depend on the cell directly or indirectly and may need to be refreshed. \cite{tang-23-efcsfg} introduces a different type of compressed dependency graph which is lossless and exploits repetitive patterns in formulas which are common in spreadsheets due to features like auto-fill and the fact that a formula only determines the value of a single cell, e.g., when all cells of a column are computed based on other columns within the same row. While these techniques enable fast re-computation of cell values, they do not enable the input dataset to be updated as they do not track updates. Furthermore, how to efficiently support updates like inserting and deleting rows which potentially affect large parts of the dependency graph has not been addressed in this work.\footnote{\cite{tang-23-efcsfg} may be better equipped with such updates, but as this is a lossless data structure, this may still require modifying a large number or all entries in a compressed formula graph.}
Vizier~\cite{brachmann:2019:sigmod:data, kennedy:2022:ieee-deb:right, kumari:2021:cidr:datasense, brachmann:2020:cidr:your} is a computational notebook system that automatically versions notebooks while they are edited by a users. This is achieved using a light-weight versioning scheme based on workflow evolution provenance, i.e., storing updates to the code (computation) of a notebook rather then the (input, intermediate, and result) data. In Vizier, any dataset used in a computational notebook can be accessed and edited through a spreadsheet interface.
In summary, several efficient algorithms for storing, accessing, and updating spreadsheets have been developed and adapted in the context of the DataSpread. The approach developed for Vizier is often less efficient, but has the advantage of supporting light-weight versioning and tracking the provenance of the evolution of a dataset (and the computational notebook containing it) under spreadsheet operations. Importantly, this approach enables replaying a user's updates that were originally applied to a dataset $D_{old}$ when $D_{old}$ is replaced with an updated dataset $D_{new}$ (e.g., the user may have downloaded a new version of an open dataset and wants to keep the manual fixes they have applied to the original version of the dataset). The overlay approach we present in this work has the potential to retain these benefits while enabling performance close to or exceeding that of DataSpread \BG{fix if not true}. Furthermore, overlays with reference frames enable more efficient support for insertion and deletion for rows and columns as this only affects reference frames, but not the formulas of cells. \BG{That right? How much do we gain by that?}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% reftex-default-bibliography: ("../main.bib")
%%% End: