paper-Vizier-SpreadsheetOve.../sections/model.tex

323 lines
17 KiB
TeX

%!TEX root=../main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Spreadsheet Data Model}
\label{sec:spre-datam}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Colors
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{tabexprcolor}{rgb}{0,0.5,0}
\newcommand{\comprehension}[2]{\left\{\;#1\;|\;#2\;\right\}}
\newcommand{\ds}{D}
\newcommand{\spreadsheet}{\mathbb S}
\newcommand{\encodedSpreadsheet}{\hat \spreadsheet}
\newcommand{\columnDomain}{\mathcal C}
\newcommand{\columnRange}{C}
\newcommand{\column}{c}
\newcommand{\rowDomain}{\mathcal R}
\newcommand{\rowRange}{R}
\newcommand{\row}{r}
\newcommand{\rowOffset}{\delta}
\newcommand{\cell}{\cellRef{\column}{\row}}
\newcommand{\cellPrime}{\cellRef{\column'}{\row'}}
\newcommand{\exprDomain}{\mathcal E}
\newcommand{\expr}{e}
\newcommand{\patternDomain}{\mathcal P}
\newcommand{\pattern}{P}
\newcommand{\valueDomain}{\mathcal V}
\newcommand{\valat}[3]{#1[#2,#3]}
\newcommand{\rangeOf}[2]{#1[#2]}
\newcommand{\range}{\rangeOf{\columnRange}{\rowRange}}
\newcommand{\cellRef}[2]{#1[#2]}
\newcommand{\evalOf}[2]{\left\llbracket\; #2 \;\right\rrbracket_{#1}}
\newcommand{\patternOf}[2]{\left\llbracket\; #1 \;\right\rrbracket_{#2}}
\newcommand{\depsOf}[1]{\textbf{deps}\left(#1\right)}
\newcommand{\tdepsOf}[1]{\ensuremath{\text{\textbf{deps}\ast\left(#1\right)}\xspace}}
\newcommand{\DG}[1]{\ensuremath{G_{#1}}\xspace}
\newcommand{\TDG}[1]{\ensuremath{G_{#1}^*}\xspace}
\newcommand{\errorval}{\ensuremath{\bot}\xspace}
\newcommand{\se}[1]{{\textcolor{tabexprcolor}{#1}}}
\newcommand{\uv}[1]{{\textcolor{red}{#1}}}
\newcommand{\upd}{U}
\newcommand{\cu}[3]{\cellRef{#1}{#2} = #3}
\newcommand{\cellupdate}{u}
\newcommand{\dom}{\textsc{Dom}}
\newcommand{\acu}{\cu{\column}{\row}{\expr}}
\newcommand{\rframe}{\ensuremath{\mathcal{F}}\xspace}
\newcommand{\rtrans}{\ensuremath{\mathcal{T}}\xspace}
\newcommand{\overlay}{\mathcal{O}}
\newcommand{\ol}[2]{\tuple{#1,#2}}
\newcommand{\oins}{\mathcal{I}}
\newcommand{\oup}{\mathcal{U}}
\newcommand{\aol}{\ol{\rtrans}{\oup}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Spreadsheets}
\label{sec:spreadsheets}
Let $\columnDomain$ and $\rowDomain$ denote domains of column and row labels. Except where noted, $\rowDomain \subset \mathbb Z$.
Let $\valueDomain$ and $\exprDomain \supset \valueDomain$ denote domains of values and expressions, respectively.
A \emph{spreadsheet} $\spreadsheet : (\columnDomain \times \rowDomain) \rightarrow \exprDomain$ is a partial mapping from \emph{cells} ($\cellRef{\column}{\row} \in (\columnDomain \times \rowDomain)$) to expressions.
We use $\valat{\spreadsheet}{\column}{\row}$ to denote $\spreadsheet(\cellRef{\column}{\row})$.
Let $\errorval \in \valueDomain$ indicate ``undefined'' and define the \emph{domain} $\dom(\spreadsheet)$ to be the set of cells $\cellRef{\column}{\row}$ where $\valat{\spreadsheet}{\column}{\row} \neq \errorval$.
An expression $\expr \in \exprDomain$ is a formula defined over literals from $\valueDomain$, the standard arithmetic operators, and references to other cells in the spreadsheet ($\cellRef{\column}{\row}$).
The expression $\expr$ is evaluated in the context of a spreadsheet ($\evalOf{\spreadsheet}{\cdot} : \exprDomain \rightarrow \valueDomain$) as follows:
(i) Literals and arithmetic are evaluated in the usual way, and (ii) References to the spreadsheet are evaluated recursively
($\evalOf{\spreadsheet}{\cellRef{\column}{\row}} \equiv \evalOf{\spreadsheet}{\spreadsheet(\column, \row)}$).
By convention, cyclic references evaluate to $\errorval$.
%
An expression's dependencies ($\depsOf{\expr}$) are the cells referenced by $\expr$.
Dependencies induce a graph $\DG{\spreadsheet}\tuple{N, E}$ over the spreadsheet, with cells as nodes (i.e., $N = \columnDomain \times \rowDomain$), and dependencies as directed edges:
$$E = \bigcup_{\cell \in \columnDomain \times \rowDomain}
\{\;\cell \rightarrow \cellPrime\;|\;\cellPrime \in \depsOf{\valat{\spreadsheet}{\column}{\row}}\;\} $$
Denote by $\TDG{\spreadsheet}$ the graph $\tuple{V,E^*}$ where $E^*$ is the transitive closure of $E$ (i.e., $\TDG{\spreadsheet}$ captures both direct and indirect dependencies).
%
Note that if all cell expressions are constants (i.e., a spreadsheet without formulas), then $\evalOf{\spreadsheet}{\cellRef{\column}{\row}} = \valat{\spreadsheet}{\column}{\row}$.
\begin{example}
Consider the spreadsheet at the top of \Cref{fig:example-spreadsheet-and-a}.
Columns \emph{A} and \emph{B} hold constant expressions, while column \emph{C} holds reference cells from columns \emph{A} and \emph{B}.
Evaluating this spreadsheet assigns each cell a value, as in the top right.
For example, $\cellRef{C}{1}$ evaluates to $\evalOf{\spreadsheet}{\cellRef{A}{1} + \cellRef{B}{1}} = \evalOf{\spreadsheet}{\cellRef{A}{1}} + \evalOf{\spreadsheet}{\cellRef{B}{1}} = 15 + 50 = 65$.
\end{example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\centering
\newcommand{\tabhead}[1]{\cellcolor{black}{\textcolor{white}{#1}}}
\begin{minipage}{0.48\linewidth}
\centering
\textbf{\small Spreadsheet $\spreadsheet$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} &\se{15} &\se{50} &\se{A1 + B1}\\ \hline
\tabhead{2} &\se{20} &\se{60} &\se{A2 + B2} \\ \hline
\tabhead{3} &\se{25} &\se{100} &\se{A3 + B3}\\ \hline
\tabhead{4} &\se{50} &\se{0} &\se{A4 + B4}\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage} %
%
\begin{minipage}{0.49\linewidth}
\centering
\textbf{\small Evaluated Spreadsheet $\evalOf{\spreadsheet}{\cdot}$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} & 15 & 50 & 65\\ \hline
\tabhead{2} & 20 & 60 & 80 \\ \hline
\tabhead{3} & 25 & 100 & 125\\ \hline
\tabhead{4} & 50 & 0 & 50\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}\\[2mm]
%$,$\\ \vspace{5mm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{minipage}{1 \linewidth}
\centering
\textbf{Update} $\upd = \{ \cu{A}{1}{20}, \cu{C}{3}{2 \cdot A3 + B3} \}$
\end{minipage}\\[1mm]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%$,$\\ %\vspace{5mm}
\begin{minipage}{0.46\linewidth}
\centering
\textbf{\small Updated Spreadsheet $\upd(\spreadsheet)$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} &\uv{20} &\se{50} &\se{A1 + B1}\\ \hline
\tabhead{2} &\se{20} &\se{60} &\se{A2 + B2} \\ \hline
\tabhead{3} &\se{25} &\se{100} &\uv{2 $\cdot$ A3 + B3}\\ \hline
\tabhead{4} &\se{50} &\se{0} &\se{A4 + B4}\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage} %
%
\begin{minipage}{0.49\linewidth}
\centering
\textbf{\small Evaluated Update $\evalOf{\upd(\spreadsheet)}{\cdot}$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} & \uv{20} & 50 & \uv{70}\\ \hline
\tabhead{2} & 20 & 60 & 80 \\ \hline
\tabhead{3} & 25 & 100 & \uv{150}\\ \hline
\tabhead{4} & 50 & 0 & 50\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}
\vspace{-3mm}
\caption{Example spreadsheet with expressions shown in \textcolor{tabexprcolor}{dark green}, and an update applied to the spreadsheet with updated expressions and values shown in \uv{red}.}\label{fig:example-spreadsheet-and-a}
\trimfigurespacing
% \vspace{1mm}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Cell Updates}
\label{sec:updates}
A cell update set $\upd \subseteq \columnDomain \times \rowDomain \times \exprDomain$ is a set of cell updates of the form $\acu$ that assign to cell $\cellRef{\column}{\row}$ the expression $\expr$.
Denote by $\dom(\upd)$ the domain of update $\upd$, containing all cells $\cellRef{\column}{\row}$ defined in $\upd$ (i.e., $\exists \expr : ([\acu] \in \upd)$).
Applying an update $\upd$ to a spreadsheet $\spreadsheet$ returns an updated spreadsheet:
\[
\valat{\upd(\spreadsheet)}{\column}{\row} =
\begin{cases}
\upd(\cellRef{\column}{\row}) &\text{\textbf{if}}\,\, \cellRef{\column}{\row} \in \dom(\upd)\\
\valat{\spreadsheet}{\column}{\row} &\text{\textbf{otherwise}}\\
\end{cases}
\]
An update may affect cells beyond its domain.
For example, the update shown in \Cref{fig:example-spreadsheet-and-a} changes two cells $\cellRef{A}{1}$ and $\cellRef{C}{3}$, but evaluating the updated spreadsheet $\upd(\spreadsheet)$ results in \emph{three} cell changes (in red).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Spreadsheet Access to Datasets}
\label{sec:spre-access-datas}
To uniformly model source datasets, whether from relational databases or other spreadsheets, we assume an input dataset $\ds$ with designated row and column labels $\columnDomain_{\ds}$ and $\rowDomain_{\ds}$ as appropriate to the source data.
In a relational table, these are the table's columns and the values of a key or rowid attribute, respectively.
For csv data, $\rowDomain_{\ds} \subset \mathbb Z$ is the position of the row in the file.
We write $\valat{\ds}{\row}{\column}$ to denote the value at column $\column \in \columnDomain_\ds$ of row $\row \in \rowDomain_\ds$ in $\ds$.
%
Denote by $\rframe: \rowDomain_{\ds} \to \mathbb{Z}$ a reference frame, an injective map from rows in $\ds$ to rows of the spreadsheet.
A \emph{spreadsheet overlay} for a dataset $\ds$ is a pair $(\ds, \rframe)$ that defines a spreadsheet $\spreadsheet_{\ds, \rframe}$ with domains $\columnDomain = \columnDomain_{\ds}$, $\rowDomain = \dom(\rframe)$ as
$
\valat{\spreadsheet_{\ds, \rframe}}{\column}{\row} = \valat{\ds}{\column}{\rframe^{-1}(\row)}
$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Overlay Updates}
\label{sec:overlay-updates}
An Overlay Update describes a set of changes to a spreadsheet (or dataset).
As we will discuss in \Cref{sec:system-presentation}, column operations are purely cosmetic in our model, and we focus on cell and row updates exclusively.
Concretely, a spreadsheet overlay $\overlay = \aol$ is a reference frame transformation $\rtrans$ and a set of pattern updates $\oup$, terms we now define.
% We now define these terms, and discuss their semantics.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\partitle{Reference Frame Transformations}
Recall that a reference frame maps the spreadsheet's positional row references to native record identifiers.
Thus, to insert, delete, or move rows in the spreadsheet, it is sufficient to modify the reference frame.
Formally, a reference frame transformation $\rtrans$ is an injective mapping $\mathbb{Z} \to \mathbb{Z} \cup \errorval$ from initial row positions to new row positions, or the value $\errorval$ for a deleted row ($\rtrans$ is allowed to map multiple inputs to $\errorval$).
The new reference frame, after applying $\overlay$ is $\rframe' = \rtrans \circ \mathcal F$, where $\circ$ denotes function composition.
As an example, consider deleting the 2nd row from \Cref{fig:example-spreadsheet-and-a}. The positions of rows $3$ and $4$ are decreased by one, while row $1$ retains its position
$$\rtrans(x) = \begin{cases}
x & \textbf{if } x < 2\\
\errorval & \textbf{if } x = 2\\
x-1 & \textbf{otherwise}
\end{cases}$$
Row insertions and movement are handled analogously.
Note that row insertions, deletions, and movement are expressible in \emph{constant size}, independent of the size of the data.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\partitle{Pattern Updates}
Spreadsheets allow a formula from one cell to be pasted across a range of cells.
%\footnote{``Relative" column and row references are updated to be relative to each cell the formula is pasted into.}
In a classical spreadsheet, bulk interactions like this modify each cell's expression individually.
Overlay spreadsheets avoid the high cost that individual modifications can entail by grouping together the set of pasted cells into a single \emph{pattern}.
A \emph{range} $\rangeOf{\columnRange}{\rowRange}$ is the Cartesian product $\columnRange \times [l,h]$ of a set of columns ($\columnRange \subseteq \columnDomain$) and row positions ($R = [l, h] \subset \mathbb{Z}$).
%
A pattern update $\oup$ is a set of pairs $\{ (\rangeOf{C_i}{R_i}, \pattern_i) \}$ where $\rangeOf{C_i}{R_i}$ is a range and $\pattern_i$ is a \emph{pattern expression}, i.e., an expression that may also contain cell references where rows are relative offsets (written as $+i$ or $-i$).
Ranges in an update $\rangeOf{C_i}{R_i}$ must be pairwise disjoint.
A pattern update $(\rangeOf{C_i}{R_i}, \pattern_i)$ assigns an expression to every cell $\cellRef{\column}{\row}$ in $\rangeOf{C_i}{R_i}$ by replacing any relative references of the form $\cellRef{\column}{+\delta}$ in $\pattern_i$ with $\cellRef{\column}{\row + \delta}$. We use $\pattern_i(\cell)$ to denote instantiation of pattern $\pattern_i$ for cell $\cell$.
For instance, to store a running sum of the values in column \emph{C} into column \emph{D} (for the spreadsheet from \Cref{fig:example-spreadsheet-and-a}):\\[-2mm]
%
\[
\oup_{running} = (\rangeOf{D}{1}, (C,+0)), (\rangeOf{D}{2-4}, (C,+0) + (D,-1))
\]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\centering
\newcommand{\tabhead}[1]{\cellcolor{black}{\textcolor{white}{#1}}}
\begin{minipage}{1\linewidth}
\centering
\textbf{Spreadsheet $\spreadsheet$} \\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C} & \tabhead{D} \\ \hline
\tabhead{1} & \se{15} & \se{50} & \se{A1 + B1} & \uv{C1} \\ \hline
\tabhead{2} & \se{20} & \se{60} & \se{A2 + B2} & \uv{C2 + D1} \\ \hline
\tabhead{3} & \se{25} & \se{100} & \se{A3 + B3} & \uv{C3 + D2} \\ \hline
\tabhead{4} & \se{50} & \se{0} & \se{A4 + B4} & \uv{C4 + D3} \\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage} %
$\,$\\
\begin{minipage}{1\linewidth}
\centering
\textbf{Evaluated Spreadsheet $\evalOf{\spreadsheet}{\cdot}$} \\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C} & \tabhead{D} \\ \hline
\tabhead{1} & 15 & 50 & 65 & \uv{65} \\ \hline
\tabhead{2} & 20 & 60 & 80 & \uv{145} \\ \hline
\tabhead{3} & 25 & 100 & 125 & \uv{270} \\ \hline
\tabhead{4} & 50 & 0 & 50 & \uv{320} \\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}
\caption{Example overlay update and result (updated expressions and values are shown in \uv{red}).}\label{fig:example-overlay-update}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\partitle{Semantics for Overlay Updates}
%
An overlay update $\overlay$ applied to a spreadsheet $\spreadsheet$ defines the spreadsheet $\overlay(\spreadsheet)$ computed by applying the reference frame update and then applying all pattern updates (with $\overlay = \tuple{\rtrans, \{ (\columnRange_i, \rowRange_i, \pattern_i)\}})$:
\begin{align*}
\valat{\overlay(\spreadsheet)}{\column}{\row} &=
\begin{cases}
\pattern_i(\cellRef{\column}{\row}) & \text{\textbf{if}}\,\, \exists i: \cellRef{\column}{\row} \in \rangeOf{C_i}{R_i} \\
\valat{\spreadsheet}{\column}{\rtrans^{-1}(\row)} & \text{\textbf{if}}\,\, \exists \row': \rtrans(\row') = \row\\
\errorval &\text{\textbf{otherwise}}\\
\end{cases}
\end{align*}
\begin{example}
\label{ex:recursive-running-sum}
Consider our example update ($\overlay_{running} = (\rtrans_{id},\oup_{running})$ where $\rtrans_{id}(x) = x$).
\Cref{fig:example-overlay-update} shows the result of applying $\overlay_{running}$ to our running example spreadsheet.
\end{example}
Several remarks are in order. First, overlays can be used to encode common spreadsheet update operations in constant space (per update), including bulk updates via copy/paste.
Second, \cite{tang-23-efcsfg} uses similar ideas to compress the dependencies in a spreadsheet using ranges and patterns, but focuses exclusively on the dependency graph rather than expressions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Replacing Source Data}
\label{sec:updating-datasets}
An overlay designed for source data $(\ds, \rframe)$ may be applied to a dataset $(\ds', \rframe')$ as long as each $\row\in \rowDomain_{\ds}$ there is a corresponding row $\row' \in \rowDomain_{\ds'}$ such that $\rframe'(\rframe^{-1}(\row)) = \row'$.
This is possible if, e.g., $\rowDomain_{\ds}= \rowDomain_{\ds'}$ is a semantic key for the dataset.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% reftex-default-bibliography: ("../main.bib")
%%% End: