main
Boris Glavic 2023-03-19 17:23:59 -05:00
parent cd1a2329d8
commit 39c98f7204
4 changed files with 212 additions and 26 deletions

View File

@ -4,6 +4,7 @@
\usepackage{todonotes}
\usepackage{amsmath}
\usepackage{xspace}
\usepackage{colortbl}
\input{macros}
@ -157,6 +158,7 @@
\maketitle
\input{sections/introduction}
\input{sections/model}
\input{sections/overview}
\input{sections/formalism}
\input{sections/system}

View File

@ -2,32 +2,32 @@
\section{Formalism}
\label{sec:formalism}
\newcommand{\comprehension}[2]{\left\{\;#1\;|\;#2\;\right\}}
% \newcommand{\comprehension}[2]{\left\{\;#1\;|\;#2\;\right\}}
\newcommand{\spreadsheet}{\mathbb S}
\newcommand{\encodedSpreadsheet}{\hat \spreadsheet}
\newcommand{\columnDomain}{\mathcal C}
\newcommand{\columnRange}{C}
\newcommand{\column}{c}
\newcommand{\rowDomain}{\mathcal R}
\newcommand{\rowRange}{R}
\newcommand{\row}{r}
\newcommand{\rowOffset}{\delta}
\newcommand{\cell}{a}
\newcommand{\exprDomain}{\mathcal E}
\newcommand{\expr}{e}
\newcommand{\patternDomain}{\mathcal P}
\newcommand{\pattern}{P}
\newcommand{\valueDomain}{\mathcal V}
\newcommand{\rangeOf}[2]{(#1\texttt{:}#2)}
\newcommand{\range}{\rangeOf{\columnRange}{\rowRange}}
\newcommand{\cellRef}[2]{(#1\texttt{:}#2)}
\newcommand{\evalOf}[2]{\left\llbracket\; #2 \;\right\rrbracket_{#1}}
\newcommand{\patternOf}[2]{\left\llbracket\; #1 \;\right\rrbracket_{#2}}
\newcommand{\depsOf}[1]{\textbf{deps}\left(#1\right)}
\newcommand{\tdepsOf}[1]{\ensuremath{\text{\textbf{deps}\ast\left(#1\right)}\xspace}}
\newcommand{\DG}[1]{\ensuremath{G_{#1}}\xspace}
\newcommand{\TDG}[1]{\ensuremath{G_{#1}^*}\xspace}
% \newcommand{\spreadsheet}{\mathbb S}
% \newcommand{\encodedSpreadsheet}{\hat \spreadsheet}
% \newcommand{\columnDomain}{\mathcal C}
% \newcommand{\columnRange}{C}
% \newcommand{\column}{c}
% \newcommand{\rowDomain}{\mathcal R}
% \newcommand{\rowRange}{R}
% \newcommand{\row}{r}
% \newcommand{\rowOffset}{\delta}
% \newcommand{\cell}{a}
% \newcommand{\exprDomain}{\mathcal E}
% \newcommand{\expr}{e}
% \newcommand{\patternDomain}{\mathcal P}
% \newcommand{\pattern}{P}
% \newcommand{\valueDomain}{\mathcal V}
% \newcommand{\rangeOf}[2]{(#1\texttt{:}#2)}
% \newcommand{\range}{\rangeOf{\columnRange}{\rowRange}}
% \newcommand{\cellRef}[2]{(#1\texttt{:}#2)}
% \newcommand{\evalOf}[2]{\left\llbracket\; #2 \;\right\rrbracket_{#1}}
% \newcommand{\patternOf}[2]{\left\llbracket\; #1 \;\right\rrbracket_{#2}}
% \newcommand{\depsOf}[1]{\textbf{deps}\left(#1\right)}
% \newcommand{\tdepsOf}[1]{\ensuremath{\text{\textbf{deps}\ast\left(#1\right)}\xspace}}
% \newcommand{\DG}[1]{\ensuremath{G_{#1}}\xspace}
% \newcommand{\TDG}[1]{\ensuremath{G_{#1}^*}\xspace}
Let $\columnDomain$ and $\rowDomain$ denote a domain of column and row (respectively) labels, and let $\exprDomain$ and $\valueDomain$ denote a domain of expressions and values; We will define $\exprDomain$ in greater detail below.
We define a \emph{spreadsheet} $\spreadsheet : (\columnDomain \times \rowDomain) \rightarrow \exprDomain$ as a mapping from \emph{cells} ($(\column, \row) \in (\columnDomain \times \rowDomain)$) to expressions.

185
sections/model.tex Normal file
View File

@ -0,0 +1,185 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Spreadsheet Datamodel}
\label{sec:spre-datam}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Colors
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{tabexprcolor}{rgb}{0,0.5,0}
\newcommand{\comprehension}[2]{\left\{\;#1\;|\;#2\;\right\}}
\newcommand{\ds}{D}
\newcommand{\spreadsheet}{\mathbb S}
\newcommand{\encodedSpreadsheet}{\hat \spreadsheet}
\newcommand{\columnDomain}{\mathcal C}
\newcommand{\columnRange}{C}
\newcommand{\column}{c}
\newcommand{\rowDomain}{\mathcal R}
\newcommand{\rowRange}{R}
\newcommand{\row}{r}
\newcommand{\rowOffset}{\delta}
\newcommand{\cell}{a}
\newcommand{\exprDomain}{\mathcal E}
\newcommand{\expr}{e}
\newcommand{\patternDomain}{\mathcal P}
\newcommand{\pattern}{P}
\newcommand{\valueDomain}{\mathcal V}
\newcommand{\valat}[3]{#1[#2,#3]}
\newcommand{\rangeOf}[2]{(#1\texttt{:}#2)}
\newcommand{\range}{\rangeOf{\columnRange}{\rowRange}}
\newcommand{\cellRef}[2]{(#1,#2)}
\newcommand{\evalOf}[2]{\left\llbracket\; #2 \;\right\rrbracket_{#1}}
\newcommand{\patternOf}[2]{\left\llbracket\; #1 \;\right\rrbracket_{#2}}
\newcommand{\depsOf}[1]{\textbf{deps}\left(#1\right)}
\newcommand{\tdepsOf}[1]{\ensuremath{\text{\textbf{deps}\ast\left(#1\right)}\xspace}}
\newcommand{\DG}[1]{\ensuremath{G_{#1}}\xspace}
\newcommand{\TDG}[1]{\ensuremath{G_{#1}^*}\xspace}
\newcommand{\errorval}{\ensuremath{\bot}\xspace}
\newcommand{\rframe}{\ensuremath{\mathcal{F}}\xspace}
\newcommand{\se}[1]{{\textcolor{tabexprcolor}{#1}}}
\newcommand{\uv}[1]{{\textcolor{red}{#1}}}
\newcommand{\upd}{U}
\newcommand{\cu}[3]{#1#2 = #3}
\newcommand{\cellupdate}{u}
\newcommand{\dom}{\textsc{Dom}}
\newcommand{\acu}{\cu{\column}{\row}{\expr}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Spreadsheets}
\label{sec:spreadsheets}
Let $\columnDomain$ denote a domain of column labels and $\rowDomain \subset \mathbb{Z}$ be a set of row positions, and let $\exprDomain$ and $\valueDomain$ denote a domain of expressions and values; We will define $\exprDomain$ in greater detail below.
We define a \emph{spreadsheet} $\spreadsheet : (\columnDomain \times \rowDomain) \rightarrow \exprDomain$ as a partial mapping from \emph{cells} ($(\column, \row) \in (\columnDomain \times \mathbb{Z})$) to expressions. We use $\valat{\spreadsheet}{\column}{\row}$ to denote $\spreadsheet((\column, \row))$ and set $\valat{\spreadsheet}{\column}{\row} = \errorval$ if $(\column,\row)$ is not in the domain of $\spreadsheet$ where $\errorval$ is a distinguished error value from $\valueDomain$. For convenience, we will use $\column\row$ as a shorthand for $(\column, \row)$.
An expression $\expr \in \exprDomain$ is a formula defined over literals from $\valueDomain$, the standard arithmetic operators, and references to other cells in the spreadsheet ($\cellRef{\column}{\row}$).
The expression $\expr$ may be evaluated in the context of a spreadsheet ($\evalOf{\spreadsheet}{\cdot} : \exprDomain \rightarrow \valueDomain$) as follows:
(i) Literals evaluate to themselves, (ii) Arithmetic formulas are evaluated in the usual way, and (iii) References to the spreadsheet are evaluated recursively:
$$\evalOf{\spreadsheet}{\cellRef{\column}{\row}} \equiv \evalOf{\spreadsheet}{\spreadsheet(\column, \row)}$$
By convention, cyclic references evaluate to the distinguished error value $\errorval$ in $\valueDomain$.
We define the dependencies of an expression ($\depsOf{\expr}$) to be the set of cells referenced by $\expr$.
Expression dependencies induce a graph $\DG{\spreadsheet}\tuple{V, E}$ over the spreadsheet, where each cell is a node (i.e., $V = \columnDomain \times \rowDomain$), and each dependency is a (directed) edge:
$$E = \bigcup_{\cell \in \columnDomain \times \rowDomain}
\{\;\cell \rightarrow \cell'\;|\;\cell' \in \depsOf{\spreadsheet(\cell)}\;\} $$
We use $\TDG{\spreadsheet}$ to denote the graph $\tuple{V,E^*}$ where $E^*$ is the transitive closure of $E$, i.e., $\TDG{\spreadsheet}$ stores both direct and indirect dependencies among the cells in the spreadsheet.
We define a \emph{range} $\rangeOf{\columnRange}{\rowRange}$ as the Cartesian product of a set of columns ($\columnRange \subseteq \columnDomain$) and range of row positions ($R = [l,h] \in \mathbb{Z} \times \mathbb{Z}$).
$$\rangeOf{\columnRange}{\rowRange} \equiv \{\;(\column, \row)\;|\;\column\in \columnRange, \row \in \rowRange\;\}$$
Note that if all cells expressions of a spreadsheet are constants (a spreadsheet without formulas), then $\evalOf{\spreadsheet}{\cellRef{\column}{\row}} = \valat{\spreadsheet}{\column}{\row}$.
As an example, consider the spreadsheet shown on the top of \Cref{fig:example-spreadsheet-and-a}. The expressions in columns \emph{A} and \emph{B} are constant expressions while the expressions in column \emph{C} are arithmetic expressions that references cells from columns \emph{A} and \emph{B}. The result of evaluating the spreadsheet to assign each cell a concrete value is shown on the top right of this figure, e.g., \emph{C1} evaluates to $\evalOf{\spreadsheet}{A1 + B1} = \evalOf{\spreadsheet}{A1} + \evalOf{\spreadsheet}{B1} = 15 + 50 = 65$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\centering
\newcommand{\tabhead}[1]{\cellcolor{black}{\textcolor{white}{#1}}}
\begin{minipage}{0.48\linewidth}
\centering
\textbf{Spreadsheet $\spreadsheet$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} &\se{15} &\se{50} &\se{A1 + B1}\\ \hline
\tabhead{2} &\se{20} &\se{60} &\se{A2 + B2} \\ \hline
\tabhead{3} &\se{25} &\se{100} &\se{A3 + B3}\\ \hline
\tabhead{4} &\se{50} &\se{0} &\se{A4 + B4}\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage} %
%
\begin{minipage}{0.49\linewidth}
\centering
\textbf{Evaluated Spreadsheet $\evalOf{\spreadsheet}{\cdot}$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} & 15 & 50 & 65\\ \hline
\tabhead{2} & 20 & 60 & 80 \\ \hline
\tabhead{3} & 25 & 100 & 125\\ \hline
\tabhead{4} & 50 & 0 & 50\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}
$,$\\ \vspace{5mm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{minipage}{1 \linewidth}
\centering
\textbf{Update} $\upd = \{ \cu{A}{1}{20}, \cu{C}{3}{2 \cdot A3 + B3} \}$
\end{minipage}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
$,$\\ %\vspace{5mm}
\begin{minipage}{0.48\linewidth}
\centering
\textbf{Updated Spreadsheet $\upd(\spreadsheet)$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} &\uv{20} &\se{50} &\se{A1 + B1}\\ \hline
\tabhead{2} &\se{20} &\se{60} &\se{A2 + B2} \\ \hline
\tabhead{3} &\se{25} &\se{100} &\uv{2 $\cdot$ A3 + B3}\\ \hline
\tabhead{4} &\se{50} &\se{0} &\se{A4 + B4}\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage} %
%
\begin{minipage}{0.49\linewidth}
\centering
\textbf{Evaluated Spreadsheet $\evalOf{\upd(\spreadsheet)}{\cdot}$}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{tabular}{c|c|c|c|}
\cline{2-4}
& \tabhead{A} & \tabhead{B} & \tabhead{C}\\ \hline
\tabhead{1} & \uv{20} & 50 & \uv{70}\\ \hline
\tabhead{2} & 20 & 60 & 80 \\ \hline
\tabhead{3} & 25 & 100 & \uv{150}\\ \hline
\tabhead{4} & 50 & 0 & 50\\ \hline
\end{tabular}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}
\caption{Example spreadsheet (expressions are shown in \textcolor{tabexprcolor}{dark green} to distinguish them from values), result of evaluating the spreadsheet, and an update applied to the spreadsheet (updated expressions and values are shown in \uv{red}).}\label{fig:example-spreadsheet-and-a}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Updates}
\label{sec:updates}
An update $\upd$ to a spreadsheet is a set of cell updates $\cellupdate$ of the form $\acu$ which assigns to cell $\column\row$ the expression $\expr$. For an update $\upd$, $\dom(\upd)$ called the domain of the updates contains all cells $\column\row$ such that there exists $\cellupdate \in \upd$ such that $\cellupdate = (\acu)$, i.e., the update modified the expression of cell $\column\row$. Applying an update $\upd$ to a spreadsheet $\spreadsheet$ return an updated spreadsheet $\upd(\spreadsheet)$ defined as:
\[
\valat{\upd(\spreadsheet)}{\column}{\row} =
\begin{cases}
\upd({\column}{\row}) &\text{\textbf{if}}\,\, \column\row \in \dom(\upd)\\
\valat{\spreadsheet}{\column}{\row} &\text{\textbf{otherwise}}\\
\end{cases}
\]
Note that this way to represent updates can be quite verbose, e.g., if inserting a row at some position in the spreadsheet, the expressions of all following rows have to be updated as they are moved down by one row.
We will discuss the more compact representation of updates as overlays used by our approach in \BG{section}
For example, consider the update shown in \Cref{fig:example-spreadsheet-and-a}. This update changes the constant expression in cell \emph{A1} and the arithmetic expression in cell \emph{C3}. When evaluating the resulting spreadsheet $\upd(\spreadsheet)$, the values of three cells (shown in red), the values of three cells are changed.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Spreadsheet Access to Datasets}
\label{sec:spre-access-datas}
To uniformly model spreadsheet access to relational data as well as to data already represented as spreadsheets, we assume an input dataset $\ds$ and let $\columnDomain$ and $\rowDomain$ denote a domain of column and row (respectively) labels for $\ds$.
For a relational table, $\columnDomain$ are the columns of the table and $\rowDomain$ could be the values of a list of key attributes or be $\mathbb{N}$ representing the order of a row on disk. If $\ds$ is a spreadsheet input dataset, then $\rowDomain$ is $\mathbb{N}$ and encodes the position of the row. For a dataset $\ds$ we use $\valat{\ds}{\row}{\column}$ to denote that value of column $\column$ of row $\row$ in $\ds$.
A \emph{spreadsheet overlay} for a dataset $\ds$ is a pair $(\ds, \rframe)$ where $\rframe: \rowDomain \to \mathbb{Z}$ determines the positions of rows from $\ds$
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../main"
%%% End:

View File

@ -4,7 +4,6 @@
\BG{define the spreadsheet model first?}
\newcommand{\errorval}{\ensuremath{\bot}\xspace}
A spreadsheet is a regular grid of cells, which are defined by formulas.
A cell's formula may be a literal value, or an expression defining a computation that may be based on the value of other cells.