Finishing out system section

2023-03-29 14:46:56 -04:00 · 2023-03-29 14:46:56 -04:00 · a078dd8525
parent b082e1e8c9
commit a078dd8525
10 changed files with 51 additions and 51 deletions
--- a/main.tex
+++ b/main.tex
@ -115,7 +115,7 @@
 %% other information printed in the page headers. This command allows
 %% the author to define a more concise list
 %% of authors' names for this purpose.
-\renewcommand{\shortauthors}{Dib et al.}
+\renewcommand{\shortauthors}{Kennedy et al.}

 %%
 %% The abstract is a short summary of the work to be presented in the
@ -140,7 +140,7 @@
 %%
 %% Keywords. The author(s) should pick words that accurately describe
 %% the work being presented. Separate the keywords with commas.
-\keywords{Apache Spark,Spreadsheets,Dataframes}
+\keywords{Spreadsheets, Dataframes, Scalable Data Management}
 %% A "teaser" image appears between the author and affiliation
 %% information and the body of the document, and typically spans the
 %% page.
--- a/results/desktop-init_formulas-varystart.pdf
+++ b/results/desktop-init_formulas-varystart.pdf
--- a/results/desktop-init_formulas-varystart.png
+++ b/results/desktop-init_formulas-varystart.png
--- a/results/desktop-init_spreadsheet-varystart.pdf
+++ b/results/desktop-init_spreadsheet-varystart.pdf
--- a/results/desktop-init_spreadsheet-varystart.png
+++ b/results/desktop-init_spreadsheet-varystart.png
--- a/results/desktop-update_one-varystart.pdf
+++ b/results/desktop-update_one-varystart.pdf
--- a/results/desktop-update_one-varystart.png
+++ b/results/desktop-update_one-varystart.png
--- a/results/gen_graph.py
+++ b/results/gen_graph.py
@ -3,7 +3,7 @@ import re
 import matplotlib.pyplot as plt
 #import numpy as np

-def read_dataspread(testbed):
+def read_dataspread(testbed, experiment):
    def extract(line):
        match = re.match("(.+)@(\\d+): ([^:]+): ([0-9.]+)", line)
        if match is None:
@ -14,10 +14,11 @@ def read_dataspread(testbed):
              "dataspread", 
              int(match.group(2)), 
              match.group(3).lower(), 
-              float(match.group(4))
+              float(match.group(4)),
+              experiment,
            )]

-    with open(f"{testbed}-dataspread.log") as f:
+    with open(f"{testbed}-dataspread-{experiment}.log") as f:
        data = [
          match
          for line in f.readlines()
@ -26,7 +27,7 @@ def read_dataspread(testbed):
    return data


-def read_vizier(testbed):
+def read_vizier(testbed, experiment):
    def extract(line):
        match = re.match("(.*)@(\\d+)/(true|false): ([^:]+): ([0-9.]+)", line)
        if match is None:
@ -38,10 +39,11 @@ def read_vizier(testbed):
              "vizier-batch" if match.group(3) == "true" else "vizier",
              int(match.group(2)), 
              match.group(4).lower(), 
-              float(match.group(5))
+              float(match.group(5)),
+              experiment,
            )]

-    with open(f"{testbed}-vizier.log") as f:
+    with open(f"{testbed}-vizier-{experiment}.log") as f:
        data = [
          match
          for line in f.readlines()
@ -65,10 +67,10 @@ def read_vizier(testbed):
 data = [
  record
  for ds in [
-    read_vizier("desktop"),
-    read_vizier("laptop"),
-    read_dataspread("desktop"),
-    read_dataspread("laptop")
+    read_vizier("desktop", "varystart"),
+    # read_vizier("laptop"),
+    read_dataspread("desktop", "varystart"),
+    # read_dataspread("laptop")
  ]
  for record in ds
 ]
@ -79,16 +81,19 @@ print(data)
 print(stages)


-def plot_one(testbed, stage):
+def plot_one(testbed, stage, experiment):
    global data
    fig, ax = plt.subplots(
                figsize=(4, 2),
-                constrained_layout=True
+                constrained_layout=True,
              )

    # ax.set_title(f"{stage} ({testbed})")
    ax.set_ylabel(f"{stage} (s)")
    ax.set_xlabel("Data Size (number of rows)")
+    ax.set_xscale("log")
+    ax.set_yscale("log")
+
    
    for system in ["vizier", "vizier-batch", "dataspread"]:
        points = sorted([
@ -100,16 +105,16 @@ def plot_one(testbed, stage):
        ], key=lambda x: x[0])

        ax.plot(
-          [pt[0] / 1000 for pt in points],
+          [pt[0] for pt in points],
          [pt[1] for pt in points],
          label=system
        )
    ax.legend()
    stage = stage.replace(" ", "_")
-    fig.savefig(f"{testbed}-{stage}.pdf")
-    fig.savefig(f"{testbed}-{stage}.png")
+    fig.savefig(f"{testbed}-{stage}-{experiment}.pdf")
+    fig.savefig(f"{testbed}-{stage}-{experiment}.png")


-plot_one("desktop", "init spreadsheet")
-plot_one("desktop", "init formulas")
-plot_one("desktop", "update one")
+plot_one("desktop", "init spreadsheet", "varystart")
+plot_one("desktop", "init formulas", "varystart")
+plot_one("desktop", "update one", "varystart")
--- a/sections/model.tex
+++ b/sections/model.tex
@ -300,7 +300,11 @@ An overlay update $\overlay$ appleid to a spreadsheet  $\spreadsheet$ defines th
                                                  \end{cases}
 \end{align*}

-For example, the result of applying the overlay update $\overlay_{running} = (\rtrans_{id},\oup_{running})$ where $\rframe_{id}(x) = x$ to our running example spreadsheet is shown in \Cref{fig:example-overlay-update}. The column \emph{D} is filled with new formulas that compute the hollowing sum.
+\begin{example}
+  \label{ex:recursive-running-sum}
+  Consider applying our example update ($\overlay_{running} = (\rtrans_{id},\oup_{running})$ where $\rframe_{id}(x) = x$) to our running example spreadsheet.
+  The result is shown in \Cref{fig:example-overlay-update}. The column $D$ now computes the running sum of column $C$.
+\end{example}

 Several remarks are in order. First, note that overlays can be used to encode common spreadsheet update operations in constant space (per update), including bulk updates via copy/paste. 
 Second, \cite{tang-23-efcsfg} uses similar ideas to compress the dependencies in a spreadsheet using ranges and patterns, but focuses exclusively on the dependency graph and not on compacting the spreadsheet itself.
--- a/sections/system.tex
+++ b/sections/system.tex
@ -74,8 +74,8 @@ If a row with dependent cells is deleted, the dependent cells need to be updated
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Update Index}
 \label{sec:system-index}
-The update index stores a sequence of updates ($\overlay = \overlay_k \circ \ldots \circ \overlay_1$) and provides efficient access to the spreadsheet (denoted $\spreadsheet_\overlay$) defined by $\overlay$, with $\errorval$ for all undefined cells.
-Efficient access entails:
+The update index provides efficient positional access to the spreadsheet (denoted $\spreadsheet_\overlay$) defined by a sequence of updates ($\overlay = \overlay_k \circ \ldots \circ \overlay_1$), with $\errorval$ for all undefined cells.
+Specifically, the index is required to support:
 (i) Access to the expressions for individual cells $\spreadsheet_\overlay[\column, \row]$ (for cell evaluation); 
 (ii) Computing the upstream of a range of cells (for topological sort and computing the active set), and
 (iii) Computing the downstream of a range of cells (for cell invalidation after an update).
@ -91,17 +91,16 @@ As noted above, we assume that the number of columns is comparatively small, and

 \partitle{Range Maps}
 The core building block for the update index is a one-dimensional range map, an ordered map with integer keys.
-In addition to the usual operations of an ordered map (e.g., \texttt{put}, \texttt{get}, \texttt{successorOf}), the operation \texttt{bulkPut(low, high, value)} has semantics identical to a \texttt{put} on every element in the range from \texttt{low} to \texttt{high}.  
+In addition to the usual operations of an ordered map (e.g., \texttt{put}, \texttt{get}, \texttt{successorOf}), we define the operation \texttt{bulkPut(low, high, value)} which is equivalent to a \texttt{put} on every element in the range from \texttt{low} to \texttt{high}.  
 Implemented naively through a binary tree over $N$ elements, this operation takes $O((\texttt{high}-\texttt{low})\cdot\log(N))$ time.

 A range map avoids the $(\texttt{high}-\texttt{low})$ factor (and correspondingly reduces $N$) by storing an ordered sequence of disjoint ranges, each mapping one specific value as illustrated in \Cref{fig:rangemap}.
-A binary tree provides efficient access to the ranges.
+A binary tree provides efficient membership lookups over the ranges.
 With a range map, the set of distinct values appearing in a range can be accessed in $O(\log(N)+M)$ time (where $M$ is the number of distinct values), and similar deletion and insertion costs.

 \partitle{Cell Access}
-Efficient access to individual cells is obtained through a two-layered forward index, consisting of an unordered map over a set of range maps.
-The pattern for a specific cell is obtained by looking up the cell's column in the unordered map, and the cell's row in the corresponding range map.
-
+The index layer maintains a ``forward" index: An unordered map that stores a range map for each column.
+To compute the expression for a cell $\cellRef{\column}{\row}$, the index layer (i) looks up the range map for $\column$ in the unordered map, (ii) looks up $\row$ in the range map to obtain a pattern (and returns $\emptyset$ if the row is undefined), and (iii) computes the expression by applying the pattern to $\cellRef{\column}{\row}$.

 \begin{algorithm}
 \caption{\textbf{upstream}($\columnRange$, $\rowRange$)}
@ -128,15 +127,15 @@ The pattern for a specific cell is obtained by looking up the cell's column in t
 \end{algorithm}

 \partitle{Upstream Reachability}
-In order to developing a materialization plan, the execution layer needs to be able to derive the set of cells on which a specific cell (or range of cells) depends.  
-We refer to this as the set of upstream cells for the specific target.
+The execution layer needs to be able to derive the set of cells on which a specific target cell (or range) depends.  
+We refer to this set as the target's \emph{upstream}.
 \Cref{alg:upstream} illustrates a naive breadth-first search to obtain the full upstream set for a given target range.  
-Each item in the BFS's work queue consists of a column, a row set, and a lineage set that pertains to an optimization we will discuss below.
-For each work item enqueued, we use the forward index range map to obtain the set of patterns appearing in the range specified by the work item (line 4), and iterate over the set of their dependencies (line 5).
+Each item in the BFS's work queue consists of a column, a row set, and a lineage; We will return to the lineage shortly.
+For each work item enqueued, we query the forward index to obtain the set of patterns in the range (line 4), and iterate over the set of their dependencies (line 5).
 If we discover a new dependency (lines 6-7), the newly discovered range is added to the return set and the work queue.
 We will explain line 10 shortly.

-The \textbf{getDeps} operation (Line 5; \Cref{alg:getDeps}) returns the set of dependencies of a pattern applied to a specific range of cells $\rangeOf{\column, \rowRange}$.
+The \textbf{getDeps} operation (Line 5; \Cref{alg:getDeps}) computes the immediate dependencies of a range of cells $\rangeOf{\column, \rowRange}$ that share a pattern.
 Concretely, it returns a set of cells $\texttt{deps}$ such that for each cell $\cell \in \texttt{deps}$, there exists at least one cell $\cell' \in \rangeOf{\column}{\rowRange}$ such that $\cell$ is in the transitive closure of $\depsOf{\cell'}$.
 The algorithm uses a recursive traversal (lines 6-7) to visit every cell reference (offset or explicit):
 For offset references (lines 2-3), the provided range of rows is offset by the appropriate amount.
@ -162,28 +161,20 @@ For explicit cell references (lines 4-5), the explicit reference is used.


 \partitle{Optimizing Recursive Reachability}
+Consider a running sum, such as the one in \Cref{ex:recursive-running-sum}.
+Observe that the $k$th element will have $O(k)$ upstream dependencies, and so naively following \Cref{alg:upstream} requires $O(k)$ compute.
+However, observe that a single pattern is responsible for all of these dependencies, suggesting that a more efficient option may be available.

-Consider a computation analogous to that of \Cref{fig:overlay}, where one cell computes a running total of a second cell.  Such a cell might be defined by a pattern:
-$$\rangeOf{\texttt{total}}{1-1000} \gets \cellRef{\texttt{value}}{+0} + \cellRef{\texttt{total}}{-1}$$
-Naively implemented, as in \Cref{alg:upstream}, computing reachability for the cell $\cellRef{\texttt{total}}{1000}$ will require visiting every distinct cell in the range 1-999: $\cellRef{\texttt{total}}{1000}$ depends on $\cellRef{\texttt{total}}{999}$, which depends on $\cellRef{\texttt{total}}{998}$, and so forth.
-
-Observe that this dependency chain is defined entirely by a single pattern: Each cell defined by the pattern depends on another cell defined by the pattern.
+Specifically, this dependency chain is defined by recursion over single pattern; all but the first cell depend on another cell defined by the same pattern.
 We refer to a pattern that references cells defined by the same pattern as \emph{recursive}.
-Note that the pattern's value may be self-referential, even if there is not a dependency cycle between the individual cells that the pattern defines.
+Note that a recursive pattern need not indicate a dependency cycle between individual cells.

-Patterns allow absolute references or offset references, but the former can not trigger a recursive pattern without creating a cycle in the dependency graph.
-Thus, recursive dependencies must be at fixed offsets, and the transitive closure must have a closed form representation.
-For example, consider a cell $\cellRef{\texttt{total}}{500}$ defined by a recursive pattern over rows $[1,1000]$, with a recursive pattern dependency on $\cellRef{\texttt{total}}{-2}$.  
-The transitive closure of the cell's dependency thus includes exactly the set of even rows (given the offset of $-2$) in the range $[1,500]$ (the cell through the start of the pattern's range).  
+Our key insight is that for some (mutually) recursive patterns, the transitive closure of the dependencies will have a closed-form representation.
+In our running example, the upstream of any $\cellRef{D}{k}$ is exactly $\cellRef{D}{1-(k-1)}$ and $\cellRef{C}{1-k}$.

-Unfortunately, the size of the encoding of the range set needed to represent these dependencies scales with the number of rows, due to the gaps.
-However, with the more common offset of $-1$, the entire set of rows can be defined by a single range.
-We address this more common case here, and leave the more general case to future work.
-Specifically, \textbf{getDeps} (\Cref{alg:getDeps}, line 3) tracks the offset of each dependency, while \textbf{upstream} (\Cref{alg:upstream}, lines 10-11) maintains a record of which patterns have been seen at which offsets in a lineage record.
-
-In its naive implementation, \textbf{upstream} attempts to advance the frontier by one hop with each work unit (lines 5-14).
-However, prior to line 5, we can check the lineage object to determine if the pattern defining the cell we are currently examining has previously been encountered along the path being advanced at an offset of $\pm 1$.
-If so, we add the remainder of the range over which the pattern is defined in the direction indicated by the offset to the active range. 
+The \texttt{lineage} field of \Cref{alg:upstream} is used to track the set of patterns visited, and the offset(s) at which they were visited.
+If the pattern being visited already appears in the lineage, then we know it is recursive and that we can extend out the sequence of upstream cells across the remaining cells of the pattern.
+If the offset is $\pm 1$, then the elements of this sequence are efficiently representable as a range of cells and can return it in $O(1)$ time.

 \partitle{Downstream Reachability}
 When a cell's expression is updated, cells that depend on it (even transitively) must be recomputed.