Adding graphics, trimming intro

main
Oliver Kennedy 2023-03-26 15:40:57 -04:00
parent 7623914c8b
commit 57f5bb7214
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
8 changed files with 458 additions and 63 deletions

BIN
graphics/rangemap.pdf Normal file

Binary file not shown.

253
graphics/rangemap.svg Normal file
View File

@ -0,0 +1,253 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="72.377388mm"
height="31.083605mm"
viewBox="0 0 72.377389 31.083605"
version="1.1"
id="svg4831"
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
sodipodi:docname="rangemap.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview4833"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="2.8284271"
inkscape:cx="210.01071"
inkscape:cy="125.86501"
inkscape:window-width="1920"
inkscape:window-height="1011"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
inkscape:current-layer="layer1" />
<defs
id="defs4828">
<marker
style="overflow:visible"
id="Arrow1L"
refX="0"
refY="0"
orient="auto-start-reverse"
inkscape:stockid="Arrow1L"
markerWidth="8.75"
markerHeight="5"
viewBox="0 0 8.75 5"
inkscape:isstock="true"
inkscape:collect="always"
preserveAspectRatio="xMidYMid">
<path
style="fill:context-stroke;fill-rule:evenodd;stroke:none"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="arrow1L"
transform="scale(-0.5)" />
</marker>
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-78.153036,-109.40684)">
<rect
style="fill:#ffffff;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round"
id="rect5078"
width="70.583153"
height="5.8602753"
x="79.331734"
y="126.45924" />
<rect
style="fill:#cccccc;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round"
id="rect5072"
width="13.01953"
height="5.8602762"
x="79.331749"
y="126.45924" />
<rect
style="fill:#cccccc;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round"
id="rect5074"
width="18.311203"
height="5.8602753"
x="99.969238"
y="126.45924" />
<rect
style="fill:#cccccc;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round"
id="rect5076"
width="13.01953"
height="5.8602762"
x="137.01089"
y="126.45924" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 81.117687,129.38938 h 7.794011"
id="path5134"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 101.75518,129.38938 h 13.08568"
id="path5714"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 138.79683,129.38938 h 7.79401"
id="path5716"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 80.589119,122.2913 v 2.22309"
id="path5718"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 101.22659,122.2913 v 2.22309"
id="path5720"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 138.26831,122.2913 v 2.22309"
id="path5722"
sodipodi:nodetypes="cc" />
<rect
style="fill:#cccccc;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round"
id="rect5724"
width="7.1987615"
height="5.8602753"
x="121.13589"
y="126.45924" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 122.92182,129.38938 h 2.50234"
id="path5726"
sodipodi:nodetypes="cc" />
<path
style="fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 122.3933,122.2913 v 2.22309"
id="path5728"
sodipodi:nodetypes="cc" />
<path
id="path5004"
style="fill:#ffffff;stroke:#4d4d4d;stroke-linecap:round;stroke-linejoin:round"
inkscape:transform-center-y="-8.4598282"
d="m 139.18198,122.03664 h -29.3057 -29.3057 l 29.3057,-12.1298 z"
sodipodi:nodetypes="ccccc" />
<text
xml:space="preserve"
style="font-size:4.23333px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;stroke-width:0.264583"
x="84.4533"
y="139.60251"
id="text5820"><tspan
sodipodi:role="line"
id="tspan5818"
style="font-size:4.23333px;stroke-width:0.264583"
x="84.4533"
y="139.60251">V<tspan
style="font-size:65%;baseline-shift:sub"
id="tspan5824">1</tspan></tspan></text>
<text
xml:space="preserve"
style="font-size:4.23333px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;stroke-width:0.264583"
x="107.744"
y="139.60251"
id="text5830"><tspan
sodipodi:role="line"
id="tspan5828"
style="font-size:4.23333px;stroke-width:0.264583"
x="107.744"
y="139.60251">V<tspan
style="font-size:65%;baseline-shift:sub"
id="tspan5826">2</tspan></tspan></text>
<text
xml:space="preserve"
style="font-size:4.23333px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;stroke-width:0.264583"
x="123.55289"
y="139.60251"
id="text5836"><tspan
sodipodi:role="line"
id="tspan5834"
style="font-size:4.23333px;stroke-width:0.264583"
x="123.55289"
y="139.60251">V<tspan
style="font-size:65%;baseline-shift:sub"
id="tspan5832">3</tspan></tspan></text>
<text
xml:space="preserve"
style="font-size:4.23333px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;stroke-width:0.264583"
x="142.13988"
y="139.60251"
id="text5842"><tspan
sodipodi:role="line"
id="tspan5840"
style="font-size:4.23333px;stroke-width:0.264583"
x="142.13988"
y="139.60251">V<tspan
style="font-size:65%;baseline-shift:sub"
id="tspan5838">4</tspan></tspan></text>
<g
id="g5866"
transform="translate(-1.0583333)">
<path
style="fill:none;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 86.868962,132.08818 v 3.28142"
id="path5810"
sodipodi:nodetypes="cc" />
<path
style="fill:#4d4d4d;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 92.759953,132.67315 -5.860825,1.87192 -5.860825,-1.87192 z"
id="path6112"
sodipodi:nodetypes="cccc" />
</g>
<g
id="g6120"
transform="translate(20.108334)">
<path
style="fill:none;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 89.051776,132.08818 v 3.28142"
id="path6116"
sodipodi:nodetypes="cc" />
<path
style="fill:#4d4d4d;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 97.522453,132.67315 -8.506659,1.87192 -8.506658,-1.87192 z"
id="path6118"
sodipodi:nodetypes="cccc" />
</g>
<g
id="g6126"
transform="translate(41.275002)">
<path
style="fill:none;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 83.69396,132.08818 v 3.28142"
id="path6122"
sodipodi:nodetypes="cc" />
<path
style="fill:#4d4d4d;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 86.939119,132.67315 -3.214991,1.87192 -3.214992,-1.87192 z"
id="path6124"
sodipodi:nodetypes="cccc" />
</g>
<g
id="g6132"
transform="translate(56.620836)">
<path
style="fill:none;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1L)"
d="m 86.868962,132.08818 v 3.28142"
id="path6128"
sodipodi:nodetypes="cc" />
<path
style="fill:#4d4d4d;stroke:#4d4d4d;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 92.759953,132.67315 -5.860825,1.87192 -5.860825,-1.87192 z"
id="path6130"
sodipodi:nodetypes="cccc" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.5 KiB

BIN
graphics/system-arch.pdf Normal file

Binary file not shown.

159
graphics/system-arch.svg Normal file
View File

@ -0,0 +1,159 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="92.856964mm"
height="52.695004mm"
viewBox="0 0 92.856964 52.695003"
version="1.1"
id="svg5"
sodipodi:docname="system-arch.svg"
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="2"
inkscape:cx="224"
inkscape:cy="115.5"
inkscape:window-width="1920"
inkscape:window-height="1011"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
inkscape:current-layer="g1116" />
<defs
id="defs2" />
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-26.026571,-2.7996719)">
<g
id="g1108"
transform="translate(0,1.5875)">
<rect
style="fill:#b7c8b7;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round;stroke-dasharray:none"
id="rect234"
width="91.856964"
height="12.801257"
x="26.526571"
y="1.7121719"
ry="0" />
<text
xml:space="preserve"
style="font-size:8.46667px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;fill:#0000ff;stroke-width:0.264583"
x="72.404251"
y="11.173502"
id="text1103"><tspan
sodipodi:role="line"
id="tspan1101"
style="font-size:8.46667px;text-align:center;text-anchor:middle;fill:#1a1a1a;stroke-width:0.264583"
x="72.404251"
y="11.173502">Presentation</tspan></text>
</g>
<g
id="g1116"
transform="translate(0,14.552082)">
<rect
style="fill:#dbe3de;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round;stroke-dasharray:none"
id="rect1110"
width="91.856964"
height="12.801257"
x="26.526571"
y="1.7121719"
ry="0" />
<text
xml:space="preserve"
style="font-size:8.46667px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;fill:#800000;stroke-width:0.264583"
x="72.404251"
y="11.173502"
id="text1114"><tspan
sodipodi:role="line"
id="tspan1112"
style="font-size:8.46667px;text-align:center;text-anchor:middle;fill:#1a1a1a;stroke-width:0.264583"
x="72.404251"
y="11.173502">Executor</tspan></text>
</g>
<g
id="g1174"
transform="translate(0,27.516664)">
<rect
style="fill:#afc6e9;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round;stroke-dasharray:none"
id="rect1168"
width="45.195683"
height="12.801257"
x="26.526571"
y="1.7121719"
ry="0" />
<text
xml:space="preserve"
style="font-size:8.46667px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;fill:#808000;stroke-width:0.264583"
x="48.76458"
y="11.347069"
id="text1172"><tspan
sodipodi:role="line"
id="tspan1170"
style="font-size:8.46667px;text-align:center;text-anchor:middle;fill:#1a1a1a;stroke-width:0.264583"
x="48.76458"
y="11.347069">Index</tspan></text>
</g>
<g
id="g1248"
transform="translate(46.661282,27.516664)">
<rect
style="fill:#b7b7c8;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round;stroke-dasharray:none"
id="rect1242"
width="45.195683"
height="12.801257"
x="26.526571"
y="1.7121719"
ry="0" />
<text
xml:space="preserve"
style="font-size:8.46667px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;fill:#008000;stroke-width:0.264583"
x="48.76458"
y="11.334369"
id="text1246"><tspan
sodipodi:role="line"
id="tspan1244"
style="font-size:8.46667px;text-align:center;text-anchor:middle;fill:#1a1a1a;stroke-width:0.264583"
x="48.76458"
y="11.334369">Cache</tspan></text>
</g>
<g
id="g1256"
transform="translate(46.661282,40.481246)">
<rect
style="fill:#c8b7be;stroke:#4d4d4d;stroke-width:1;stroke-linecap:round;stroke-dasharray:none"
id="rect1250"
width="45.195683"
height="12.801257"
x="26.526571"
y="1.7121719"
ry="0" />
<text
xml:space="preserve"
style="font-size:8.46667px;line-height:1.25;font-family:Ubuntu;-inkscape-font-specification:Ubuntu;letter-spacing:0px;fill:#008080;stroke-width:0.264583"
x="48.76458"
y="11.334369"
id="text1254"><tspan
sodipodi:role="line"
id="tspan1252"
style="font-size:8.46667px;text-align:center;text-anchor:middle;fill:#1a1a1a;stroke-width:0.264583"
x="48.76458"
y="11.334369">Spark</tspan></text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 5.5 KiB

View File

@ -9,6 +9,8 @@
\usepackage{algorithm}
\usepackage{algpseudocode}
\newcommand{\trimfigurespacing}{\vspace*{-5mm}}
\input{macros}
%% Rights management information. This information is sent to you
@ -75,7 +77,7 @@
%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Sparking Insights: Overlaying Spreadsheets onto Apache Spark}
\title{Overlay Spreadsheets}
%%
%% The "author" command and its associated commands are used to define

View File

@ -2,13 +2,11 @@
\section{Introduction}
\label{sec:introduction}
\BG{In the next pass over the intro we can start to reduce redundancy and compact}
Spreadsheets are a popular tools for data exploration, as they provide a simple environment for programmatically accessing and manipulating data.
However, spreadsheets have historically had challenges managing ``big data'', with as few as fifty thousand rows of data creating problems for existing spreadsheet engines~\cite{DBLP:conf/sigmod/RahmanMBZKP20}.
One approach to scalability, employed by \emph{Wrangler}~\cite{DBLP:conf/chi/KandelPHH11}, \emph{Vizier}~\cite{freire:2016:hilda:exception,brachmann:2020:cidr:your}, and others relies on translating spreadsheet interactions into declarative transformations (dataflows) that can be deployed to a database or dataflow system like Spark. As demonstrated in the context of Vizier, this enables light-weight versioning of spreadsheets as only multiple versions of the computation (which are small) rather than multiple versions of the data (which are large) have to be stored.
Spreadsheets are a popular tools for data exploration, transformation, and visualization, but have historically had challenges managing ``big data'' --- with as few as fifty thousand rows of data create problems for existing spreadsheet engines~\cite{DBLP:conf/sigmod/RahmanMBZKP20}.
One approach to scalability, employed by \emph{Wrangler}~\cite{DBLP:conf/chi/KandelPHH11}, \emph{Vizier}~\cite{freire:2016:hilda:exception,brachmann:2020:cidr:your}, and others relies on translating spreadsheet interactions into declarative transformations (dataflows) that can be deployed to a database or dataflow system like Apache Spark.
In this model, the spreadsheet is a chain of versions, each linked by a lightweight transformation function~\cite{freire:2016:hilda:exception}.
A more recent approach employed by \emph{DataSpread}~\cite{DBLP:conf/sigmod/BendreWMCP19,DBLP:conf/sigmod/RahmanMBZKP20,DBLP:conf/icde/BendreVZCP18}, instead re-architects the entire spreadsheet runtime around database primitives like indexes and incremental maintenance specialized for spreadsheet access patterns.
We refer to these as the virtual and materialized approach, respectively.
We refer to these as the virtual and materialized approach, respectively, and illustrate them in \Cref{fig:overlay}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}
@ -18,68 +16,46 @@ We refer to these as the virtual and materialized approach, respectively.
}
\caption{Approaches to scalable spreadsheet design}
\label{fig:overlay}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The data access patterns of spreadsheet have several characteristics~\cite{DBLP:conf/icde/BendreVZCP18, DBLP:conf/sigmod/RahmanMBZKP20, DBLP:conf/sigmod/BendreWMCP19} that differ from typical SQL queries and lead to several inefficiencies if not taken into account when implementing a spreadsheet system:
\begin{itemize}
\item \textbf{Positional Order:} Both the rows and columns in a spreadsheet are ordered and cells can be referenced by position. As already observed in \cite{DBLP:conf/icde/BendreVZCP18}, cell positions needs to be maintained under operations (e.g., inserting or deleting rows and columns).
\item \textbf{Local Updates:} Editing the content of cells in a spreadsheet results in local updates that typically affect one or a small number of cells. Operations like inserting \& deleting rows or columns affect the whole spreadsheet and require updates to the positions of a large number of cells (\cite{DBLP:conf/icde/BendreVZCP18} presented an index structure that allows fast maintenance of positions under updates and access to a cell at a certain position).
\item \textbf{Small Number of Updates:} Since spreadsheet updates are typically made manually, the number of updates is limited by the speed of a human interacting with the system.
\item \textbf{Limited Visibility:} At each point in time, the user can only observe a small portion of the spreadsheet (that fits on the screen). This enables lazy materialization of cells values and updates to the spreadsheet as the values of cells which are not currently visible to the user only have to be computed if they affect the value of a currently visible cell (e.g., because the visible cell uses a formula the references the other cell).
\item \textbf{Formulas:} An important feature of spreadsheets are that cells may contain formulas which are expressions that reference other cells in the spreadsheets. This essentially empowers the user to write highly localized ``views''. Formulas complicate the implementation of spreadsheets, because to compute the value of a cell, the values of all cells it depends on directly or indirectly have to be determined first.
\end{itemize}
The materialized approach is optimized for multiple data access patterns common to spreadsheets~\cite{DBLP:conf/icde/BendreVZCP18, DBLP:conf/sigmod/RahmanMBZKP20, DBLP:conf/sigmod/BendreWMCP19}, including
(i) Data structures specialized for the positional referencing scheme commonly used in spreadsheet formulas~\cite{DBLP:conf/icde/BendreVZCP18},
(ii) Execution strategies that prioritize completion of portions of the spreadsheet that the user is viewing~\cite{DBLP:conf/sigmod/BendreWMCP19}, and
(iii) Indexes that leverage patterns in the dependencies of adjacent cells to compress dependency graphs~\cite{tang-23-efcsfg}.
Similar optimizations are considerably harder in the virtual approach, as the result of updates and their effects on cell position are only materialized when data is received.
The implementation of the materialized approach in DataSpread targets such access patterns, by (i) using a specialized index structure to maintain positional order in logarithmic time under typical spreadsheet updates, (ii) by developing specialized compressed representations of dependencies between cells for efficient computation of values of cells with formulas; and (iii) by prioritizing the refresh of the values of cells that are currently visible to the user. Maintaining positional order efficiently and exploiting the fact that the user can only view a small portion of the spreadsheet
at each point in time is much harder to exploit in the virtual approach as the result of updates and their effects on cell position are only materialized when data is received. Thus, the virtual approach is often less efficient. However, the virtual approach also has several important benefits: because we are storing only the updates made by the user (insert a row at position $x$, replace the value of cell $c$ with $v$, \ldots), multiple versions of the spreadsheet can be retained at very low storage cost by storing changes to the sequence of transformations that were applied to the data rather than changes to the data itself (linear in the number of operations independent of the size of the spreadsheet).
In Wrangler, the resulting data transformation process can be easily upscaled from an interaction-friendly sample of the data to the entire dataset.
In Vizier, the user's manipulations are encoded in the lineage of a Spark dataframe, facilitating detailed provenance analysis.
In fact, this is how versioning of datasets (spreadsheets) is implemented in Vizier. Furthermore, spreadsheets are often used for data preparation: a user loads a dataset and then iteratively curates the data. If the original dataset is updated, e.g., the user may have downloaded the dataset from an open data portal and the provider of the portal has uploaded a new version of the dataset, then the user may want to reapply their edits to the new version of the dataset. This translation is often possible for the virtual approach, but is not possible in the materialized approach where there is no log of the update operations that were applied to the spreadsheet.
Although the virtual approach is often less efficient, it does provide capabilities that the materialized approach does not:
(i) Because it stores only the updates applied by the user (e.g., insert a row at position $x$, replace the value of cell $c$ with $v$, \ldots), the spreadsheet's full version history can be stored at negligible;
(ii) As in Wrangler, the resulting data transformation process can be easily applied to other data (e.g., by scaling up from an interaction-friendly sample of the data to the entire dataset, or an updated version of the data); and
(iii) As in Vizier, the user's interactions can be translated into a standardized query model (i.e., a Spark dataframe), allowing it to ``plug into'' existing scalable computation platforms (i.e., Spark) and standardized provenance analysis frameworks (e.g., \cite{kumari:2021:cidr:datasense}).
% User interactions manipulate a data transformation process, rather than the data itself.
% In Wrangler, the resulting data transformation process can be easily upscaled from an interaction-friendly sample of the data to the entire dataset.
% In Vizier, the user's manipulations are encoded in the lineage of a Spark's dataframe, facilitating detailed provenance analysis and effective versioning.
\BG{Because relational engines are not optimized for low-latency update queries, the architectural approach can provide significantly better interactive performance than the relational approach.}\BG{Furthermore, an important feature of spreadsheets is that cells may contain expressions (\emph{formulas}) which reference other cells. This enables users to create ``local'' views which populate some parts of the spreadsheet automatically. The evaluation of formulas is an inherently recursive process (a cell with a formula may refer to another cell that itself contains a formula and so on) which is hard to express efficiently in SQL).}
% However, the relational design an important benefit:
% User interactions manipulate a data transformation process, rather than the data itself.
% In Wrangler, the resulting data transformation process can be easily upscaled from an interaction-friendly sample of the data to the entire dataset.
% In Vizier, the user's manipulations are encoded in the lineage of a Spark's dataframe, facilitating detailed provenance analysis and effective versioning.
In this paper, we present an optimized hybrid of the virtual and materialized approaches to scalable spreadsheets: \emph{Overlay Spreadsheets}.
An Overlay Spreadsheet keeps source data in-situ, decoupled from the user's edits to a spreadsheet ``overlaid'' on top of the source data, as illustrated in \Cref{fig:overlay}.
Users interact with the resulting structure much like they would an ordinary spreadsheet, inserting or removing rows or columns, overwriting data with formulas or literals, and reorganizing the data.
Crucially, the overlay virtualizes references to the source dataset, allowing users to replay their actions on a new, updated dataset and in contrast to the purely virtual approach which expressed updates as relational operations, overlays store more concrete information about updates to the positional order of cells and about which cells where modified. We demonstrate that this different virtual representation of edits enables more efficient exploitation of spreadsheet access patterns including computing the values of cells currently visible to the user.
In this paper, we present an optimized hybrid of the virtual and materialized approaches: \emph{Overlay Spreadsheets}.
In an Overlay Spreadsheet (\Cref{fig:overlay}), the user's edits are stored in a spreadsheet that is ``overlaid'' on top of source data.
Users interact with an Overlay Spreadsheet just like an ordinary spreadsheet, inserting or removing rows or columns, overwriting data with formulas or literals, and reorganizing the data.
However, references to the source dataset are virtualized, allowing users to replay their actions on a updated datasets, translate spreadsheets to run on scalable computation platforms, and to facilitate provenance analysis.
We also demonstrate that this different virtual representation of edits enables more efficient exploitation of spreadsheet access patterns, including optimizing computation of cells visible to the user.
We outline a preliminary implementation of Overlay Spreadsheets within Vizier~\cite{brachmann:2019:sigmod:data,brachmann:2020:cidr:your,kennedy:2022:ieee-deb:right}, a multi-modal, reproducibility-oriented, notebook-style workflow system built on Apache Spark.
Users of Vizier define sequences of data transformation steps that may include scripts, templated widgets, or other operations.
A key feature of Vizier is that users can define data transformations (including limited formula support) through a spreadsheet style interface;
Following~\cite{freire:2016:hilda:exception}, user interactions are applied to a dataframe, and the results are updated and displayed.
As mentioned above, on spite of this approach's performance limitations, it remains preferable, as it allows the user actions to be reapplied to new source data (a necessity in Vizier's workflow model), and enables fine-grained provenance analysis and light-weight versioning (another key feature of Vizier).
Existing versions of Vizier provide a spreadsheet-style interface, where each user interaction builds out the data transformation workflow.
In spite of the performance limitations of the virtual approach, it remains preferable for Vizier, where (i) changes to an early step in the workflow may require automatically re-applying the user's edits, and (ii) fine-grained provenance features are implemented primarily over Spark dataframes.
%
Our objective in this paper is to demonstrate that a spreadsheet-style interface can provide \textbf{interactive latencies} (i.e., like the materialized approach), while still supporting for \textbf{replay and provenance} (i.e., like the virtual approach).
Concretely, our objective is to demonstrate a spreadsheet-style interface that provides interactive latencies (i.e., like the materialized approach), while simultaneously supporting for replay, provenance, and dealing with updates to the input data (i.e., like the virtual approach).
The resulting interface can be used for data exploration, data preparation, and preliminary analysis, but also provides a low-friction, visual environment for defining bulk data transformations.
As a secondary goal, we further explore the additional benefits of the overlay approach.
Specifically, we observe that because spreadsheet updates are typically made manually, the number of updates is limited by the speed of a human interacting with the system.
Although a single update may be applied to multiple cells (e.g., by copy/pasting a formula over a range of cells), the number of such updates is likely to be small.
In this paper, we take the first steps towards hybridizing the cell-at-a-time execution strategies of classical spreadsheets, with bulk computation strategies found in relational databases.
This hybrid strategy is akin to optimizations applied in data spread~\cite{DBLP:conf/sigmod/BendreWMCP19, tang-23-efcsfg}, but operating over patterns of updates rather than patterns in the dependency graph.
The overlay approach also carries one additional benefit.
As mentioned above, typically the number of interactions that the user performs on the dataset will be small compared to the size of the dataset --- the user is unlikely to need to manually inspect and update each individual row of a million row dataset.
Rather, we expect a common pattern to involve fine-grained manipulation of a small fragment of the dataset to derive new formulas, followed by a bulk application of the formula to the remainder of the dataset.
Under the assumption that the majority of cell updates will be bulk applications of a common formula ``pattern,'' then the overlay only needs to record the pattern and the range of cells it was applied to. This is akin to optimizations applied in data spread~\cite{DBLP:conf/sigmod/BendreWMCP19, tang-23-efcsfg}, but we are creating patterns of updates to the spreadsheet rather than patterns of dependencies in a particular version of the spreadsheet.
Such patterns reference external cells by offsets to the cell on which the formula is applied, allowing one pattern to define an entire data-parallel computation.
This form of compression can substantially reduce the size of the overlay's encoding, but its use of offset positions becomes problematic if the shape of the dataset changes.
For example, if a new row is inserted, the offset for a given formula changes, an issue that was not addressed in~\cite{DBLP:conf/sigmod/BendreWMCP19, tang-23-efcsfg}.
We explore how the compression can be preserved through versioned ``reference frames'' that record and facilitate low-overhead transformations between different versions of the mapping between positions and cells that defines a spreadsheet.
As a further advantage of the overlay approach, user interactions with the overlay can be translated into other representations~\cite{freire:2016:hilda:exception}.
For example, a user's edits on a spreadsheet can be transformed into a series of transformations over a dataframe, allowing seamless integration of existing approaches to provenance management~\cite{brachmann:2020:cidr:your,kumari:2021:cidr:datasense} and workflow execution~\cite{kennedy:2022:ieee-deb:right}.
In this paper, we introduce Overlay Spreadsheets, and present the details of our prototype implementation.
We implement the concept in the Vizier notebook~\cite{kennedy:2022:ieee-deb:right,brachmann:2020:cidr:your,brachmann:2019:sigmod:data}, a workflow-style notebook built over Apache Spark.
We explore the challenges of integrating overlay spreadsheets with Apache Spark dataframes, and discuss preliminary work in translating an overlay spreadsheet to derive a dataframe.
\BG{Experimal result take-aways}
% March 26 by OK: Trimming the ToC summary for space
%
% In this paper, we introduce Overlay Spreadsheets, and present the details of our prototype implementation.
% We implement the concept in the Vizier notebook~\cite{kennedy:2022:ieee-deb:right,brachmann:2020:cidr:your,brachmann:2019:sigmod:data}, a workflow-style notebook built over Apache Spark.
% We explore the challenges of integrating overlay spreadsheets with Apache Spark dataframes, and discuss preliminary work in translating an overlay spreadsheet to derive a dataframe.
% \BG{Experimal result take-aways}

View File

@ -1,5 +1,6 @@
%!TEX root=../main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Spreadsheet Datamodel}
\section{Spreadsheet Data Model}
\label{sec:spre-datam}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -153,6 +154,7 @@ $,$\\ %\vspace{5mm}
\end{minipage}
\caption{Example spreadsheet (expressions are shown in \textcolor{tabexprcolor}{dark green} to distinguish them from values), result of evaluating the spreadsheet, and an update applied to the spreadsheet (updated expressions and values are shown in \uv{red}).}\label{fig:example-spreadsheet-and-a}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -261,6 +263,7 @@ $\,$\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{minipage}
\caption{Example overlay update and result (updated expressions and values are shown in \uv{red}).}\label{fig:example-overlay-update}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -3,9 +3,10 @@
\label{sec:system}
\begin{figure}
\includegraphics[width=0.8\columnwidth]{graphics/systemdesign.png}
\includegraphics[width=0.4\columnwidth]{graphics/system-arch}
\caption{Overlay system design.}
\label{fig:systemdesign}
\trimfigurespacing
\end{figure}
We now outline the design of our prototype overlay spreadsheet, implemented as part of the Vizier reproducible notebook platform~\cite{brachmann:2020:cidr:your,brachmann:2019:sigmod:data,kennedy:2022:ieee-deb:right}.
@ -81,9 +82,10 @@ Crucially, the update index avoids materializing the expressions for each cell b
As noted above, we assume that the number of columns is comparatively small, and the number of rows is comparatively large.
\begin{figure}
\includegraphics[width=\columnwidth]{graphics/rangemap.png}
\caption{A range map defines a mapping from disjoint ranges to values.}
\includegraphics[width=0.7\columnwidth]{graphics/rangemap.pdf}
\caption{A range map maps disjoint ranges to values.}
\label{fig:rangemap}
\trimfigurespacing
\end{figure}
\paragraph{Range Maps}