Appendix D cleanup, and starting to think about the intro

2021-09-08 22:32:30 -04:00 · 2021-09-08 22:32:30 -04:00 · bfbf92b0cb
parent 03967356e2
commit bfbf92b0cb
4 changed files with 37 additions and 34 deletions
--- a/appendix.tex
+++ b/appendix.tex
@ -43,7 +43,9 @@ Note that we can construct circuits for \bis in time linear in the time required
 \newcommand{\bagdbof}{\textsc{bag}(\pxdb)}

 We now connect the size of a circuit (where the size of a circuit is the number of vertices in the corresponding DAG) %\footnote{since each node has indegree at most two, this also is the same up to constants to counting the number of edges in the DAG.})
- for a given SPJU query $Q$ and $\semNX$-PDB $\pxdb$ to its $\qruntime{Q,\db}$ where $\db$ is one of the possible worlds of $\pxdb$. We do this formally by showing that the size of the circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms.
+for a given SPJU query $Q$ and $\semNX$-PDB $\pxdb$ to 
+the runtime $\qruntime{Q,\dbbase}$ of the PDB's \dbbaseName $\dbbase$. 
+We do this formally by showing that the size of the circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\getpoly}[1]{\textbf{lin}\inparen{#1}}
@ -64,7 +66,7 @@ We define the circuit for a select-union-project-join $Q$ recursively by cases a

 \caseheading{Base Relation}
 Let $Q$ be a base relation $R$.  We define one node for each tuple.  Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$.
-This circuit has $|R|$ vertices.
+This circuit has $|D_\Omega.R|$ vertices.

 \caseheading{Selection}
 Let $Q = \sigma_\theta \inparen{Q_1}$.
@ -120,19 +122,19 @@ There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Lemma}\label{lem:circ-model-runtime}
 \label{lem:circuits-model-runtime}
-Given a $\semNX$-PDB $\pxdb$ and query plan $Q$, the runtime of $Q$ over $\pxdb$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$.  That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q}$, where $k$ is the maximal degree of  any  polynomial in $Q(\pxdb)$.
+Given a $\semNX$-PDB $\pxdb$ with \dbbaseName $\dbbase$, and query plan $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$.  That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q, \dbbase}$, where $k$ is the maximal degree of  any  polynomial in $Q(\pxdb)$.
 \end{Lemma}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %\noindent The proof is shown in \Cref{app:subsec-lem-lin-vs-qplan}.

 %\subsection{Proof for \Cref{lem:circuits-model-runtime}}\label{app:subsec-lem-lin-vs-qplan}
 \begin{proof}
-Proof by induction.  The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |R|$.
-For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i,\pxdb}| \leq (k_i-1)\qruntime{Q_i,\pxdb}$ where $k_i$ is the degree of $Q_i$.
+Proof by induction.  The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |D_\Omega.R|$.
+For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i,\pxdb}| \leq (k_i-1)\qruntime{Q_i,\dbbase}$ where $k_i$ is the degree of $Q_i$.

 \caseheading{Selection}
 Assume that $Q = \sigma_\theta(Q_1)$.
-In the circuit for $Q$, $|V_{Q,\pxdb}| = |V_{Q_1,\pxdb}|$ vertices, so from the inductive assumption and $\qruntime{Q,\pxdb} = \qruntime{Q_1,\pxdb}$ by definition, we have $|V_{Q,\pxdb}| \leq (k-1) \qruntime{Q,\pxdb} $.
+In the circuit for $Q$, $|V_{Q,\pxdb}| = |V_{Q_1,\dbbase}|$ vertices, so from the inductive assumption and $\qruntime{Q,\dbbase} = \qruntime{Q_1,\dbbase}$ by definition, we have $|V_{Q,\pxdb}| \leq (k-1) \qruntime{Q,\dbbase} $.
 % \AH{Technically, $\kElem$ is the degree of $\poly_1$, but I guess this is a moot point since one can argue that $\kElem$ is also the degree of $\poly$.}
 % OK: Correct

@ -142,24 +144,24 @@ The circuit for $Q$ has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices.
 % \AH{The combination of terms above doesn't follow the details for projection above.}
 \begin{align*}
 |V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}| + |Q_1|\\
-%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\pxdb} \geq |Q_1|$}
+%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\dbbase} \geq |Q_1|$}
 %& \leq |V_{Q_1,\pxdb}| + 2 \qruntime{Q_1,\pxdb}\\
 \intertext{(From the inductive assumption)}
-& \leq (k-1)\qruntime{Q_1,\pxdb} + \abs{Q_1}\\
-\intertext{(By definition  of $\qruntime{Q,\pxdb}$)}
-& \le (k-1)\qruntime{Q,\pxdb}.
+& \leq (k-1)\qruntime{Q_1,\dbbase} + \abs{Q_1}\\
+\intertext{(By definition  of $\qruntime{Q,\dbbase}$)}
+& \le (k-1)\qruntime{Q,\dbbase}.
 \end{align*}
 \caseheading{Union}
 Assume that $Q = Q_1 \cup Q_2$.
 The circuit for $Q$ has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices.
 \begin{align*}
 |V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1}|+|{Q_2}|\\
-%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\pxdb} \geq |Q_1|$}
-%& \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+\qruntime{Q_1,\pxdb}+\qruntime{Q_2,\pxdb}|\\
+%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\dbbase} \geq |Q_1|$}
+%& \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+\qruntime{Q_1,\pxdb}+\qruntime{Q_2,\dbbase}|\\
 \intertext{(From the inductive assumption)}
-& \leq (k-1)(\qruntime{Q_1,\pxdb} + \qruntime{Q_2,\pxdb}) + (b_1 + b_2)
-\intertext{(By definition of $\qruntime{Q,\pxdb}$)}
-& \leq (k-1)(\qruntime{Q,\pxdb}).
+& \leq (k-1)(\qruntime{Q_1,\dbbase} + \qruntime{Q_2,\dbbase}) + (b_1 + b_2)
+\intertext{(By definition of $\qruntime{Q,\dbbase}$)}
+& \leq (k-1)(\qruntime{Q,\dbbase}).
 \end{align*}

 \caseheading{$k$-ary Join}
@ -168,12 +170,12 @@ The circuit for $Q$ has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bow
 \begin{align*}
 |V_{Q,\pxdb}| & = |V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
 \intertext{From the inductive assumption and noting $\forall i: k_i \leq k-1$}
-& \leq (k-1)\qruntime{Q_1,\pxdb}+\ldots+(k-1)\qruntime{Q_k,\pxdb}+\\
+& \leq (k-1)\qruntime{Q_1,\dbbase}+\ldots+(k-1)\qruntime{Q_k,\dbbase}+\\
 &\;\;\; (k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
-& \leq (k-1)(\qruntime{Q_1,\pxdb}+\ldots+\qruntime{Q_k,\pxdb}+\\
+& \leq (k-1)(\qruntime{Q_1,\dbbase}+\ldots+\qruntime{Q_k,\dbbase}+\\
 &\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_k}|)\\
-\intertext{(By definition of $\qruntime{Q,\pxdb}$)}
-& = (k-1)\qruntime{Q,\pxdb}.
+\intertext{(By definition of $\qruntime{Q,\dbbase}$)}
+& = (k-1)\qruntime{Q,\dbbase}.
 \end{align*}

 The property holds for all recursive queries, and the proof holds.
--- a/circuits-model-runtime.tex
+++ b/circuits-model-runtime.tex
@ -18,31 +18,29 @@ Lastly, we generalize our result for expectation to other moments.
 \mypar{The cost model}
 %\label{sec:cost-model}
 So far our analysis of $\approxq$ has been in terms of the size of the lineage circuits.
-We now show that this model corresponds to the behavior of a deterministic database by proving that for any \raPlus query $\query$, we can construct a compressed circuit for $\poly$ and \bi $\pxdb$ of size and runtime linear in that of a general class of query processing algorithms for the same query $\query$ on a deterministic database $\db$.
-We assume a linear relationship between input sizes $|\pxdb|$ and $|\db|$ (i.e., $\exists c, \db \in \pxdb$ s.t. $\abs{\pxdb} \leq c \cdot \abs{\db})$).
-\footnote{This is a reasonable assumption because each block of a \bi represents entities with uncertain attributes.
-In practice there is often a limited number of alternatives for each block (e.g., which of five conflicting data sources to trust). Note that all \tis trivially fulfill this condition (i.e., $c = 1$).}
+We now show that this model corresponds to the behavior of a deterministic database by proving that for any \raPlus query $\query$, we can construct a compressed circuit for $\poly$ and \bi $\pxdb$ of size and runtime linear in that of a general class of query processing algorithms for the same query $\query$ on $\pxdb$'s \dbbaseName $\dbbase$.  
+% Note that by definition, there exists a linear relationship between input sizes $|\pxdb|$ and $|\dbbase|$ (i.e., $\exists c, \db \in \pxdb$ s.t. $\abs{\pxdb} \leq c \cdot \abs{\db})$).
+% \footnote{This is a reasonable assumption because each block of a \bi represents entities with uncertain attributes.
+% In practice there is often a limited number of alternatives for each block (e.g., which of five conflicting data sources to trust). Note that all \tis trivially fulfill this condition (i.e., $c = 1$).}
 %That is for \bis that fulfill this restriction approximating the expectation of results of SPJU queries is only has a constant factor overhead over deterministic query processing (using one of the algorithms for which we prove the claim).
 % with the same complexity as it would take to evaluate the query on a deterministic \emph{bag} database of the same size as the input PDB.
 We adopt a minimalistic compute-bound model of query evaluation drawn from the worst-case optimal join literature~\cite{skew,ngo-survey}.
-
-\newcommand{\qruntime}[1]{\textbf{cost}(#1)}
 %
 \noindent\resizebox{1\linewidth}{!}{
 \begin{minipage}{1.0\linewidth}
  \begin{align*}
-\qruntime{R,D}                               & = |R|                                                        &
-                                                                                                              \qruntime{\sigma Q, D}                       & = \qruntime{Q,D}                                             &
-                                                                                                                                                                                                                            \qruntime{\pi Q, D}                          & = \qruntime{Q,D} + \abs{Q(D)}
+\qruntime{R,\dbbase}                               & = |\dbbase.R|                                                        &
+                                                                                                              \qruntime{\sigma Q, \dbbase}                       & = \qruntime{Q,\dbbase}                                             &
+                                                                                                                                                                                                                            \qruntime{\pi Q, \dbbase}                          & = \qruntime{Q,\dbbase} + \abs{Q(D)}
  \end{align*}\\[-15mm]
 \begin{align*}
-\qruntime{Q \cup Q', D}                      & = \qruntime{Q, D} + \qruntime{Q', D} +\abs{Q(D)}+\abs{Q'(D)} \\
-\qruntime{Q_1 \bowtie \ldots \bowtie Q_n, D} & = \qruntime{Q_1, D} + \ldots + \qruntime{Q_n,D} + \abs{Q_1(D) \bowtie \ldots \bowtie Q_n(D)}
+\qruntime{Q \cup Q', \dbbase}                      & = \qruntime{Q, \dbbase} + \qruntime{Q', \dbbase} +\abs{Q(D)}+\abs{Q'(D)} \\
+\qruntime{Q_1 \bowtie \ldots \bowtie Q_n, \dbbase} & = \qruntime{Q_1, \dbbase} + \ldots + \qruntime{Q_n,\dbbase} + \abs{Q_1(D) \bowtie \ldots \bowtie Q_n(D)}
 \end{align*}
 \end{minipage}
 }\\

-Under this model a query $Q$ evaluated over database $D$ has runtime $O(\qruntime{Q,D})$.
+Under this model a query $Q$ evaluated over database $\dbbase$ has runtime $O(\qruntime{Q,\dbbase})$.
 We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.

 It can be verified that worst-case optimal join algorithms~\cite{skew,ngo-survey}, as well as query evaluation via factorized databases~\cite{factorized-db}\AR{See my comment on element on whether we should include this ref or not.} (and work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as select-union-project-join queries (though the size of these queries is data dependent).\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding SPJU join queries correctly captures their runtime.
@ -58,10 +56,10 @@ It can be verified that worst-case optimal join algorithms~\cite{skew,ngo-survey
 We are now ready to formally state our claim from \Cref{sec:intro}:
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Corollary}
-  Given an SPJU query $Q$ over a \ti $\pxdb$ and let $\db_{max}$ denote the world containing all tuples of $\pxdb$, we can compute a $(1\pm\eps)$-approximation of the expectation for each output tuple in $\query(\pxdb)$ with probability at least $1-\delta$ in time
+  Given an SPJU query $Q$ over a \ti $\pxdb$ with \dbbaseName $\dbbase$, we can compute a $(1\pm\eps)$-approximation of the expectation for each output tuple in $\query(\pxdb)$ with probability at least $1-\delta$ in time
 %
  \[
-    O_k\left(\frac 1{\eps^2}\cdot\qruntime{Q,\db_{max}}\cdot \log{\frac{1}{\conf}}\cdot \log(n)\right)
+    O_k\left(\frac 1{\eps^2}\cdot\qruntime{Q,\dbbase}\cdot \log{\frac{1}{\conf}}\cdot \log(n)\right)
    \]
 \end{Corollary}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- a/intro-rewrite-070921.tex
+++ b/intro-rewrite-070921.tex
@ -212,7 +212,8 @@ These are a natural fit to $\raPlus$ queries, as each operator maps to either a
 \end{figure}

 In other words, we can capture the size of a factorized lineage polynomial by the size of its correspoding arithmetic circuit $\circuit$ (which we denote by $|\circuit|$).
-More importantly, our result in \cref{sec:circuit-runtime} shows that, assuming a standard $\raPlus$ query evaluation algorithm for \abbrStepOne (\termStepOne), given the arithmetic circuit $\circuit$ corresponding to lineage polynomial output at the end of \abbrStepOne, we always have $|\circuit|\le \bigO{\timeOf{\abbrStepOne}(Q,\pdb)}$. Given this, we study the following stronger version of~\Cref{prob:big-o-step-one}:
+More importantly, our results in \cref{sec:circuit-runtime} show that, assuming a standard $\raPlus$ query evaluation algorithm for \abbrStepOne (\termStepOne), given the arithmetic circuit $\circuit$ corresponding to lineage polynomial output at the end of \abbrStepOne, we always have $|\circuit|\le \bigO{\timeOf{\abbrStepOne}(Q,\pdb)}$. Given this, we study the following stronger version of~\Cref{prob:big-o-step-one}:
+\OK{This is still wrong.  It should be phrased in terms of $\qruntime(Q, \dbbase)$... but I think it's going to require changes to \Cref{prob:big-o-step-one}}

 %Atri: Replaced the text below by the above. I know I had talked about $|\circuit|^k$ but I think the stuff below breaks the flow a bit
 %Re-stating our earlier observation, given a circuit \circuit, if \circuit is in \abbrSMB (i.e. every sink to source path has a prefix of addition nodes and the rest of the internal nodes are multiplication nodes), then we have that $\timeOf{\abbrStepTwo}(Q,\pdb)$ is indeed $\bigO{\timeOf{\abbrStepOne}(Q,\pdb)}$.  We note that \abbrSMB representations are produced by queries with a projection operation on top of a join operation.
--- a/macros.tex
+++ b/macros.tex
@ -114,6 +114,7 @@
 \newcommand{\pdassign}{\mathcal{P}}
 \newcommand{\pdb}{\mathcal{D}}
 \newcommand{\dbbase}{\db_\idb}
+\newcommand{\dbbaseName}{deterministic bounding database\xspace}
 \newcommand{\pxdb}{\pdb_{\semNX}}
 \newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db--Are we currently using this?

@ -265,6 +266,7 @@
 \newcommand{\subgraph}{\vari{S}_{\equivtree(\circuit)}}
 %-----
 \newcommand{\cost}{\func{Cost}}
+\newcommand{\qruntime}[1]{\textbf{cost}(#1)}
 \newcommand{\nullval}{NULL}