Bounding TLC

2021-09-18 21:09:17 -04:00 · 2021-09-18 21:09:17 -04:00 · 0e26c8d736
parent 2893442616
commit 0e26c8d736
3 changed files with 104 additions and 58 deletions
--- a/appendix.tex
+++ b/appendix.tex
@ -66,62 +66,54 @@ We define the circuit for a $\raPlus$ query $\query$ recursively by cases as fol


 \begin{algorithm}
-\caption{\abbrStepOne$(\query, \dbbase)$}
+\caption{\abbrStepOne$(\query, \dbbase, E, V, \ell)$}
 \label{alg:lc}
  \begin{algorithmic}[1]
  \Require $\query$: query
  \Require $\dbbase$: a \dbbaseName
+  \Require $E, V, \ell$: accumulators for the edge list, vertex list, and vertex label list.
  \Ensure $\circuit = \tuple{E, V, \phi, \ell}$: a circuit encoding the lineage of each tuple in $\query(\dbbase)$
-
-  \If{$\query$ is $R$}
-    \State $V \gets \comprehension{v_t}{t \in \dbbase.R}$
-    \State $E \gets \emptyset$
+  \If{$\query$ is $R$} \Comment{\textbf{Case 1}: $\query$ is a relation atom}
    \For{$t \in \dbbase.R$}
-      \State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
-      \State $\ell(v_t) \gets R(t)$
+      \State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, R(t))\}$  \Comment{Allocate a fresh node $v_t$}
+      \State $\phi(t) = v_t$ 
    \EndFor
-  \ElsIf{$\query$ is $\sigma_\theta(\query')$}
-    \State $\tuple{V, E, \phi', \ell} \gets \abbrStepOne(\query', \dbbase)$
+  \ElsIf{$\query$ is $\sigma_\theta(\query')$} \Comment{\textbf{Case 2}: $\query$ is a Selection}
+    \State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase, V, E, \ell)$
    \For{$t \in \domain(\phi')$}
-      \If{$\theta(t)$}
-        \State $\phi(t) \gets \phi'(t)$
-      \Else
-        \State $\phi(t) \gets v_0$
-      \EndIf
+      \State \textbf{if }$\theta(t)$
+              \textbf{ then } $\phi(t) = \phi'(t)$
+              \textbf{ else } $\phi(t) = v_0$
    \EndFor
-  \ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$}
-    \State $\tuple{V', E', \phi', \ell'} \gets \abbrStepOne(\query', \dbbase)$
-    \State $V \gets V' \cup \comprehension{v_t}{t \in \pi_{\vec{A}}(\domain(\phi'))}$
-    \State $E \gets E' \cup \comprehension{(\phi(t'), v_t)}{t \in \pi_{\vec{A}}t', t' \in \domain(\phi'), t \in \pi_{\vec{A}}(\domain(\phi'))}$
-      \Comment{Nodes with in-degrees above 2 are corrected (with logarithmic overhead) with an equivalent fan-in tree.}
+  \ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$} \Comment{\textbf{Case 3}: $\query$ is a Projection}
+    \State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase, V, E, \ell)$
    \For{$t \in \pi_{\vec{A}}(\query'(\dbbase))$}
-      \State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
-      \State $\ell(v_t) \gets +$
+      \State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$\Comment{Allocate a fresh node $v_t$}
+      \State $\phi(t) \leftarrow v_t$
    \EndFor
-  \ElsIf{$\query$ is $\query_1 \cup \query_2$}
-    \State $\tuple{V_1, E_1, \phi_1, \ell_1} \gets \abbrStepOne(\query_1, \dbbase)$
-    \State $\tuple{V_2, E_2, \phi_2, \ell_2} \gets \abbrStepOne(\query_2, \dbbase)$
-    \State $V \gets V_1 \cup V_2 \cup \comprehension{v_t}{t \in \domain(\phi_1) \cap \domain(\phi_2)}$
-    \State $E \gets E_1 \cup E_2 \cup \comprehension{(\phi_1(t), v_t), (\phi_2(t), v_t)}{t \in \domain(\phi_1) \cap \domain(\phi_2)}$
-    \State $\phi \gets \phi_1 \cup \phi_2$
-    \State $\ell \gets \ell_1 \cup \ell_2$
+    \For{$t \in \query'(\dbbase)$}
+      \State $E \leftarrow E \cup \{(\phi'(t), \phi(\pi_{\vec{A}}t))\}$
+    \EndFor
+    \State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in tree instead
+  \ElsIf{$\query$ is $\query_1 \cup \query_2$} \Comment{\textbf{Case 4}: $\query$ is a Bag Union}
+    \State $\tuple{V, E, \phi_1, \ell} = \abbrStepOne(\query_1, \dbbase, V, E, \ell)$
+    \State $\tuple{V, E, \phi_2, \ell} = \abbrStepOne(\query_2, \dbbase, V, E, \ell)$
+    \State $\phi = \phi_1 \cup \phi_2$
    \For{$t \in \domain(\phi_1) \cap \domain(\phi_2)$}
-      \State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
-      \State $\ell(v_t) \gets +$
+      \State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$ \Comment{Allocate a fresh node $v_t$}
+      \State $\phi(t) = v_t$
+      \State $E \leftarrow E \cup \{(\phi_1(t), v_t), (\phi_2(t), v_t)\}$
    \EndFor
-  \ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_k$}
-    \For{$i \in [1, k]$}
-      \State $\tuple{V_i, E_i, \phi_i, \ell_i} \gets \abbrStepOne(\query_i, \dbbase)$
+  \ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_n$} \Comment{\textbf{Case 5}: $\query$ is a n-ary Join}
+    \For{$i \in [n]$}
+      \State $\tuple{V, E, \phi_i, \ell} = \abbrStepOne(\query_i, \dbbase, V, E, \ell)$
    \EndFor
-    \State $V \gets V_1 \cup \ldots \cup V_k \cup \comprehension{v_t}{t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)}$
-    \State $E \gets E_1 \cup \ldots \cup E_k \cup \bigcup_{i \in [1,k]}
-      \comprehension{(\phi_i(\pi_{sch(\query_i)}(t))}{t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)}$
-    \State $\phi \gets \phi_1 \cup \ldots \cup \phi_k$
-    \State $\ell \gets \ell_1 \cup \ldots \cup \phi_k$
    \For{$t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)$}
-      \State $\phi(t) \gets v_t$
-      \State $\ell(v_t) \gets \times$
+      \State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, \times)\}$ \Comment{Allocate a fresh node $v_t$}
+      \State $\phi(t) = v_t$
+      \State $E \leftarrow E \cup \comprehension{(\phi_i(\pi_{sch(\query_i(\dbbase))}(t)), v_t)}{i \in [n]}$
    \EndFor
+    \State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in tree instead

  \EndIf

@ -197,8 +189,8 @@ $\depth(\circuit^*) \leq O(k|\query|\log(n))$

 \begin{proof}
 We show that the bound of \Cref{prop:circuit-depth} holds for the circuit constructed by \Cref{alg:lc}.
-First, observe that \Cref{alg:lc} is invoked exactly once for every relational operator or base relation in $\query$; It thus suffices to show that an invocation \Cref{alg:lc} adds at most $O_k(\log(n))$ to the depth of any circuit produced by a recursive invocation.
-Second, observe that modulo the logarithmic fan-in of the projection and join cases, the depth of the output is at most one greater than the depth of any input.
+First, observe that \Cref{alg:lc} is (recursively) invoked exactly once for every relational operator or base relation in $\query$; It thus suffices to show that a call to \Cref{alg:lc} adds at most $O_k(\log(n))$ to the depth of a circuit produced by any recursive invocation.
+Second, observe that modulo the logarithmic fan-in of the projection and join cases, the depth of the output is at most one greater than the depth of any input (or at most 1 in the base case of relation atoms).
 For the join case, the number of in-edges can be no greater than the join width, which itself is bounded by $k$.  The depth thus increases by at most a constant factor of $\lceil \log(k) \rceil = O_k(1)$.  
 For the projection case, observe that the fan-in is bounded by $|\query'(\dbbase)|$, which is in turn bounded by $n^k$.  The depth increase for any projection node is thus at most $\lceil \log(n^k)\rceil = O(k\log(n))$, as desired. % = O_k(\log(n))$.  
 \qed
@ -278,9 +270,49 @@ The property holds for all recursive queries, and the proof holds.
 \label{sec:lc-runtime}

 We next need to show that we can construct the circuit in time linear in the deterministic runtime.  
-\begin{lemma}\label{lem:tlc-is-the-same-as-det}
+\begin{Lemma}\label{lem:tlc-is-the-same-as-det}
 Given a query $\query$ over a \dbbaseName $\dbbase$, the runtime $\timeOf{\abbrStepOne}(\query,\dbbase,\circuit) \le O(\qruntime{\query, \dbbase})$
-\end{lemma}
+\end{Lemma}
+\begin{proof}
+By analysis of \Cref{alg:lc}, invoked as $\abbrStepOne(\query, \dbbase, \emptyset, \emptyset, \emptyset)$.
+
+We assume that $V$, $E$, and $\ell$ are each stored in a mutable accumulator with $O(1)$ ammortized append.
+We assume that $\phi$ is stored in a linked hashmap, with $O(1)$ insertions and retrievals, and $O(n)$ iteration over the domain of keys.
+We assume that the n-ary join $\domain(\phi_1) \bowtie \ldots \domain(\phi_n)$ can be computed in time $\jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$ and that an intersection $\domain(\phi_1) \cap \domain(\phi_2)$ can be computed in time $O(|\domain(\phi_1)| + |\domain(\phi_2)|)$ (i.e., with a hash table).
+
+
+Before proving our runtime bound, we first observe that $\qruntime{\query, \db} \geq O(|\query(\db)|)$.
+This is true by construction for the relation, projection, and union cases, by \Cref{def:join-cost} for joins, and by the observation that $|\sigma(R)| \leq |R|$.
+
+We showthat $\qruntime{\query, \dbbase}$ is an upper-bound for the runtime of \Cref{alg:lc} by recursion.  
+The base case of a relation atom requires only an $O(|\dbbase.R|)$ iteration over the source tuples.
+For the remaining cases, we make the recursive assumption that for every subquery $\query'$, it holds that $O(\qruntime{\query', \dbbase})$ bounds the runtime of \Cref{alg:lc}.
+
+\caseheading{Selection}
+Selection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$.  
+\Cref{alg:lc} requires a loop over every element of $\query'(\dbbase)$.
+By the observation above that $\qruntime{\query, \db} \geq O(|\query(\db)|)$, this iteration is also bounded by $O(\qruntime{\query', \dbbase})$.
+
+\caseheading{Projection}
+Projection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$, which in turn is a term in $\qruntime{\pi_{\vec{A}}\query', \dbbase}$.
+What remains is an iteration over $\pi_{\vec A}(\query(\dbbase))$ (lines 13--16), an iteration over $\query'(\dbbase)$ (lines 17--19), and the construction of a fan-in tree (line 20).
+The first iteration is $O(|\query(\dbbase)|) \leq O(\qruntime{\query, \dbbase})$.  
+The second iteration and the construction of the bounded fan-in tree are both $O(|\query'(\dbbase)|) \leq O(\qruntime{\query', \dbbase}) \leq O(\qruntime{\query, \dbbase}) $, by the the observation above that $\qruntime{\query, \db} \geq O(|\query(\db)|)$.
+
+\caseheading{Bag Union}
+As above, the recursive calls explicitly correspond to terms in the expansion of $O(\qruntime{\query_1 \cup \query_2, \dbbase})$.  
+Initializing $\phi$ (line 24) can be accomplished in $O(\domain(\phi_1) + \domain(\phi_2)) = O(|\query_1(\dbbase)| + |\query_2(\dbbase)|) \leq O(\qruntime{\query_1, \dbbase} + \qruntime{\query_2, \dbbase})$.
+The remainder requires computing $\query_1 \cup \query_2$ (line 25) and iterating over it (lines 25--29), which is $O(|\query_1| + |\query_2|)$ as noted above --- this directly corresponds to terms in $\qruntime{\query_1 \cup \query_2, \dbbase}$.
+
+
+\caseheading{n-ary Join}
+As in the prior cases, recursive calls explicitly correspond to terms in our target runtime.
+The remaining logic consists of computing $\domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_n)$, iterating over the results, and combining nodes in a fan-in tree.
+Respectively, these are $\jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$, $O(|\query_1(\dbbase) \bowtie \ldots \bowtie \query_n(\dbbase)|) \leq \jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$ (\Cref{def:join-cost}), and $O(k|\query_1(\dbbase) \bowtie \ldots \bowtie \query_n(\dbbase)|)$.
+\qed
+\end{proof}
+
+

 With \Cref{lem:circ-model-runtime,lem:tlc-is-the-same-as-det} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm,  the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction.

--- a/circuits-model-runtime.tex
+++ b/circuits-model-runtime.tex
@ -24,31 +24,44 @@
 % In practice there is often a limited number of alternatives for each block (e.g., which of five conflicting data sources to trust). Note that all \tis trivially fulfill this condition (i.e., $c = 1$).}
 %That is for \bis that fulfill this restriction approximating the expectation of results of SPJU queries is only has a constant factor overhead over deterministic query processing (using one of the algorithms for which we prove the claim).
 % with the same complexity as it would take to evaluate the query on a deterministic \emph{bag} database of the same size as the input PDB.
-We adopt a minimalistic compute-bound model of query evaluation drawn from the worst-case optimal join literature~\cite{skew,ngo-survey} to define $\qruntime{\cdot,\cdot}$.\AR{Recursive definition needs to change based on what Oliver needs. Also I think in the definition betlow would be better to replace all $\dbbase$ with $D$.}

-%
+To decouple our results from specific join algorithms, we first abstract the cost of a join.
+
+\begin{Definition}[Join Cost]
+\label{def:join-cost}
+Denote by $\jointime{R_1, \ldots, R_n}$ the runtime of an algorithm for computing the n-ary join $R_1 \bowtie \ldots \bowtie R_n$.
+We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_n} \geq \Omega(|R_1(\db) \bowtie \ldots \bowtie R_n(\db)|)$
+\end{Definition}
+
+Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
+For these algorithms, $\jointime{R_1, \ldots, R_n} = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.  Our cost model for general query evaluation follows from the join cost:
+
 \noindent\resizebox{1\linewidth}{!}{
 \begin{minipage}{1.0\linewidth}
  \begin{align*}
-\qruntime{R,\dbbase}                               & = |\dbbase.R|                                                        &
-                                                                                                              \qruntime{\sigma Q, \dbbase}                       & = \qruntime{Q,\dbbase}                                             &
-                                                                                                                                                                                                                            \qruntime{\pi Q, \dbbase}                          & = \qruntime{Q,\dbbase} + \abs{Q(D)}
+    \qruntime{R,\db}                    & = |\db.R| &
+    \qruntime{\sigma \query, \db}       & = \qruntime{\query,\db} &
+    \qruntime{\pi \query, \db}          & = \qruntime{\query,\db} + \abs{\query(\db)}
  \end{align*}\\[-15mm]
-\begin{align*}
-\qruntime{Q \cup Q', \dbbase}                      & = \qruntime{Q, \dbbase} + \qruntime{Q', \dbbase} +\abs{Q(D)}+\abs{Q'(D)} \\
-\qruntime{Q_1 \bowtie \ldots \bowtie Q_n, \dbbase} & = \qruntime{Q_1, \dbbase} + \ldots + \qruntime{Q_n,\dbbase} + \abs{Q_1(D) \bowtie \ldots \bowtie Q_n(D)}
+  \begin{align*}
+    \qruntime{\query \cup \query', \db} & = \qruntime{\query, \db} + 
+                                            \qruntime{\query', \db} +
+                                            \abs{\query(D)}+\abs{\query'(D)} \\
+    \qruntime{\query_1 \bowtie \ldots \bowtie \query_n, \db} 
+                                        & = \qruntime{\query_1, \db} + \ldots + 
+                                            \qruntime{\query_n,\db} + 
+                                            \jointime{\query_1(\db), \ldots, \query_n(\db)}
 \end{align*}
 \end{minipage}
 }\\

-Under this model a query $Q$ evaluated over database $\dbbase$ has runtime $O(\qruntime{Q,\dbbase})$.
+
+Under this model, a query $Q$ evaluated over database $\db$ has runtime $O(\qruntime{Q,\db})$.
 We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
+Observe that 
+% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .

-It can be verified that worst-case optimal join algorithms~\cite{skew,ngo-survey}, as well as query evaluation via factorized databases~\cite{factorized-db}
-%\AR{See my comment on element on whether we should include this ref or not.} 
-(and work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the size of these queries is data dependent).\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures their runtime.
-
-More specifically \Cref{lem:circ-model-runtime} and \Cref{to-be-decided} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit)$ and $|\circuit$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
+More specifically \Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit|$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit)$ and $|\circuit$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
 %
 %We now make a simple observation on the above cost model:
 %\begin{proposition}
--- a/macros.tex
+++ b/macros.tex
@ -312,6 +312,7 @@
 \newcommand{\ptime}{{\sf PTIME}\xspace}
 \newcommand{\timeOf}[1]{T_{#1}}
 \newcommand{\qruntime}[1]{T_{det}(#1)}
+\newcommand{\jointime}[1]{T_{join}(#1)}
 \newcommand{\kmatchtime}{T_{match}\inparen{k, G}}