Bounding TLC

master
Oliver Kennedy 2021-09-18 21:09:17 -04:00
parent 2893442616
commit 0e26c8d736
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
3 changed files with 104 additions and 58 deletions

View File

@ -66,62 +66,54 @@ We define the circuit for a $\raPlus$ query $\query$ recursively by cases as fol
\begin{algorithm}
\caption{\abbrStepOne$(\query, \dbbase)$}
\caption{\abbrStepOne$(\query, \dbbase, E, V, \ell)$}
\label{alg:lc}
\begin{algorithmic}[1]
\Require $\query$: query
\Require $\dbbase$: a \dbbaseName
\Require $E, V, \ell$: accumulators for the edge list, vertex list, and vertex label list.
\Ensure $\circuit = \tuple{E, V, \phi, \ell}$: a circuit encoding the lineage of each tuple in $\query(\dbbase)$
\If{$\query$ is $R$}
\State $V \gets \comprehension{v_t}{t \in \dbbase.R}$
\State $E \gets \emptyset$
\If{$\query$ is $R$} \Comment{\textbf{Case 1}: $\query$ is a relation atom}
\For{$t \in \dbbase.R$}
\State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) \gets R(t)$
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, R(t))\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) = v_t$
\EndFor
\ElsIf{$\query$ is $\sigma_\theta(\query')$}
\State $\tuple{V, E, \phi', \ell} \gets \abbrStepOne(\query', \dbbase)$
\ElsIf{$\query$ is $\sigma_\theta(\query')$} \Comment{\textbf{Case 2}: $\query$ is a Selection}
\State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase, V, E, \ell)$
\For{$t \in \domain(\phi')$}
\If{$\theta(t)$}
\State $\phi(t) \gets \phi'(t)$
\Else
\State $\phi(t) \gets v_0$
\EndIf
\State \textbf{if }$\theta(t)$
\textbf{ then } $\phi(t) = \phi'(t)$
\textbf{ else } $\phi(t) = v_0$
\EndFor
\ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$}
\State $\tuple{V', E', \phi', \ell'} \gets \abbrStepOne(\query', \dbbase)$
\State $V \gets V' \cup \comprehension{v_t}{t \in \pi_{\vec{A}}(\domain(\phi'))}$
\State $E \gets E' \cup \comprehension{(\phi(t'), v_t)}{t \in \pi_{\vec{A}}t', t' \in \domain(\phi'), t \in \pi_{\vec{A}}(\domain(\phi'))}$
\Comment{Nodes with in-degrees above 2 are corrected (with logarithmic overhead) with an equivalent fan-in tree.}
\ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$} \Comment{\textbf{Case 3}: $\query$ is a Projection}
\State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase, V, E, \ell)$
\For{$t \in \pi_{\vec{A}}(\query'(\dbbase))$}
\State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) \gets +$
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$\Comment{Allocate a fresh node $v_t$}
\State $\phi(t) \leftarrow v_t$
\EndFor
\ElsIf{$\query$ is $\query_1 \cup \query_2$}
\State $\tuple{V_1, E_1, \phi_1, \ell_1} \gets \abbrStepOne(\query_1, \dbbase)$
\State $\tuple{V_2, E_2, \phi_2, \ell_2} \gets \abbrStepOne(\query_2, \dbbase)$
\State $V \gets V_1 \cup V_2 \cup \comprehension{v_t}{t \in \domain(\phi_1) \cap \domain(\phi_2)}$
\State $E \gets E_1 \cup E_2 \cup \comprehension{(\phi_1(t), v_t), (\phi_2(t), v_t)}{t \in \domain(\phi_1) \cap \domain(\phi_2)}$
\State $\phi \gets \phi_1 \cup \phi_2$
\State $\ell \gets \ell_1 \cup \ell_2$
\For{$t \in \query'(\dbbase)$}
\State $E \leftarrow E \cup \{(\phi'(t), \phi(\pi_{\vec{A}}t))\}$
\EndFor
\State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in tree instead
\ElsIf{$\query$ is $\query_1 \cup \query_2$} \Comment{\textbf{Case 4}: $\query$ is a Bag Union}
\State $\tuple{V, E, \phi_1, \ell} = \abbrStepOne(\query_1, \dbbase, V, E, \ell)$
\State $\tuple{V, E, \phi_2, \ell} = \abbrStepOne(\query_2, \dbbase, V, E, \ell)$
\State $\phi = \phi_1 \cup \phi_2$
\For{$t \in \domain(\phi_1) \cap \domain(\phi_2)$}
\State $\phi(t) \gets v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) \gets +$
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) = v_t$
\State $E \leftarrow E \cup \{(\phi_1(t), v_t), (\phi_2(t), v_t)\}$
\EndFor
\ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_k$}
\For{$i \in [1, k]$}
\State $\tuple{V_i, E_i, \phi_i, \ell_i} \gets \abbrStepOne(\query_i, \dbbase)$
\ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_n$} \Comment{\textbf{Case 5}: $\query$ is a n-ary Join}
\For{$i \in [n]$}
\State $\tuple{V, E, \phi_i, \ell} = \abbrStepOne(\query_i, \dbbase, V, E, \ell)$
\EndFor
\State $V \gets V_1 \cup \ldots \cup V_k \cup \comprehension{v_t}{t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)}$
\State $E \gets E_1 \cup \ldots \cup E_k \cup \bigcup_{i \in [1,k]}
\comprehension{(\phi_i(\pi_{sch(\query_i)}(t))}{t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)}$
\State $\phi \gets \phi_1 \cup \ldots \cup \phi_k$
\State $\ell \gets \ell_1 \cup \ldots \cup \phi_k$
\For{$t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_k)$}
\State $\phi(t) \gets v_t$
\State $\ell(v_t) \gets \times$
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, \times)\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) = v_t$
\State $E \leftarrow E \cup \comprehension{(\phi_i(\pi_{sch(\query_i(\dbbase))}(t)), v_t)}{i \in [n]}$
\EndFor
\State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in tree instead
\EndIf
@ -197,8 +189,8 @@ $\depth(\circuit^*) \leq O(k|\query|\log(n))$
\begin{proof}
We show that the bound of \Cref{prop:circuit-depth} holds for the circuit constructed by \Cref{alg:lc}.
First, observe that \Cref{alg:lc} is invoked exactly once for every relational operator or base relation in $\query$; It thus suffices to show that an invocation \Cref{alg:lc} adds at most $O_k(\log(n))$ to the depth of any circuit produced by a recursive invocation.
Second, observe that modulo the logarithmic fan-in of the projection and join cases, the depth of the output is at most one greater than the depth of any input.
First, observe that \Cref{alg:lc} is (recursively) invoked exactly once for every relational operator or base relation in $\query$; It thus suffices to show that a call to \Cref{alg:lc} adds at most $O_k(\log(n))$ to the depth of a circuit produced by any recursive invocation.
Second, observe that modulo the logarithmic fan-in of the projection and join cases, the depth of the output is at most one greater than the depth of any input (or at most 1 in the base case of relation atoms).
For the join case, the number of in-edges can be no greater than the join width, which itself is bounded by $k$. The depth thus increases by at most a constant factor of $\lceil \log(k) \rceil = O_k(1)$.
For the projection case, observe that the fan-in is bounded by $|\query'(\dbbase)|$, which is in turn bounded by $n^k$. The depth increase for any projection node is thus at most $\lceil \log(n^k)\rceil = O(k\log(n))$, as desired. % = O_k(\log(n))$.
\qed
@ -278,9 +270,49 @@ The property holds for all recursive queries, and the proof holds.
\label{sec:lc-runtime}
We next need to show that we can construct the circuit in time linear in the deterministic runtime.
\begin{lemma}\label{lem:tlc-is-the-same-as-det}
\begin{Lemma}\label{lem:tlc-is-the-same-as-det}
Given a query $\query$ over a \dbbaseName $\dbbase$, the runtime $\timeOf{\abbrStepOne}(\query,\dbbase,\circuit) \le O(\qruntime{\query, \dbbase})$
\end{lemma}
\end{Lemma}
\begin{proof}
By analysis of \Cref{alg:lc}, invoked as $\abbrStepOne(\query, \dbbase, \emptyset, \emptyset, \emptyset)$.
We assume that $V$, $E$, and $\ell$ are each stored in a mutable accumulator with $O(1)$ ammortized append.
We assume that $\phi$ is stored in a linked hashmap, with $O(1)$ insertions and retrievals, and $O(n)$ iteration over the domain of keys.
We assume that the n-ary join $\domain(\phi_1) \bowtie \ldots \domain(\phi_n)$ can be computed in time $\jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$ and that an intersection $\domain(\phi_1) \cap \domain(\phi_2)$ can be computed in time $O(|\domain(\phi_1)| + |\domain(\phi_2)|)$ (i.e., with a hash table).
Before proving our runtime bound, we first observe that $\qruntime{\query, \db} \geq O(|\query(\db)|)$.
This is true by construction for the relation, projection, and union cases, by \Cref{def:join-cost} for joins, and by the observation that $|\sigma(R)| \leq |R|$.
We showthat $\qruntime{\query, \dbbase}$ is an upper-bound for the runtime of \Cref{alg:lc} by recursion.
The base case of a relation atom requires only an $O(|\dbbase.R|)$ iteration over the source tuples.
For the remaining cases, we make the recursive assumption that for every subquery $\query'$, it holds that $O(\qruntime{\query', \dbbase})$ bounds the runtime of \Cref{alg:lc}.
\caseheading{Selection}
Selection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$.
\Cref{alg:lc} requires a loop over every element of $\query'(\dbbase)$.
By the observation above that $\qruntime{\query, \db} \geq O(|\query(\db)|)$, this iteration is also bounded by $O(\qruntime{\query', \dbbase})$.
\caseheading{Projection}
Projection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$, which in turn is a term in $\qruntime{\pi_{\vec{A}}\query', \dbbase}$.
What remains is an iteration over $\pi_{\vec A}(\query(\dbbase))$ (lines 13--16), an iteration over $\query'(\dbbase)$ (lines 17--19), and the construction of a fan-in tree (line 20).
The first iteration is $O(|\query(\dbbase)|) \leq O(\qruntime{\query, \dbbase})$.
The second iteration and the construction of the bounded fan-in tree are both $O(|\query'(\dbbase)|) \leq O(\qruntime{\query', \dbbase}) \leq O(\qruntime{\query, \dbbase}) $, by the the observation above that $\qruntime{\query, \db} \geq O(|\query(\db)|)$.
\caseheading{Bag Union}
As above, the recursive calls explicitly correspond to terms in the expansion of $O(\qruntime{\query_1 \cup \query_2, \dbbase})$.
Initializing $\phi$ (line 24) can be accomplished in $O(\domain(\phi_1) + \domain(\phi_2)) = O(|\query_1(\dbbase)| + |\query_2(\dbbase)|) \leq O(\qruntime{\query_1, \dbbase} + \qruntime{\query_2, \dbbase})$.
The remainder requires computing $\query_1 \cup \query_2$ (line 25) and iterating over it (lines 25--29), which is $O(|\query_1| + |\query_2|)$ as noted above --- this directly corresponds to terms in $\qruntime{\query_1 \cup \query_2, \dbbase}$.
\caseheading{n-ary Join}
As in the prior cases, recursive calls explicitly correspond to terms in our target runtime.
The remaining logic consists of computing $\domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_n)$, iterating over the results, and combining nodes in a fan-in tree.
Respectively, these are $\jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$, $O(|\query_1(\dbbase) \bowtie \ldots \bowtie \query_n(\dbbase)|) \leq \jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$ (\Cref{def:join-cost}), and $O(k|\query_1(\dbbase) \bowtie \ldots \bowtie \query_n(\dbbase)|)$.
\qed
\end{proof}
With \Cref{lem:circ-model-runtime,lem:tlc-is-the-same-as-det} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction.

View File

@ -24,31 +24,44 @@
% In practice there is often a limited number of alternatives for each block (e.g., which of five conflicting data sources to trust). Note that all \tis trivially fulfill this condition (i.e., $c = 1$).}
%That is for \bis that fulfill this restriction approximating the expectation of results of SPJU queries is only has a constant factor overhead over deterministic query processing (using one of the algorithms for which we prove the claim).
% with the same complexity as it would take to evaluate the query on a deterministic \emph{bag} database of the same size as the input PDB.
We adopt a minimalistic compute-bound model of query evaluation drawn from the worst-case optimal join literature~\cite{skew,ngo-survey} to define $\qruntime{\cdot,\cdot}$.\AR{Recursive definition needs to change based on what Oliver needs. Also I think in the definition betlow would be better to replace all $\dbbase$ with $D$.}
%
To decouple our results from specific join algorithms, we first abstract the cost of a join.
\begin{Definition}[Join Cost]
\label{def:join-cost}
Denote by $\jointime{R_1, \ldots, R_n}$ the runtime of an algorithm for computing the n-ary join $R_1 \bowtie \ldots \bowtie R_n$.
We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_n} \geq \Omega(|R_1(\db) \bowtie \ldots \bowtie R_n(\db)|)$
\end{Definition}
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
For these algorithms, $\jointime{R_1, \ldots, R_n} = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$. Our cost model for general query evaluation follows from the join cost:
\noindent\resizebox{1\linewidth}{!}{
\begin{minipage}{1.0\linewidth}
\begin{align*}
\qruntime{R,\dbbase} & = |\dbbase.R| &
\qruntime{\sigma Q, \dbbase} & = \qruntime{Q,\dbbase} &
\qruntime{\pi Q, \dbbase} & = \qruntime{Q,\dbbase} + \abs{Q(D)}
\qruntime{R,\db} & = |\db.R| &
\qruntime{\sigma \query, \db} & = \qruntime{\query,\db} &
\qruntime{\pi \query, \db} & = \qruntime{\query,\db} + \abs{\query(\db)}
\end{align*}\\[-15mm]
\begin{align*}
\qruntime{Q \cup Q', \dbbase} & = \qruntime{Q, \dbbase} + \qruntime{Q', \dbbase} +\abs{Q(D)}+\abs{Q'(D)} \\
\qruntime{Q_1 \bowtie \ldots \bowtie Q_n, \dbbase} & = \qruntime{Q_1, \dbbase} + \ldots + \qruntime{Q_n,\dbbase} + \abs{Q_1(D) \bowtie \ldots \bowtie Q_n(D)}
\begin{align*}
\qruntime{\query \cup \query', \db} & = \qruntime{\query, \db} +
\qruntime{\query', \db} +
\abs{\query(D)}+\abs{\query'(D)} \\
\qruntime{\query_1 \bowtie \ldots \bowtie \query_n, \db}
& = \qruntime{\query_1, \db} + \ldots +
\qruntime{\query_n,\db} +
\jointime{\query_1(\db), \ldots, \query_n(\db)}
\end{align*}
\end{minipage}
}\\
Under this model a query $Q$ evaluated over database $\dbbase$ has runtime $O(\qruntime{Q,\dbbase})$.
Under this model, a query $Q$ evaluated over database $\db$ has runtime $O(\qruntime{Q,\db})$.
We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
Observe that
% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .
It can be verified that worst-case optimal join algorithms~\cite{skew,ngo-survey}, as well as query evaluation via factorized databases~\cite{factorized-db}
%\AR{See my comment on element on whether we should include this ref or not.}
(and work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the size of these queries is data dependent).\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures their runtime.
More specifically \Cref{lem:circ-model-runtime} and \Cref{to-be-decided} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit)$ and $|\circuit$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
More specifically \Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit|$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit)$ and $|\circuit$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
%
%We now make a simple observation on the above cost model:
%\begin{proposition}

View File

@ -312,6 +312,7 @@
\newcommand{\ptime}{{\sf PTIME}\xspace}
\newcommand{\timeOf}[1]{T_{#1}}
\newcommand{\qruntime}[1]{T_{det}(#1)}
\newcommand{\jointime}[1]{T_{join}(#1)}
\newcommand{\kmatchtime}{T_{match}\inparen{k, G}}