An algorithm for LC

This commit is contained in:
Oliver Kennedy 2021-09-18 00:46:00 -04:00
parent f6ba1e8a2a
commit bd13ad5569
Signed by: okennedy
GPG key ID: 3E5F9B3ABD3FDB60

View file

@ -62,74 +62,140 @@ encodes a polynomial, realized as
\end{cases}\]
We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$.
We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$. We implicitly include in all circuits a global zero node $v_0$ s.t., $\ell_{Q, \pxdb}(v_0) = 0$ for any $Q,\pxdb$.
\begin{algorithm}
\caption{\abbrStepOne$(\query, \dbbase)$}
\label{alg:lc}
\begin{algorithmic}[1]
\Require $\query$: query
\Require $\dbbase$: a \dbbaseName
\Ensure $\circuit = \tuple{E, V, \ell, \phi}$: a circuit encoding the lineage of each tuple in $\query(\dbbase)$
\If{$\query$ is $R$}
\State $V = \comprehension{v_t}{t \in \dbbase.R}$
\State $E = \emptyset$
\For{$t \in \dbbase.R$}
\State $\phi(t) = v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) = R(t)$
\EndFor
\ElsIf{$\query$ is $\sigma_\theta(\query')$}
\State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase)$
\For{$t \in \dbbase.R$}
\If{$\theta(t)$}
\State $\phi(t) = \phi'(t)$
\Else
\State $\phi(t) = v_0$
\EndIf
\EndFor
\ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$}
\State $\tuple{V', E', \phi', \ell'} = \abbrStepOne(\query', \dbbase)$
\State $V = V' \cup \comprehension{v_t}{t \in \pi_{\vec{A}}(\query)}$
\State $E = E' \cup \comprehension{(\phi(t'), v_t)}{t \in \pi_{\vec{A}}t', t' \in \query', t \in \pi_{\vec{A}}(\query')}$
\Comment{Nodes with in-degrees above 2 are corrected (with logarithmic overhead) with an equivalent fan-in tree.}
\For{$t \in \pi_{\vec{A}}(\query')$}
\State $\phi(t) = v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) = +$
\EndFor
\ElsIf{$\query$ is $\query_1 \cup \query_2$}
\State $\tuple{V_1, E_1, \phi_1, \ell_1} = \abbrStepOne(\query_1, \dbbase)$
\State $\tuple{V_2, E_2, \phi_2, \ell_2} = \abbrStepOne(\query_2, \dbbase)$
\State $V = V_1 \cup V_2 \cup \comprehension{v_t}{t \in \query_1 \cap \query_2}$
\State $E = E_1 \cup E_2 \cup \comprehension{(\phi_1(t), v_t), (\phi_2(t), v_t)}{t \in \query_1 \cap \query_2}$
\State $\phi = \phi_1 \cup \phi_2$
\State $\ell = \ell_1 \cup \ell_2$
\For{$t \in \query_1 \cap \query_2$}
\State $\phi(t) = v_t$ \Comment{$v_t$ as defined above}
\State $\ell(v_t) = +$
\EndFor
\ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_k$}
\For{$i \in [1, k]$}
$\tuple{V_i, E_i, \phi_i, \ell_i} = \abbrStepOne(\query_i, \dbbase)$
\EndFor
\State $V = V_1 \cup \ldots \cup V_k \cup \comprehension{v_t}{t \in \query_1 \bowtie \ldots \bowtie \query_k}$
\State $E = E_1 \cup \ldots \cup E_k \cup \bigcup_{i \in [1,k]}
\comprehension{(\phi_i(\pi_{sch}(\query_i)(t))}{t \in \query_1 \bowtie \ldots \bowtie \query_k}$\Comment{Nodes with in-degrees above 2 are corrected (with $\log_2(k)$ overhead) with an equivalent fan-in tree.}
\State $\phi = \phi_1 \cup \ldots \cup \phi_k$
\State $\ell = \ell_1 \cup \ldots \cup \phi_k$
\For{$t \in \query_1 \bowtie \ldots \bowtie \query_k$}
\State $\phi(t) = v_t$
\State $\ell(v_t) = \times$
\EndFor
\EndIf
\end{algorithmic}
\end{algorithm}
\Cref{alg:lc} defines how the circuit for a query result is constructed. We quickly review the number of vertices emitted in each case.
\caseheading{Base Relation}
Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$.
% Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$.
This circuit has $|D_\Omega.R|$ vertices.
\caseheading{Selection}
Let $Q = \sigma_\theta \inparen{Q_1}$.
We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate.
Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb}$, let $\ell_{Q,\pxdb}(v_0) = 0$, and let $\ell_{Q,\pxdb}(v) = \ell_{Q_1,\pxdb}(v)$ for any $v \in V_{Q_1,\pxdb}$. Let $E_{Q,\pxdb} = E_{Q_1,\pxdb}$, and define
$$\phi_{Q,\pxdb}(t) =
\phi_{Q_{1}, \pxdb}(t) \text{ for } t \text{ s.t.}\; \theta(t).$$
Dead sinks are iteratively removed, and so
%\AH{While not explicit, I assume a reviewer would know that the notation above discards tuples/vertices not satisfying the selection predicate.}
%v_0 & \textbf{otherwise}
%\end{cases}$$
% Let $Q = \sigma_\theta \inparen{Q_1}$.
% We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate.
% Let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \{v_0\}$, and let $\ell_{Q,\pxdb}(v) = \ell_{Q_1,\pxdb}(v)$ for any $v \in V_{Q_1,\pxdb}$. Let $E_{Q,\pxdb} = E_{Q_1,\pxdb}$, and define
% $$\phi_{Q,\pxdb}(t) =
% \phi_{Q_{1}, \pxdb}(t) \text{ for } t \text{ s.t.}\; \theta(t) \text{ and } \phi_{Q,\pxdb}(t) = v_0 \text{ otherwise}.$$
If we assume dead sinks are iteratively garbage collected,
this circuit has at most $|V_{Q_1,\pxdb}|$ vertices.
\caseheading{Projection}
Let $Q = \pi_{\vct A} {Q_1}$.
We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$.
Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_{Q,\pxdb}(t) = v_t$, and let $\ell_{Q,\pxdb}(v_t) = +$. Finally let
$$E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$
% Let $Q = \pi_{\vct A} {Q_1}$.
% We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$.
% Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_{Q,\pxdb}(t) = v_t$, and let $\ell_{Q,\pxdb}(v_t) = +$. Finally let
% $$E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$
This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ new vertices.
% \AH{Is the rightmost operator \emph{supposed} to be a $-$? In the beginning we add $|\pi_{\vct A}{Q_1}|$ vertices.}
The corrected circuit thus has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices.
\caseheading{Union}
Let $Q = {Q_1} \cup {Q_2}$.
We merge graphs and produce a sum vertex for all tuples in both sides of the union.
Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup V_{Q_2,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_{Q,\pxdb}(v_t) = +$, and let
\[E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup E_{Q_2,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t), v_t), (\phi_{Q_{2}, \pxdb}(t), v_t)}{t \in {Q_1} \cap {Q_2}}\]
\[
\phi_{Q,\pxdb}(t) = \begin{cases}
v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\
\phi_{Q_{1}, \pxdb}(t) & \textbf{if } t \not \in {Q_2}\\
\phi_{Q_{2}, \pxdb}(t) & \textbf{if } t \not \in {Q_1}\\
\end{cases}\]
% Let $Q = {Q_1} \cup {Q_2}$.
% We merge graphs and produce a sum vertex for all tuples in both sides of the union.
% Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup V_{Q_2,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_{Q,\pxdb}(v_t) = +$, and let
% \[E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup E_{Q_2,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t), v_t), (\phi_{Q_{2}, \pxdb}(t), v_t)}{t \in {Q_1} \cap {Q_2}}\]
% \[
% \phi_{Q,\pxdb}(t) = \begin{cases}
% v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\
% \phi_{Q_{1}, \pxdb}(t) & \textbf{if } t \not \in {Q_2}\\
% \phi_{Q_{2}, \pxdb}(t) & \textbf{if } t \not \in {Q_1}\\
% \end{cases}\]
This circuit has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices.
\caseheading{$k$-ary Join}
Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$.
We merge graphs and produce a multiplication vertex for all tuples resulting from the join
Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \ldots \cup V_{Q_k,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let
{\small
\begin{multline*}
E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \ldots \cup E_{Q_k,\pxdb} \cup
\left\{\;
(\phi_{Q_{1}, \pxdb}(\pi_{\sch({Q_1})}t), v_t), \right.\\
\ldots, (\phi_{Q_k,\pxdb}(\pi_{\sch({Q_k})}t), v_t)
\;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\}
\end{multline*}
}
Let $\ell_{Q,\pxdb}(v_t) = \times$, and let $\phi_{Q,\pxdb}(t) = v_t$
% Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$.
% We merge graphs and produce a multiplication vertex for all tuples resulting from the join
% Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \ldots \cup V_{Q_k,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let
% {\small
% \begin{multline*}
% E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \ldots \cup E_{Q_k,\pxdb} \cup
% \left\{\;
% (\phi_{Q_{1}, \pxdb}(\pi_{\sch({Q_1})}t), v_t), \right.\\
% \ldots, (\phi_{Q_k,\pxdb}(\pi_{\sch({Q_k})}t), v_t)
% \;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\}
% \end{multline*}
% }
% Let $\ell_{Q,\pxdb}(v_t) = \times$, and let $\phi_{Q,\pxdb}(t) = v_t$
As in projection, newly created vertices will have an in-degree of $k$, and a fan-in tree is required.
There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:circ-model-runtime}
\label{lem:circuits-model-runtime}
Given a \abbrNXPDB $\pxdb$ with \dbbaseName $\dbbase$, and query plan $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q, \dbbase}$, where $k$ is the maximal degree of any polynomial in $Q(\pxdb)$.
Given a \abbrNXPDB $\pxdb$ with \dbbaseName $\dbbase$, and query plan $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q, \dbbase}+1$, where $k$ is the maximal degree of any polynomial in $Q(\pxdb)$.
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\noindent The proof is shown in \Cref{app:subsec-lem-lin-vs-qplan}.
%\subsection{Proof for \Cref{lem:circuits-model-runtime}}\label{app:subsec-lem-lin-vs-qplan}
\begin{proof}
Proof by induction. The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |D_\Omega.R|$.
We prove by induction that $\abs{V_{Q,\pxdb} - \{v_0\}} \leq (k-1)\qruntime{Q, \dbbase}$. For clarity, we implicitly exclude $v_0$ in the proof below.
The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |D_\Omega.R|$.
For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i,\pxdb}| \leq (k_i-1)\qruntime{Q_i,\dbbase}$ where $k_i$ is the degree of $Q_i$.
\caseheading{Selection}
@ -182,7 +248,12 @@ The property holds for all recursive queries, and the proof holds.
\qed
\end{proof}
With \cref{lem:circ-model-runtime} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction.
We next need to show that we can construct the circuit in time linear in the deterministic runtime.
\begin{lemma}\label{lem:tlc-is-the-same-as-det}
Given a query $\query$ over a \dbbaseName $\dbbase$, the runtime $\timeOf{\abbrStepOne}(\query,\dbbase,\circuit) \le O(\qruntime{\query, \dbbase})$
\end{lemma}
With \Cref{lem:circ-model-runtime,lem:tlc-is-the-same-as-det} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction.
\section{Proof of \Cref{cor:cost-model}}
\begin{proof}