From bd13ad5569a64f6043e0de4f3e1213a3b619bbd9 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sat, 18 Sep 2021 00:46:00 -0400 Subject: [PATCH] An algorithm for LC --- appendix.tex | 153 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 112 insertions(+), 41 deletions(-) diff --git a/appendix.tex b/appendix.tex index a6e4b68..7895315 100644 --- a/appendix.tex +++ b/appendix.tex @@ -62,74 +62,140 @@ encodes a polynomial, realized as \end{cases}\] -We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$. +We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$. We implicitly include in all circuits a global zero node $v_0$ s.t., $\ell_{Q, \pxdb}(v_0) = 0$ for any $Q,\pxdb$. + + +\begin{algorithm} +\caption{\abbrStepOne$(\query, \dbbase)$} +\label{alg:lc} + \begin{algorithmic}[1] + \Require $\query$: query + \Require $\dbbase$: a \dbbaseName + \Ensure $\circuit = \tuple{E, V, \ell, \phi}$: a circuit encoding the lineage of each tuple in $\query(\dbbase)$ + + \If{$\query$ is $R$} + \State $V = \comprehension{v_t}{t \in \dbbase.R}$ + \State $E = \emptyset$ + \For{$t \in \dbbase.R$} + \State $\phi(t) = v_t$ \Comment{$v_t$ as defined above} + \State $\ell(v_t) = R(t)$ + \EndFor + \ElsIf{$\query$ is $\sigma_\theta(\query')$} + \State $\tuple{V, E, \phi', \ell} = \abbrStepOne(\query', \dbbase)$ + \For{$t \in \dbbase.R$} + \If{$\theta(t)$} + \State $\phi(t) = \phi'(t)$ + \Else + \State $\phi(t) = v_0$ + \EndIf + \EndFor + \ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$} + \State $\tuple{V', E', \phi', \ell'} = \abbrStepOne(\query', \dbbase)$ + \State $V = V' \cup \comprehension{v_t}{t \in \pi_{\vec{A}}(\query)}$ + \State $E = E' \cup \comprehension{(\phi(t'), v_t)}{t \in \pi_{\vec{A}}t', t' \in \query', t \in \pi_{\vec{A}}(\query')}$ + \Comment{Nodes with in-degrees above 2 are corrected (with logarithmic overhead) with an equivalent fan-in tree.} + \For{$t \in \pi_{\vec{A}}(\query')$} + \State $\phi(t) = v_t$ \Comment{$v_t$ as defined above} + \State $\ell(v_t) = +$ + \EndFor + \ElsIf{$\query$ is $\query_1 \cup \query_2$} + \State $\tuple{V_1, E_1, \phi_1, \ell_1} = \abbrStepOne(\query_1, \dbbase)$ + \State $\tuple{V_2, E_2, \phi_2, \ell_2} = \abbrStepOne(\query_2, \dbbase)$ + \State $V = V_1 \cup V_2 \cup \comprehension{v_t}{t \in \query_1 \cap \query_2}$ + \State $E = E_1 \cup E_2 \cup \comprehension{(\phi_1(t), v_t), (\phi_2(t), v_t)}{t \in \query_1 \cap \query_2}$ + \State $\phi = \phi_1 \cup \phi_2$ + \State $\ell = \ell_1 \cup \ell_2$ + \For{$t \in \query_1 \cap \query_2$} + \State $\phi(t) = v_t$ \Comment{$v_t$ as defined above} + \State $\ell(v_t) = +$ + \EndFor + \ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_k$} + \For{$i \in [1, k]$} + $\tuple{V_i, E_i, \phi_i, \ell_i} = \abbrStepOne(\query_i, \dbbase)$ + \EndFor + \State $V = V_1 \cup \ldots \cup V_k \cup \comprehension{v_t}{t \in \query_1 \bowtie \ldots \bowtie \query_k}$ + \State $E = E_1 \cup \ldots \cup E_k \cup \bigcup_{i \in [1,k]} + \comprehension{(\phi_i(\pi_{sch}(\query_i)(t))}{t \in \query_1 \bowtie \ldots \bowtie \query_k}$\Comment{Nodes with in-degrees above 2 are corrected (with $\log_2(k)$ overhead) with an equivalent fan-in tree.} + \State $\phi = \phi_1 \cup \ldots \cup \phi_k$ + \State $\ell = \ell_1 \cup \ldots \cup \phi_k$ + \For{$t \in \query_1 \bowtie \ldots \bowtie \query_k$} + \State $\phi(t) = v_t$ + \State $\ell(v_t) = \times$ + \EndFor + + \EndIf + + \end{algorithmic} +\end{algorithm} + + +\Cref{alg:lc} defines how the circuit for a query result is constructed. We quickly review the number of vertices emitted in each case. \caseheading{Base Relation} -Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$. +% Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$. This circuit has $|D_\Omega.R|$ vertices. \caseheading{Selection} -Let $Q = \sigma_\theta \inparen{Q_1}$. -We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate. -Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb}$, let $\ell_{Q,\pxdb}(v_0) = 0$, and let $\ell_{Q,\pxdb}(v) = \ell_{Q_1,\pxdb}(v)$ for any $v \in V_{Q_1,\pxdb}$. Let $E_{Q,\pxdb} = E_{Q_1,\pxdb}$, and define -$$\phi_{Q,\pxdb}(t) = -\phi_{Q_{1}, \pxdb}(t) \text{ for } t \text{ s.t.}\; \theta(t).$$ -Dead sinks are iteratively removed, and so -%\AH{While not explicit, I assume a reviewer would know that the notation above discards tuples/vertices not satisfying the selection predicate.} -%v_0 & \textbf{otherwise} -%\end{cases}$$ +% Let $Q = \sigma_\theta \inparen{Q_1}$. +% We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate. +% Let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \{v_0\}$, and let $\ell_{Q,\pxdb}(v) = \ell_{Q_1,\pxdb}(v)$ for any $v \in V_{Q_1,\pxdb}$. Let $E_{Q,\pxdb} = E_{Q_1,\pxdb}$, and define +% $$\phi_{Q,\pxdb}(t) = +% \phi_{Q_{1}, \pxdb}(t) \text{ for } t \text{ s.t.}\; \theta(t) \text{ and } \phi_{Q,\pxdb}(t) = v_0 \text{ otherwise}.$$ +If we assume dead sinks are iteratively garbage collected, this circuit has at most $|V_{Q_1,\pxdb}|$ vertices. \caseheading{Projection} -Let $Q = \pi_{\vct A} {Q_1}$. -We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$. -Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_{Q,\pxdb}(t) = v_t$, and let $\ell_{Q,\pxdb}(v_t) = +$. Finally let -$$E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$ +% Let $Q = \pi_{\vct A} {Q_1}$. +% We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$. +% Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_{Q,\pxdb}(t) = v_t$, and let $\ell_{Q,\pxdb}(v_t) = +$. Finally let +% $$E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$ This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ new vertices. % \AH{Is the rightmost operator \emph{supposed} to be a $-$? In the beginning we add $|\pi_{\vct A}{Q_1}|$ vertices.} The corrected circuit thus has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices. \caseheading{Union} -Let $Q = {Q_1} \cup {Q_2}$. -We merge graphs and produce a sum vertex for all tuples in both sides of the union. -Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup V_{Q_2,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_{Q,\pxdb}(v_t) = +$, and let -\[E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup E_{Q_2,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t), v_t), (\phi_{Q_{2}, \pxdb}(t), v_t)}{t \in {Q_1} \cap {Q_2}}\] -\[ - \phi_{Q,\pxdb}(t) = \begin{cases} -v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\ -\phi_{Q_{1}, \pxdb}(t) & \textbf{if } t \not \in {Q_2}\\ -\phi_{Q_{2}, \pxdb}(t) & \textbf{if } t \not \in {Q_1}\\ -\end{cases}\] +% Let $Q = {Q_1} \cup {Q_2}$. +% We merge graphs and produce a sum vertex for all tuples in both sides of the union. +% Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup V_{Q_2,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_{Q,\pxdb}(v_t) = +$, and let +% \[E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup E_{Q_2,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t), v_t), (\phi_{Q_{2}, \pxdb}(t), v_t)}{t \in {Q_1} \cap {Q_2}}\] +% \[ +% \phi_{Q,\pxdb}(t) = \begin{cases} +% v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\ +% \phi_{Q_{1}, \pxdb}(t) & \textbf{if } t \not \in {Q_2}\\ +% \phi_{Q_{2}, \pxdb}(t) & \textbf{if } t \not \in {Q_1}\\ +% \end{cases}\] This circuit has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices. \caseheading{$k$-ary Join} -Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$. -We merge graphs and produce a multiplication vertex for all tuples resulting from the join -Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \ldots \cup V_{Q_k,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let -{\small -\begin{multline*} -E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \ldots \cup E_{Q_k,\pxdb} \cup -\left\{\; -(\phi_{Q_{1}, \pxdb}(\pi_{\sch({Q_1})}t), v_t), \right.\\ -\ldots, (\phi_{Q_k,\pxdb}(\pi_{\sch({Q_k})}t), v_t) -\;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\} -\end{multline*} -} -Let $\ell_{Q,\pxdb}(v_t) = \times$, and let $\phi_{Q,\pxdb}(t) = v_t$ +% Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$. +% We merge graphs and produce a multiplication vertex for all tuples resulting from the join +% Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \ldots \cup V_{Q_k,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let +% {\small +% \begin{multline*} +% E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \ldots \cup E_{Q_k,\pxdb} \cup +% \left\{\; +% (\phi_{Q_{1}, \pxdb}(\pi_{\sch({Q_1})}t), v_t), \right.\\ +% \ldots, (\phi_{Q_k,\pxdb}(\pi_{\sch({Q_k})}t), v_t) +% \;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\} +% \end{multline*} +% } +% Let $\ell_{Q,\pxdb}(v_t) = \times$, and let $\phi_{Q,\pxdb}(t) = v_t$ As in projection, newly created vertices will have an in-degree of $k$, and a fan-in tree is required. There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Lemma}\label{lem:circ-model-runtime} \label{lem:circuits-model-runtime} -Given a \abbrNXPDB $\pxdb$ with \dbbaseName $\dbbase$, and query plan $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q, \dbbase}$, where $k$ is the maximal degree of any polynomial in $Q(\pxdb)$. +Given a \abbrNXPDB $\pxdb$ with \dbbaseName $\dbbase$, and query plan $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q, \dbbase}+1$, where $k$ is the maximal degree of any polynomial in $Q(\pxdb)$. \end{Lemma} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\noindent The proof is shown in \Cref{app:subsec-lem-lin-vs-qplan}. %\subsection{Proof for \Cref{lem:circuits-model-runtime}}\label{app:subsec-lem-lin-vs-qplan} \begin{proof} -Proof by induction. The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |D_\Omega.R|$. +We prove by induction that $\abs{V_{Q,\pxdb} - \{v_0\}} \leq (k-1)\qruntime{Q, \dbbase}$. For clarity, we implicitly exclude $v_0$ in the proof below. + +The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |D_\Omega.R|$. For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i,\pxdb}| \leq (k_i-1)\qruntime{Q_i,\dbbase}$ where $k_i$ is the degree of $Q_i$. \caseheading{Selection} @@ -182,7 +248,12 @@ The property holds for all recursive queries, and the proof holds. \qed \end{proof} -With \cref{lem:circ-model-runtime} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction. +We next need to show that we can construct the circuit in time linear in the deterministic runtime. +\begin{lemma}\label{lem:tlc-is-the-same-as-det} +Given a query $\query$ over a \dbbaseName $\dbbase$, the runtime $\timeOf{\abbrStepOne}(\query,\dbbase,\circuit) \le O(\qruntime{\query, \dbbase})$ +\end{lemma} + +With \Cref{lem:circ-model-runtime,lem:tlc-is-the-same-as-det} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction. \section{Proof of \Cref{cor:cost-model}} \begin{proof}