%!TEX root=./main.tex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Missing details from Section~\ref{sec:background}}\label{sec:proofs-background} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{$\semK$-relations and $\semNX$-PDBs}\label{subsec:supp-mat-background}\label{subsec:supp-mat-krelations} \input{app_notation-background} \section{Missing details from Section~\ref{sec:hard}} \label{app:single-mult-p} \input{app_hardness-results} \section{Missing Details from Section~\ref{sec:algo}}\label{sec:proofs-approx-alg} \input{app_approx-alg-defs-and-examples} \input{app_approx-alg-analysis} \input{app_one-pass-analysis} \input{app_samp-monom-analysis} \subsection{Experimental Results}\label{app:subsec:experiment} \input{experiments} \section{Circuits}\label{app:sec-cicuits} \subsection{Representing Polynomials with Circuits}\label{app:subsec-rep-poly-lin-circ} \subsubsection{Circuits for query plans} \label{sec:circuits-formal} We now formalize circuits and the construction of circuits for SPJU queries. As mentioned earlier, we represent lineage polynomials as arithmetic circuits over $\mathbb N$-valued variables with $+$, $\times$. A circuit for query $Q$ and $\semNX$-PDB $\pxdb$ is a directed acyclic graph $\tuple{V_{Q,\pxdb}, E_{Q,\pxdb}, \phi_{Q,\pxdb}, \ell_{Q,\pxdb}}$ with vertices $V_{Q,\pxdb}$ and directed edges $E_{Q,\pxdb} \subset {V_{Q,\pxdb}}^2$. The sink function $\phi_{Q,\pxdb} : \udom^n \rightarrow V_{Q,\pxdb}$ is a partial function that maps the tuples of the $n$-ary relation $Q(\pxdb)$ to vertices. We require that $\phi_{Q,\pxdb}$'s range be limited to sink vertices (i.e., vertices with out-degree 0). %We call a sink vertex not in the range of $\phi_R$ a \emph{dead sink}. A function $\ell_{Q,\pxdb} : V_{Q,\pxdb} \rightarrow \{\;+,\times\;\}\cup \mathbb N \cup \vct X$ assigns a label to each node: Source nodes (i.e., vertices with in-degree 0) are labeled with constants or variables (i.e., $\mathbb N \cup \vct X$), while the remaining nodes are labeled with the symbol $+$ or $\times$. We require that vertices have an in-degree of at most two. %For the specifics on how to construct a circuit to encode the polynomials of all result tuples for a query and $\semNX$-PDB see \Cref{app:subsec-rep-poly-lin-circ}. Note that we can construct circuits for \bis in time linear in the time required for deterministic query processing over a possible world of the \bi under the aforementioned assumption that $\abs{\pxdb} \leq c \cdot \abs{\db}$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsubsection{Circuit size vs. runtime} \label{sec:circuit-runtime} \newcommand{\bagdbof}{\textsc{bag}(\pxdb)} We now connect the size of a circuit (where the size of a circuit is the number of vertices in the corresponding DAG) %\footnote{since each node has indegree at most two, this also is the same up to constants to counting the number of edges in the DAG.}) for a given SPJU query $Q$ and $\semNX$-PDB $\pxdb$ to its $\qruntime{Q,\db}$ where $\db$ is one of the possible worlds of $\pxdb$. We do this formally by showing that the size of the circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\getpoly}[1]{\textbf{lin}\inparen{#1}} Each vertex $v \in V_{Q,\pxdb}$ in the arithmetic circuit for \[\tuple{V_{Q,\pxdb}, E_{Q,\pxdb}, \phi_{Q,\pxdb}, \ell_{Q,\pxdb}}\] encodes a polynomial, realized as \[\getpoly{v} = \begin{cases} \sum_{v' : (v',v) \in E_{Q,\pxdb}} \getpoly{v'} & \textbf{if } \ell(v) = +\\ \prod_{v' : (v',v) \in E_{Q,\pxdb}} \getpoly{v'} & \textbf{if } \ell(v) = \times\\ \ell(v) & \textbf{otherwise} \end{cases}\] We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$. \caseheading{Base Relation} Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_{Q,\pxdb} = \comprehension{v_t}{t\in R}$, let $\phi_{Q,\pxdb}(t) = v_t$, let $\ell_{Q,\pxdb}(v_t) = R(t)$, and let $E_{Q,\pxdb} = \emptyset$. This circuit has $|R|$ vertices. \caseheading{Selection} Let $Q = \sigma_\theta \inparen{Q_1}$. We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate. Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb}$, let $\ell_{Q,\pxdb}(v_0) = 0$, and let $\ell_{Q,\pxdb}(v) = \ell_{Q_1,\pxdb}(v)$ for any $v \in V_{Q_1,\pxdb}$. Let $E_{Q,\pxdb} = E_{Q_1,\pxdb}$, and define $$\phi_{Q,\pxdb}(t) = \phi_{Q_{1}, \pxdb}(t) \text{ for } t \text{ s.t.}\; \theta(t).$$ Dead sinks are iteratively removed, and so %\AH{While not explicit, I assume a reviewer would know that the notation above discards tuples/vertices not satisfying the selection predicate.} %v_0 & \textbf{otherwise} %\end{cases}$$ this circuit has at most $|V_{Q_1,\pxdb}|$ vertices. \caseheading{Projection} Let $Q = \pi_{\vct A} {Q_1}$. We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$. Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_{Q,\pxdb}(t) = v_t$, and let $\ell_{Q,\pxdb}(v_t) = +$. Finally let $$E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$ This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ new vertices. % \AH{Is the rightmost operator \emph{supposed} to be a $-$? In the beginning we add $|\pi_{\vct A}{Q_1}|$ vertices.} The corrected circuit thus has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices. \caseheading{Union} Let $Q = {Q_1} \cup {Q_2}$. We merge graphs and produce a sum vertex for all tuples in both sides of the union. Formally, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup V_{Q_2,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_{Q,\pxdb}(v_t) = +$, and let \[E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup E_{Q_2,\pxdb} \cup \comprehension{(\phi_{Q_{1}, \pxdb}(t), v_t), (\phi_{Q_{2}, \pxdb}(t), v_t)}{t \in {Q_1} \cap {Q_2}}\] \[ \phi_{Q,\pxdb}(t) = \begin{cases} v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\ \phi_{Q_{1}, \pxdb}(t) & \textbf{if } t \not \in {Q_2}\\ \phi_{Q_{2}, \pxdb}(t) & \textbf{if } t \not \in {Q_1}\\ \end{cases}\] This circuit has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices. \caseheading{$k$-ary Join} Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$. We merge graphs and produce a multiplication vertex for all tuples resulting from the join Naively, let $V_{Q,\pxdb} = V_{Q_1,\pxdb} \cup \ldots \cup V_{Q_k,\pxdb} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let {\small \begin{multline*} E_{Q,\pxdb} = E_{Q_1,\pxdb} \cup \ldots \cup E_{Q_k,\pxdb} \cup \left\{\; (\phi_{Q_{1}, \pxdb}(\pi_{\sch({Q_1})}t), v_t), \right.\\ \ldots, (\phi_{Q_k,\pxdb}(\pi_{\sch({Q_k})}t), v_t) \;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\} \end{multline*} } Let $\ell_{Q,\pxdb}(v_t) = \times$, and let $\phi_{Q,\pxdb}(t) = v_t$ As in projection, newly created vertices will have an in-degree of $k$, and a fan-in tree is required. There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Lemma}\label{lem:circ-model-runtime} \label{lem:circuits-model-runtime} Given a $\semNX$-PDB $\pxdb$ and query plan $Q$, the runtime of $Q$ over $\pxdb$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq (k-1)\qruntime{Q}$, where $k$ is the maximal degree of any polynomial in $Q(\pxdb)$. \end{Lemma} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\noindent The proof is shown in \Cref{app:subsec-lem-lin-vs-qplan}. %\subsection{Proof for \Cref{lem:circuits-model-runtime}}\label{app:subsec-lem-lin-vs-qplan} \begin{proof} Proof by induction. The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |R|$. For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i,\pxdb}| \leq (k_i-1)\qruntime{Q_i,\pxdb}$ where $k_i$ is the degree of $Q_i$. \caseheading{Selection} Assume that $Q = \sigma_\theta(Q_1)$. In the circuit for $Q$, $|V_{Q,\pxdb}| = |V_{Q_1,\pxdb}|$ vertices, so from the inductive assumption and $\qruntime{Q,\pxdb} = \qruntime{Q_1,\pxdb}$ by definition, we have $|V_{Q,\pxdb}| \leq (k-1) \qruntime{Q,\pxdb} $. % \AH{Technically, $\kElem$ is the degree of $\poly_1$, but I guess this is a moot point since one can argue that $\kElem$ is also the degree of $\poly$.} % OK: Correct \caseheading{Projection} Assume that $Q = \pi_{\vct A}(Q_1)$. The circuit for $Q$ has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices. % \AH{The combination of terms above doesn't follow the details for projection above.} \begin{align*} |V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}| + |Q_1|\\ %\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\pxdb} \geq |Q_1|$} %& \leq |V_{Q_1,\pxdb}| + 2 \qruntime{Q_1,\pxdb}\\ \intertext{(From the inductive assumption)} & \leq (k-1)\qruntime{Q_1,\pxdb} + \abs{Q_1}\\ \intertext{(By definition of $\qruntime{Q,\pxdb}$)} & \le (k-1)\qruntime{Q,\pxdb}. \end{align*} \caseheading{Union} Assume that $Q = Q_1 \cup Q_2$. The circuit for $Q$ has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices. \begin{align*} |V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1}|+|{Q_2}|\\ %\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1,\pxdb} \geq |Q_1|$} %& \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+\qruntime{Q_1,\pxdb}+\qruntime{Q_2,\pxdb}|\\ \intertext{(From the inductive assumption)} & \leq (k-1)(\qruntime{Q_1,\pxdb} + \qruntime{Q_2,\pxdb}) + (b_1 + b_2) \intertext{(By definition of $\qruntime{Q,\pxdb}$)} & \leq (k-1)(\qruntime{Q,\pxdb}). \end{align*} \caseheading{$k$-ary Join} Assume that $Q = Q_1 \bowtie \ldots \bowtie Q_k$. The circuit for $Q$ has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices. \begin{align*} |V_{Q,\pxdb}| & = |V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\ \intertext{From the inductive assumption and noting $\forall i: k_i \leq k-1$} & \leq (k-1)\qruntime{Q_1,\pxdb}+\ldots+(k-1)\qruntime{Q_k,\pxdb}+\\ &\;\;\; (k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\ & \leq (k-1)(\qruntime{Q_1,\pxdb}+\ldots+\qruntime{Q_k,\pxdb}+\\ &\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_k}|)\\ \intertext{(By definition of $\qruntime{Q,\pxdb}$)} & = (k-1)\qruntime{Q,\pxdb}. \end{align*} The property holds for all recursive queries, and the proof holds. \qed \end{proof} With \cref{lem:circ-model-runtime} and our upper bound results on \approxq, we now have all the pieces to argue that using our approximation algorithm, the expected multiplicities of an $\raPlus$ query can be computed in essentially the same runtime as deterministic query processing for the same query, proving claim (iv) of the Introduction. %%% Local Variables: %%% mode: latex %%% TeX-master: "main" %%% End: