%!TEX root=./main.tex \section{Generalizations} In this section, we consider couple of generalizations/corollaries of our results so far. In particular, in~\Cref{sec:circuits} we first consider the case when the compressed polynomial is represented by a Directed Acyclic Graph (DAG) instead of the earlier (expression) tree (\Cref{def:express-tree}) and we observe that all of our results carry over to the DAG representation. Then we formalize our claim in~\Cref{sec:intro} that a linear runtime algorithm for our problem would imply that we can process PDBs in the same time as deterministic query processing. Finally, in~\Cref{sec:momemts}, we make some simple observations on how our results can be used to estimate moments beyond the expectation of a lineage polynomial. \subsection{Lineage circuits} \label{sec:circuits} Thus far, our analysis of the runtime of $\onepass$ has been in terms of the size of the compressed lineage polynomial. We now show that this models the behavior of a deterministic database by proving that for any boolean conjunctive query, we can construct a compressed lineage polynomial with the same complexity as it would take to evaluate the query on a deterministic \emph{bag-relational} database. We adopt a minimalistic model of query evaluation focusing on the size of intermediate materialized states. \newcommand{\qruntime}[1]{\textbf{eval}(#1)} \begin{align*} \qruntime{Q} & = |Q|\\ \qruntime{\sigma Q} & = \qruntime{Q}\\ \qruntime{\pi Q} & = \qruntime{Q}\\ \qruntime{Q \cup Q'} & = \qruntime{Q} + \qruntime{Q'}\\ \qruntime{Q_1 \bowtie \ldots \bowtie Q_n} & = \qruntime{Q_1} + \ldots + \qruntime{Q_n} + |Q_1 \bowtie \ldots \bowtie Q_n|\\ \end{align*} Under this model the query plan $Q(D)$ has runtime $O(\qruntime{Q(D)})$. Base relations assume that a full table scan is required; We model index scans by treating an index scan query $\sigma_\theta(R)$ as a single base relation. \begin{proposition} \label{prop:queries-need-to-output-tuples} The runtime $\qruntime{Q}$ of any query $Q$ is at least $|Q|$ \end{proposition} \subsection{Circuit Lineage} We represent lineage polynomials with arithmetic circuits over $\mathbb N$ with $+$, $\times$. A circuit for relation $R$ is an acyclic graph $\tuple{V_R, E_R, \phi_R, \ell_R}$ with vertices $V_R$ and directed edges $E_R \subset V_R^2$. A sink function $\phi_R : R \rightarrow V$ maps the tuples of the relation to vertices in the graph. We require that $\phi_R$'s range be limited to sink vertices (i.e., vertices with out-degree 0). We call a sink vertex not in the range of $\phi_R$ a \emph{dead sink}. A function $\ell_R : V_R \rightarrow \{\;+,\times\;\}\cup \mathbb N \cup \vct X$ assigns a label to each node: Source nodes (i.e., vertices with in-degree 0) are labeled with constants or variables (i.e., $\mathbb N \cup \vct X$), while the remaining nodes are labeled with the symbol $+$ or $\times$. We require that vertices have an in-degree of at most two. \newcommand{\getpoly}[1]{\textbf{poly}(#1)} Each vertex $v \in V_R$ in the arithmetic circuit for $\tuple{V_R, E_R, \phi_R, \ell_R}$ encodes a polynomial, realized as $$\getpoly(v) = \begin{cases} \sum_{v' : (v',v) \in E_R} \getpoly(v') & \textbf{if } \ell(v) = +\\ \prod_{v' : (v',v) \in E_R} \getpoly(v') & \textbf{if } \ell(v) = \times\\ \ell(v) & \textbf{otherwise} \end{cases}$$ \newcommand{\caseheading}[1]{\smallskip \noindent \textbf{#1}.~} We define the circuit for $R$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i}, E_{Q_i}, \phi_{Q_i}, \ell_{Q_i}}$ denote the circuit for subquery $Q_i$. \caseheading{Base Relation} Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_Q = \comprehension{v_t}{t\in R}$, let $\phi_Q(t) = v_t$, let $\ell_Q(v_t) = R(t)$, and let $E_Q = \emptyset$. This circuit has $|R|$ vertices. \caseheading{Selection} Let $Q = \sigma_\theta Q_1$. We re-use the circuit for $Q_1$, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate. Formally, let $V_Q = V_{Q_1} \cup {v_0}$, let $\ell_Q(v_0) = 0$, and let $\ell_Q(v) = \ell_{Q_1}(v)$ for any $v \in V_{Q_1}$. Let $E_Q = E_{Q_1}$, and define $$\phi_Q = \begin{cases} \phi_{Q_1} & \textbf{if } \theta(t)\\ v_0 & \textbf{otherwise} \end{cases}$$ This circuit has $|V_{Q_1}|+1$ vertices. \caseheading{Projection} Let $Q = \pi_{\vct A} {Q_1}$. We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$. Naively, let $V_Q = V_{Q_1} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_Q(t) = v_t$, and let $\ell_Q(v_t) = +$. Finally let $$E_Q = E_{Q_1} \cup \comprehension{(\phi_{Q_1}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$ This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ additional vertices. The corrected circuit thus has at most $|V_{Q_1}|+|\pi_{\vct A} {Q_1}| + |{Q_1}|-1$ vertices. \caseheading{Union} Let $Q = {Q_1} \cup {Q_2}$. We merge graphs and produce a sum vertex for all tuples in both sides of the union. Formally, let $V_Q = V_{Q_1} \cup V_{Q_2} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $$E_Q = E_{Q_1} \cup E_{Q_2} \cup \comprehension{(\phi_{Q_1}(t), v_t), (\phi_{Q_2}(t), v_t)}{t \in {Q_1} \cap {Q_2}}$$, let $\ell_R(v_t) = +$, and let $$\phi_R(t) = \begin{cases} v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\ \phi_{Q_1}(t) & \textbf{if } t \not \in {Q_2}\\ \phi_{Q_2}(t) & \textbf{if } t \not \in {Q_1}\\ \end{cases}$$ This circuit has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices. \caseheading{N-ary Join} Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_n}$. We merge graphs and produce a multiplication vertex for all tuples resulting from the join Naively, let $V_Q = V_{Q_1} \cup \ldots \cup V_{Q_n} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_n}}$, let {\small \begin{multline*} E_Q = E_{Q_1} \cup \ldots \cup E_{Q_n} \cup \\ \comprehension{(\phi_{Q_1}(\pi_{\sch({Q_1})}t), v_t), \ldots, (\phi_{Q_n}(\pi_{\sch({Q_n})}t), v_t)}{t \in {Q_1} \bowtie \ldots \bowtie {Q_n}} \end{multline*} } Let $\ell_Q(v_t) = \times$, and let $\phi_Q(t) = v_t$ As in projection, newly created vertices will have an in-degree of $n$, and a fan-in tree is required. There are $|{Q_1} \bowtie \ldots \bowtie {Q_n}|$ such vertices, so the corrected circuit has $|V_{Q_1}|+\ldots+|V_{Q_n}|+(n-1)|{Q_1} \bowtie \ldots \bowtie {Q_n}|$ vertices. \subsection{Runtime vs Lineage} \begin{lemma} \label{lem:circuits-model-runtime} The runtime of any query plan $Q$ has the same or better complexity as the lineage of the corresponding query result for any specific database instance. That is, for any query plan $Q$ there exists some constants $a$, $b$ such that $|V_Q| \leq a\qruntime{Q}+b$ \end{lemma} \begin{proof} Proof by recursion. The base case is a base relation: $Q = R$ and is trivially true since $|V_R| = |R|$. For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i}| \leq a_i\qruntime{Q_i} + b_i$. \caseheading{Selection} Assume that $Q = \sigma_\theta(Q_1)$. In the circuit for $Q$, $|V_Q| = |V_{Q_1}|+1$ vertices, so from the inductive assumption and $\qruntime{Q} = \qruntime{Q_1}$ by definition, we have $|V_Q| \leq a_i \qruntime{Q} + (b_i + 1)$. \caseheading{Projection} Assume that $Q = \pi_{\vct A}(Q_1)$. The circuit for $Q$ has at most $|V_{Q_1}|+|\pi_{\vct A} {Q_1}| + |{Q_1}|-1$ vertices. \begin{align*} |V_{Q}| & \leq |V_{Q_1}|+|\pi_{\vct A} {Q_1}| + |{Q_1}|-1\\ & \leq |V_{Q_1}| + 2|Q_1|\\ \intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$} & \leq |V_{Q_1}| + 2 \qruntime{Q_1}\\ \intertext{From the inductive assumption} & \leq a_1\qruntime{Q_1} + b_1 + 2 \qruntime{Q_1}\\ \intertext{By definition, and compacting} & = (a_1+2)\qruntime{Q} + b_1\\ \end{align*} \caseheading{Union} Assume that $Q = Q_1 \cup Q_2$. The circuit for $Q$ has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices. \begin{align*} |V_{Q}| & \leq |V_{Q_1}|+|V_{Q_2}|+|{Q_1}|+|{Q_2}|\\ \intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$} & \leq |V_{Q_1}|+|V_{Q_2}|+\qruntime{Q_1}+\qruntime{Q_2}|\\ \intertext{From the inductive assumption and compacting} & \leq (a_1+a_2+2)(\qruntime{Q_1} + \qruntime{Q_2}) + (b_1 + b_2) \intertext{By definition} & \leq (a_1+a_2+2)(\qruntime{Q}) + (b_1 + b_2) \end{align*} \caseheading{N-ary Join} Assume that $Q = Q_1 \bowtie \ldots \bowtie Q_n$. The circuit for $Q$ has $|V_{Q_1}|+\ldots+|V_{Q_n}|+(n-1)|{Q_1} \bowtie \ldots \bowtie {Q_n}|$ vertices. \begin{align*} |V_{Q}| & = |V_{Q_1}|+\ldots+|V_{Q_n}|+(n-1)|{Q_1} \bowtie \ldots \bowtie {Q_n}|\\ \intertext{From the inductive assumption} & \leq a_1\qruntime{Q_1}+b_1+\ldots+a_n\qruntime{Q_n}+b_n+\\ &\;\;\; (n-1)|{Q_1} \bowtie \ldots \bowtie {Q_n}|\\ & \leq (a_1+\ldots+a_n+n-1)(\qruntime{Q_1}+\ldots+\qruntime{Q_n}+\\ &\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_n}|)+b_1+\ldots+b_n\\ \intertext{By definition} & = (a_1+\ldots+a_n+n-1)\qruntime{Q}+(b_1+\ldots+b_n)\\ \end{align*} The property holds for all recursive queries, and the proof holds. \end{proof} \subsection{Higher moments} \label{sec:momemts}