paper-BagRelationalPDBsAreHard/app_approx-alg-corollaries.tex

126 lines
14 KiB
TeX

%root: main.tex
%\textcolor{red}{Aaron: The stuff below needs to be integrated into this section.}
\subsection{\Cref{lem:ctidb-gamma},~\Cref{lem:val-ub},~\Cref{cor:approx-algo-punchline}, and Proof of~\Cref{cor:approx-algo-punchline-ctidb}} %, recalling from~\Cref{sec:intro} for \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, where $\tupset$ is the set of possible tuples across all possible worlds of $\pdb$.
\begin{Lemma}
\label{lem:ctidb-gamma}
Given $\raPlus$ query $\query$ and \abbrCTIDB $\pdb$, let \circuit be the circuit computed by $\query\inparen{\tupset}$. Then, for the reduced \abbrOneBIDB $\pdb'$ there exists an equivalent circuit \circuit' obtained from $\query\inparen{\tupset'}$, such that $\gamma\inparen{\circuit'}\leq 1 - \bound^{-\inparen{k-1}}$ with $\size\inparen{\circuit'} \leq \size\inparen{\circuit} + \bigO{\numvar\bound}$
and $\depth\inparen{\circuit'} = \depth\inparen{\circuit} + \bigO{\log{\bound}}$.
\end{Lemma}
We briefly connect the runtime in \Cref{eq:approx-algo-runtime} to the algorithm outline earlier (where we ignore the dependence on $\multc{\cdot}{\cdot}$, which is needed to handle the cost of arithmetic operations over integers). The $\size(\circuit)$ comes from the time taken to run \onepass once (\onepass essentially computes $\abs{\circuit}(1,\ldots, 1)$ using the natural circuit evaluation algorithm on $\circuit$). We make $\frac{\log{\frac{1}{\conf}}}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}$ many calls to \sampmon (each of which essentially traces $O(k)$ random sink to source paths in $\circuit$ all of which by definition have length at most $\depth(\circuit)$).
Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}$ term in the runtime.
\begin{Lemma}
\label{lem:val-ub}
For any \emph{\abbrOneBIDB} circuit $\circuit$ with $\degree(\circuit)=k$, we have
$\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \depth(\circuit)}.$
Further, if $\circuit$ is a tree, then we have $\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.$
\end{Lemma}
Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$. If $\circuit$ is a tree, then the runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$, which then answers \Cref{prob:intro-stmt} with yes for such circuits.
Finally, note that by \Cref{prop:circuit-depth} and \Cref{lem:circ-model-runtime} for any $\raPlus$ query $\query$, there exists a circuit $\circuit^*$ for $\apolyqdt$ such that $\depth(\circuit^*)\le O_{|Q|}(\log{n})$ and $\size(\circuit)\le O_k\inparen{\qruntime{\query, \tupset, \bound}}$. Using this along with \Cref{lem:val-ub}, \Cref{cor:approx-algo-const-p} and the fact that $n\le \qruntime{\query, \tupset, \bound}$, we have the following corollary:
\begin{Corollary}
\label{cor:approx-algo-punchline}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \emph{\abbrOneBIDB} with $p_0>0$ and $\gamma<1$, where $p_0,\gamma$ as in \Cref{cor:approx-algo-const-p}, are absolute constants. Let $\poly(\vct{X})=\apolyqdt$ for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $p_i$ for each $i\in [n]$ that defines $\pd$).
\end{Corollary}
\subsection{Proof of~\Cref{lem:ctidb-gamma}}
\begin{proof}
The circuit \circuit' is built from \circuit in the following manner. For each input gate $\gate_i$ with $\gate_i.\val = X_\tup$, replace $\gate_i$ with the circuit \subcircuit encoding the sum $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$. We argue that \circuit' is a valid circuit by the following facts. Let $\pdb = \inparen{\worlds, \bpd}$ be the original \abbrCTIDB \circuit was generated from. Then, by~\Cref{prop:ctidb-reduct} there exists a \abbrOneBIDB $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$, with $\tupset' = \inset{\intuple{\tup, j}~|~\tup\in\tupset, j\in\pbox{\bound}}$, from which the conversion from \circuit to \circuit' follows. Both $\polyf\inparen{\circuit}$ and $\polyf\inparen{\circuit'}$ have the same expected multiplicity since (by~\Cref{prop:ctidb-reduct}) the distributions $\bpd$ and $\bpd'$ are equivalent and $\sum_{j=1}^\bound j\cdot\worldvec'_{\tup, j} = \worldvec_\tup$ for $\worldvec'\in\inset{0, 1}^{\bound\numvar}$ and $\worldvec\in\worlds$ such that $\worldvec_\tup\equiv\worldvec'_\tup$. Finally, note that because there exists a (sub) circuit encoding $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$ that is a \emph{balanced} binary tree, the above conversion implies the claimed size and depth bounds of the lemma.
Next we argue the claim on $\gamma\inparen{\circuit'}$. Consider the list of expanded monomials $\expansion{\circuit}$ for \abbrCTIDB circuit \circuit. Let
$\encMon = X_{\tup_1}^{d_1}\cdots X_{\tup_\ell}^{d_\ell}$ be an arbitrary monomial with $\ell$ variables and let (abusing notation) $\encMon' = \inparen{\sum_{j = 1}^{\bound}j\cdot X_{\tup_1, j}}^{d_1}\cdots\inparen{\sum_{j = 1}^{\bound}j\cdot X_{\tup_\ell, j}}^{d_\ell}$. Then, for $f_\ell = \sum_{i = 1}^\ell d_i$, $\encMon$ induces the set of monomials $\inset{\prod_{i = 1}^{f_\ell} j_i\cdot X_{\tup_i, j_i}^{d_i}}_{j_i\in\pbox{\bound}}$ in the pure expansion of $\encMon'$.
%Denote the additional list elements (projecting out coefficient terms) \emph{induced} by $\monom$ as $\vari{E}_\monom\inparen{\circuit'}$. Then $\vari{E}_\monom\inparen{\circuit'}=\inset{\monom'^1~|~\encMon' \in \vari{S}}$%\inset{j_1^{d_1}\cdot X_{\tup, j_1}^{d_1}\times\cdots\times j_\ell^{d_\ell}\cdot X_{\tup, j_\ell}^{d_\ell}}_{j_1,\ldots, j_\ell \in \pbox{\bound}}$ in $\expansion{\circuit'}$.
Recall that a cancellation occurs in $\encMon'$ when there exists $\tup_{i, j}\neq\tup_{i, j'}$ in the same block $\block$ where variables $X_{\tup_i, j}, X_{\tup_i, j'}$ are in the set of variables $\monom_i'$ of $\monom_{\vari{m}_\vari{i}}\in\encMon'$. Observe that cancellations can only occur for each $X_{\tup}^{d_\tup}\in \encMon$, where the expansion $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_\tup}$ represents the monomial $X_\tup^{d_\tup}$ in $\tupset'$. Consider the number of cancellations for $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_t}$. Then $\gamma \leq 1 - \bound^{-\inparen{d_\tup - 1}}$, since
for each element in the set of cross products $\inset{\bigtimes_{i\in\pbox{d_\tup}, j_i\in\pbox{\bound}}X_{\tup, j_i}}$ there are \emph{exactly} $\bound$ surviving elements with $j_1=\cdots=j_{d_\tup}=j$, i.e. $X_{t,j}^{d_\tup}$ for each $j\in\pbox{\bound}$. The rest of the $\bound^{d_\tup}-c$ cross terms cancel. Regarding all of $\encMon'$, it is the case that the proportion of non-cancellations for each $\inparen{\sum_{j = 1}^{\bound}j\cdot X_{\tup_i, j }}^{d_i}\in\encMon'$ multiply because non-cancelling terms for $\inparen{\sum_{j = 1}^{\bound}j\cdot X_{\tup_i, j}}^{d_i}$ can only be joined with non-cancelling terms of $\inparen{\sum_{j=1}^{\bound}X_{\tup_{i'}, j}}^{d_{i'}}\in\encMon'$ for $\tup\neq\tup'$. This then yields the fraction of cancelled monomials $\gamma\le 1 - \prod_{i = 1}^{\ell}\bound^{-\inparen{d_i - 1}} \leq 1 - \bound^{-\inparen{k - 1}}$ where the inequalities take into account the fact that $f_\ell \leq k$.
Since this is true for arbitrary \monom, the bound follows for $\polyf\inparen{\circuit'}$.
\end{proof}
\qed
\subsection{Proof of \Cref{lem:val-ub}}\label{susec:proof-val-up}
\label{app:proof-lem-val-ub}
We will prove \Cref{lem:val-ub} by considering the two cases separately. We start by considering the case when $\circuit$ is a tree:
\begin{Lemma}
\label{lem:C-ub-tree}
Let $\circuit$ be a tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint). Then we have
\[\abs{\circuit}(1,\dots,1)\le \left(\size(\circuit)\right)^{\degree(\circuit)+1}.\]
\end{Lemma}
\begin{proof}[Proof of \Cref{lem:C-ub-tree}]
For notational simplicity define $N=\size(\circuit)$ and $k=\degree(\circuit)$.
We use induction on $\depth(\circuit)$ to show that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient or constant. In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and by \Cref{def:degree} it is the case that $0 \leq k = \degree\inparen{\circuit} \leq 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{k+1} = 1^{k + 1} = 1$ for $k \in \inset{0, 1}$.
Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the inductive step we consider a circuit \circuit such that $\depth(\circuit) = \ell + 1$. The sink can only be either a $\circmult$ or $\circplus$ gate. Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively. Consider when sink node is $\circmult$.
Then note that
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\cdot \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq (N-1)^{k_\linput+1} \cdot (N - 1)^{k_\rinput+1}\nonumber\\
&= (N-1)^{k+1}\label{eq:sumcoeff-times-upper}\\
&\leq N^{k + 1}.\nonumber
\end{align}
In the above the first inequality follows from the inductive hypothesis (and the fact that the size of either subtree is at most $N-1$) and \Cref{eq:sumcoeff-times-upper} follows by \cref{def:degree} which states that for $k = \degree(\circuit)$ we have $k=k_\linput+k_\rinput+1$.
For the case when the sink gate is a $\circplus$ gate, then for $N_\linput = \size(\circuit_\linput)$ and $N_\rinput = \size(\circuit_\rinput)$ we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq
N_\linput^{k+1} + N_\rinput^{k+1}\nonumber\\
&\leq (N-1)^{k+1 } \label{eq:sumcoeff-plus-upper}\\
&\leq N^{k+1}.\nonumber
\end{align}
In the above, the first inequality follows from the inductive hypothes and \cref{def:degree} (which implies the fact that $k_\linput,k_\rinput\le k$). Note that the RHS of this inequality is maximized when the base and exponent of one of the terms is maximized. The second inequality follows from this fact as well as the fact that since $\circuit$ is a tree we have $N_\linput+N_\rinput=N-1$ and, lastly, the fact that $k\ge 0$. This completes the proof.
%\AH{I don't think that it matters whether or not \circuit is a tree. For $N=\size\inparen{\circuit}$ it must follow that $N_L + N_R + 1 = N$ regardless of whether a gate a allowed to have more than one parent. Not true, consider when $\circuit_R = \circuit_L$.}
\end{proof}
The upper bound in \Cref{lem:val-ub} for the general case is a simple variant of the above proof (but we present a proof sketch of the bound below for completeness):
\begin{Lemma}
\label{lem:C-ub-gen}
Let $\circuit$ be a (general) circuit.
Then we have
\[\abs{\circuit}(1,\dots,1)\le 2^{2^{\degree(\circuit)}\cdot \depth(\circuit)}.\]
\end{Lemma}
\begin{proof}[Proof Sketch of \Cref{lem:C-ub-gen}]
We use the same notation as in the proof of \Cref{lem:C-ub-tree} and further define $d=\depth(\circuit)$. We will prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq 2^{2^k\cdot d }$. The base case argument is similar to that in the proof of \Cref{lem:C-ub-tree}. In the inductive case we have that $d_\linput,d_\rinput\le d-1$.
For the case when the sink node is $\times$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq {2^{2^{k_\linput}\cdot d_\linput}} \circmult {2^{2^{k_\rinput}\cdot d_\rinput}}\\
&\leq 2^{2\cdot 2^{k-1}\cdot (d-1)}\\
&\leq 2^{2^k d}.
\end{align*}
In the above the first inequality follows from inductive hypothesis while the second inequality follows from the fact that $k_\linput,k_\rinput\le k-1$ and $d_\linput, d_\rinput\le d-1$, where we substitute the upperbound into every respective term.
Now consider the case when the sink node is $+$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq 2^{2^{k_\linput}\cdot d_\linput} + 2^{2^{k_\rinput}\cdot d_\rinput}\\
&\leq 2\cdot {2^{2^k(d-1)} } \\
&\leq 2^{2^kd}.
\end{align*}
In the above the first inequality follows from the inductive hypothesis while the second inequality follows from the facts that $k_\linput,k_\rinput\le k$ and $d_\linput,d_\rinput\le d-1$. The final inequality follows from the fact that $k\ge 0$.
\qed
\end{proof}
%\textcolor{red}{The corollary below is a repeat of the corollary on S4}
%Next, we note that the above result along with \Cref{lem:ctidb-gamma}
%answers \Cref{prob:big-o-joint-steps} in the affirmative as follows:
%\begin{Corollary}
%\label{cor:approx-algo-punchline-ctidb}
%Let $\query$ be an $\raPlus$ query and $\pdb$ be a \abbrCTIDB with $p_0>0$, where $p_0$ as in \Cref{cor:approx-algo-const-p}, is an absolute constant. Let $\poly(\vct{X})=\apolyqdt$ for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf,\bound}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $\prob_{\tup, j}$ for each $\tup\in\tupset,~j\in\pbox{\bound}$ that defines $\bpd$).
%\end{Corollary}
\begin{proof}[Proof of~\Cref{cor:approx-algo-punchline-ctidb}]
By~\Cref{lem:ctidb-gamma} and~\Cref{cor:approx-algo-punchline}, the proof follows.
\end{proof}
\qed