paper-BagRelationalPDBsAreHard/app_approx-alg-analysis.tex

%root: main.tex

The following results assume input circuit \circuit computed from an arbitrary $\raPlus$ query $\query$ and arbitrary \abbrBIDB $\pdb$.  We refer to \circuit as a \abbrBIDB circuit.
\begin{Theorem}\label{lem:approx-alg}
Let \circuit be an arbitrary \abbrBIDB circuit
and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
{\small
\[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\]
}
such that
\begin{equation}
\label{eq:approx-algo-bound}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
\end{Theorem}
The slight abuse of notation seen in $\abs{\circuit}\inparen{1,\ldots,1}$ is explained after \Cref{def:positive-circuit} and an example is given in \Cref{ex:def-pos-circ}.  The only difference in the use of this notation in \Cref{lem:approx-alg} is that we include an additional exponent to square the quantity.

\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
\input{app_approx_alg-pseudo-code}
We prove \Cref{lem:approx-alg} constructively by presenting an algorithm \approxq (\Cref{alg:mon-sam}) which has the desired runtime and computes an approximation with the desired approximation guarantee. Algorithm \approxq uses Algorithm \onepass to compute weights on the edges of a circuits. These weights are then used to sample a set of monomials of $\poly(\circuit)$ from the circuit $\circuit$ by traversing the circuit using the weights to ensure that monomials are sampled with an appropriate probability. The correctness of \approxq relies on the correctness (and runtime behavior) of auxiliary algorithms \onepass and \sampmon that we state in the following lemmas (and prove later in this part of the appendix).

\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in time:
$$O\left(\size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}\right)$$
  $\onepass$ guarantees two post-conditions:  First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$.  Second, when $\vari{S}.\type  = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
\end{Lemma}
To prove correctness of \Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$) output by \onepass, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.

\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in time
$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
 where $k = \degree(\circuit)$.  The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
\end{Lemma}

With the above two lemmas, we are ready to argue the following result:
\begin{Theorem}\label{lem:mon-samp}
For any $\circuit$ with
$\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
 in $O\left(\left(\size(\circuit)+\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}}\right)$ time.
\end{Theorem}


Before proving \Cref{lem:mon-samp}, we use it to argue the claimed runtime of our main result, \Cref{lem:approx-alg}.

\begin{proof}[Proof of \Cref{lem:approx-alg}]
Set $\mathcal{E}=\approxq({\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},\]
 which achieves the claimed error bound on $\mathcal{E}$ (\vari{acc}) trivially due to the assignment to $\error'$ and \cref{lem:mon-samp}, since $\error' \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\frac{\rpoly(1,\ldots, 1)}{\abs{\circuit}(1,\ldots, 1)} \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\rpoly(1,\ldots, 1)$.

The claim on the runtime follows from \Cref{lem:mon-samp} since

\begin{align*}
\frac 1{\inparen{\error'}^2}\cdot \log\inparen{\frac 1\conf}=&\frac{\log{\frac{1}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{{\circuit}}(1,\ldots, 1)}\right)^2}\\
= &\frac{\log{\frac{1}{\conf}}\cdot \abs{{\circuit}}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}.
\end{align*}
\qed
\end{proof}

Let us now prove \Cref{lem:mon-samp}:
\subsection{Proof of Theorem \ref{lem:mon-samp}}\label{app:subsec-th-mon-samp}
\begin{proof}
Consider now the random variables $\randvar_1,\dots,\randvar_\numsamp$, where each $\randvar_\vari{i}$ is the value of $\vari{Y}_{\vari{i}}$ in \cref{alg:mon-sam} after \cref{alg:mon-sam-product} is executed. Overloading $\isInd{\cdot}$ to receive monomial input (recall $\encMon$ is the monomial composed of the variables in the set $\monom$), we have
\[\randvar_\vari{i}= \indicator{\inparen{\isInd{\encMon}}}\cdot \prod_{X_i\in \var\inparen{v}} p_i,\]
where the indicator variable handles the check in \Cref{alg:check-duplicate-block}
Then for random variable $\randvar_i$, it is the case that
\begin{align*}
\expct\pbox{\randvar_\vari{i}} &= \sum\limits_{(\monom, \coef) \in \expansion{{\circuit}} }\frac{\indicator{\inparen{\isInd{\encMon}}}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} p_i }{\abs{{\circuit}}(1,\dots,1)} \\
&= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},
\end{align*}
where in the first equality we use the fact that $\vari{sgn}_{\vari{i}}\cdot \abs{\coef}=\coef$ and the second equality follows from \Cref{eq:tilde-Q-bi} with $X_i$ substituted by $\prob_i$.

Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_\vari{i}$.  It is also true that

\[\expct\pbox{\empmean}
= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_\vari{i}}
= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)}.\]

Hoeffding's inequality states that if we know that each $\randvar_i$ (which are all independent) always lie in the intervals $[a_i, b_i]$, then it is true that
\begin{equation*}
\probOf\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}.
\end{equation*}

Line~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with $O(k)$ $\prob_i\in [0, 1]$, which implies the range for each $\randvar_i$ is $[-1, 1]$.
Using Hoeffding's inequality, we then get:
\begin{equation*}
\probOf\left(~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} = 2\exp{\left(-\frac{\samplesize\error^2}{2 }\right)}\leq \conf,
\end{equation*}
where the last inequality dictates our choice of $\samplesize$ in \Cref{alg:mon-sam-global2}.

For the claimed probability bound of $\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf$, note that in the algorithm, \vari{acc} is exactly $\empmean \cdot \abs{\circuit}(1,\ldots, 1)$.  Multiplying the rest of the terms by the additional factor $\abs{\circuit}(1,\ldots, 1)$ yields the said bound.

This concludes the proof for the first claim of theorem~\ref{lem:mon-samp}.  Next, we prove the claim on the runtime.

\paragraph*{Run-time Analysis}
The runtime of the algorithm is dominated first by \Cref{alg:mon-sam-onepass} which has $O\left({\size(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ runtime by \Cref{lem:one-pass}.  There are then $\samplesize$ iterations of the loop in \Cref{alg:sampling-loop}. Each iteration's run time is dominated by the call to \sampmon in \Cref{alg:mon-sam-sample} (which by \Cref{lem:sample} takes $O\left(\log{k} \cdot k \cdot {\depth(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$
) and the check \Cref{alg:check-duplicate-block}, which by the subsequent argument takes $O(k\log{k})$ time. We sort the $O(k)$ variables by their block IDs and then check if there is a duplicate block ID or not. Combining all the times discussed here gives us the desired overall runtime.
\qed
\end{proof}

\subsection{Proof of \Cref{cor:approx-algo-const-p}}
\begin{proof}
The result follows by first noting that by definition of $\gamma$, we have
\[\rpoly(1,\dots,1)= (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1).\]
Further, since each $\prob_i\ge \prob_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
\[ \rpoly(1,\dots,1) \ge \prob_0^k\cdot \rpoly(1,\dots,1).\]
The above two inequalities implies $\rpoly(1,\dots,1) \ge \prob_0^k\cdot (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1)$.
Applying this bound in the runtime bound in \Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ follows by noting that $\depth({\circuit})\le \size({\circuit})$ and absorbing all factors that just depend on $k$.
\qed
\end{proof}

\subsection{Proof of~\Cref{lem:ctidb-gamma}}

\begin{proof}
The circuit \circuit' is built from \circuit in the following manner.  For each input gate $\gate_i$ with $\gate_i.\val = X_\tup$, replace $\gate_i$ with the circuit \subcircuit encoding the sum $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$.  We argue that \circuit' is a valid circuit by the following facts.  Let $\pdb = \inparen{\worlds, \bpd}$ be the original \abbrCTIDB \circuit was generated from.  Then, by~\Cref{prop:ctidb-reduct} there exists a \abbrOneBIDB $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$, with $\tupset' = \inset{\intup{\tup, j}~|~\tup\in\tupset, j\in\pbox{\bound}}$, from which the conversion from \circuit to \circuit' follows.  Both $\polyf\inparen{\circuit}$ and $\polyf\inparen{\circuit'}$ have the same expected multiplicity since (by~\Cref{prop:ctidb-reduct}) the distributions $\bpd$ and $\bpd'$ are equivalent and each $j\cdot\worldvec'_{\tup, j} = \worldvec_\tup$ for $\worldvec'\in\inset{0, 1}^{\bound\numvar}$ and $\worldvec\in\worlds$.  Finally, note that because there exists a (sub) circuit encoding $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$ that is a \emph{balanced} binary tree, the above conversion implies the claimed size and depth bounds of the lemma.

Next we argue the claim on $\gamma\inparen{\circuit'}$.  Consider the list of expanded monomials $\expansion{\circuit}$ for \abbrCTIDB circuit \circuit.  Let
 $\encMon = X_{\tup_1}^{d_1},\ldots,X_{\tup_\ell}^{d_\ell}$ be an arbitrary monomial with $\ell$ variables.  Then \monom yields the set of monomials $\vari{E}_\monom\inparen{\circuit'}=\inset{j_1^{d_1}\cdot X_{\tup, j_1}^{d_1}\times\cdots\times j_\ell^{d_\ell}\cdot X_{\tup, j_\ell}^{d_\ell}}_{j_1,\ldots, j_\ell \in \pbox{\bound}}$ in $\expansion{\circuit'}$.  Recall that a cancellation occurs when we have a monomial \monom' such that there exists $\tup\neq\tup'$ in the same block $\block$ where variables $X_\tup, X_{\tup'}$ are in the set of variables $\encMon'$ of \monom'.  Observe that cancellations can only occur for each $X_{\tup}^{d_\tup}\in \encMon$, where the expansion $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_\tup}$ represents the monomial $X_\tup^{d_\tup}$ in $\tupset'$.  Consider the number of cancellations for $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_t}$.  Then $\gamma \leq 1 - \bound^{d_\tup - 1}$, since
 for each element in the set of cross products $\inset{\bigtimes_{i\in\pbox{d_\tup}, j_i\in\pbox{\bound}}X_{\tup, j_i}}$ there are \emph{exactly} $\bound$ surviving elements with $j_1=\cdots=j_{d_\tup}=j$, i.e. $X_{t,j}^{d_\tup}$ for each $j\in\pbox{\bound}$.  The rest of the $\bound^{d_\tup}-c$ cross terms cancel.  Regarding the whole monomial \monom', it is the case that the proportion of non-cancellations across each $X_\tup^{d_\tup}\in\encMon'$ multiply because non-cancelling terms for $X_\tup$ can only be joined with non-cancelling terms of $X_{\tup'}^{d_{\tup'}}\in\encMon'$ for $\tup\neq\tup'$.  This then yields the fraction of cancelled monomials $\gamma\le 1 - \prod_{i = 1}^{\ell}\bound^{d_i - 1} \leq 1 - \bound^{-\inparen{k - 1}}$ where the inequalities take into account the fact that $\sum_{i = 1}^\ell d_i \leq k$.

Since this is true for arbitrary \monom, the bound follows for $\polyf\inparen{\circuit'}$.
\end{proof}
\qed


\subsection{Proof of \Cref{lem:val-ub}}\label{susec:proof-val-up}
\label{app:proof-lem-val-ub}

We will prove \Cref{lem:val-ub} by considering the two cases separately. We start by considering the case when $\circuit$ is a tree:
\begin{Lemma}
\label{lem:C-ub-tree}
Let $\circuit$ be a tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint). Then we have
\[\abs{\circuit}(1,\dots,1)\le \left(\size(\circuit)\right)^{\degree(\circuit)+1}.\]
\end{Lemma}
\begin{proof}[Proof of \Cref{lem:C-ub-tree}]
For notational simplicity define $N=\size(\circuit)$ and $k=\degree(\circuit)$.
We use induction on $\depth(\circuit)$ to show that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient or constant.  In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and by \Cref{def:degree} it is the case that $0 \leq k = \degree\inparen{\circuit} \leq 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{k+1} = 1^{k + 1} = 1$ for $k \in \inset{0, 1}$.

Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.

For the inductive step we consider a circuit \circuit such that $\depth(\circuit) = \ell + 1$.  The sink can only be either a $\circmult$ or $\circplus$ gate.  Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively.  Consider when sink node is $\circmult$.
 Then note that
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\cdot \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq (N-1)^{k_\linput+1} \cdot (N - 1)^{k_\rinput+1}\nonumber\\
 &= (N-1)^{k+1}\label{eq:sumcoeff-times-upper}\\
 &\leq N^{k + 1}.\nonumber
\end{align}
In the above the first inequality follows from the inductive hypothesis (and the fact that the size of either subtree is at most $N-1$) and \Cref{eq:sumcoeff-times-upper} follows by \cref{def:degree} which states that for $k = \degree(\circuit)$ we have $k=k_\linput+k_\rinput+1$.

For the case when the sink gate is a $\circplus$ gate, then for $N_\linput = \size(\circuit_\linput)$ and $N_\rinput = \size(\circuit_\rinput)$ we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq
N_\linput^{k+1} + N_\rinput^{k+1}\nonumber\\
&\leq (N-1)^{k+1 } \label{eq:sumcoeff-plus-upper}\\
&\leq N^{k+1}.\nonumber
\end{align}
In the above, the first inequality follows from the inductive hypothes and \cref{def:degree} (which implies the fact that $k_\linput,k_\rinput\le k$).  Note that the RHS of this inequality is maximized when the base and exponent of one of the terms is maximized.  The second inequality follows from this fact as well as the fact that since $\circuit$ is a tree we have $N_\linput+N_\rinput=N-1$ and, lastly, the fact that $k\ge 0$. This completes the proof.
\end{proof}

The upper bound in \Cref{lem:val-ub} for the general case is a simple variant of the above proof (but we present a proof sketch of the bound below for completeness):
\begin{Lemma}
\label{lem:C-ub-gen}
Let $\circuit$ be a (general) circuit.
Then we have
\[\abs{\circuit}(1,\dots,1)\le 2^{2^{\degree(\circuit)}\cdot \depth(\circuit)}.\]
\end{Lemma}
\begin{proof}[Proof Sketch of \Cref{lem:C-ub-gen}]
We use the same notation as in the proof of \Cref{lem:C-ub-tree} and further define $d=\depth(\circuit)$. We will prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq 2^{2^k\cdot d }$. The base case argument is similar to that in the proof of \Cref{lem:C-ub-tree}. In the inductive case we have that $d_\linput,d_\rinput\le d-1$.

For the case when the sink node is $\times$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq {2^{2^{k_\linput}\cdot d_\linput}} \circmult {2^{2^{k_\rinput}\cdot d_\rinput}}\\
 &\leq 2^{2\cdot 2^{k-1}\cdot (d-1)}\\
 &\leq 2^{2^k d}.
\end{align*}
In the above the first inequality follows from inductive hypothesis while the second inequality follows from the fact that $k_\linput,k_\rinput\le k-1$ and $d_\linput, d_\rinput\le d-1$, where we substitute the upperbound into every respective term.

Now consider the case when the sink node is $+$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq 2^{2^{k_\linput}\cdot d_\linput} + 2^{2^{k_\rinput}\cdot d_\rinput}\\
&\leq 2\cdot {2^{2^k(d-1)} } \\
&\leq 2^{2^kd}.
\end{align*}
In the above the first inequality follows from the inductive hypothesis while the second inequality follows from the facts that $k_\linput,k_\rinput\le k$ and $d_\linput,d_\rinput\le d-1$. The final inequality follows from the fact that $k\ge 0$.
\qed
\end{proof}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: