paper-BagRelationalPDBsAreHard/app_approx-alg-analysis.tex

%root: main.tex

The following results assume input circuit \circuit computed from an arbitrary $\raPlus$ query $\query$ and arbitrary \abbrBIDB $\pdb$.  We refer to \circuit as a \abbrBIDB circuit.
\begin{Theorem}\label{lem:approx-alg}
Let \circuit be an arbitrary \abbrBIDB circuit
and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
{\small
\[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\]
}
such that
\begin{equation}
\label{eq:approx-algo-bound}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
\end{Theorem}
The slight abuse of notation seen in $\abs{\circuit}\inparen{1,\ldots,1}$ is explained after \Cref{def:positive-circuit} and an example is given in \Cref{ex:def-pos-circ}.  The only difference in the use of this notation in \Cref{lem:approx-alg} is that we include an additional exponent to square the quantity.

\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
\input{app_approx_alg-pseudo-code}
We prove \Cref{lem:approx-alg} constructively by presenting an algorithm \approxq (\Cref{alg:mon-sam}) which has the desired runtime and computes an approximation with the desired approximation guarantee. Algorithm \approxq uses auxiliary algorithm \onepass to compute weights on the edges of a circuit. These weights are then used to sample a set of monomials of $\poly(\circuit)$ from the circuit $\circuit$ by traversing the circuit using the weights to ensure that monomials are sampled with an appropriate probability. The correctness of \approxq relies on the correctness (and runtime behavior) of auxiliary algorithms \onepass and \sampmon that we state in the following lemmas (and prove later in this part of the appendix).

\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in time:
$$O\left(\size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}\right)$$
  $\onepass$ guarantees two post-conditions:  First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$.  Second, when $\vari{S}.\type  = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
\end{Lemma}
To prove correctness of \Cref{alg:mon-sam}, we use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$) output by \onepass, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.

\AH{I don't think the word \emph{only} is needed.}
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in time
$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
 where $k = \degree(\circuit)$.  The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
\end{Lemma}

With the above two lemmas, we are ready to argue the following result:
\begin{Theorem}\label{lem:mon-samp}
For any $\circuit$ with
$\degree(\polyf(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|\geq \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
 in $O\left(\left(\size(\circuit)+\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}}\right)$ time.
\end{Theorem}


Before proving \Cref{lem:mon-samp}, we use it to argue the claimed runtime of our main result, \Cref{lem:approx-alg}.

\begin{proof}[Proof of \Cref{lem:approx-alg}]
Set $\mathcal{E}=\approxq({\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},\]
 which achieves the claimed error bound on $\mathcal{E}$ (\vari{acc}) trivially due to the assignment to $\error'$ and \cref{lem:mon-samp}, since $\error' \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\circuit}(1,\ldots, 1)} \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\rpoly(\prob_1,\ldots, \prob_\numvar)$.

The claim on the runtime follows from \Cref{lem:mon-samp} since

\begin{align*}
\frac 1{\inparen{\error'}^2}\cdot \log\inparen{\frac 1\conf}=&\frac{\log{\frac{1}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{{\circuit}}(1,\ldots, 1)}\right)^2}\\
= &\frac{\log{\frac{1}{\conf}}\cdot \abs{{\circuit}}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}.
\end{align*}
\qed
\end{proof}

Let us now prove \Cref{lem:mon-samp}:
\subsection{Proof of Theorem \ref{lem:mon-samp}}\label{app:subsec-th-mon-samp}
\begin{proof}
Consider now the random variables $\randvar_1,\dots,\randvar_\numsamp$, where each $\randvar_\vari{i}$ is the value of $\vari{Y}_{\vari{i}}$ in \cref{alg:mon-sam} after \cref{alg:mon-sam-product} is executed. Overloading $\isInd{\cdot}$ to receive monomial input (recall $\encMon$ is the monomial composed of the variables in the set $\monom$), we have
\[\randvar_\vari{i}= \indicator{\inparen{\isInd{\encMon}}}\cdot \prod_{X_i\in \var\inparen{v}} p_i,\]
where the indicator variable handles the check in \Cref{alg:check-duplicate-block}
Then for random variable $\randvar_i$, it is the case that
\begin{align*}
\expct\pbox{\randvar_\vari{i}} &= \sum\limits_{(\monom, \coef) \in \expansion{{\circuit}} }\frac{\indicator{\inparen{\isInd{\encMon}}}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} p_i }{\abs{{\circuit}}(1,\dots,1)} \\
&= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},
\end{align*}
where in the first equality we use the fact that $\vari{sgn}_{\vari{i}}\cdot \abs{\coef}=\coef$ and the second equality follows from \Cref{eq:tilde-Q-bi} with $X_i$ substituted by $\prob_i$.

Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_\vari{i}$.  It is also true that

\[\expct\pbox{\empmean}
= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_\vari{i}}
= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)}.\]

Hoeffding's inequality states that if we know that each $\randvar_i$ (which are all independent) always lie in the intervals $[a_i, b_i]$, then it is true that
\begin{equation*}
\probOf\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}.
\end{equation*}

Line~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with $O(k)$ $\prob_i\in [0, 1]$, which implies the range for each $\randvar_i$ is $[-1, 1]$.
Using Hoeffding's inequality, we then get:
\begin{equation*}
\probOf\left(~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} = 2\exp{\left(-\frac{\samplesize\error^2}{2 }\right)}\leq \conf,
\end{equation*}
where the last inequality dictates our choice of $\samplesize$ in \Cref{alg:mon-sam-global2}.
\AH{Why does the $\geq$ sign change to $>$?}
For the claimed probability bound of $\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|\geq \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf$, note that in the algorithm, \vari{acc} is exactly $\empmean \cdot \abs{\circuit}(1,\ldots, 1)$.  Multiplying the rest of the terms by the additional factor $\abs{\circuit}(1,\ldots, 1)$ yields the said bound.

This concludes the proof for the first claim of theorem~\ref{lem:mon-samp}.  Next, we prove the claim on the runtime.

\paragraph*{Run-time Analysis}
The runtime of the algorithm is dominated first by \Cref{alg:mon-sam-onepass} which has $O\left({\size(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ runtime by \Cref{lem:one-pass}.  There are then $\samplesize$ iterations of the loop in \Cref{alg:sampling-loop}. Each iteration's run time is dominated by the call to \sampmon in \Cref{alg:mon-sam-sample} (which by \Cref{lem:sample} takes $O\left(\log{k} \cdot k \cdot {\depth(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$
) and the check \Cref{alg:check-duplicate-block}, which by the subsequent argument takes $O(k\log{k})$ time. We sort the $O(k)$ variables by their block IDs and then check if there is a duplicate block ID or not. Combining all the times discussed here gives us the desired overall runtime.
\qed
\end{proof}

\subsection{Proof of \Cref{cor:approx-algo-const-p}}
\begin{proof}
The result follows by first noting that by definition of $\gamma$, we have
\[\rpoly(1,\dots,1)= (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1).\]
Further, since each $\prob_i\ge \prob_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
\[ \rpoly(1,\dots,1) \ge \prob_0^k\cdot \rpoly(1,\dots,1).\]
The above two inequalities implies $\rpoly(1,\dots,1) \ge \prob_0^k\cdot (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1)$.
Applying this bound in the runtime bound in \Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ follows by noting that $\depth({\circuit})\le \size({\circuit})$ and absorbing all factors that just depend on $k$.
\qed
\end{proof}


%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: