paper-BagRelationalPDBsAreHard/app_approx-alg-analysis.tex

%root: main.tex


Before proving~\Cref{lem:mon-samp}, we use it to argue our main result,~\Cref{lem:approx-alg}:
\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}

Set $\mathcal{E}=\approxq({\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{{\circuit}}(1,\ldots, 1)},\]
 which achieves the claimed accuracy bound on $\mathcal{E}$ due to~\Cref{lem:mon-samp}.

The claim on the runtime follows from~\Cref{lem:mon-samp} since
\begin{align*}
\frac 1{\inparen{\error'}^2}\cdot \log\inparen{\frac 1\conf}=&\frac{\log{\frac{1}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{{\circuit}}(1,\ldots, 1)}\right)^2}\\
= &\frac{\log{\frac{1}{\conf}}\cdot \abs{{\circuit}}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)},
\end{align*}
%and the runtime then follows, thus upholding ~\cref{lem:approx-alg}.
which completes the proof.

We now return to the proof of~\Cref{lem:mon-samp}:
\subsection{Proof of Theorem \ref{lem:mon-samp}}\label{app:subsec-th-mon-samp}
Consider now the random variables $\randvar_1,\dots,\randvar_\numvar$, where each $\randvar_i$ is the value of $\vari{Y}_{\vari{i}}$ after~\Cref{alg:mon-sam-product} is executed. In particular, note that we have
\[Y_i= \onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i,\]
where the indicator variable handles the check in~\Cref{alg:check-duplicate-block}
Then for random variable $\randvar_i$, it is the case that
\begin{align*}
\expct\pbox{\randvar_i} &= \sum\limits_{(\monom, \coef) \in \expansion{{\circuit}} }\frac{\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} p_i }{\abs{{\circuit}}(1,\dots,1)} \\
&= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},
\end{align*}
where in the first equality we use the fact that $\vari{sgn}_{\vari{i}}\cdot \abs{\coef}=\coef$ and the second equality follows from~\cref{eq:tilde-Q-bi} with $X_i$ substituted by $\prob_i$.

Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$.  It is also true that

\[\expct\pbox{\empmean}
= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}
= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)}.\]

Hoeffding's inequality states that if we know that each $\randvar_i$ (which are all independent) always lie in the intervals $[a_i, b_i]$, then it is true that
\begin{equation*}
\probOf\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}.
\end{equation*}

Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with $O(k)$ $\prob_i\in [0, 1]$, which implies the range for each $\randvar_i$ is $[-1, 1]$.
Using Hoeffding's inequality, we then get:
\begin{equation*}
\probOf\pbox{~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error} \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} = 2\exp{\left(-\frac{\samplesize\error^2}{2 }\right)}\leq \conf,
\end{equation*}
where the last inequality follows from our choice of $\samplesize$ in~\Cref{alg:mon-sam-global2}.

This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}. We prove the claim  on the runtime next.

\paragraph*{Run-time Analysis}
The runtime of the algorithm is dominated by~\Cref{alg:mon-sam-onepass} (which by~\Cref{lem:one-pass} takes time $O\left({\size(\circuit)}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$) and the $\samplesize$ iterations of the loop in~\Cref{alg:sampling-loop}. Each iteration's run time is dominated by the call to~\Cref{alg:mon-sam-sample} (which by~\Cref{lem:sample} takes $O\left(\log{k} \cdot k \cdot {\depth(\circuit)}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$
) and~\Cref{alg:check-duplicate-block}, which by the subsequent argument takes $O(k\log{k})$ time. We sort the $O(k)$ variables by their block IDs and then check if there is a duplicate block ID or not. Adding up all the times discussed here gives us the desired overall runtime.

\subsection{Proof of~\Cref{cor:approx-algo-const-p}}
The result follows by first noting that by definition of $\gamma$, we have
\[\rpoly(1,\dots,1)= (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1).\]
Further, since each $\prob_i\ge \prob_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
\[ \rpoly(1,\dots,1) \ge \prob_0^k\cdot \rpoly(1,\dots,1).\]
The above two inequalities implies $\rpoly(1,\dots,1) \ge \prob_0^k\cdot (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1)$.
Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ follows by noting that $\depth({\circuit})\le \size({\circuit})$ and absorbing all factors that just depend on $k$.

\subsection{Proof of~\Cref{lem:val-ub}}
\label{app:proof-lem-val-ub}

%\paragraph{Sufficient condition for $\abs{\circuit}(1,\ldots, 1)$ to be size $O(N)$}
%For our runtime results to be relevant, it must be the case that the sum of the coefficients computed by \onepass is indeed size $O(N)$ since there are $O(\log{N})$ bits in the RAM model where $N$ is the size of the input.  The size of the input here is \size(\circuit).  We show that when \size$(\circuit_\linput) = N_\linput$, \size$(\circuit_\rinput) = N_\rinput$, where $N_\linput + N_\rinput \leq N$, this is indeed the case.

We will prove~\Cref{lem:val-ub} by considering the three cases separetly. We first being with the case when $\circuit$ is a tree:
\begin{Lemma}
\label{lem:C-ub-tree}
Let $\circuit$ be a tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint). Then we have
\[\abs{\circuit}(1,\dots,1)\le \left(\size(\circuit)\right)^{\degree(\circuit)+1}.\]
\end{Lemma}
\begin{proof}%[Proof of $\abs{\circuit}(1,\ldots, 1)$ is size $O(N)$]
For notational simplcity define $N=\size(\circuit)$ and $k=\degree(\circuit)$.
To prove this result, we by prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient (or constant) of $1$.  In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{k+1} = 1^{1} = 1$.

Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{\deg(\circuit)+1 }$.% for $k \geq 1$ when \depth(C) $\geq 1$.

For the inductive step we consider a circuit \circuit such that $\depth(\circuit) = \ell + 1$.  The sink can only be either a $\circmult$ or $\circplus$ gate.  Consider when sink node is $\circmult$.  Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively.  %Note that this case does not require the constraint on $N_\linput$ or $N_\rinput$.
%In this case we do not use the fact that $\circuit$ is a tree and just assume that $N_\linput,N_\rinput\le N-1$.
 Then note that
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq (N-1)^{k_\linput+1} \circmult (N - 1)^{k_\rinput+1}\nonumber\\
 &= (N-1)^{k+1}\label{eq:sumcoeff-times-upper}\\
 &\leq N^{2^k}.\nonumber
\end{align}
%We derive the upperbound of \cref{eq:sumcoeff-times-upper} by noting that the maximum value of the LHS occurs when both the base and exponent are maximized.
In the above the first inequality follows from the inductive hypothesis (and the fact that $N_\linput,N_\rinput\le N-1$) and \cref{eq:sumcoeff-times-upper} follows by nothing that for $\times$ node we have $k=k_\linput+k_\rinput+1$.

For the case when the sink node is a $\circplus$ node, then we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq
N_\linput^{k+1} + N_\rinput^{k+1}\nonumber\\
&\leq (N-1)^{k+1 } \label{eq:sumcoeff-plus-upper}\\
&\leq N^{k+1}.\nonumber
\end{align}
In the above, the first inequality follows from the inductive hypothesis (and the fact that $k_\linput,k_\rinput\le k$)  while the second inequality follows from the fact that since $\circuit$ is a tree we have $N_\linput+N_\rinput=N-1$ and the fact that $k\ge 0$. This compeletes the proof.
%Similar to the $\circmult$ case, \cref{eq:sumcoeff-plus-upper} upperbounds its LHS by the fact that the maximum base and exponent combination is always greater than or equal to the sum of lower base/exponent combinations.  The final equality is true given the constraint over the inputs.

%Since $\abs{\circuit}(1,\ldots, 1) \leq N^{2^k}$ for all circuits such that all $\circplus$ gates share at most one gate with their sibling (across their respective subcircuits), then $\log{N^{2^k}} = 2^k \cdot \log{N}$ which for fixed $k$ yields the desired $O(\log{N})$ bits for $O(1)$ arithmetic operations.% for the given query class.
\end{proof}

%\revision{\textbf{THE PART BELOW NEEDS WORK. --Atri}}
The upper bound in~\Cref{lem:val-ub} for the general case is a simple variant of the above proof (but we present a proof sketch of the bound below for completeness):
\begin{Lemma}
\label{lem:C-ub-gen}
Let $\circuit$ be a (general) circuit. % tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint).
Then we have
\[\abs{\circuit}(1,\dots,1)\le 2^{2^{\degree(\circuit)}\cdot \size(\circuit)}.\]
\end{Lemma}
\begin{proof}[Proof Sketch]
We use the same notation as in the proof of~\Cref{lem:C-ub-tree}. We will prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq 2^{2^k\cdot N }$. The base case argument is similar to that in the proof of~\Cref{lem:C-ub-tree}. In the inductive case we have that $N_\linput,N_\rinput\le N-1$.

For the case when the sink node is $\times$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq {2^{2^{k_\linput}\cdot N_\linput}} \circmult {2^{2^{k_\rinput}\cdot N_\rinput}}\\
 &\leq 2^{2\cdot 2^{k-1}\cdot (N-1)}\\
 &\leq 2^{2^k N}.
\end{align*}
In the above the first inequality follows from inductive hypothesis while the second inequality follows from the fact that $k_\linput,k_\rinput\le k-1$ and $N_\linput, N_\rinput\le N-1$.
%$k_\linput+k_\rinput=k$ (and hence $\max(k_\linput,k_\rinput)\le k$) as well as the fact that $k\ge 0$.

Now consider the case when the sink node is $+$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq 2^{2^{k_\linput}\cdot N_\linput} + 2^{2^{k_\rinput}\cdot N_\rinput}\\
&\leq 2\cdot {2^{2^k(N-1)} } \\
&\leq 2^{2^kN}.
\end{align*}
In the above the first inequality follows from the inductive hypothesis while the second inequality follows from the facts that $k_\linput,k_\rinput\le k$ and $N_\linput,N_\rinput\le N-1$. The final inequality follows from the fact that $k\ge 0$.
\end{proof}