paper-BagRelationalPDBsAreHard/app_approx-alg-analysis.tex

138 lines
11 KiB
TeX
Raw Normal View History

2021-04-06 11:43:34 -04:00
%root: main.tex
Before proving~\Cref{lem:mon-samp}, we use it to argue our main result,~\Cref{lem:approx-alg}:
\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
2021-04-06 16:35:11 -04:00
Set $\mathcal{E}=\approxq({\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{{\circuit}}(1,\ldots, 1)},\]
2021-04-06 14:29:47 -04:00
which achieves the claimed accuracy bound on $\mathcal{E}$ due to~\Cref{lem:mon-samp}.
2021-04-06 11:43:34 -04:00
2021-04-06 14:29:47 -04:00
The claim on the runtime follows from~\Cref{lem:mon-samp} since
2021-04-06 11:43:34 -04:00
\begin{align*}
2021-04-06 16:35:11 -04:00
\frac 1{\inparen{\error'}^2}\cdot \log\inparen{\frac 1\conf}=&\frac{\log{\frac{1}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{{\circuit}}(1,\ldots, 1)}\right)^2}\\
= &\frac{\log{\frac{1}{\conf}}\cdot \abs{{\circuit}}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)},
2021-04-06 11:43:34 -04:00
\end{align*}
%and the runtime then follows, thus upholding ~\cref{lem:approx-alg}.
which completes the proof.
We now return to the proof of~\Cref{lem:mon-samp}:
\subsection{Proof of Theorem \ref{lem:mon-samp}}\label{app:subsec-th-mon-samp}
Consider now the random variables $\randvar_1,\dots,\randvar_\numvar$, where each $\randvar_i$ is the value of $\vari{Y}_{\vari{i}}$ after~\Cref{alg:mon-sam-product} is executed. In particular, note that we have
\[Y_i= \onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i,\]
where the indicator variable handles the check in~\Cref{alg:check-duplicate-block}
Then for random variable $\randvar_i$, it is the case that
\begin{align*}
2021-04-06 16:35:11 -04:00
\expct\pbox{\randvar_i} &= \sum\limits_{(\monom, \coef) \in \expansion{{\circuit}} }\frac{\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} p_i }{\abs{{\circuit}}(1,\dots,1)} \\
&= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},
2021-04-06 11:43:34 -04:00
\end{align*}
where in the first equality we use the fact that $\vari{sgn}_{\vari{i}}\cdot \abs{\coef}=\coef$ and the second equality follows from~\cref{eq:tilde-Q-bi} with $X_i$ substituted by $\prob_i$.
Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that
\[\expct\pbox{\empmean}
= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}
2021-04-06 16:35:11 -04:00
= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)}.\]
2021-04-06 11:43:34 -04:00
Hoeffding's inequality states that if we know that each $\randvar_i$ (which are all independent) always lie in the intervals $[a_i, b_i]$, then it is true that
\begin{equation*}
\probOf\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}.
\end{equation*}
Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with $O(k)$ $\prob_i\in [0, 1]$, which implies the range for each $\randvar_i$ is $[-1, 1]$.
Using Hoeffding's inequality, we then get:
\begin{equation*}
\probOf\pbox{~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error} \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} = 2\exp{\left(-\frac{\samplesize\error^2}{2 }\right)}\leq \conf,
\end{equation*}
where the last inequality follows from our choice of $\samplesize$ in~\Cref{alg:mon-sam-global2}.
2021-04-06 14:29:47 -04:00
This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}. We prove the claim on the runtime next.
2021-04-06 11:43:34 -04:00
2021-04-06 14:29:47 -04:00
\paragraph*{Run-time Analysis}
The runtime of the algorithm is dominated by~\Cref{alg:mon-sam-onepass} (which by~\Cref{lem:one-pass} takes time $O\left({\size(\circuit)}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$) and the $\samplesize$ iterations of the loop in~\Cref{alg:sampling-loop}. Each iteration's run time is dominated by the call to~\Cref{alg:mon-sam-sample} (which by~\Cref{lem:sample} takes $O\left(\log{k} \cdot k \cdot {\depth(\circuit)}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$
2021-04-06 11:43:34 -04:00
) and~\Cref{alg:check-duplicate-block}, which by the subsequent argument takes $O(k\log{k})$ time. We sort the $O(k)$ variables by their block IDs and then check if there is a duplicate block ID or not. Adding up all the times discussed here gives us the desired overall runtime.
\subsection{Proof of~\Cref{cor:approx-algo-const-p}}
The result follows by first noting that by definition of $\gamma$, we have
2021-04-06 16:35:11 -04:00
\[\rpoly(1,\dots,1)= (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1).\]
2021-04-06 11:43:34 -04:00
Further, since each $\prob_i\ge \prob_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
\[ \rpoly(1,\dots,1) \ge \prob_0^k\cdot \rpoly(1,\dots,1).\]
2021-04-06 16:35:11 -04:00
The above two inequalities implies $\rpoly(1,\dots,1) \ge \prob_0^k\cdot (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1)$.
Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\cdot \multc{\log\left(\abs{\circuit}^2(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ follows by noting that $\depth({\circuit})\le \size({\circuit})$ and absorbing all factors that just depend on $k$.
\subsection{Proof of~\Cref{lem:val-ub}}
2021-04-06 21:14:29 -04:00
\label{app:proof-lem-val-ub}
2021-04-06 16:35:11 -04:00
%\paragraph{Sufficient condition for $\abs{\circuit}(1,\ldots, 1)$ to be size $O(N)$}
%For our runtime results to be relevant, it must be the case that the sum of the coefficients computed by \onepass is indeed size $O(N)$ since there are $O(\log{N})$ bits in the RAM model where $N$ is the size of the input. The size of the input here is \size(\circuit). We show that when \size$(\circuit_\linput) = N_\linput$, \size$(\circuit_\rinput) = N_\rinput$, where $N_\linput + N_\rinput \leq N$, this is indeed the case.
We will prove~\Cref{lem:val-ub} by considering the three cases separetly. We first being with the case when $\circuit$ is a tree:
\begin{Lemma}
\label{lem:C-ub-tree}
Let $\circuit$ be a tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint). Then we have
2021-04-06 23:17:19 -04:00
\[\abs{\circuit}(1,\dots,1)\le \left(\size(\circuit)\right)^{\degree(\circuit)+1}.\]
2021-04-06 16:35:11 -04:00
\end{Lemma}
\begin{proof}%[Proof of $\abs{\circuit}(1,\ldots, 1)$ is size $O(N)$]
2021-04-06 23:17:19 -04:00
For notational simplcity define $N=\size(\circuit)$ and $k=\degree(\circuit)$.
To prove this result, we by prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient (or constant) of $1$. In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{k+1} = 1^{1} = 1$.
2021-04-06 16:35:11 -04:00
2021-04-06 23:17:19 -04:00
Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{\deg(\circuit)+1 }$.% for $k \geq 1$ when \depth(C) $\geq 1$.
2021-04-06 16:35:11 -04:00
For the inductive step we consider a circuit \circuit such that $\depth(\circuit) = \ell + 1$. The sink can only be either a $\circmult$ or $\circplus$ gate. Consider when sink node is $\circmult$. Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively. %Note that this case does not require the constraint on $N_\linput$ or $N_\rinput$.
2021-04-06 23:17:19 -04:00
%In this case we do not use the fact that $\circuit$ is a tree and just assume that $N_\linput,N_\rinput\le N-1$.
Then note that
2021-04-06 16:35:11 -04:00
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
2021-04-06 23:17:19 -04:00
&\leq (N-1)^{k_\linput+1} \circmult (N - 1)^{k_\rinput+1}\nonumber\\
&= (N-1)^{k+1}\label{eq:sumcoeff-times-upper}\\
2021-04-06 16:35:11 -04:00
&\leq N^{2^k}.\nonumber
\end{align}
%We derive the upperbound of \cref{eq:sumcoeff-times-upper} by noting that the maximum value of the LHS occurs when both the base and exponent are maximized.
2021-04-06 23:17:19 -04:00
In the above the first inequality follows from the inductive hypothesis (and the fact that $N_\linput,N_\rinput\le N-1$) and \cref{eq:sumcoeff-times-upper} follows by nothing that for $\times$ node we have $k=k_\linput+k_\rinput+1$.
2021-04-06 16:35:11 -04:00
For the case when the sink node is a $\circplus$ node, then we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq
2021-04-06 23:17:19 -04:00
N_\linput^{k+1} + N_\rinput^{k+1}\nonumber\\
&\leq (N-1)^{k+1 } \label{eq:sumcoeff-plus-upper}\\
&\leq N^{k+1}.\nonumber
2021-04-06 16:35:11 -04:00
\end{align}
2021-04-06 23:17:19 -04:00
In the above, the first inequality follows from the inductive hypothesis (and the fact that $k_\linput,k_\rinput\le k$) while the second inequality follows from the fact that since $\circuit$ is a tree we have $N_\linput+N_\rinput=N-1$ and the fact that $k\ge 0$. This compeletes the proof.
2021-04-06 16:35:11 -04:00
%Similar to the $\circmult$ case, \cref{eq:sumcoeff-plus-upper} upperbounds its LHS by the fact that the maximum base and exponent combination is always greater than or equal to the sum of lower base/exponent combinations. The final equality is true given the constraint over the inputs.
%Since $\abs{\circuit}(1,\ldots, 1) \leq N^{2^k}$ for all circuits such that all $\circplus$ gates share at most one gate with their sibling (across their respective subcircuits), then $\log{N^{2^k}} = 2^k \cdot \log{N}$ which for fixed $k$ yields the desired $O(\log{N})$ bits for $O(1)$ arithmetic operations.% for the given query class.
\end{proof}
2021-04-06 23:17:19 -04:00
%\revision{\textbf{THE PART BELOW NEEDS WORK. --Atri}}
2021-04-06 16:35:11 -04:00
The upper bound in~\Cref{lem:val-ub} for the general case is a simple variant of the above proof (but we present a proof sketch of the bound below for completeness):
\begin{Lemma}
\label{lem:C-ub-gen}
Let $\circuit$ be a (general) circuit. % tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint).
Then we have
2021-04-06 23:17:19 -04:00
\[\abs{\circuit}(1,\dots,1)\le 2^{2^{\degree(\circuit)}\cdot \size(\circuit)}.\]
2021-04-06 16:35:11 -04:00
\end{Lemma}
\begin{proof}[Proof Sketch]
2021-04-06 23:17:19 -04:00
We use the same notation as in the proof of~\Cref{lem:C-ub-tree}. We will prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq 2^{2^k\cdot N }$. The base case argument is similar to that in the proof of~\Cref{lem:C-ub-tree}. In the inductive case we have that $N_\linput,N_\rinput\le N-1$.
2021-04-06 16:35:11 -04:00
For the case when the sink node is $\times$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \\
2021-04-06 23:17:19 -04:00
&\leq {2^{2^{k_\linput}\cdot N_\linput}} \circmult {2^{2^{k_\rinput}\cdot N_\rinput}}\\
&\leq 2^{2\cdot 2^{k-1}\cdot (N-1)}\\
&\leq 2^{2^k N}.
2021-04-06 16:35:11 -04:00
\end{align*}
2021-04-06 23:17:19 -04:00
In the above the first inequality follows from inductive hypothesis while the second inequality follows from the fact that $k_\linput,k_\rinput\le k-1$ and $N_\linput, N_\rinput\le N-1$.
%$k_\linput+k_\rinput=k$ (and hence $\max(k_\linput,k_\rinput)\le k$) as well as the fact that $k\ge 0$.
2021-04-06 16:35:11 -04:00
Now consider the case when the sink node is $+$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \\
2021-04-06 23:17:19 -04:00
&\leq 2^{2^{k_\linput}\cdot N_\linput} + 2^{2^{k_\rinput}\cdot N_\rinput}\\
&\leq 2\cdot {2^{2^k(N-1)} } \\
&\leq 2^{2^kN}.
2021-04-06 16:35:11 -04:00
\end{align*}
2021-04-06 23:17:19 -04:00
In the above the first inequality follows from the inductive hypothesis while the second inequality follows from the facts that $k_\linput,k_\rinput\le k$ and $N_\linput,N_\rinput\le N-1$. The final inequality follows from the fact that $k\ge 0$.
2021-04-06 16:35:11 -04:00
\end{proof}