paper-BagRelationalPDBsAreHard/est_bounds.tex

60 lines
2.7 KiB
TeX

% -*- root: main.tex -*-
\section{Bounding the Estimates}
\newcommand{\bMu}{\epsilon\mu_{\sketchCols_{sum}}}
\newcommand{\bBnd}{\sketchCols_{sketch}}
\newcommand{\mBnd}{\sketchRows_{sketch}}
\newcommand{\sBnd}{m_{sketch}}
For a $\sketchCols$ estimate, denoted $\sketchCols_{est}$, and given the following:
\begin{align*}
&\bMu \text{ is the expectation for the sum of estimates.}\\
&X = \sum_{i = 1}^{\sketchRows}X_i \\
&X_i\text{ is i.i.d. r.v.} \in [0, 1], i \in \sketchRows \\
&X_i = \begin{cases}
0 &\sketchCols_{est} > \bMu\\
1 &\sketchCols_{est} \leq \bMu
\end{cases}\\
&p[X_i = 1] \geq \frac{2}{3}\\
&p[X_i = 0] \leq \frac{1}{3}\\
&\mu = \frac{2}{3}\sketchRows\\
&\epsilon = 0.5
\end{align*}
Because Chebyshev bounds tell us that the probability of a bad row estimate is $\leq \frac{1}{3}$, we set epsilon to the value that, when multiplied to $\mu$, outputs $\frac{1}{3}$. We then derive bounds for $\sketchRows$.
Note, because we are only concerned with the left side of the tail, we can use the generic Chernoff bounds for the left tail,
\begin{equation*}
Pr[|X - \mu| \leq (1 - \epsilon)\mu] \leq e^{-\frac{\epsilon^2}{2 + \epsilon}\mu}.
\end{equation*}
Solving for $\delta$,
\begin{align*}
\delta \geq e^{-\frac{(\frac{1}{2})^2}{2 + \frac{1}{2}}\frac{2}{3}\sketchRows}\\
\delta \geq e^{-\frac{1}{15}\sketchRows}\\
e^{\frac{1}{15}\sketchRows} \geq \frac{1}{\delta}\\
\sketchRows \geq \frac{15}{1}ln(\frac{1}{\delta})
\end{align*}
We are now ready to combine the bounds we have derived for both $\sketchCols$ and $\sketchRows$ to which we will refer to as $\bBnd$ and $\mBnd$ respectively.
\begin{align*}
&\mBnd \cdot \bBnd \\
= & \frac{15}{1}ln(\frac{1}{\delta}) \cdot \frac{3\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2\right)}{\epsilon^2 p^2}\\
= & \frac{45\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2\right)}{\epsilon^2 p^2}ln(\frac{1}{\delta})
\end{align*}
Sampling bounds, $\sBnd$, are obtained via Chernoff Bounds. Given,
\begin{align*}
&X = \sum_{i = 1}^{m}X_i\\
&X_i \text{is i.i.d. r.v.} \in [0, 1] \\
&p = \frac{\norm{\genV}_1}{|W|}\\
&\bar{X} = \frac{X}{m}\\
&P[|\bar{X} - p| \geq \epsilon p] \leq 2e^{-\frac{\epsilon^2}{2 + \epsilon}pm} \rightarrow\\
&\delta \geq 2e^{-\frac{\epsilon^2}{2 + \epsilon}pm} \\
&e^{\frac{\epsilon^2}{2 + \epsilon}pm} \geq \frac{2}{\delta} \\
&\frac{\epsilon^2}{2 + \epsilon}pm \geq ln(\frac{2}{\delta})\\
&m \geq \frac{2 + \epsilon}{\epsilon^2 p}ln(\frac{2}{\delta})
\end{align*}
We are particularly interested when the former are a lower bound to the latter. We want to know when the following is true.
\begin{equation*}
\frac{2 + \epsilon}{\epsilon^2 p}ln(\frac{2}{\delta}) > \frac{45\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2\right)}{\epsilon^2 p^2}ln(\frac{1}{\delta})
\end{equation*}