paper-BagRelationalPDBsAreHard/var_estj.tex

169 lines
11 KiB
TeX

% -*- root: main.tex -*-
\onecolumn
\section{Bounding $\sigsq$}
\label{sec:var_est}
We wish to prove that
\[
\sigsq \leq \sum_j \sigsq_j.
\]
Therefore, substituting in the definition of variance for complex numbers,
\begin{align}
\sigsq &= \ex{\sum_j est_j \cdot \conj{\sum_{j'} est_j'}} - \ex{\sum_j est_j}\cdot\ex{\conj{\sum_{j'} est_{j'}}}\nonumber\\
&= \ex{\sum_j est_j \cdot \sum_{j'} \conj{est_j'}} - \ex{\sum_j est_j}\cdot\ex{\sum_{j'} \conj{est_{j'}}}\nonumber\\
&= \sum_{j, j'}\left(\ex{est_j \cdot \overline{est_j'}} - \ex{est_j}\ex{\overline{est_{j'}}} = \cvar{j, j'}\right)\nonumber\\
&= \sum_j\ex{est_j \cdot \overline{est_j'}} - \ex{est_j}\ex{\overline{est_j}} + \sum_{j \neq j'}\cvar{j, j'}\nonumber\\
&= \sum_j \sigsq_j + \sum_{j \neq j'}\cvar{j, j'} \label{eq:sigsq-jneqj}\\
&\Rightarrow \sum_{j \neq j'}\cvar{j, j'}\leq 0. \nonumber
\end{align}
\subsection{Bounding $\sum_{j \neq j'}\cvar{j, j'}$}
\begin{align*}
\sum_{j \neq j'}\cvar{j, j'} &= \sum_{j \neq j'} \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\\
&=\ex{\prod_{i = 1}^{k}\sum_{w \in W}v_i(w)s(w)\ind{h(w) = j}\cdot \prod_{i = 1}^{k}\sum_{w' \in W}v_i(w')\conj{s(w')}\ind{h(w') = j'}} - \ex{\prod_{i = 1}^{k}\sum_{w \in W}v_i(w)s(w)\ind{h(w) = j}}\cdot \ex{\prod_{i = 1}^{k}\sum_{w' \in W}v_i(w')\conj{s(w')}\ind{h(w') = j'}}\\
&=\ex{\sum_{\substack{w_1,\cdots,w_k,\\w'_1,\cdots,w'_k\\\in W}}\prod_{i = 1}^{k}v_i(w_i)s(w_i)v_i(w'_i)s(w'_i) \ind{h(w_i) = j} \ind{h(w'_i) = j'}} - \ex{\sum_{\substack{w_1,\cdots, w_k\\\in W}}\prod_{i = 1}^{k}v_i(w_i)s(w_i) \ind{h(w_i) = j}}\cdot\ex{\sum_{\substack{w'_1,\cdots, w'_k\\\in W}}\prod_{i = 1}^{k}v_i(w'_i)s(w'_i) \ind{h(w'_i) = j'}}\\
&=\sum_{\substack{w_1,\cdots,w_k,\\w'_1,\cdots,w'_k\\\in W}}\ex{\prod_{i = 1}^{k}v_i(w_i)s(w_i)v_i(w'_i)s(w'_i)\ind{h(w_i) = j}\ind{h(w'_i) = j'}} - \ex{\prod_{i = 1}^{k} v_i(w_i)s(w_i) \ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^{k}v_i(w'_i)s(w'_i)\ind{h(w'_i) = j'}}\\
&= \sum_{\substack{w_1,\cdots,w_k,\\w'_1,\cdots,w'_k\\\in W}}\prod_{i = 1}^{k}v_i(w_i)v_i(w'_i)\ex{\prod_{i = 1}^{k}s(w_i)s(w'_i)\ind{h(w_i) = j}\ind{h(w'_i) = j'}} - \prod_{i = 1}^{k}v_i(w_i)\ex{\prod_{i = 1}^{k}s(w_i)\ind{h(w_i) = j}}\cdot \prod_{i = 1}^{k}v_i(w'_i)\ex{\prod_{i = 1}^{k}s(w'_i)\ind{h(w_i') = j'}}\\
&= \sum_{\substack{w_1,\cdots,w_k,\\w'_1,\cdots,w'_k\\\in W}}\prod_{i = 1}^{k}v_i(w_i)v_i(w'_i)\left(\ex{\prod_{i = 1}^{k}s(w_i)s(w'_i)\ind{h(w_i) = j}\ind{h(w'_i) = j'}} - \ex{\prod_{i = 1}^{k}s(w_i)\ind{h(w_i) = j}}\cdot\ex{\prod_{i = 1}^{k}s(w'_i)\ind{h(w_i') = j'}} \right).
\end{align*}
For $T_1 = \ex{\prod_{i = 1}^{k}s(w_i)s(w'_i)\ind{h(w_i) = j}\ind{h(w'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only surviving terms occur when there is no overlap between the $w_i$ and $w'_i$ variables. Given the condition of no overlap, the only terms that survive are when $\forall i \in [k], w_i = w, w'_i = w', w \neq w'$. Notice, however, that in such a case, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations, each can only survive when $\forall i \in [k], w_i = w, w'_i = w'$. Such constraints leave us with only one surviving case, when all variables are the same world. Thus,
\begin{align}
&\sum_{j \neq j'}\cvar{j, j'} = - \frac{1}{B^2}\sum_{w \in W}\prod_{i = 1}^{k}v_i^2(w)\label{eq:cvar-bound}.
\end{align}
\subsection{Bounding $\sigsq_j$}
We now seek to bound the remaining term in ~\eqref{eq:sigsq-jneqj}. We take a look at the variance of a single bucket estimate.
\begin{align*}
&\sigsq_j = \ex{est_j \cdot \overline{est_j}} - \ex{est_j} \cdot \ex{\overline{est_j}} \\
&= \ex{\prod_{i = 1}^{k}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^k\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{k}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^k\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\\
&= \ex{\sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\prod_{i = 1}^k v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_k \in W} \prod_{i = 1}^k v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_k \in W} \prod_{i = 1}^k v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\\
=&\sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\ex{\prod_{i = 1}^k v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^k v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\\
&= \sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\prod_{i = 1}^k v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^k s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^k\overline{s(w'_i)}\ind{h(w'_i) = j}} \right).
\end{align*}
\subsection{Non-generic k}
\subsubsection{k = 2}
Taking $k = 2$ and looking at $T_1 = \ex{\prod\limits_{i = 1}^k s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}}$, it can be seen that only specific combinations of $w$ can survive. First, when $\forall i \in [k], w_i = w, w'_i = w'$, then we end up with $s(w)^k = 1$ and $s(w')^k = 1$. This translates into:
\begin{align*}
\frac{1}{B}\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w) +\frac{1}{B^2}\sum_{w\neq w' \in W}\prod_{i = 1}^{2}v_i(w)v_i(w'_i).
\end{align*}
Taking into account that for $\omega \in \mathbb{C}, \omega \cdot \conj{\omega} = 1$, terms in $T_1$ also survive the expectation when all $w_i$ have a matching counterpart in $w'_i$, yielding
\begin{align*}
\frac{1}{B^2}\sum_{\substack{w_1 \neq w_2\\ \in W}}\prod_{i = 1}^{2}v_i^2(w_i) + \frac{1}{B^2}\sum_{w \neq w' \in W}\prod_{i = 1}^{2}v_i(w)v_i(w').
\end{align*}
Putting all cases together we have that
\begin{align*}
T_1 = \frac{1}{B}\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w) + \frac{1}{B^2}\left(2\sum_{w\neq w' \in W}\prod_{i = 1}^{2}v_i(w)v_i(w') + \sum_{\substack{w_1 \neq w_2\\ \in W}}\prod_{i = 1}^{2}v_i^2(w_i)\right).
\end{align*}
For $T_2 = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}$ and $T_3 = \ex{\prod_{i = 1}^k\overline{s(w'_i)}\ind{h(w'_i) = j}}$, we get
\begin{align*}
&T_2 = \frac{1}{B}\sum_{w \in W}\prod_{i = 1}^{2}v_i(w),\\
&T_3 = \frac{1}{B}\sum_{w' \in W}\prod_{i = 1}^{2}v_i(w'),\\
&T_2 \cdot T_3 = \frac{1}{B^2}\sum_{w, w' \in W}\prod_{i = 1}^{2}v_i(w)v_i(w') = \frac{1}{B^2}\left(\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w) + \sum_{w \neq w' \in W}\prod_{i = 1}^{2}v_i(w)v_i(w')\right).
\end{align*}
Combining all $T_i$,
\begin{align*}
\sigsq_j = T_1 - T_2 \cdot T_3 = \frac{B - 1}{B^2}\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w) + \frac{1}{B^2}\left(\sum_{\substack{w_1\neq w_2 \\ \in W}}\prod_{i = 1}^{2}v_i(w_1)v_i(w_2) + v_i^2(w_i)\right).%+ \sum_{w}\prod_{i = 1}^{2}v_i(w)^2\right)
\end{align*}
Recall ~\eqref{eq:cvar-bound}, that $\sum\limits_{j \neq j'}\cvar{j, j'} = -\frac{1}{B^2}\sum\limits_{w \in W}\prod_{i = 1}^{2}v_i^2(w)$. Thus, for $k = 2$ we can compute ~\eqref{eq:sigsq-jneqj}
\begin{align*}
&\sigsq = \sum_{j \in B}\sigsq_j + \sum_{j \neq j'}\cvar{j, j'}\\
&=B \cdot \left(\frac{B - 1}{B^2}\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w) + \frac{1}{B^2}\left(\sum_{\substack{w_1\neq w_2 \\ \in W}}\prod_{i = 1}^{2}v_i(w_1)v_i(w_2) + v_i^2(w_i)\right)
\right) - \frac{B\left(B - 1\right)}{B^2}\sum_{w \in W}\prod_{i = 1}^{2}v_i^2(w)\\
&= \frac{1}{B}\left(\sum_{\substack{w_1\neq w_2 \\ \in W}}\prod_{i = 1}^{2}v_i(w_1)v_i(w_2) + v_i^2(w_i)\right)\\
&= \frac{1}{B}\left(\left(\sum_{w \in W}v_1(w)\right)^2\left(\sum_{w \in W}v_2(w)\right)^2 + \left(\sum_{w \in W}v_1(w)v_2(w)\right)^2\right)\\
&= \frac{1}{B}\left(\norm{v_1}_2^2\norm{v_2}_2^2 + \left(\sum_{w \in W}v_1(w)v_2(w)\right)^2\right)\\
&\leq \frac{1}{B}\left(\norm{v_1}_2^2\norm{v_2}_2^2 + \norm{v_1}_2^2\norm{v_2}_2^2\right)\\
&\leq \frac{2}{B}\left(\norm{v_1}_2^2\norm{v_2}_2^2\right).
\end{align*}
\subsubsection{k = 3}
\begin{align*}
&= \sum_{\substack{w_1...w_3\\w'_1...w'_3\\ \in W}}\prod_{i = 1}^3 v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^3 s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^3s(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^3\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)
\end{align*}
In the above expression, we seek to know which combinations of $w_i$ and $w'_i$ variables will survive the expectation calculations. We can divide the possibilities up into several different cases.
First, for roots of unity, we have that $\omega^k = 1$ if $\omega$ is a kth root of unity. This gives our first case.
\underline{Case 1:}
\begin{align*}
&w_1 = w_2 = w_3 = w\\
&w'_1 = w'_2 = w'_3 = w'\\
&1.1)~ w = w'\qquad1.2)~ w \neq w'
\end{align*}
The remaining cases take into account the property for roots of unity that $\omega \cdot \conj{\omega} = 1$. Note that we omit the case of all variables being equal because that has already been covered above.
\underline{Case 2:}
\begin{align*}
&w_1 = w'_1 = w\\
&w_2 = w'_2 = w'\\
&w_3 = w'_3 = w''\\
&2.1)~ w = w' \neq w''\qquad2.2)~ w \neq w'= w''\qquad2.3)w = w'' \neq w'\qquad2.4) w \neq w' \neq w''
\end{align*}
\underline{Case 3:}
\begin{align*}
&w_1 = w'_2 = w\\
&w_2 = w'_3 = w'\\
&w_3 = w'_1 = w''\\
&3.1)~ w = w' \neq w''\qquad3.2)~ w \neq w'= w''\qquad3.3)w = w'' \neq w'\qquad3.4) w \neq w' \neq w''
\end{align*}
\underline{Case 4:}
\begin{align*}
&w_1 = w'_3 = w\\
&w_2 = w'_1 = w'\\
&w_3 = w'_2 = w''\\
&4.1)~ w = w' \neq w''\qquad4.2)~ w \neq w'= w''\qquad4.3)w = w'' \neq w'\qquad4.4) w \neq w' \neq w''
\end{align*}
\underline{Case 5:}
\begin{align*}
&w_1 = w'_2 = w\\
&w_2 = w'_1 = w'\\
&w_3 = w'_3 = w''\\
&5.1)~ w = w' \neq w''\qquad5.2)~ w \neq w'= w''\qquad5.3)w = w'' \neq w'\qquad5.4) w \neq w' \neq w''
\end{align*}
\underline{Case 6:}
\begin{align*}
&w_1 = w'_1 = w\\
&w_2 = w'_3 = w'\\
&w_3 = w'_2 = w''\\
&6.1)~ w = w' \neq w''\qquad6.2)~ w \neq w'= w''\qquad6.3)w = w'' \neq w'\qquad6.4) w \neq w' \neq w''
\end{align*}
\underline{Case 7:}
\begin{align*}
&w_1 = w'_3 = w\\
&w_2 = w'_2 = w'\\
&w_3 = w'_1 = w''\\
&7.1)~ w = w' \neq w''\qquad7.2)~ w \neq w'= w''\qquad7.3)w = w'' \neq w'\qquad7.4) w \neq w' \neq w''
\end{align*}
The surviving terms are:
\begin{align*}
&\text{Case 1:}\\
&\frac{B - 1}{B^2}\left(\sum_w v_1^2(w) v_2^2(w) v_3^2(w)\right) + \\
&Case 2:\\
&\frac{1}{B^2}\left(\sum_{w \neq w'}v_1^2(w)\left(v_2^2(w)v_3^2(w') + v_2^2(w')v_3^2(w') + v_2^2(w')v_3^2(w)\right)\right) + \frac{1}{B^3}\sum_{w \neq w' \neq w''}v_1^2(w)v_2^2(w')v_3^2(w'') +\\
&\text{Case 3 and 4:}\\
&\frac{2}{B^2}\left(\sum_{w \neq w'}v_1(w)v_1(w')v_2^2(w)v_3(w)v_3(w') + v_1(w)v_1(w')v_2(w)v_2(w')v_3^2(w') + v_1^2(w)v_2(w)v_2(w')v_3(w)v_3(w')\right) + \\
&\qquad\qquad \frac{1}{B^3}\left(\sum_{w \neq w' \neq w''}v_1(w)v_1(w'')v_2(w')v_2(w)v_3(w'')v_3(w') + v_1(w)v_1(w')v_2(w')v_2(w'')v_2(w)v_3(w'')\right)\\
&\text{Case 5, 6, 7:}\\
&\frac{1}{B^3}\left(\sum_{w \neq w' \neq w''}v_1(w)v_1(w')v_2(w)v_2(w')v_3^2(w'') + v_1^2(w)v_2(w')v_2(w'')v_3(w')v_3(w'') + v_1(w)v_1(w'')v_2^2(w')v_3(w)v_3(w'')\right)
\end{align*}