paper-BagRelationalPDBsAreHard/sop.tex

275 lines
32 KiB
TeX

%root--main.tex
\section{Sum of Products Analysis}
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query.
We need to compute the variance of the $\prodsize$-way product $\est$. We wish to prove that
\begin{equation}
\sigsq \leq \sum_j \sigsq_j \label{eq:var-to-prove}.
\end{equation}
Therefore, substituting in the definition of variance for complex numbers,
\begin{align}
\sigsq &= \ex{\sum_j \est_j \cdot \conj{\sum_{j'} \est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\conj{\sum_{j'} \est_{j'}}}\nonumber\\
&= \ex{\sum_j \est_j \cdot \sum_{j'} \conj{\est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\sum_{j'} \conj{\est_{j'}}}\nonumber\\
&= \sum_{j, j'}\left(\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_{j'}}} = \cvar{j, j'}\right)\nonumber\\
&= \sum_j\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_j}} + \sum_{j \neq j'}\cvar{j, j'}\nonumber\\
&= \sum_j \sigsq_j + \sum_{j \neq j'}\cvar{j, j'} \label{eq:sigsq-jneqj}
\end{align}
Notice that assuming independence of $\sigsq_j ~\forall j \in \sketchCols$, we can push the variance through the sum and obtain the result
\begin{align*}
&\sigsq - \sum_j \sigsq_j = \cvar{j, j'}\\
&\implies \cvar{j, j'} \leq 0.
\end{align*}
Recall that we started this section out by seeking to prove \cref{eq:var-to-prove}. Should this be true, the use of $\leq$ in the above implication results from the fact that $\sigsq \leq \sum_j \sigsq_j \implies \cvar{j, j'} \leq 0$.
\AH{I'm really not so sure about the above results. This was from a conversation we had months ago, but we're basing an implication on something we haven't proved. That doesn't seem right to me.}
One can see that \cref{eq:sigsq-jneqj} is composed of two addends. We now bound each of them separately.
\subsection{Bounding $\sum_{j \neq j'}\cvar{j, j'}$}
\begin{align*}
\sum_{j \neq j'}\cvar{j, j'} &= \sum_{j \neq j'} \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\\
&=\ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}\cdot \prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}} - \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}}\cdot \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}}\\
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)s(\wElem'_i) \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)s(\wElem'_i) \ind{h(\wElem'_i) = j'}}\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)s(\wElem'_i)\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem'_i)\ind{h(\wElem_i') = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}s(\wElem'_i)\ind{h(\wElem_i') = j'}} \right).
\end{align*}
\AH{Perhaps a formal proof is necessary below.}
For $\term_1^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only surviving terms occur when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, the only terms that survive are when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, that in such a case, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations, each can only survive when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Such constraints leave us with only one surviving case, when all variables are the same world. Thus,
\begin{align}
&\sum_{j \neq j'}\cvar{j, j'} = - \frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We now move on to bound the variance of a $\prodsize$-way join.
\begin{align}
&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. We refer to the leftmost expectation of \cref{eq:sig-j-last} in the following way:
\[\term_1\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}}.%\text{, and}
\]
%\[\term_2\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote $\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i') \cdot\term_1\left(\wElem_1,\ldots,\wElem_\prodsize\right)$ given a specific set of world values. %To say that a term survives \AR{You should not care about whether the $T_1$ term survives or not. See the above comment on why.} the expectation is to mean that $\term_1 - \term_2 \neq 0$. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
We next describe the nonzero terms of \cref{eq:sig-j-last}.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{f, f'}
\begin{Definition}
Define and then fix a total ordering of the $\dist$ distinct world elements to follow the total order of the natural numbers in $[\dist]$, such that $\forall i, j \in [\dist], i < j \implies \widetilde{\wElem_i} < \widetilde{\wElem_j}, i.e. \wElem_1 \prec\ldots\prec\wElem_\prodsize$.
%Given a fixed order $\wSet_{\order}: \left(\wSet, \wSet\right)\mapsto \mathbb{B}$ of possible worlds, define the lexographical order of distinct worlds $\wSet_\dist$ to be the ordering which complies to the identity mapping of elements in $[\prodsize]$ to elements in $[\dist]$ up to $\dist$, such that . In other worlds, $\forall \wElem, \wElem' \in \wSet_\dist, \widetilde{\wElem} < \wElem' \leftrightarrow \wSet_{\order}\left(\wElem, \wElem'\right) = T$.
\end{Definition}
To help describe all possible world value matchings we introduce functions $f$ and $f'$.
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%f(i) = \begin{cases}
% \dMap{w_1} &f(i) = 1\\
% \dMap{w_2} &f(i) = 2\\
% \vdots &\vdots\\
% \dMap{w_\dist} &f(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \dMap{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist \in [\prodsize]}\sum_{\dist' \in [\prodsize]}\sum_{f, f'}\sum_{\substack{\dMap{\wElem_1}, \ldots,\dMap{\wElem_\dist},\\\dMap{\wElem'_1},\ldots,\dMap{\wElem'_{\dist'}}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dMap{\wElem_{f(i)}})\vect_i(\dMap{\wElem'_{f'(i)}})\cdot \term_1\left(\wElem_1,\ldots,\wElem_\prodsize\right)%\left( \ex{\prod_{i = 1}^\prodsize \sine(\dMap{\wElem_{f(i)}}\conj{\sine(\dMap{\wElem'_{f'(i)}})}\ind{h(\dMap{\wElem_{f(i)}}) = j}\ind{h(\dMap{w'_{f'(i)}}) = j}} -
%\ex{\prod_{i = 1}^\prodsize \sine(\dMap{\wElem_{f(i)}})\ind{h(\dMap{\wElem_{f(i)}}) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\conj{\sine(\dMap{\wElem'_{f'(i)}})}\ind{h(\dMap{w'_{f'(i)}}) = j}} \right)
\label{eq:sig-j-distinct}
\end{equation}
Observe that the cartesian product of world values assigned to $\wElem_1,\ldots,\wElem_\prodsize$ throughout the summation can be rearranged into groups of variables with distinct values, for each distinct element $\dist$ in the set $[\prodsize]$. For each $\dist \in [\prodsize]$, all possible combinations of $\dist$ world values can be equivalently modeled by taking the set of surjective functions $f:[\prodsize]\mapsto [\dist]$ and computing all world value combinations based on the total ordering of $\widetilde{\wElem_{f(1)}}\prec\cdots\prec\widetilde{\wElem_{f(m)}}$. For any $\dist$, all surjective mappings $f$ constitute all unique mappings with their symmetrical counterparts. Combining that with the total order over $\widetilde{\wElem_{f(1)}},\ldots,\widetilde{\wElem_{f(\dist)}}$ yields exactly the world value combinations containing $\dist$ distinct values which appear in the cartesian product of the sum, without double counting. What this all boils down to is a rearrangement of addends in the sum.
%The fact that \cref{eq:sig-j-last} $\equiv$ \cref{eq:sig-j-distinct} follows since \cref{eq:sig-j-distinct} is simply a rearrangement of the addends in the sum.
%The reason \cref{eq:sig-j-last} $\equiv$ \cref{eq:sig-j-distinct} is because the only surviving terms in $\term_1 - \term_2$ are bijective mappings of $\dist < \prodsize$ distinct pairs between $\wElem_1\ldots\wElem_\prodsize$ and $\wElem_1'\ldots\wElem_\prodsize'$. Another way of saying this is that the only surviving terms of $\term_1 - \term_2$ are those for which we have $\dist$ distinct world values such that the same cardianlity of variables in $\wElem_1\ldots\wElem_\prodsize$ that are mapped to distinct world $\wElem _i$ $\left(\forall i \in [\dist]\right)$ is the same as the cardinality of variables mapped from $\wElem_1'\ldots\wElem_\prodsize'$.\newline
%Note that for a given $\dist$, we may have several ways to map $\prodsize$ worlds to $\dist$ distinct values. We need to define what if means for $f$ and $f'$ to be matching.
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
\begin{enumerate}
\item $\dist = \dist'$
\item $\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} = \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}$, i.e., the set of preimage cardinalities for $f$ equals the set of preimage cardinalities for $f'$.
% \item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, or a symmetrical mapping exists, where $\forall i \in [\dist], \exists i' \in [\dist]$ such that $i'$ is unique, $|f^{-1}(i)| = |f^{-1}(i')|$.
\end{enumerate}
\end{Definition}
%\begin{Definition}
%For every $i, j \in [\dist]~|~ i < j$, the numerical value of the concatenation of the numerically ordered elements of $f^{-1}(i)$ < the numerical value of the concatenation of the numerically ordered elements of $f^{-1}(j)$, where $<$ is the order of the natural numbers.
%\end{Definition}
%
%We illustrate with an example. Consider a join of $k = 3$ tuples, where $\dist = 2$, and we have that $f^{-1}(1) = 1$ and $f^{-1}(2) = 2$. Imposing the above ordering yields the following set of unique functions:
%\begin{align*}
%f_1 = \begin{cases}
% 1 \mapsto 1 &\implies\wElem_1 \mapsto \dMap{\wElem_1}\\
% 2, 3 \mapsto 2 &\implies\wElem_2, \wElem_3 \mapsto \dMap{\wElem_2}
% \end{cases}\\
%f_2 = \begin{cases}
% 2 \mapsto 1 &\implies\wElem_2 \mapsto \dMap{\wElem_1}\\
% 1, 3 \mapsto 2 &\implies\wElem_1, \wElem_3 \mapsto \dMap{\wElem_2}
% \end{cases}\\
%f_3 = \begin{cases}
% 3 \mapsto1 &\implies\wElem_3 \mapsto \dMap{\wElem_1}\\
% 1, 2 \mapsto 2 &\implies\wElem_1, \wElem_2 \mapsto \dMap{\wElem_2}
% \end{cases}
%\end{align*}
%The above mappings satisfy the ordering constraint so that for $f_1$, $1 < 23$, for $f_2$, $2 < 13$, and for $f_3$, $3 < 12$.
%Note that above orderings share no symmetry, while the symmetrical versions of the above, which are the orderings for the case when $f^{-1}(1) = 2$ and $f^{-1}(2) = 1$, break our above ordering requirements, and are therefore disallowed, thus avoiding double counting. Another way of saying this is that the preimage sizes follow the natural order of their respective counterparts in the image. For the case when the two are equal, we need a more defined order, and can distinguish using the same idea as first described.
\begin{Lemma}\label{lem:sig-j-survive}
When $f, f'$are matching, where $\forall j \in[\dist], \dMap{\wElem_j} = \dMap{\wElem'_j}$, \cref{eq:sig-j-distinct} is exactly
\[
\sum_{\substack{\dMap{\wElem_1}, \ldots,\dMap{\wElem_\dist},\\\dMap{\wElem'_1},\ldots,\dMap{\wElem'_{\dist'}}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dMap{\wElem_{f(i)}})\vect_i(\dMap{\wElem'_{f'(i)}})
\]
and $0$ otherwise.
\end{Lemma}
In proving \cref{lem:sig-j-survive}, we introduce another fact.
\begin{Lemma}\label{lem:exp-prod-rand-roots}
Given a $\prodsize^{th}$ root of unity $\rou$, the expectation of the product of $\rou^i \cdot \rou^j$ for $i, j \in [\prodsize - 1]$ is zero.
\end{Lemma}
\AR{The lemma should be stated and proved for both the case of $w=w'$ and $w\ne w'$.}
\AR{You should be using \texttt{proof} environment to put your proofs in: i.e. \texttt{\textbackslash begin\{proof\} Blah \textbackslash end\{proof\}}.}
\begin{proof}
We start with the case when $\wElem \neq \wElem'$.
\begin{align*}
&\ex{\sine(\wElem)^i \conj{\sine(w')}^j}\\
= &\ex{\sine(\wElem^i)}\ex{\conj{\sine(\wElem')}^j}\\
= &0
\end{align*}
\end{proof}
In the above, since we have more than pairwise independence for $\wElem \neq \wElem'$, we can push the expectation into the product. Then by \cref{lem:exp-sine} we get 0 for both expectations.\newline
\AR{Proof environment for Lemma\cref{lem:sig-j-survive} should start here.}
\AR{Um, the proof below is bit of a mess. This needs to be re-written. Below are some suggestions.}
\AR{First some typos/things that are incorrect below-- note this is \textbf{not} an exhaustive list. (1) In the proof below the $w_i$ and $w'_i$ should be $\tilde{w}_i$ and $\tilde{w'}_i$ respectively. (2) The expression for $T_1$ below is incorrect since it seems to assume that all the pre-image sizes are $1$-- the expression for $T_2$ is fine except the $j_i$ terms are not defined. However, ``taking out" one term for $\tilde{w'}_{m'}$ for $T_2$ is incorrect since e.g. we could have the pre-image of $m'$ have size $>1$. (3) The proof below never explicitly argues why the condition $\dMap{\wElem_j} = \dMap{\wElem'_j}$ is needed.}
\AR{Here is how I recommend that you re-write the proof. First as mentioned earlier, you should only consider the $T_1$ terms (as you account for the $T_2$ terms later on. Second you should first start off by re-stating the $T_1$ term like so. Consider the ``generic term"--
\[T_1(\tilde{w}_{f(1)},\dots, \tilde{w}_{f(m)}, \tilde{w'}_{f'(1)},\dots, \tilde{w'}_{f'(m')}).\]
Then re-write the what the above term is based on the exact definition (BTW I'm dropping the $\mathbf{E}$ terms for convenience but they should be all there below.) In particular, the above term by definition is exactly
\[\prod_{i=1}^k s(\tilde{w}_{f(i)})\cdot \overline{s(\tilde{w'}_{f'(i)})}.\]
Now re-write the above in terms of ``powers" of distinct worlds:
\[ (\prod_{i=1}^m s(\tilde{w}_{i})^{|f^{-1}(i)|})\cdot \overline{(\prod_{j=1}^m s(\tilde{w'}_j)^{|f^{-1}(j)|})}\]
Now once you have the above expression, then it will be much easier to argue why if any of the matching conditions are not satisfied then the expression is $0$. I also believe that working with the above expression will also make it more ``obvious" as to why the different conditions are required. Currently the arguments below do not explicitly bring this out...
}
To prove that \cref{lem:sig-j-survive} is true, consider what the expectation looks like when $f, f'$ are not matching. Looking at the first condition for $f, f'$ to be matching when $\dist \neq \dist'$ note that since $\dist \neq \dist'$ we know that one set of variables has at least one more distinct world than the other set of variables. Also, to be explicit, $\wElem_1\ldots\wElem_\dist, \wElem_1'\ldots\wElem_{\dist'}'$ are distinct world values such that $\forall i \neq j \in [\dist], \wElem_i = \wElem_i' \neq \wElem_j = \wElem_j'$. To make things easier, assume that $\dist < \dist'$. The opposite case of $\dist > \dist'$ has a symmetrical proof. Fixing variables $\wElem_1\ldots\wElem_\dist, \wElem_1'\ldots\wElem_\dist$, in both $\term_1$ and $\term_2$ we have one extra distinct value, $\wElem_{\dist'}'$. This distinct term cancels out all the other values in the expectations.
\begin{align}
\term_1 - \term_2 = &\ex{\sine(\wElem_1)\conj{\sine(\wElem_1)}\cdot\ldots\cdot\sine(\wElem_\dist)\conj{\sine(\wElem_\dist)}\cdot\conj{\sine(\wElem'_{\dist'})}} - \ex{\prod_{i = 1}^{\dist}\sine(\wElem_i)^{j_i}}\ex{\left(\prod_{i = 1}^{\dist}\sine(\wElem_i)^{j_i}\right) \cdot \conj{\sine(\wElem'_{\dist'})}}\\
= &\ex{\sine(\wElem_1)\conj{\sine(\wElem_1)}\cdot\ldots\cdot\sine(\wElem_\dist)\conj{\sine(\wElem_\dist)}}\cdot \ex{\conj{\sine(\wElem'_{\dist'})}} - \ex{\prod_{i = 1}^{\dist}\sine(\wElem_i)^{j_i}}\ex{\prod_{i = 1}^{\dist}\sine(\wElem_i)^{j_i}} \cdot \ex{\conj{\sine(\wElem'_{\dist'})}}\\
= &0.
%&\sum_{\substack{\wElem_1,\ldots,\wElem_{\dist},\\ \wElem_1',\ldots,\wElem_{\dist}'\\\in \wSet}}\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i')
% \left(\ex{\prod_{i = 1}^{k}\sine(\wElem_i)\conj{\sine(\wElem_i')}\ind{\hfunc(\wElem_i) = \buck}\ind{\hfunc(\wElem_i') = \buck}} -
% \ex{\prod_{i = 1}^{k}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = \buck}}\ex{\prod_{i = 1}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = \buck}} \right) \\
%
%
%\term_1 = \ex{\sine(\wElem_1)^{\dupSize_1}\cdot,\ldots,\cdot\sine(\wElem_m)^{\dupSize_m}\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot, \ldots,\cdot \conj{\sine(\wElem_{m}')}^{\dupSize_m'} \cdot, \ldots, \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}} = 0.
\end{align}
By the same reasoning in the proof of \cref{lem:exp-prod-rand-roots}, we can push the expectation into the product of two independent random values. \textit{Here at most we assume 2k wise independence, but we really would like less}. Then by \cref{lem:exp-sine}, we get an factor of zero in both products, giving a result of zero.\qed
%Notice that, with $\dist < \dist'$, this means that there will be $\sum\limits_{n \in \{ [\dist'] - [\dist] \}}\dupSize_n'$ world values on either side that do not have a match since the number of $\prodsize$ products is constant. This leaves us with $\prod\limits_{\dupSize_{n^*} \in \{\dupSize_n | \forall n \in [m], \dupSize_n \neq \dupSize_n'\}}\sine(\wElem_i)^{\dupSize_{n^*} - \dupSize_{n'}} \cdot \prod\limits_{\dist_i' \in \{\dist' - \dist\}}\conj{\sine(\wElem_{m_i'}')}^{\dupSize_{m_i'}'}$ pairings, where $\forall \wElem_i \in \{\wElem_i | i \in [m], \dupSize_i \neq \dupSize_i'\}, \forall \dist_i' \in \{\dist_i' | \dist_i' \in \{\dist' - \dist\}\}, \wElem_i \neq \wElem_{\dist_i'}$, which in expectation is 0 by \cref{lem:exp-prod-rand-roots}. The proof is symmetric for the case of $\dist > \dist'$.
%
%From $\term_2$, notice that by the fact that $\dist' > \dist$, it has to be the case that $\dist' > 1$, which by \cref{lem:exp-prod-rand-roots} equals zero, setting the whole product to zero. The proof is symmetric for the case of $\dist > \dist'$.
%\qed\newline
%%\newline Assume $\dist < \dist'$.
%%\begin{align*}
%%\term_2 = &\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}\\
%%= &\ex{\sine(\wElem_1)^{\dupSize_1}\cdot\ldots\cdot\sine(\wElem_\dist)^{\dupSize_\dist}} \cdot
%% \ex{\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot\ldots\cdot\conj{\sine(\wElem_{\dist}')}^{\dupSize_\dist'}\conj{\sine(\wElem_{\dist + 1}')}^{\dupSize_{\dist + 1}'}\cdot\ldots\cdot\conj{\sine(\wElem_{\dist'}')}^{\dupSize_{\dist'}'}}\\
%%= &\ex{\sine(\wElem_1)^{\dupSize_1}\cdot\ldots\cdot\sine(\wElem_\dist)^{\dupSize_\dist}} \cdot 0 \\
%%= & 0
%%\end{align*}
%
%%Note that the second expectation in $\term_2$ cancels out by the fact that there is at least one distinct worl $\wElem_\dist'$ that does not match any of the other world value in the product, and by \cref{lem:exp-prod-rand-roots} yields zero.
To finish the proof of \cref{lem:sig-j-survive}, we now approach the case where $\dist = \dist'$, but the set of preimage cardinalities for $f, f'$ are unequal. Effectively this condition means that we end up with the same result of unequal pairs as when $\dist \neq \dist'$.
\begin{align}
&\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} \neq \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}\\
\rightarrow &\exists i, i' \in [m] s.t. |f^{-1}(i)| \neq |f'^{-1}(i)|, |f^{-1}(i')| \neq |f'^{-1}(i')|\\
%\rightarrow &|\dupSize_s - \dupSize_s'| = |\dupSize_t - \dupSize_t'|\text{ by the fact that $\dist = \dist'$}
\end{align}
The above means that we will have at least two world values that don't match, i.e. a $\wElem_i \neq \wElem_{i'}'$, both of which $i \neq i' \in [m]$. Fixing all world values except $\wElem_i$ and $\wElem_{i'}'$,
\begin{align}
\term_1 - \term_2 = &\ex{\left(\prod_{i''= 1}^{\dist}\sine(\wElem_{i''})\conj{\sine(\wElem_{i''}')}\right)\sine(\wElem_i)\conj{\sine(\wElem_{i'}')}} - \ex{\prod_{i'' = 1}^{\dist}\sine(\wElem_{i''})}\cdot\ex{\prod_{i'' = 1}^{\dist}\conj{\sine(\wElem_{i''}')}}\\
= &\ex{\prod_{i''= 1}^{\dist}\sine(\wElem_{i''})\conj{\sine(\wElem_{i''}')}}\ex{\sine(\wElem_i)\conj{\sine(\wElem_{i'}')}} -
\ex{\prod_{i'' = 1 s.t. i'' \neq i}^{\dist}\sine(\wElem_{i''})}\cdot \ex{\sine(\wElem_{i})}\cdot\ex{\prod_{i'' = 1 s.t. i'' \neq i'}^{\dist}\conj{\sine(\wElem_{i''}')}}\cdot \ex{\conj{\sine(\wElem_{i'}')}}\\
= &0.
\end{align}
By the same arguments as before, we have at least one distinct world value in each expectation, which by independence of random variables allows us to push the expectations into the product, and then by \cref{lem:exp-prod-rand-roots} and \cref{lem:exp-sine} produce a zero in each product, yielding a value of zero.\qed
%To be clear, if the set of preimage sizes do not match, then we have for both $f, f'$ at least two preimage mappings to their respective distinct variables whose corresponding sizes are both unequal. In the notation above, we set the unmatching cardinalities to $\dupSize_s$, etc. to reference later on. Note that since $\dist$ is the same for both $f, f'$, the disagreement in cardinalities evens out across variables. For the sake of argument, define $\dupSize_s > \dupSize_s', \dupSize_t' > \dupSize_t$.
%
%Translating this to $\term_1$ and $\term2$ we have
%\begin{align}
%\term_1 = &\ex{\left(\prod_{i \in [\dist] | i \neq s, t}\sine(\wElem_i)^{\dupSize_i}\conj{\sine(\wElem_i')}^{\dupSize_i}\right)
% \sine(\wElem_s)^{\dupSize_s}\conj{\sine(\wElem_s')^{\dupSize_s'}}\sine(\wElem_t)^{\dupSize_t}\conj{\sine(\wElem_t')}^{\dupSize_t'} }\\
%= &\ex{\left(\prod_{i \in [\dist] | i \neq s, t}\sine(\wElem_i)^{\dupSize_i}\conj{\sine(\wElem_i')}^{\dupSize_i}\right)
% \sine(\wElem_s)^{\dupSize_s'}\conj{\sine(\wElem_s')^{\dupSize_s'}} \sine(\wElem_t)^{\dupSize_t}\conj{\sine(\wElem_t')}^{\dupSize_t} \cdot
% \sine(\wElem_s)^{\dupSize_s - \dupSize_s'} \conj{\sine(\wElem_t)}^{\dupSize_t' - \dupSize_t} }\\
%=& 0.
%\end{align}
%Note, that because we have that $\wElem_s \neq \wElem_t$, the expectation is zero by \cref{lem:exp-prod-rand-roots}.
%
%For $\term_2$, notice that since we have $m \geq 2$ for preimage sizes to disagree, both expectations yield zero by \cref{lem:exp-prod-rand-roots}.
%
%For the case where $\dupSize_s < \dupSize_s', \dupSize_t' < \dupSize_t$, the argument is symmetric.\qed\newline
%Since we have at least one extra \textit{distinct} world value, whose conjugate of its sine value is paired with the sine value of another distinct world value, the expectation will equal zero.
%\newline
%The proof is immediate and follows from the fact that the random $\sine$ functions are only guaranteed to produce a product of one under one of two possible conditions:
% \begin{enumerate}
% \item $\sine(\wElem)^\prodsize = 1$,
% \item $\sine(\wElem) \conj{\sine(\wElem)} = 1$.\qed
% \end{enumerate}
%\AH{Here is where I have attempted to use prose to discuss the restrictions on $f$ and $f'$, rather than the use of \dist-tuples. Maybe there is a better, cleaner formal way?}
%E.g., for $\prodsize = 4, \dist = 2$, mappings could be such that one $\wElem_i$ is distinct, while the other three $\wElem_i$ are mapped to the other distinct value. Additionally, we would have the case where two $\wElem_i$ map to a distinct value, while the other two $\wElem_i$ map to a seperate distinct world. The expectations of equation \eqref{eq:sig-j-last} restrict $f$ and $f'$ to belonging to the same class of $\dist$-mapping, meaning, if the mapping $f$ for $\prodsize = 4, \dist = 2$ is in the setting of one distinct world and three equal world values, then $f'$ must be from that same set of mappings, and not from another class of mappings, such as when two $w_i$ map to a distinct world, while the other two $w_i$ map to a separate distinct world.
%\AH{Here is the use of \dist-tuples to explain the same thing.}
% In the example above, $f$ mappings for $\dist_{2_1}$ may only cross product with $f'$ mappings for $\dist_{2_1}$ and not with those for $\dist_{2_2}$. Likewise for $f, f'$ mappings of $\dist_{2_2}$.
We now seek to show that when $f, f'$ are matching, that $\term_1 - \term_2$ will always equal 1.
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value.\AR{This is where you are subtracting off the $T_2$ terms, which is why you do not need to consider them in the arguments above.} We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
\AR{Um, no this is not an argument for general situation. You need to explicitly argue how each term in Lemma~\eqref{eq:sig-j-distinct} fares under Lemma~\ref{lem:sig-j-survive} and then how those give you the expression below. In particular, you should explicitly argue what happens to the indicator variable terms. Also doing this for $\lambda(j,j')$ would mean you'll have to deal with those explicitly in any case.}
%
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\dMap{\wElem_1}, \dMap{\wElem_2}}\prod_{i = 1}^{\prodsize}\vect_i(\dMap{\wElem_{f(i)}})\vect_i(\dMap{\wElem_{f'(i)}}).
%\end{equation*}
%This is because we know that the expectation from \eqref{eq:sig-j-last} will survive when we have mappings that produce pairs of the form $\sine(\wElem)\conj{\sine(\wElem)}$. Second, in consideration of the randomized hashing, with two distinct variables, the indicator variables in the expectation yield $\frac{1}{\sketchCols}\cdot \frac{1}{\sketchCols}$.
%
%We need to sum over all mappings for each case (c) when the number of distinct values is $\dist = 2$, resulting in
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\dMap{\wElem_1}, \dMap{\wElem_2}}\sum_{c \in \dist = 2}\sum_{f, f'}\prod_{i = 1}^{\prodsize}\vect_i(\dMap{\wElem_{f(i)}})\vect_i(\dMap{\wElem_{f'(i)}}).
%\end{equation*}
%
%Finally, we need to do this for all $\dist$.
\begin{equation*}
\sigsq_j = \sum_{\dist \in [2, \prodsize]} \frac{1}{B^\dist} \sum_{\dMap{w_1}\ldots\widetilde{w_\dist}\in W} \sum_{\substack{f, f',\\\match{f}{f'}}} \prod_{i = 1}^{\prodsize} v_i(\dMap{w_{f(i)}}) v_i(\dMap{w_{f'(i)}})
\end{equation*}
\AR{Remaining TODOs: (1) Give expression for general $\sigma^2$, i.e. deal with the general $\lambda(j,j')$ term. (2) Show how to use the analysis for general $k$-product to handle generic SoP expressions-- the expectation arguments would just follow from the above and linearity of expectation but the variance bounds might need a bit of extra work.}