paper-BagRelationalPDBsAreHard/sop.tex
2020-04-17 11:24:21 -04:00

297 lines
32 KiB
TeX

%root--main.tex
\section{Analysis of a $\prodsize$-way join}
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query. We start by analyzing a $\prodsize$ product. Define the $j^{\text{th}}$ bucket of a sketch $\sk$ for a vector $\vect$ as
\[\sk^\vect[j] = \sum_{\substack{\wElem \in \wSet,\\ \hfunc(\wElem) = j}}\vect(\wElem)\sine(\wElem)\].
Define the estimate of the $j_{th}$ bucket to be
\[\est_j = \prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]\].
For notational convenience define
\begin{align*}
&\wSet_j = \{\wElem ~|~ \hfunc(\wElem) = j\}\\
&\term_j = \sum_{\wElem \in \wSet_j} \prod_{i = 1}^{\prodsize}\vect_i(\wElem)
\end{align*}
Let us show first that the expectation of the estimate does in fact yield the value we are estimating, $\term_j$.
\begin{Lemma}
The expectation of an estimate $\est_j$ is the sum of its pointwise vector products.
\end{Lemma}
\begin{proof}
\begin{align*}
\ex{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]} \\
= &\ex{\prod_{i = 1}^{\prodsize} \sum_{\substack{\wElem \in \wSet_j, \\ \hfunc(\wElem) = j}}\vect_i(\wElem)\sine(\wElem)}\\
= &\ex{\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}
\end{align*}
Fix the variables $\wElem_1,\ldots, \wElem_{\prodsize}$. Define $\dist$ to be the number of distinct worlds in $\wElem_1,\ldots, \wElem_{\prodsize}$ and $e_{\ell}$ to be the number of repetitions for the $\ell^{\text{th}}$ distinct world value. Focusing on the expectation factor, $\ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)}$, we get
\begin{align*}
&\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\ex{\prod_{\ell = 1}^{\dist} \sine(\wElem_{\ell})^{e_{\ell}}}\\
= & \begin{cases}
0 &1 <\dist < \prodsize\\
1 & \dist = 1.
\end{cases}
\end{align*}
We obtain the final equality by \cref{lem:exp-sine}, which states that the only way in expectation that $\sine(\wElem_{\ell})^{e_{\ell}}$ can be something other than $0$ is when $e_{\ell} = \prodsize$. It can further be seen that the only way this can happen is when $\dist = \prodsize$.
Notice, that the above leaves us with the only remaining condition that $\forall i, j \in [\prodsize], \wElem_i = \wElem_j$,
\begin{align*}
= &\sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize} \vect_i(w) \cdot \ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)} = \term_j.
\end{align*}
\end{proof}
The proof for $\est = \sum_j \est_j$ follows by linearity of expectation.\qed\newline
We need to compute the variance of the $\prodsize$-way product $\est$. We wish to prove that
\begin{equation}
\sigsq \leq \sum_j \sigsq_j \label{eq:var-to-prove}.
\end{equation}
For notational convenience let $\cvar{j, j'} = \ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\conj{\est_{j'}}}$.
Substituting in the definition of variance for complex numbers,
\begin{align}
\sigsq &= \ex{\sum_j \est_j \cdot \conj{\sum_{j'} \est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\conj{\sum_{j'} \est_{j'}}}\nonumber\\
&= \ex{\sum_j \est_j \cdot \sum_{j'} \conj{\est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\sum_{j'} \conj{\est_{j'}}}\nonumber\\
&= \sum_{j, j'}\left(\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_{j'}}}\right)\nonumber\\
&= \sum_j\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_j}} + \sum_{j \neq j'}\cvar{j, j'}\nonumber\\
&= \sum_j \sigsq_j + \sum_{j \neq j'}\cvar{j, j'} \label{eq:sigsq}
\end{align}
Notice that assuming independence of $\sigsq_j ~\forall j \in \sketchCols$, we can push the variance through the sum and obtain the result
\begin{align*}
&\sigsq - \sum_j \sigsq_j = \cvar{j, j'}\\
&\implies \cvar{j, j'} \leq 0.
\end{align*}
\AH{The implication above was discussed months ago, but I don't see how it's true. Is it true?}
One can see that \cref{eq:sigsq} is composed of two addends. We now bound each of them separately.
\subsection{Bounding $\cvar{j, j'}$}
\AR{You need to re-write the stuff below. First in the 2nd equality suddenly the sum on $j\ne j'$ has vanished. Also I think you should first analyze $\lambda(j,j')$ for both $j=j'$ and $j\ne j'$ for as long as you can. Only when it is needed should you divide into the two cases-- do not do the division up front.}
Notice we have two cases of $\cvar{j, j'}$, the first is when $j = j'$, i.e. $(\sigsq_j)$, and the second when $j \neq j'$.
\begin{align}
\cvar{j, j'} &= \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\nonumber\\
&=\ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}\cdot \prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}} - \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}}\cdot \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}}\nonumber\\
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem'_i) = j'}}\nonumber\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}}\nonumber\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}} \right).\label{eq:var-lambda-j-j'}
\end{align}
\AH{How can I present the derivation of the bounds below in a \textit{better} way?}
Equation ~\eqref{eq:var-lambda-j-j'} for $j \neq j'$ bounds to the rightmost sum of \cref{eq:sigsq}. For $\term_1^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only instance $\term_1^{\cvar{j, j'}} = 1$ occurs when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, $\term_1^{\cvar{j, j'}} = 1$ only with the further condition that $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, given the conditions, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations $\term_2^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i) \ind{\hfunc(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}\conj{\sine(\wElem'_i)} \ind{\hfunc(\wElem'_i) = j'}}$, that $\term_2^{\cvar{j, j'}} = 1$ only when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Taken together, the constraints leave us with only one possible case for $\term_1^{\cvar{j, j'}} - \term_2^{\cvar{j, j'}} \neq 0$, when all variables are the same world. Thus,
\begin{align}
&\sum_{j \neq j'}\cvar{j, j'} = - \frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Taking a look at the leftmost term of \cref{eq:sigsq}, we establish bounds the variance of the $j^{\text{th}}$ bucket of a $\prodsize$-way join. Note in this case that \cref{eq:var-lambda-j-j'} has that $j = j'$, and can be written in the following way,
\begin{align}
%&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
%&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
%\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
%&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
%\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
%\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
%=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
%\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. We refer to the leftmost expectation of \cref{eq:sig-j-last} in the following way:
\AR{dangling eq ref}\AH{I don't see one}
\[\term_1\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}}.%\text{, and}
\]
%\[\term_2\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote $\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i') \cdot\term_1\left(\wElem_1,\ldots,\wElem_\prodsize\right)$ given a specific set of world values. %To say that a term survives \AR{You should not care about whether the $T_1$ term survives or not. See the above comment on why.} the expectation is to mean that $\term_1 - \term_2 \neq 0$. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
We next describe the nonzero terms of \cref{eq:sig-j-last}.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{f, f'}
\begin{Definition}
Define and then fix a total ordering of the $\dist$ distinct world elements to follow the total order of the natural numbers in $[\dist]$, such that $\forall i, j \in [\dist], i < j \implies \dw_i < \dw_j, i.e. \wElem_1 \prec\ldots\prec\wElem_\prodsize$.
%Given a fixed order $\wSet_{\order}: \left(\wSet, \wSet\right)\mapsto \mathbb{B}$ of possible worlds, define the lexographical order of distinct worlds $\wSet_\dist$ to be the ordering which complies to the identity mapping of elements in $[\prodsize]$ to elements in $[\dist]$ up to $\dist$, such that . In other worlds, $\forall \wElem, \wElem' \in \wSet_\dist, \dw < \wElem' \leftrightarrow \wSet_{\order}\left(\wElem, \wElem'\right) = T$.
\end{Definition}
\AR{NO. The ordering $\prec$ has nothing to do with $m$. It is just ordering all the worlds in $W$.}
To help describe all possible world value matchings we introduce functions $f$ and $f'$.
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%f(i) = \begin{cases}
% \dMap{w_1} &f(i) = 1\\
% \dMap{w_2} &f(i) = 2\\
% \vdots &\vdots\\
% \dMap{w_\dist} &f(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \dMap{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist = 2}^{\prodsize}\sum_{\dist' = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_1, \ldots,\dw_\dist,\\ \dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{_{f(i)}})\vect_i(\dw_{'_{f'(i)}})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)
\label{eq:sig-j-distinct}
\end{equation}
\AR{Three comments on the above: (1) Why do the sums on $m$ and $m'$ start with $2$ and not $1$? (2) Also $\tilde{w}_1,\dots,\tilde{w}_m\in W$ should be replaced by $\tilde{w}_1\prec \cdots\prec \tilde{w}_m \in W$-- similarly for $\tilde{w'}_i$s as well. (3) Use $\widetilde{w_i}$ instead of $\tilde{w}_i$-- I had used the latter in my notes due to laziness.}
Observe that the cartesian product of world values assigned to $\wElem_1,\ldots,\wElem_\prodsize$ throughout the summation can be rearranged into groups of world variables with distinct world values, for each distinct element $\dist$ in the set $[\prodsize]$. For each $\dist \in [\prodsize]$, all possible combinations of $\dist$ world values can be equivalently modeled by taking the set of surjective functions $f:[\prodsize]\mapsto [\dist]$ and computing all world value combinations based on the total ordering of $\dw_{f(1)}\prec\cdots\prec\dw_{f(m)}$.\AR{Again total ordering is on worlds in $W$-- $\dw_{f(1)}\prec\cdots\prec\dw_{f(m)}$ does not make sense since some of these world values could be the same.} For any $\dist$, all surjective mappings $f$ constitute all unique mappings with their symmetrical counterparts \AR{I do not see what the ``symmetrical counterparts" comment adds here. Just remove it}. Combining that with the total order over $\dw_{f(1)},\ldots,\dw_{f(\dist)}$ yields exactly the world value combinations containing $\dist$ distinct values which appear in the cartesian product of the sum, without double counting \AR{Again not sure the ``double counting" comment adds anything here}.
\AR{Overall comments: (1) The main thing missing if explicitly stating that $(w_1,\dots,w_k)\mapsto (\dw_{f(1)},\ldots,\dw_{f(\dist)})$. (2) After stating the map you should argue in words why all distinct tuples with $m$ distinct world values are covered.} What this all boils down to is a rearrangement of addends in the sum.
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
\begin{enumerate}
\item $\dist = \dist'$
\item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, i.e., the cardinality of variables mapped to $\dw_i$ equals the cardinality of variables mapped to $\dw_i'$, for all $i \in [\dist]$.
\end{enumerate}
\end{Definition}
\begin{Lemma}\label{lem:sig-j-survive}
When $f, f'$are matching, where $\forall j \in[\dist], \dw_{_j} = \dw_{'_j}$, \cref{eq:sig-j-distinct} is exactly
\[
\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw_{'_1},\ldots,\dw_{'_{\dist'}}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{_{f(i)}})\vect_i(\dw_{'_{f'(i)}})
\]
and $0$ otherwise.
\end{Lemma}
In proving \cref{lem:sig-j-survive}, we introduce another fact.
\begin{Lemma}\label{lem:exp-prod-rand-roots}
Given a $\prodsize^{th}$ root of unity $\rou$, the expectation of the product of $(\rou^i)^l \cdot (\rou^j)^{l'}$ for uniformly random $i, j$, where $i, j, l, l' \in [\prodsize]$, is zero.
\end{Lemma}
\AH{I don't think I need this lemma anymore, but I'll give it a go anyways.}
\AH{A quick heads up, I realized as I was doing this that I had stated things incorrectly in the definition of the lemma. This, being corrected, allows only for one case.}
\AH{Also, since I don't use this in the proof of \cref{lem:sig-j-survive}, it probably makes no sense to have this sitting right in the middle of things. I'm only leaving it since you had said you wanted to see a proof for both cases: $\wElem = \wElem', \wElem \neq \wElem'$.}
\begin{proof}
The proof only needs the case when $\wElem \neq \wElem'$, since $i, j$ are both uniformly random.
\begin{align*}
&\ex{\sine(\wElem)^i \conj{\sine(w')}^j}\\
= &\ex{\sine(\wElem^i)}\ex{\conj{\sine(\wElem')}^j}\\
= &0
\end{align*}
\end{proof}
In the above, since we have more than pairwise independence for $\wElem \neq \wElem'$, we can push the expectation into the product. Then by \cref{lem:exp-sine} we get 0 for both expectations.\newline
\begin{proof}
\AR{First some typos/things that are incorrect below-- note this is \textbf{not} an exhaustive list. (1) In the proof below the $w_i$ and $w'_i$ should be $\tilde{w}_i$ and $\tilde{w'}_i$ respectively. (2) The expression for $T_1$ below is incorrect since it seems to assume that all the pre-image sizes are $1$-- the expression for $T_2$ is fine except the $j_i$ terms are not defined. However, ``taking out" one term for $\tilde{w'}_{m'}$ for $T_2$ is incorrect since e.g. we could have the pre-image of $m'$ have size $>1$. (3) The proof below never explicitly argues why the condition $\dw_{_j} = \dw_{'_j}$ is needed.}
\AR{Here is how I recommend that you re-write the proof. First as mentioned earlier, you should only consider the $T_1$ terms (as you account for the $T_2$ terms later on. Second you should first start off by re-stating the $T_1$ term like so. Consider the ``generic term"--
\[T_1(\tilde{w}_{f(1)},\dots, \tilde{w}_{f(m)}, \tilde{w'}_{f'(1)},\dots, \tilde{w'}_{f'(m')}).\]
Then re-write the what the above term is based on the exact definition (BTW I'm dropping the $\mathbf{E}$ terms for convenience but they should be all there below.) In particular, the above term by definition is exactly
\[\prod_{i=1}^k s(\tilde{w}_{f(i)})\cdot \overline{s(\tilde{w'}_{f'(i)})}.\]
Now re-write the above in terms of ``powers" of distinct worlds:
\[ (\prod_{i=1}^m s(\tilde{w}_{i})^{|f^{-1}(i)|})\cdot \overline{(\prod_{j=1}^m s(\tilde{w'}_j)^{|f^{-1}(j)|})}\]
Now once you have the above expression, then it will be much easier to argue why if any of the matching conditions are not satisfied then the expression is $0$. I also believe that working with the above expression will also make it more ``obvious" as to why the different conditions are required. Currently the arguments below do not explicitly bring this out...
}
Consider the "generic term"--
\[T_1(\tilde{w}_{f(1)},\dots, \tilde{w}_{f(\prodsize)}, \tilde{w'}_{f'(1)},\dots, \tilde{w'}_{f'(\prodsize')}).\]
Let's rewrite the term based on its exact definition:
\begin{align*}
= &\ex{\prod_{i = 1}^{\prodsize}\sine(\dw_{f(i)})\cdot\conj{\sine(\dw'_{f'(i)})}}\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist'}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)}
\end{align*}
Notice that each $i \in [\prodsize]$ has its own mapping to an element in $[\dist]$. We can thus rearrange all the elements of the product such that the preimage of function $f(i)$, i.e., $f^{-1}(i)$ yields the number of terms that will be mapped to a distinct variable $\dw_i$.
Further see how the requirement that $\dw_i = \dw'_i$ gives us the precise combinations we are looking for, where each random $\sine$ output value has its own matching complex conjugate.
To prove that \cref{lem:sig-j-survive} is true, consider what the expectation looks like when $f, f'$ are not matching. The first condition for $f, f'$ to be matching is violated when $\dist \neq \dist'$.
\AH{The following observation isn't necessary to complete the proof. I'll just leave it for now in case it may be valuable down the road.}
Observe that $\forall \dist \in [\prodsize], \sum_{i = 1}^{\dist}|f^{-1}(i)| = \prodsize$ and that this fact implies for $\dist, \dist' \in [\prodsize] ~|~\dist \neq \dist', \exists i \in [m] ~|~\forall j \in [m], f^{-1}(i)| \neq |f'^{-1}(j)|$, meaning that if we have that $\dist \neq \dist'$, then the second matching condition is also violated.
\AH{Moving on...}Note that $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw_{\dist'}'$ are distinct world values such that $\forall i \neq j \in [\dist], \dw_i = \dw_i' \neq \dw_j = \dw_j'$. To make things easier, assume that $\dist < \dist'$. The opposite case of $\dist > \dist'$ has a symmetrical proof. Fixing variables $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw_\dist$, we have at least one $\dw_i$ without a conjugate, and at least one extra distinct value, $\dw_{\dist'}'$, for which no $\dw_{\dist'}$ exists. This (these) distinct term(s) cancel(s) out all the other values in the expectations.
\begin{align}
&\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist'}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)}\nonumber\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)\left(\prod_{l = \dist + 1}^{\dist'}\conj{\sine(\dw_l)}^{f'^{-1}(l)}\right)}\nonumber\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)} \cdot \prod_{l = \dist + 1}^{\dist'} \ex{\conj{\sine(\dw_l)}^{|f^{-1}(l)|}} = 0.\label{eq:lem-fmatch-pt1}
\end{align}
In \cref{eq:lem-fmatch-pt1} the expectation can be pushed through the last product group since we know that all the operands are distinct from any others appearing in the overall product. Then, by \cref{lem:exp-sine} we get $0$ for that rightmost term, and this cancels out the rest of the terms in the overall product.
\textit{Here at most we assume 2k wise independence, but we really would like less}.
\AH{Can we bring it down to k-wise, since we have $<$ k terms which we are pushing the expectation through?}
To complete the proof, we now approach the case where $\dist = \dist'$, but there is a $\dw_i, \dw_i'$ with an unequal number of mappings.
\begin{align*}
&\exists i \in [\dist], |f^{-1}(i)| \neq |f'^{-1}(i)|\\
\implies &\exists j \in [m] ~|~i \neq j, |f^{-1}(j)| \neq |f'^{-1}(j)|\\
\implies &\exists i, j \in [\dist], i \neq j ~|~ |\dw_i| \neq |\dw_i|, |\dw_j| \neq |\dw'_j|, \\
%\implies &\exists \wElem_i \in \wSet ~|~ \nexists \wElem_i' \in \wSet ~|~ \wElem_i = \wElem_i'
\end{align*}
The above means that we will have at least two world values that don't match. Put another way, after the optimal number of matching world value pairs have been assigned, there will be at least one world value whose matching conjugate product is not the conjugate of the sine of the same world value, i.e. for $i \neq j$, there will exist at least one product of $\sine(\dw_i) \conj{\sine(\dw_{j}')}$.
Such cross terms exist since
\[\left(\sum_{\substack{i \in [\dist],\\|f^{-1}(i)| \neq |f'^{-1}(i)|}}|f^{-1}(i)|\right) = \left(\sum_{\substack{i' \in [\dist],\\|f^{-1}(i')| \neq |f'^{-1}(i')|}}|f'^{-1}(i')|\right)\]
Let $n = \{i ~|~ |f^{-1}(i)| \neq |f'^{-1}(i)|\}$. Further, let $\dist_* = [\dist] - n$ and $f^{*-1}(i) = min\left(f^{-1}(i), f'^{-1}(i)\right)$. Then,
\begin{align}
\term_1 = &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)
\left(\prod_{\substack{i' \in [n],\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|} \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}\right)} \label{eq:lem-match-pt2-line1}\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\ex{\left(\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|} \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}\right)}\nonumber\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\ex{\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|}} \cdot \ex{\prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}}\nonumber\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \ex{\sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|}} \cdot \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \ex{\conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}}\label{eq:lem-match-pt2-last}\\
& = 0.\nonumber
\end{align}
Looking at \cref{eq:lem-match-pt2-line1}, each $\sine$ function in the first two products has its matching complex conjugate in the product terms. However, the rightmost products of the expectation are all distinct world value inputs, i.e. random $\sine$ values with no matching conjugate counterparts. Since we have distinct, non-matching world value inputs for the rightmost products, we can push the expectation through the products until we arrive at \cref{eq:lem-match-pt2-last}, where finally, by \cref{lem:exp-sine}, each of those inner expectations computes to $0$. This in turn zeroes out the whole product.
We now seek to show that when $f, f'$ are matching, that $\term_1$ will always equal 1. Recall that when $\match{f}{f'}$, that
\begin{enumerate}
\item $\dist = \dist'$, i.e., the output size of both functions is the same,
\item $\forall i \in [\dist],| f^{-1}(i)| = |f'^{-1}(i)|$, i.e. each $\dw_i$ has the same number of variables assigned to it as its $\dw'_i$ counterpart.
\end{enumerate}
This means,
\begin{align}
\term_1 = &\ex{\prod_{i = 1}^{\dist}\sine(\dw_i)^{|f^{-1}(i)|}\conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}}\nonumber\\
= &\ex{\prod_{i = 1}^{\dist}\left(\sine(\dw_i) \cdot \conj{\sine(\dw'_i)}\right)^{|f^{-1}(i)|}}\label{eq:lem-match-pt3-2}\\
= &1\nonumber
\end{align}
We arrive at \cref{eq:lem-match-pt3-2} since $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$ and we can use the distributive law of exponents
over multiplication. This then implies that each individiual $\sine(\dw_i)$ has its own matching conjugate $\conj{\sine(\dw'_i)}$,and by the property of roots of unity in complex numbers, each $\sine(\dw_i)\cdot \conj{\sine(\dw'_i)} = 1$, yielding an overall product of $1$.
\end{proof}
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem, \wElem_{i'}' = \wElem_{j'}' = \wElem'$, for both $\wElem = \wElem'$ and $\wElem \neq \wElem'$, we can rid ourselves of the case when there exists only one distinct world value. This is precisely why we have not needed to account for the last two expectations in \cref{eq:sig-j-last}. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Starting with \cref{eq:sig-j-distinct},
\begin{align}
\sigsq_j = &\sum_{\dist = 2}^{\prodsize}\sum_{\dist' = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)\nonumber\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)\label{eq:sig-j-bnd-1}\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \prod_{i = 1}^{\dist}\ind{\hfunc(\dw_i) = j}\ind{\hfunc(\dw'_i) = j}\label{eq:sig-j-bnd-2}\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)})\cdot \prod_{i = 1}^{\dist}\ind{\hfunc(\dw_i) = j}\label{eq:sig-j-bnd-3}\\
= &\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)})\label{eq:sig-j-bnd-4}
\end{align}
We obtain \cref{eq:sig-j-bnd-1} by the fact that $\dist = \dist'$. Next, we arrive at \cref{eq:sig-j-bnd-2} by \cref{lem:sig-j-survive} as well as bringing out the indicator variables of $\term_1$. Equation \ref{eq:sig-j-bnd-3} is derived from the fact that $\forall i \in [\dist], \dw_i = \dw'_i$. We arrive at \cref{eq:sig-j-bnd-4}, since with $\dist$ distinct variables, the product of indicator variables will result in multiplying the uniform distribution probability distribution $\dist$ times.
Using \cref{eq:cvar-bound} and \cref{eq:sig-j-bnd-4}, we state the general bounds for $\sigsq$,
\[\sigsq = \sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots, \dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)}) -
\frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.\]
\AH{Next on the agenda, type up the expectation calculations, then start on SOP.}
\AR{Remaining TODOs: (1) Give expression for general $\sigma^2$, i.e. deal with the general $\lambda(j,j')$ term. (2) Show how to use the analysis for general $k$-product to handle generic SoP expressions-- the expectation arguments would just follow from the above and linearity of expectation but the variance bounds might need a bit of extra work.}