paper-BagRelationalPDBsAreHard/sop.tex

285 lines
31 KiB
TeX

%root--main.tex
\section{Analysis of a $\prodsize$-way join}
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query. We start by analyzing a $\prodsize$ product. Define the $j_{th}$ bucket of a sketch $\sk$ for a vector $\vect$ as
\[\sk^\vect[j] = \sum_{\substack{\wElem \in \wSet,\\ \hfunc(\wElem) = j}}\vect(\wElem)\sine(\wElem)\].
Define the estimate of the $j_{th}$ bucket to be
\[\est_j = \prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]\].
For notational convenience define
\begin{align*}
&\wSet_j = \{\wElem ~|~ \hfunc(\wElem) = j\}\\
&\term_j = \sum_{\wElem \in \wSet_j} \prod_{i = 1}^{\prodsize}\vect_i(\wElem)
\end{align*}
Let us show first that the expectation of the estimate does in fact yield the value we are estimating, $\term_j$.
\AR{You should convert the above statement into a formal lemma. Otherwise it is weird to see a proof without any formal statement of what it is proving.}
\begin{proof}
\begin{align*}
\ex{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]} \\
= &\ex{\prod_{i = 1}^{\prodsize} \sum_{\substack{\wElem \in \wSet_j, \\ \hfunc(\wElem) = j}}\vect_i(\wElem)\sine(\wElem)}\\
= &\ex{\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}
\end{align*}
Fix the variables $\wElem_1,\ldots, \wElem_{\prodsize}$. Define $\dist$ to be the number of distinct worlds in $\wElem_1,\ldots, \wElem_{\prodsize}$ and $e_l$ to be the number of repitions for the $l_{th}$ \AR{General typesetting comments. (1) You shoud laway use $\ell$ instead of $l$. (2) Typeset $l_{th}$ as $\ell^{\text{th}}$-- note that ``th" is in superscript and not in math mode.} distinct world value. For $\term_1^{\est_j} = \ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)}$, \AR{Why are you defining the new notation $\term_1^{\est_j}$. You should always be wary of introducing new notation since it makes things hard to read.} we get
\begin{align*}
\term_1^{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\ex{\prod_{l = 1}^{\dist} \sine(\wElem_l)^{e_l}}\\
= & \begin{cases}
0 &1 <\dist < \prodsize\\
1 & \dist = 1.
\end{cases}
\end{align*}
\AR{Why is the last equality true? You need to justify it by explicitly showing how you are using Lemma~\ref{lem:exp-sine} to prove it.}
Notice, that the above leaves us with the condition that $\forall i, j \in [\prodsize], \wElem_i = \wElem_j$,
\begin{align*}
= &\sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize} \vect_i(w) \cdot \term_1^{\est_j} = \term_j.
\end{align*}
\end{proof}
The proof for $\est = \sum_j \est_j$ follows by linearity of expectation.\qed\newline
We need to compute the variance of the $\prodsize$-way product $\est$. We wish to prove that
\begin{equation}
\sigsq \leq \sum_j \sigsq_j \label{eq:var-to-prove}.
\end{equation}
Therefore, substituting in the definition of variance for complex numbers,
\begin{align}
\sigsq &= \ex{\sum_j \est_j \cdot \conj{\sum_{j'} \est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\conj{\sum_{j'} \est_{j'}}}\nonumber\\
&= \ex{\sum_j \est_j \cdot \sum_{j'} \conj{\est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\sum_{j'} \conj{\est_{j'}}}\nonumber\\
&= \sum_{j, j'}\left(\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_{j'}}} = \cvar{j, j'}\right)\nonumber\\
&= \sum_j\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_j}} + \sum_{j \neq j'}\cvar{j, j'}\nonumber\\
&= \sum_j \sigsq_j + \sum_{j \neq j'}\cvar{j, j'} \label{eq:sigsq-jneqj}
\end{align}
\AR{The above is a terrible way to define $\lambda(j,j')$. Pretty much any reader will miss the fact that you defined it here. Define $\lambda(j,j')$ ideally outside the align statement especially since this definition will be used later on as well.}
Notice that assuming independence of $\sigsq_j ~\forall j \in \sketchCols$, we can push the variance through the sum and obtain the result
\begin{align*}
&\sigsq - \sum_j \sigsq_j = \cvar{j, j'}\\
&\implies \cvar{j, j'} \leq 0.
\end{align*}
Recall that we started this section out by seeking to prove \cref{eq:var-to-prove}. Should this be true, the use of $\leq$ in the above implication results from the fact that $\sigsq \leq \sum_j \sigsq_j \implies \cvar{j, j'} \leq 0$.
\AH{I'm really not so sure about the above results. This was from a conversation we had months ago, but we're basing an implication on something we haven't proved. That doesn't seem right to me.}
\AR{Yeah, the para above does not make sense.}
One can see that \cref{eq:sigsq-jneqj} is composed of two addends. We now bound each of them separately.
\subsection{Bounding $\sum_{j \neq j'}\cvar{j, j'}$}
\begin{align*}
\sum_{j \neq j'}\cvar{j, j'} &= \sum_{j \neq j'} \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\\
&=\ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}\cdot \prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}} - \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}}\cdot \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}}\\
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem'_i) = j'}}\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}} \right).
\end{align*}
\AH{Perhaps a formal proof is necessary below.}
For $\term_1^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only instance $\term_1^{\cvar{j, j'}} = 1$ occurs when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, $\term_1^{\cvar{j, j'}} = 1$ only with the further condition that $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, given the conditions, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations $\term_2^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i) \ind{\hfunc(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}\conj{\sine(\wElem'_i)} \ind{\hfunc(\wElem'_i) = j'}}$, that $\term_2^{\cvar{j, j'}} = 1$ only when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Taken together, the constraints leave us with only one possible case for $\term_1^{\cvar{j, j'}} - \term_2^{\cvar{j, j'}} \neq 0$, when all variables are the same world. Thus,
\begin{align}
&\sum_{j \neq j'}\cvar{j, j'} = - \frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We now move on to bound the variance of a $\prodsize$-way join.
\begin{align}
&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. We refer to the leftmost expectation of \cref{eq:sig-j-last} in the following way:
\[\term_1\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}}.%\text{, and}
\]
%\[\term_2\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote $\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i') \cdot\term_1\left(\wElem_1,\ldots,\wElem_\prodsize\right)$ given a specific set of world values. %To say that a term survives \AR{You should not care about whether the $T_1$ term survives or not. See the above comment on why.} the expectation is to mean that $\term_1 - \term_2 \neq 0$. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
We next describe the nonzero terms of \cref{eq:sig-j-last}.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{f, f'}
\begin{Definition}
Define and then fix a total ordering of the $\dist$ distinct world elements to follow the total order of the natural numbers in $[\dist]$, such that $\forall i, j \in [\dist], i < j \implies \dw_i < \dw_j, i.e. \wElem_1 \prec\ldots\prec\wElem_\prodsize$.
%Given a fixed order $\wSet_{\order}: \left(\wSet, \wSet\right)\mapsto \mathbb{B}$ of possible worlds, define the lexographical order of distinct worlds $\wSet_\dist$ to be the ordering which complies to the identity mapping of elements in $[\prodsize]$ to elements in $[\dist]$ up to $\dist$, such that . In other worlds, $\forall \wElem, \wElem' \in \wSet_\dist, \dw < \wElem' \leftrightarrow \wSet_{\order}\left(\wElem, \wElem'\right) = T$.
\end{Definition}
To help describe all possible world value matchings we introduce functions $f$ and $f'$.
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%f(i) = \begin{cases}
% \dMap{w_1} &f(i) = 1\\
% \dMap{w_2} &f(i) = 2\\
% \vdots &\vdots\\
% \dMap{w_\dist} &f(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \dMap{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist = 2}^{\prodsize}\sum_{\dist' = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_1, \ldots,\dw_\dist,\\ \dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{_{f(i)}})\vect_i(\dw_{'_{f'(i)}})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)
\label{eq:sig-j-distinct}
\end{equation}
Observe that the cartesian product of world values assigned to $\wElem_1,\ldots,\wElem_\prodsize$ throughout the summation can be rearranged into groups of variables with distinct values, for each distinct element $\dist$ in the set $[\prodsize]$. For each $\dist \in [\prodsize]$, all possible combinations of $\dist$ world values can be equivalently modeled by taking the set of surjective functions $f:[\prodsize]\mapsto [\dist]$ and computing all world value combinations based on the total ordering of $\dw_{f(1)}\prec\cdots\prec\dw_{f(m)}$. For any $\dist$, all surjective mappings $f$ constitute all unique mappings with their symmetrical counterparts. Combining that with the total order over $\dw_{f(1)},\ldots,\dw_{f(\dist)}$ yields exactly the world value combinations containing $\dist$ distinct values which appear in the cartesian product of the sum, without double counting. What this all boils down to is a rearrangement of addends in the sum.
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
\begin{enumerate}
\item $\dist = \dist'$
\item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, i.e., the cardinality of variables mapped to $\dw_i$ equals the cardinality of variables mapped to $\dw_i'$, for all $i \in [\dist]$.
\end{enumerate}
\end{Definition}
\begin{Lemma}\label{lem:sig-j-survive}
When $f, f'$are matching, where $\forall j \in[\dist], \dw_{_j} = \dw_{'_j}$, \cref{eq:sig-j-distinct} is exactly
\[
\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw_{'_1},\ldots,\dw_{'_{\dist'}}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{_{f(i)}})\vect_i(\dw_{'_{f'(i)}})
\]
and $0$ otherwise.
\end{Lemma}
In proving \cref{lem:sig-j-survive}, we introduce another fact.
\begin{Lemma}\label{lem:exp-prod-rand-roots}
Given a $\prodsize^{th}$ root of unity $\rou$, the expectation of the product of $(\rou^i)^l \cdot (\rou^j)^{l'}$ for uniformly random $i, j$, where $i, j, l, l' \in [\prodsize]$, is zero.
\end{Lemma}
\AH{I don't think I need this lemma anymore, but I'll give it a go anyways.}
\AH{A quick heads up, I realized as I was doing this that I had stated things incorrectly in the definition of the lemma. This, being corrected, allows only for one case.}
\AH{Also, since I don't use this in the proof of \cref{lem:sig-j-survive}, it probably makes no sense to have this sitting right in the middle of things. I'm only leaving it since you had said you wanted to see a proof for both cases: $\wElem = \wElem', \wElem \neq \wElem'$.}
\begin{proof}
The proof only needs the case when $\wElem \neq \wElem'$, since $i, j$ are both uniformly random.
\begin{align*}
&\ex{\sine(\wElem)^i \conj{\sine(w')}^j}\\
= &\ex{\sine(\wElem^i)}\ex{\conj{\sine(\wElem')}^j}\\
= &0
\end{align*}
\end{proof}
In the above, since we have more than pairwise independence for $\wElem \neq \wElem'$, we can push the expectation into the product. Then by \cref{lem:exp-sine} we get 0 for both expectations.\newline
\begin{proof}
\AR{First some typos/things that are incorrect below-- note this is \textbf{not} an exhaustive list. (1) In the proof below the $w_i$ and $w'_i$ should be $\tilde{w}_i$ and $\tilde{w'}_i$ respectively. (2) The expression for $T_1$ below is incorrect since it seems to assume that all the pre-image sizes are $1$-- the expression for $T_2$ is fine except the $j_i$ terms are not defined. However, ``taking out" one term for $\tilde{w'}_{m'}$ for $T_2$ is incorrect since e.g. we could have the pre-image of $m'$ have size $>1$. (3) The proof below never explicitly argues why the condition $\dw_{_j} = \dw_{'_j}$ is needed.}
\AR{Here is how I recommend that you re-write the proof. First as mentioned earlier, you should only consider the $T_1$ terms (as you account for the $T_2$ terms later on. Second you should first start off by re-stating the $T_1$ term like so. Consider the ``generic term"--
\[T_1(\tilde{w}_{f(1)},\dots, \tilde{w}_{f(m)}, \tilde{w'}_{f'(1)},\dots, \tilde{w'}_{f'(m')}).\]
Then re-write the what the above term is based on the exact definition (BTW I'm dropping the $\mathbf{E}$ terms for convenience but they should be all there below.) In particular, the above term by definition is exactly
\[\prod_{i=1}^k s(\tilde{w}_{f(i)})\cdot \overline{s(\tilde{w'}_{f'(i)})}.\]
Now re-write the above in terms of ``powers" of distinct worlds:
\[ (\prod_{i=1}^m s(\tilde{w}_{i})^{|f^{-1}(i)|})\cdot \overline{(\prod_{j=1}^m s(\tilde{w'}_j)^{|f^{-1}(j)|})}\]
Now once you have the above expression, then it will be much easier to argue why if any of the matching conditions are not satisfied then the expression is $0$. I also believe that working with the above expression will also make it more ``obvious" as to why the different conditions are required. Currently the arguments below do not explicitly bring this out...
}
Consider the "generic term"--
\[T_1(\tilde{w}_{f(1)},\dots, \tilde{w}_{f(\prodsize)}, \tilde{w'}_{f'(1)},\dots, \tilde{w'}_{f'(\prodsize')}).\]
Let's rewrite the term based on its exact definition:
\begin{align*}
= &\ex{\prod_{i = 1}^{\prodsize}\sine(\dw_{f(i)})\cdot\conj{\sine(\dw'_{f'(i)})}}\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist'}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)}
\end{align*}
Notice that each $i \in [\prodsize]$ has its own mapping to an element in $[\dist]$. We can thus rearrange all the elements of the product such that the preimage of function $f(i)$, i.e., $f^{-1}(i)$ yields the number of terms that will be mapped to a distinct variable $\dw_i$.
Further see how the requirement that $\dw_i = \dw'_i$ gives us the precise combinations we are looking for, where each random $\sine$ output value has its own matching complex conjugate.
To prove that \cref{lem:sig-j-survive} is true, consider what the expectation looks like when $f, f'$ are not matching. The first condition for $f, f'$ to be matching is violated when $\dist \neq \dist'$.
\AH{The following observation isn't necessary to complete the proof. I'll just leave it for now in case it may be valuable down the road.}
Observe that $\forall \dist \in [\prodsize], \sum_{i = 1}^{\dist}|f^{-1}(i)| = \prodsize$ and that this fact implies for $\dist, \dist' \in [\prodsize] ~|~\dist \neq \dist', \exists i \in [m] ~|~\forall j \in [m], f^{-1}(i)| \neq |f'^{-1}(j)|$, meaning that if we have that $\dist \neq \dist'$, then the second matching condition is also violated.
\AH{Moving on...}Note that $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw_{\dist'}'$ are distinct world values such that $\forall i \neq j \in [\dist], \dw_i = \dw_i' \neq \dw_j = \dw_j'$. To make things easier, assume that $\dist < \dist'$. The opposite case of $\dist > \dist'$ has a symmetrical proof. Fixing variables $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw_\dist$, we have at least one $\dw_i$ without a conjugate, and at least one extra distinct value, $\dw_{\dist'}'$, for which no $\dw_{\dist'}$ exists. This (these) distinct term(s) cancel(s) out all the other values in the expectations.
\begin{align}
&\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist'}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)}\nonumber\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)\left(\prod_{l = \dist + 1}^{\dist'}\conj{\sine(\dw_l)}^{f'^{-1}(l)}\right)}\nonumber\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|f^{-1}(i)|}\right) \cdot \left(\prod_{j = 1}^{\dist}\conj{\sine(\dw'_{j})}^{|f^{-1}(j)|}\right)} \cdot \prod_{l = \dist + 1}^{\dist'} \ex{\conj{\sine(\dw_l)}^{|f^{-1}(l)|}} = 0.\label{eq:lem-fmatch-pt1}
\end{align}
In \cref{eq:lem-fmatch-pt1} the expectation can be pushed through the last product group since we know that all the operands are distinct from any others appearing in the overall product. Then, by \cref{lem:exp-sine} we get $0$ for that rightmost term, and this cancels out the rest of the terms in the overall product.
\textit{Here at most we assume 2k wise independence, but we really would like less}.
\AH{Can we bring it down to k-wise, since we have $<$ k terms which we are pushing the expectation through?}
To complete the proof, we now approach the case where $\dist = \dist'$, but there is a $\dw_i, \dw_i'$ with an unequal number of mappings.
\begin{align*}
&\exists i \in [\dist], |f^{-1}(i)| \neq |f'^{-1}(i)|\\
\implies &\exists j \in [m] ~|~i \neq j, |f^{-1}(j)| \neq |f'^{-1}(j)|\\
\implies &\exists i, j \in [\dist], i \neq j ~|~ |\dw_i| \neq |\dw_i|, |\dw_j| \neq |\dw'_j|, \\
%\implies &\exists \wElem_i \in \wSet ~|~ \nexists \wElem_i' \in \wSet ~|~ \wElem_i = \wElem_i'
\end{align*}
The above means that we will have at least two world values that don't match. Put another way, after the optimal number of matching world value pairs have been assigned, there will be at least one world value whose matching conjugate product is not the conjugate of the sine of the same world value, i.e. for $i \neq j$, there will exist at least one product of $\sine(\dw_i) \conj{\sine(\dw_{j}')}$.
Such cross terms exist since
\[\left(\sum_{\substack{i \in [\dist],\\|f^{-1}(i)| \neq |f'^{-1}(i)|}}|f^{-1}(i)|\right) = \left(\sum_{\substack{i' \in [\dist],\\|f^{-1}(i')| \neq |f'^{-1}(i')|}}|f'^{-1}(i')|\right)\]
Let $n = \{i ~|~ |f^{-1}(i)| \neq |f'^{-1}(i)|\}$. Further, let $\dist_* = [\dist] - n$ and $f^{*-1}(i) = min\left(f^{-1}(i), f'^{-1}(i)\right)$. Then,
\begin{align}
\term_1 = &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)
\left(\prod_{\substack{i' \in [n],\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|} \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}\right)} \label{eq:lem-match-pt2-line1}\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\ex{\left(\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|} \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}\right)}\nonumber\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\ex{\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|}} \cdot \ex{\prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}}\nonumber\\
= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|f^{-1}(i)|} \conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}\right)
\left(\prod_{j \in [n]}\sine(\dw_i)^{|f^{*-1}|} \conj{\sine(\dw'_i)}^{|f^{*-1}|}\right)} \cdot
\prod_{\substack{i' \in n ~|~\\f^{-1}(i') > f'^{-1}(i')}} \ex{\sine(\dw_{i'})^{|f^{-1}(i')| - |f'^{-1}(i')|}} \cdot \prod_{\substack{j' \in n ~|~\\ f'^{-1}('j) > f^{-1}(j')}} \ex{\conj{\sine(\dw'_{j'})}^{|f'^{-1}(j')| - |f^{-1}(j')|}}\label{eq:lem-match-pt2-last}\\
& = 0.\nonumber
\end{align}
Looking at \cref{eq:lem-match-pt2-line1}, each $\sine$ function in the first two products has its matching complex conjugate in the product terms. However, the rightmost products of the expectation are all distinct world value inputs, i.e. random $\sine$ values with no matching conjugate counterparts. Since we have distinct, non-matching world value inputs for the rightmost products, we can push the expectation through the products until we arrive at \cref{eq:lem-match-pt2-last}, where finally, by \cref{lem:exp-sine}, each of those inner expectations computes to $0$. This in turn zeroes out the whole product.
We now seek to show that when $f, f'$ are matching, that $\term_1$ will always equal 1. Recall that when $\match{f}{f'}$, that
\begin{enumerate}
\item $\dist = \dist'$, i.e., the output size of both functions is the same,
\item $\forall i \in [\dist],| f^{-1}(i)| = |f'^{-1}(i)|$, i.e. each $\dw_i$ has the same number of variables assigned to it as its $\dw'_i$ counterpart.
\end{enumerate}
This means,
\begin{align}
\term_1 = &\ex{\prod_{i = 1}^{\dist}\sine(\dw_i)^{|f^{-1}(i)|}\conj{\sine(\dw'_i)}^{|f'^{-1}(i)|}}\nonumber\\
= &\ex{\prod_{i = 1}^{\dist}\left(\sine(\dw_i) \cdot \conj{\sine(\dw'_i)}\right)^{|f^{-1}(i)|}}\label{eq:lem-match-pt3-2}\\
= &1\nonumber
\end{align}
We arrive at \cref{eq:lem-match-pt3-2} since $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$ and we can use the distributive law of exponents
over multiplication. This then implies that each individiual $\sine(\dw_i)$ has its own matching conjugate $\conj{\sine(\dw'_i)}$,and by the property of roots of unity in complex numbers, each $\sine(\dw_i)\cdot \conj{\sine(\dw'_i)} = 1$, yielding an overall product of $1$.
\end{proof}
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem, \wElem_{i'}' = \wElem_{j'}' = \wElem'$, for both $\wElem = \wElem'$ and $\wElem \neq \wElem'$, we can rid ourselves of the case when there exists only one distinct world value. This is precisely why we have not needed to account for the last two expectations in \cref{eq:sig-j-last}. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Starting with \cref{eq:sig-j-distinct},
\begin{align}
\sigsq_j = &\sum_{\dist = 2}^{\prodsize}\sum_{\dist' = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)\nonumber\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)\label{eq:sig-j-bnd-1}\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \prod_{i = 1}^{\dist}\ind{\hfunc(\dw_i) = j}\ind{\hfunc(\dw'_i) = j}\label{eq:sig-j-bnd-2}\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)})\cdot \prod_{i = 1}^{\dist}\ind{\hfunc(\dw_i) = j}\label{eq:sig-j-bnd-3}\\
= &\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)})\label{eq:sig-j-bnd-4}
\end{align}
We obtain \cref{eq:sig-j-bnd-1} by the fact that $\dist = \dist'$. Next, we arrive at \cref{eq:sig-j-bnd-2} by \cref{lem:sig-j-survive} as well as bringing out the indicator variables of $\term_1$. Equation \ref{eq:sig-j-bnd-3} is derived from the fact that $\forall i \in [\dist], \dw_i = \dw'_i$. We arrive at \cref{eq:sig-j-bnd-4}, since with $\dist$ distinct variables, the product of indicator variables will result in multiplying the uniform distribution probability distribution $\dist$ times.
Using \cref{eq:cvar-bound} and \cref{eq:sig-j-bnd-4}, we state the general bounds for $\sigsq$,
\[\sigsq = \sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots, \dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)}) -
\frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.\]
\AH{Next on the agenda, type up the expectation calculations, then start on SOP.}
\AR{Remaining TODOs: (1) Give expression for general $\sigma^2$, i.e. deal with the general $\lambda(j,j')$ term. (2) Show how to use the analysis for general $k$-product to handle generic SoP expressions-- the expectation arguments would just follow from the above and linearity of expectation but the variance bounds might need a bit of extra work.}