paper-BagRelationalPDBsAreHard/sop.tex

464 lines
50 KiB
TeX

%root--main.tex
\section{Analysis of a $\prodsize$-way join}
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query. We start by analyzing a $\prodsize$ product. Define the $j^{\text{th}}$ bucket of a sketch $\sk$ for a vector $\vect$ as
\[\sk^\vect[j] = \sum_{\substack{\wElem \in \wSet,\\ \hfunc(\wElem) = j}}\vect(\wElem)\sine(\wElem)\].
Define the estimate of the $j_{th}$ bucket to be
\[\est_j = \prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]\].
For notational convenience define
\begin{align*}
&\wSet_j = \{\wElem ~|~ \hfunc(\wElem) = j\}\\
&\term_j = \sum_{\wElem \in \wSet_j} \prod_{i = 1}^{\prodsize}\vect_i(\wElem)
\end{align*}
Let us show first that the expectation of the estimate does in fact yield the value we are estimating, $\term_j$.
\begin{Lemma}
The expectation of an estimate $\est_j$ is the sum of its pointwise vector products.
\end{Lemma}
\begin{proof}
\begin{align*}
\ex{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]} \\
= &\ex{\prod_{i = 1}^{\prodsize} \sum_{\substack{\wElem \in \wSet_j, \\ \hfunc(\wElem) = j}}\vect_i(\wElem)\sine(\wElem)}\\
= &\ex{\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}
\end{align*}
Fix the variables $\wElem_1,\ldots, \wElem_{\prodsize}$. Define $\dist$ to be the number of distinct worlds in $\wElem_1,\ldots, \wElem_{\prodsize}$ and $e_{\ell}$ to be the number of repetitions for the $\ell^{\text{th}}$ distinct world value. Focusing on the expectation factor, $\ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)}$, we get
\begin{align*}
&\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\ex{\prod_{\ell = 1}^{\dist} \sine(\wElem_{\ell})^{e_{\ell}}}\\
= & \begin{cases}
0 &1 <\dist < \prodsize\\
1 & \dist = 1.
\end{cases}
\end{align*}
\AH{Oliver had suggested that we change the proof to lemma 1...I think he wanted to allow for values of $i$ to be within the set of integers, not just restricted to $0 < i \leq k$.
Even if we don't change the lemma, I think the proof itself is inaccurate and needs to be rewritten.}
We obtain the final equality by \cref{lem:exp-sine}, which states that the only way in expectation that $\sine(\wElem_{\ell})^{e_{\ell}}$ can be something other than $0$ is when $e_{\ell} = \prodsize$. It can further be seen that the only way this can happen is when $\dist = \prodsize$.
Notice, that the above leaves us with the only remaining condition that $\forall i, j \in [\prodsize], \wElem_i = \wElem_j$,
\begin{align*}
= &\sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize} \vect_i(w) \cdot \ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)} = \term_j.
\end{align*}
\end{proof}
The proof for $\est = \sum_j \est_j$ follows by linearity of expectation.\qed\newline
We need to compute the variance of the $\prodsize$-way product $\est$. We wish to prove that
\begin{equation}
\sigsq \leq \sum_j \sigsq_j \label{eq:var-to-prove}.
\end{equation}
For notational convenience let $\cvar(j, j') = \ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\conj{\est_{j'}}}$.
Substituting in the definition of variance for complex numbers,
\begin{align}
\sigsq &= \ex{\sum_j \est_j \cdot \conj{\sum_{j'} \est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\conj{\sum_{j'} \est_{j'}}}\nonumber\\
&= \ex{\sum_j \est_j \cdot \sum_{j'} \conj{\est_j'}} - \ex{\sum_j \est_j}\cdot\ex{\sum_{j'} \conj{\est_{j'}}}\nonumber\\
&= \sum_{j, j'}\left(\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_{j'}}}\right)\nonumber\\
&= \sum_j\ex{\est_j \cdot \overline{\est_j'}} - \ex{\est_j}\ex{\overline{\est_j}} + \sum_{j \neq j'}\cvar(j, j')\nonumber\\
&= \sum_j \sigsq_j + \sum_{j \neq j'}\cvar(j, j') \label{eq:sigsq}
\end{align}
One can see that \cref{eq:sigsq} is composed of two addends. We now bound each of them separately.
\subsection{Bounding $\cvar(j, j')$}
Notice we have two cases of $\cvar(j, j')$, the first is when $j = j'$, i.e. $(\sigsq_j)$, and the second when $j \neq j'$.
\begin{align}
\cvar(j, j') &= \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\nonumber\\
&=\ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}\cdot \prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}} - \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}}\cdot \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}}\nonumber\\
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem'_i) = j'}}\nonumber\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}}\nonumber\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}} \right).\label{eq:var-lambda-j-j'}
\end{align}
\subsection{$\cvar(j, j')~|~j \neq j'$}
For notational convenience set
\begin{align}
\term_1\left(\wElem_1,\ldots, \wElem_{\prodsize}, \wElem'_1,\ldots, \wElem'_{\prodsize}\right) = &\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem_i) = j} \ind{\hfunc(\wElem'_i) = j'}}\label{eq:term-1}\\
\term_2\left(\wElem_1,\ldots, \wElem_{\prodsize}, \wElem'_1,\ldots, \wElem'_{\prodsize}\right) = &\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}} \label{eq:term-2}
\end{align}
Focusing on $\term_1$, observe that $\term_1 = 1$ if and only if all the $\wElem_i$'s are equal, all the $\wElem'_i$'s are equal, and the two groups of variables do not equal each other,
\AH{Need to back up the above statement with lemma 1, indicator vars, 2kwise independence of s, h}
\AH{Both of these terms need to be fixed, we forgot the indicator variables.}
\begin{equation*}
\term_1\left(\wElem_1,\ldots, \wElem_{\prodsize}\right) =
\begin{cases}
1 &\text{if } \forall i, j \in [\prodsize], \wElem_i \neq \wElem'_j, \wElem_i = \wElem_j, \wElem'_i = \wElem'_j\\
0 &otherwise.
\end{cases}
\end{equation*}
Focusing on $\term_2$, it can be seen that $\term_2 = 1$ when we have that all $\wElem_i$'s are equal, and all $\wElem'_i$'s are equal,
\begin{equation*}
\term_2\left(\wElem_1,\ldots, \wElem_{\prodsize}\right) =
\begin{cases}
1 &\text{if } \forall i, j \in [\prodsize], \wElem_i = \wElem_j, \wElem'_i = \wElem'_j\\
0 &otherwise.
\end{cases}
\end{equation*}
Using equation ~\eqref{eq:var-lambda-j-j'} for $j \neq j'$, we establish bounds for the rightmost addend of \cref{eq:sigsq}.\newline\newline
\underline{Case 1:}
Assume that there exists an $i, j \in [k]$ such that $\wElem_i = \wElem'_j$. Then, since the same value cannot be hashed to two different buckets, $\term_1 = 0$. If we have that all $\wElem_i, \wElem'_i$ are equal, $\term_2 = 1$, and otherwise $\term_2 = 0$. \newline
\underline{Case 2:}
Alternatively, assume that for all $i$ in $[\prodsize]$ there does not exist a $j$ such that $\wElem_i = \wElem'_j$. Then, if for all $i, j$ in $[\prodsize]$ $\wElem_i = \wElem_j, \wElem'_i = \wElem'_j$, $\term_1 = 1$, and $0$ otherwise. Should $\term_1 =1$, then it is certain that $\term_2 = 1$. Should $\term_1 = 0$, then it has to be that $\term_2 = 0$.
Thus, the only time that $\term_1 - \term_2 \neq 0$ is from case 1, when we have that all $\wElem_i = \wElem'_i = \wElem$. \newline
%$\startOld{'Proof'/reasoning}$
%For $\term_1^{\cvar(j, j')} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only instance $\term_1^{\cvar(j, j')} = 1$ occurs when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, $\term_1^{\cvar(j, j')} = 1$ only with the further condition that $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, given the conditions, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations $\term_2^{\cvar(j, j')} = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i) \ind{\hfunc(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}\conj{\sine(\wElem'_i)} \ind{\hfunc(\wElem'_i) = j'}}$, that $\term_2^{\cvar(j, j')} = 1$ only when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Taken together, the constraints leave us with only one possible case for $\term_1^{\cvar(j, j')} - \term_2^{\cvar(j, j')} \neq 0$, when all variables are the same world.
%$\finOld$
Therefore,
\begin{align}
\underset{j \neq j'}{\cvar(j, j')} = &\sum_{\wElem \in \wSet}\prod_{i = 1}^{\prodsize}\vect_i^2(\wElem)\left(\term_1 - \term_2\right)\nonumber\\
= &\sum_{\wElem \in \wSet}\prod_{i = 1}^{\prodsize} \vect_i^2(\wElem)\left(0 - \frac{1}{\sketchCols^2}\right)\nonumber\\
= &- \frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.
\end{align}
Based on the results of \cref{eq:sigsq}, we deduce the following,
\begin{align*}
&\sigsq - \sum_j \sigsq_j = \cvar(j, j')\\
&\implies \sigsq \leq \sum_j \sigsq_j.
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\cvar(j, j')~|~j = j'$}
Taking a look at the leftmost term of \cref{eq:sigsq}, we establish bounds for the variance of the $j^{\text{th}}$ bucket of a $\prodsize$-way join. Note in this case that \cref{eq:var-lambda-j-j'} has that $j = j'$, and can be written in the following way,
\begin{align}
%&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
%&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
%\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
%&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
%\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
%\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
%=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
%\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
%Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. We refer to the leftmost expectation of \cref{eq:sig-j-last} in the following way:
%\begin{equation}
%\term_1\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}}.%\text{, and}
%\end{equation}
%\[\term_2\left(\wElem_1,\ldots,\wElem_\prodsize, \wElem_1',\ldots, \wElem_\prodsize'\right) = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote $\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i') \cdot\term_1\left(\wElem_1,\ldots,\wElem_\prodsize\right)$ given a specific set of world values. %To say that a term survives \AR{You should not care about whether the $T_1$ term survives or not. See the above comment on why.} the expectation is to mean that $\term_1 - \term_2 \neq 0$. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
We next describe the nonzero terms of \cref{eq:sig-j-last}.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $\surj$ and $\surj'$ are allowed but in that case why not state those properties directly in terms of $\surj$ and $\surj'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $\surj, \surj'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{\surj, \surj'}
\begin{Definition}
Define and then fix an arbitrary total ordering of the $|\wSet|$ worlds in $\wSet$. We use the symbol $\prec$ to denote the ordering.
%Given a fixed order $\wSet_{\order}: \left(\wSet, \wSet\right)\mapsto \mathbb{B}$ of possible worlds, define the lexographical order of distinct worlds $\wSet_\dist$ to be the ordering which complies to the identity mapping of elements in $[\prodsize]$ to elements in $[\dist]$ up to $\dist$, such that . In other worlds, $\forall \wElem, \wElem' \in \wSet_\dist, \dw < \wElem' \leftrightarrow \wSet_{\order}\left(\wElem, \wElem'\right) = T$.
\end{Definition}
To help describe all possible world value matchings we introduce functions $\surj$ and $\surj'$.
\begin{Definition}\label{def:f}
Functions \surj, \surj' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $\surj: [\prodsize] \rightarrow [\dist], \surj': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%\surj(i) = \begin{cases}
% \dMap{w_1} &\surj(i) = 1\\
% \dMap{w_2} &\surj(i) = 2\\
% \vdots &\vdots\\
% \dMap{w_\dist} &\surj(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $\surj, \surj'$ are used to produce the mappings $w_i \mapsto \dMap{w_{\surj(i)}}$. In particular, $\surj$ and $\surj'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $\surj, \surj'$ mappings.
\AH{A bug here? I think it is necessary to include $\term_2$ for the rewriting to be equivalent.}
\begin{equation}
\sum_{\dist = 1}^{\prodsize}\sum_{\dist' = 1}^{\prodsize}\sum_{\surj, \surj'}\sum_{\substack{\dw_1 \prec \cdots \prec \dw_\dist,\\ \dw'_{1} \prec \cdots \prec \dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw'_{\surj'(i)})\cdot \term_1\left(\dw_{\surj(1)},\ldots,\dw_{\surj(\prodsize)}, \dw'_{\surj'(1)},\ldots, \dw'_{\surj'(\prodsize)}\right)
\label{eq:sig-j-distinct}
\end{equation}
The functions $\surj, \surj'$ are used to produce the mapping $\left(\wElem_1,\ldots, \wElem_{\prodsize}\right) \mapsto \left(\dw_{\surj(1)},\ldots, \dw_{\surj(\prodsize)}\right)$. Observe that the cartesian product of world values assigned to $\wElem_1,\ldots,\wElem_\prodsize$ throughout the summation can be rearranged into groups of world variables with distinct world values, for each distinct element $\dist$ in the set $[\prodsize]$. For each $\dist \in [\prodsize]$, all possible combinations of $\dist$ world values is equivalently modeled by taking the set of surjective functions $\surj:[\prodsize]\mapsto [\dist]$ and using that set to obtain all respctive mappings $\left(\wElem_1,\ldots, \wElem_{\prodsize}\right) \mapsto \left(\dw_1,\ldots, \dw_{\prodsize}\right)$.
For any $\dist$, the set of surjective mappings $\surj$ covers all unique mappings.
Let us define the set
\[S_1 = \{\left(\wElem_1,\ldots, \wElem_{\prodsize}\right)~|~ \wElem_1,\ldots, \wElem_{\prodsize} \in \wSet, |\{\wElem_i ~|~ \forall i \in [\prodsize]\}| = \dist\}.\] Further define
\AH{Note that in the following definition we do not need to state that $\surj$ is surjective. $\surj$ is required to be surjective by definition of $\dist$, in that $\dist$ is the number of \textit{distinct} values, and for $\prodsize \geq \dist$, any function that maps $\prodsize$ elements down to $\dist$ elements will indeed be surjective. $\dist$ is the number of actual distinct values, which implies that each element in the set $\dist$ will have \textit{at least} one element of the \textit{preimage} mapped to it.}
\[S_2 = \left\{\left(\dw_{\surj(1)},\ldots, \dw_{\surj(\prodsize)}\right)~|~ \surj:[\prodsize] \mapsto [\dist], \dw_1 \prec \cdots \prec \dw_{\dist}\in \wSet \right\}.\]
To prove that \cref{eq:sig-j-last} and \cref{eq:sig-j-distinct} are the same, we need to show that the following lemmas are true.
\begin{Lemma}
The set $S_1$ is equal to the set $S_2$.
\end{Lemma}
\begin{proof}
Note that $S_1 = S_2$ is the same as $S_1 \subseteq S_2 \wedge S_2 \subseteq S_1$.
First, we argue that $S_2 \subseteq S_1$. By definition of $S_2$, this must follow, since every tuple in the set has $\dist$ distinct worlds, namely $\dw_1,\ldots, \dw_{\dist}$, while by definition of $S_1$, we have the set of all such possible tuples.
Next, we argue that $S_1 \subseteq S_2$. Pick an arbitrary tuple $\left(\wElem_1,\ldots, \wElem_{\prodsize}\right)$ in $S_1$. Note that it has $\dist$ distinct world values by definition of $S_1$. To show that this tuple exists in $S_2$ we need to show that there exists $\dw_1 \prec \cdots \prec \dw_{\dist}$ and $\surj$ such that $\left(\wElem_1,\ldots, \wElem_{\prodsize}\right) = \left(\dw_{\surj(1)},\ldots, \dw_{\surj(\prodsize)}\right)$. Take the distinct elements of the tuple and name these $\dw_1,\ldots,\dw_{\dist}$ such that they respect $\prec$. Then the corresponding definition of $\surj$ follows from which $\dw_j$ each $\wElem_i$ maps to. Such a mapping must exist since by \cref{def:f} we have the set of all mappings.
\end{proof}
\begin{Lemma}
Given a distinct $\dw_1 \prec \cdots \prec \dw_{\dist}$ over every distinct $\surj:[\prodsize]\mapsto [\dist]$, the tuples $\left(\dw_{\surj(1)},\ldots, \dw_{\surj(\prodsize)}\right)$ are distinct.
\end{Lemma}
\begin{proof}
For a given $\prodsize, \dist$, every function $\surj$ in the set of surjective functions ($\surjSet$), applies its mapping to the set of distinct value combinations that respect the total order ($\mtupSet$). In other words, an alternative view is to take the cartesian product of the world values that can be mapped to the $\dist$ $\dw_i$ variables respecting $\prec$, i.e.,
\[\mtupSet = \left\{\left(\dw_1,\ldots, \dw_{\dist}\right) ~|~ \dw_1,\ldots, \dw_{\dist} \in \wSet, \dw_1 \prec \cdots \prec \dw_{\dist}\right\},\]
where each member of this set of distinct values can be thought of as input to each mapping produced by each $\surj \in \surjSet$. For convenience let $\boldsymbol{\dw} = \left(\dw_1,\ldots, \dw_{\dist}\right)$ and $\kdisttup{\surj} = \left(\dw_{\surj(1)},\ldots, \dw_{\surj(\prodsize)}\right).$ Based on this, we can equivalently say that
\[S_2 = \left\{\kdisttup{\surj} ~|~ \surj \in \surjSet,~~ \boldsymbol{\dw} \in \mtupSet\right\}\]
We can look at $S_2$ at a set of pairs, $\left(\surj, \disttup\right)$, where each pair represents the set of ordered distinct $\dist$ values, and the mapping used to produce $\kdisttup{\surj}$. We need to show that
\[\left(\surj, \disttup\right) \neq \left(\surj', \disttup'\right) \implies \kdisttup{\surj} \neq \kdisttup{\surj'}' \vee \disttup \neq \disttup'\].
For the first case, assume $\disttup \neq \disttup'$. This, along with the surjectivity of $\surj$ and $\surj'$ implies that there is at least one world in $\kdisttup{f}$ not equal to $\kdisttup{f'}'.$
For the second case, assume that $\disttup = \disttup'$, which then requires that $\surj \neq \surj'.$ But, if $\surj \neq \surj'$, we know that there exists an $i \in [\prodsize]$ such that $\surj(i) \neq \surj'(i)$, and this then implies that $\kdisttup{f} \neq \kdisttup{f'}$.
\end{proof}
%, while additionally including all symmetrical counterparts allows for double counting. This double counting is mitigated by the fact that $\dw_1 \prec \cdots \prec \dw_\dist$ based on the fixed order of the world values, which then \AR{I do not see what the ``symmetrical counterparts" comment adds here. Just remove it}.
%yields exactly the world value combinations containing $\dist$ distinct values which appear in the cartesian product of the sum.
%\AR{Overall comments: (1) The main thing missing if explicitly stating that $(w_1,\dots,w_k)\mapsto (\dw_{\surj(1)},\ldots,\dw_{\surj(\dist)})$. (2) After stating the map you should argue in words why all distinct tuples with $m$ distinct world values are covered.}
\begin{Definition}
Functions $\surj:[\prodsize]\mapsto [\dist], \surj':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{\surj}{\surj'}$, if and only if
\begin{enumerate}
%\item $\dist = \dist'$
\item $\forall i \in [\dist], |\surj^{-1}(i)| = |\surj'^{-1}(i)|$, i.e., the cardinality of variables mapped to $\dw_i$ equals the cardinality of variables mapped to $\dw_i'$, for all $i \in [\dist]$.
\end{enumerate}
\end{Definition}
\begin{Lemma}\label{lem:sig-j-survive}
When $\surj, \surj'$are matching, where for every $i \in[\dist], \dw_{i} = \dw'_{i}$, %\cref{eq:sig-j-distinct} is exactly
\[
\term_1(\dw_{\surj(1)},\dots, \dw_{\surj(\prodsize)}, \dw_{\surj'(1)},\dots, \dw_{\surj'(\prodsize')}) = \frac{1}{\sketchCols^\dist}% \sum_{\substack{\dw_{1} \prec \cdots \prec \dw_{_\dist}\\ \in \wSet}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw_{\surj'(i)})
\]
and $0$ otherwise.
\end{Lemma}
%In proving \cref{lem:sig-j-survive}, we introduce another fact.
%
%\begin{Lemma}\label{lem:exp-prod-rand-roots}
%Given a $\prodsize^{th}$ root of unity $\rou$, the expectation of the product of $(\rou^i)^l \cdot (\rou^j)^{l'}$ for uniformly random $i, j$, where $i, j, l, l' \in [\prodsize]$, is zero.
%\end{Lemma}
%\AH{I don't think I need this lemma anymore, but I'll give it a go anyways.}
%\AH{A quick heads up, I realized as I was doing this that I had stated things incorrectly in the definition of the lemma. This, being corrected, allows only for one case.}
%\AH{Also, since I don't use this in the proof of \cref{lem:sig-j-survive}, it probably makes no sense to have this sitting right in the middle of things. I'm only leaving it since you had said you wanted to see a proof for both cases: $\wElem = \wElem', \wElem \neq \wElem'$.}
%
%\begin{proof}
%The proof only needs the case when $\wElem \neq \wElem'$, since $i, j$ are both uniformly random.
%\begin{align*}
%&\ex{\sine(\wElem)^i \conj{\sine(w')}^j}\\
%= &\ex{\sine(\wElem^i)}\ex{\conj{\sine(\wElem')}^j}\\
%= &0
%\end{align*}
%\end{proof}
%In the above, since we have more than pairwise independence for $\wElem \neq \wElem'$, we can push the expectation into the product. Then by \cref{lem:exp-sine} we get 0 for both expectations.\newline
%\AR{First some typos/things that are incorrect below-- note this is \textbf{not} an exhaustive list. (1) In the proof below the $w_i$ and $w'_i$ should be $\dw_i$ and $\dw_i$ respectively. (2) The expression for $T_1$ below is incorrect since it seems to assume that all the pre-image sizes are $1$-- the expression for $T_2$ is fine except the $j_i$ terms are not defined. However, ``taking out" one term for $\dw_{m'}$ for $T_2$ is incorrect since e.g. we could have the pre-image of $m'$ have size $>1$. (3) The proof below never explicitly argues why the condition $\dw_{_j} = \dw_{'_j}$ is needed.}
%\AR{Here is how I recommend that you re-write the proof. First as mentioned earlier, you should only consider the $T_1$ terms (as you account for the $T_2$ terms later on. Second you should first start off by re-stating the $T_1$ term like so. Consider the ``generic term"--
%\[T_1(\dw_{\surj(1)},\dots, \dw_{\surj(m)}, \dw_{\surj'(1)},\dots, \dw_{\surj'(m')}).\]
%Then re-write the what the above term is based on the exact definition (BTW I'm dropping the $\mathbf{E}$ terms for convenience but they should be all there below.) In particular, the above term by definition is exactly
%\[\prod_{i=1}^k s(\dw_{\surj(i)})\cdot \overline{s(\dw_{\surj'(i)})}.\]
%Now re-write the above in terms of ``powers" of distinct worlds:
%\[ (\prod_{i=1}^m s(\dw_{i})^{|\surj^{-1}(i)|})\cdot \overline{(\prod_{j=1}^m s(\dw_j)^{|\surj^{-1}(j)|})}\]
%Now once you have the above expression, then it will be much easier to argue why if any of the matching conditions are not satisfied then the expression is $0$. I also believe that working with the above expression will also make it more ``obvious" as to why the different conditions are required. Currently the arguments below do not explicitly bring this out...
%}
\textit{*In the subsequent proofs, at most we assume 2k wise independence for both $\hfunc$ and $\sine$, but we really would like less*}.
\AH{This generic term was introduced earlier in \cref{eq:sig-j-distinct}}
\begin{proof}
Note \cref{eq:term-1}, and consider the "generic term"--
\[\term_1(\dw_{\surj(1)},\dots, \dw_{\surj(\prodsize)}, \dw_{\surj'(1)},\dots, \dw_{\surj'(\prodsize)}).\]
Let's rewrite the term based on its exact definition:
\begin{align*}
= &\ex{\prod_{i = 1}^{\prodsize}\sine(\dw_{\surj(i)})\cdot\conj{\sine(\dw'_{\surj'(i)})}\ind{\hfunc(\dw_i) = j}\ind{\hfunc(\dw'_i) = j}}\\
= &\ex{\left(\prod_{i = 1}^{\dist}\sine(\dw_{i})^{|\surj^{-1}(i)|}\right) \cdot \left(\prod_{\ell = 1}^{\dist}\conj{\sine(\dw'_{\ell})}^{|\surj'^{-1}(\ell)|}\right)\ind{\hfunc(\dw_i) = j}}
\end{align*}
Notice that each $i \in [\prodsize]$ has its own mapping to an element in $[\dist]$. We can thus rearrange all the elements of the product such that the preimage of function $\surj(i)$, i.e., $\surj^{-1}(i)$ yields the number of terms that will be mapped to a distinct variable $\dw_i$.
Further see how the requirement that $\dw_i = \dw'_i$ gives us the precise combinations we are looking for, where each random $\sine$ output value has its own matching complex conjugate. This condition also cancels one of the indicator variables as well.
%To prove that \cref{lem:sig-j-survive} is true, consider what the expectation looks like when $\surj, \surj'$ are not matching. The first condition for $\surj, \surj'$ to be matching is violated when $\dist \neq \dist'$.
%Note that $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw_{\dist'}'$ are distinct world values such that $\forall i \neq j \in [\dist], \dw_i = \dw_i' \neq \dw_j = \dw_j'$. To make things easier, assume that $\dist < \dist'$. The opposite case of $\dist > \dist'$ has a symmetrical proof. Fixing variables $\dw_1\ldots\dw_\dist, \dw_1'\ldots\dw'_{\dist'}$, we have at least one $\dw_i$ without a conjugate, and at least one extra distinct value, $\dw_{\dist'}'$, for which no $\dw_{\dist'}$ exists. This (these) distinct term(s) cancel(s) out all the other values in the expectations.
%To complete the proof,
To prove that \cref{lem:sig-j-survive} is true, we approach the case where $\match{\surj \not}{\surj'}$, i.e., there is a $\dw_i$ and $\dw_i'$ with an unequal number of mappings.
\begin{align*}
&\exists i \in [\dist], |\surj^{-1}(i)| \neq |\surj'^{-1}(i)|\\
\implies &\exists j \in [m] ~|~i \neq j, |\surj^{-1}(j)| \neq |\surj'^{-1}(j)|\\
\implies &\exists i, j \in [\dist], i \neq j ~|~ |\dw_i| \neq |\dw'_i|, |\dw_j| \neq |\dw'_j|, \\
%\implies &\exists \wElem_i \in \wSet ~|~ \nexists \wElem_i' \in \wSet ~|~ \wElem_i = \wElem_i'
\end{align*}
The above means that we will have at least two world values that don't match. Put another way, after the optimal number of matching world value pairs have been assigned, there will be at least one world value whose matching conjugate product is not the conjugate of the sine of the same world value, i.e. for $i \neq j$, there will exist at least one product of $\sine(\dw_i) \conj{\sine(\dw_{j}')}$.
Such cross terms exist since
\[\left(\sum_{\substack{i \in [\dist],\\|\surj^{-1}(i)| \neq |\surj'^{-1}(i)|}}|\surj^{-1}(i)|\right) = \left(\sum_{\substack{i' \in [\dist],\\|\surj^{-1}(i')| \neq |\surj'^{-1}(i')|}}|\surj'^{-1}(i')|\right)\]
Let $n = \{i ~|~ |\surj^{-1}(i)| \neq |\surj'^{-1}(i)|\}$. Further, let $\dist_* = [\dist] - n$, $\surj^{*-1}(i) = min\left(\surj^{-1}(i), \surj'^{-1}(i)\right)$, and $\hat{f}^{-1}(i) = \biggm| |\surj^{-1}(i)| - |\surj'^{-1}(i)| \biggm|$. Then,
\begin{align}
\term_1 = &\mathbb{E}\left[\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|\surj^{-1}(i)|} \conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\ind{\hfunc(\dw_i) = j}\right)
\left(\prod_{\ell \in [n]}\sine(\dw_\ell)^{|\surj^{*-1}(\ell)|} \conj{\sine(\dw'_\ell)}^{|\surj^{*-1}(\ell)|}\ind{\hfunc(\dw_\ell) = j}\right) \right.\nonumber\\
&\qquad\qquad\qquad\qquad\qquad\qquad\left.\left(\prod_{\substack{i' \in [n],\\\surj^{-1}(i') > \surj'^{-1}(i')}} \sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}\ind{\hfunc(\dw_{i'}) = j}\right)
\left(\prod_{\substack{\ell' \in n,\\ \surj'^{-1}(\ell') > \surj^{-1}(\ell')}} \conj{\sine(\dw'_{\ell'})}^{|\surj'^{-1}(\ell')| - |\surj^{-1}(\ell')|}\ind{\hfunc(\dw_{\ell'}) = j}\right)\right] \label{eq:lem-match-pt2-line1}
\end{align}
Notice that the two rightmost factors in the product are distinct values with no matching conjugates, and by the independence of $\sine$, we can push the expectation through the product. If we label the four factors as
\begin{align*}
\term_{1, 1} =&\prod_{i \in [\dist_*]} \sine(\dw_i)^{|\surj^{-1}(i)|} \conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\ind{\hfunc(\dw_i) = j}\\
\term_{1, 2} =& \prod_{\ell \in [n]}\sine(\dw_\ell)^{|\surj^{*-1}(\ell)|} \conj{\sine(\dw'_\ell)}^{|\surj^{*-1}(\ell)|}\ind{\hfunc(\dw_\ell) = j}\\
\term_{1, 3} =& \prod_{\substack{i' \in [n],\\\surj^{-1}(i') > \surj'^{-1}(i')}} \sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}\ind{\hfunc(\dw_{i'}) = j}\\
\term_{1, 4} =&\prod_{\substack{\ell' \in n,\\ \surj'^{-1}(\ell') > \surj^{-1}(\ell')}} \conj{\sine(\dw'_{\ell'})}^{|\surj'^{-1}(\ell')| - |\surj^{-1}(\ell')|}\ind{\hfunc(\dw_{\ell'}) = j}
\end{align*}
we can obtain
\begin{equation*}
\term_1 = \ex{\term_{1, 1} \cdot \term_{1, 2}}\ex{\term_{1, 3}} \ex{\term_{1, 4}}
\end{equation*}
from \cref{eq:lem-match-pt2-line1}, and arbitrarily pick either of the last two factors and derive
\begin{align}
\ex{\term_{1, 3}} =& \ex{\prod_{\substack{i' \in [n],\\\surj^{-1}(i') > \surj'^{-1}(i')}} \sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}\ind{\hfunc(\dw_{i'}) = j}}\nonumber\\
=&\prod_{\substack{i' \in [n],\\\surj^{-1}(i') > \surj'^{-1}(i')}}\ex{\sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}}\ex{\ind{\hfunc(\dw_{i'}) = j}}\label{eq:term-1-3}\\
=&0.\nonumber
\end{align}
We get $0$ from \cref{eq:term-1-3} by \cref{lem:exp-sine}. This $0$ cancels all other products in $\term_1$, arriving at a final answer of $0$.\qed\newline
%\begin{align}
%= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|\surj^{-1}(i)|} \conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\right)
% \left(\prod_{j \in [n]}\sine(\dw_i)^{|\surj^{*-1}(j)|} \conj{\sine(\dw'_i)}^{|\surj^{*-1}(j)|}\right)} \cdot
% \ex{\left(\prod_{\substack{i' \in n ~|~\\\surj^{-1}(i') > \surj'^{-1}(i')}} \sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|} \prod_{\substack{j' \in n ~|~\\ \surj'^{-1}('j) > \surj^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|\surj'^{-1}(j')| - |\surj^{-1}(j')|}\right)}\nonumber\\
%= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|\surj^{-1}(i)|} \conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\right)
% \left(\prod_{j \in [n]}\sine(\dw_i)^{|\surj^{*-1}(j)|} \conj{\sine(\dw'_i)}^{|\surj^{*-1}(j)|}\right)} \cdot
% \ex{\prod_{\substack{i' \in n ~|~\\\surj^{-1}(i') > \surj'^{-1}(i')}} \sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}} \cdot \ex{\prod_{\substack{j' \in n ~|~\\ \surj'^{-1}('j) > \surj^{-1}(j')}} \conj{\sine(\dw'_{j'})}^{|\surj'^{-1}(j')| - |\surj^{-1}(j')|}}\nonumber\\
%= &\ex{\left(\prod_{i \in [\dist_*]} \sine(\dw_i)^{|\surj^{-1}(i)|} \conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\right)
% \left(\prod_{j \in [n]}\sine(\dw_i)^{|\surj^{*-1}(j)|} \conj{\sine(\dw'_i)}^{|\surj^{*-1}(j)|}\right)} \cdot
% \prod_{\substack{i' \in n ~|~\\\surj^{-1}(i') > \surj'^{-1}(i')}} \ex{\sine(\dw_{i'})^{|\surj^{-1}(i')| - |\surj'^{-1}(i')|}} \cdot \prod_{\substack{j' \in n ~|~\\ \surj'^{-1}('j) > \surj^{-1}(j')}} \ex{\conj{\sine(\dw'_{j'})}^{|\surj'^{-1}(j')| - |\surj^{-1}(j')|}}\label{eq:lem-match-pt2-last}\\
%= &0.\nonumber
%\end{align}
%Looking at \cref{eq:lem-match-pt2-line1}, each $\sine$ function in the first two products has its matching complex conjugate in the product terms. However, the rightmost products of the expectation are all distinct world value inputs, i.e. random $\sine$ values with no matching conjugate counterparts. Since we have distinct, non-matching world value inputs for the rightmost products, by the uniformity of $\sine$ we can push the expectation through the products until we arrive at \cref{eq:lem-match-pt2-last}, where finally, by \cref{lem:exp-sine}, each of those inner expectations computes to $0$. This in turn zeroes out the whole product.
\begin{corollary}
When $\match{\surj}{\surj'}$, it follows that $\dist = \dist'$.
\end{corollary}
\AH{I'm not really sure what is the better argument.}
\AH{arg 1}
\begin{proof}
Note that if for all $i \in [\dist]$, $|\surj(i)| = |\surj'(i)|$, it is implicit that $\dist = \dist'$, since we have that $\sum_{i \in [\dist]}|\surj^{-1}(i)| = \prodsize.$ Assume the contrary, that $\surj$ has an image of $\dist$ elements and $\surj'$ an image of $\dist'$ elements such that $\dist \neq \dist'$. Then it would have to be that we have either $\sum_{i \in [\dist]}|\surj^{-1}(i)| = \prodsize < \sum_{i' \in [\dist']}|\surj'^{-1}(i')|$, or $ \sum_{i' \in [\dist']}|\surj'^{-1}(i')| = \prodsize < \sum_{i \in [\dist]}|\surj^{-1}(i)|$ since the preimage cardinality for each element in $[\dist]$ for $\surj$ and $\surj'$ are agreeing, and this results in a contradication. %If $\dist \neq \dist'$, then there would have to be an element
\AH{arg 2}
Observe that when we %have the condition for all $i \in [\dist], |\surj^{-1}(i)| = |\surj'^{-1}(i)|$, it has to be that there are the same number of distinct items mapped to in both $\surj$ and $\surj'$ that this implies that $\sum_{i \in [\dist]} \surj^{-1}(i)|
have the condition $\dist \neq \dist'$, we still have that for any $\dist \in [\prodsize], \sum_{i = 1}^{\dist}|\surj^{-1}(i)| = \prodsize$ and that this fact implies for $\dist, \dist' \in [\prodsize]$ such that $\dist \neq \dist'$, there exists $i \in [m]$ such that $|\surj^{-1}(i)| \neq |\surj'^{-1}(i)|$, meaning that if we have that $\dist \neq \dist'$, then $\match{\surj\not}{\surj}$, which is logically equivalent (contrapositive) to the corollary. \qed\newline
\end{proof}
We now seek to show that when $\surj, \surj'$ are matching, that $\term_1$ will always equal $\frac{1}{\sketchCols^\dist}$. Recall that when $\match{\surj}{\surj'}$, that
\begin{enumerate}
%\item $\dist = \dist'$, i.e., the output size of both functions is the same,
\item $\forall i \in [\dist],| \surj^{-1}(i)| = |\surj'^{-1}(i)|$, i.e. each $\dw_i$ has the same number of variables assigned to it as its $\dw'_i$ counterpart.
\end{enumerate}
Recalling that we require $\dw_i = \dw'_i$ this means,
\AH{This also covers the case when m = 1, i.e. $|\surj^{-1}(i)| = \prodsize$.}
\begin{align}
\term_1 = &\ex{\prod_{i = 1}^{\dist}\sine(\dw_i)^{|\surj^{-1}(i)|}\conj{\sine(\dw'_i)}^{|\surj'^{-1}(i)|}\ind{\hfunc(\dw_i) = j}}\nonumber\\
= &\ex{\prod_{i = 1}^{\dist}\left(\sine(\dw_i) \cdot \conj{\sine(\dw'_i)}\right)^{|\surj^{-1}(i)|}\ind{\hfunc(\dw_i) = j}}\label{eq:lem-match-pt3-2}\\
= &\frac{1}{\sketchCols^{\dist}}\nonumber
\end{align}
We arrive at \cref{eq:lem-match-pt3-2} since $\forall i \in [\dist], |\surj^{-1}(i)| = |\surj'^{-1}(i)|$ and we can use the distributive law of exponents
over multiplication. Recall also that for all $i \in [\dist]$, $\dw_i = \dw'_i$ by definition of the lemma, allowing us to drop one of the indicator variables. This then implies that each individiual $\sine(\dw_i)$ has its own matching conjugate $\conj{\sine(\dw'_i)}$,and by the property of roots of unity in complex numbers, each $\sine(\dw_i)\cdot \conj{\sine(\dw'_i)} = 1$, yielding an overall product of $1 \cdot \ex{\ind{\hfunc(\dw_i)}}$.
\end{proof}
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
\begin{Lemma}
When $\match{\surj}{\surj'}$, with $\dw_i = \dw'_i$ for all $i \in [\dist]$, \cref{eq:sig-j-distinct} =
\[\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{\surj, \surj'\\\match{\surj}{\surj'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw_{\surj'(i)}).\]
\end{Lemma}
\AH{A bug below? When we have that for all $i \in [\prodsize]$ $\dw_i = \dw_{i'}$, doesn't $\term_2 = \frac{1}{\sketchCols^2}$ while $\term_1 = \frac{1}{\sketchCols}$, yielding a difference of $\frac{B - 1}{\sketchCols^2}$?}
\AH{Should we mention this weird case for $\dist = \dist' = 1$, where for all $i \in [\prodsize]$ $\dw_i = \wElem$ and for all $i' \in [\prodsize]$ $\dw'_{i'} = \wElem'$ \textit{but} $\wElem \neq \wElem'$? This case makes $\term_1 - \term_2 = 0$, and doesn't change the results.}
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem, \wElem_{i'}' = \wElem_{j'}' = \wElem'$, for both $\wElem = \wElem'$ and $\wElem \neq \wElem'$, we can rid ourselves of $\term_2$, (\cref{eq:term-2}), the case when there exists only one distinct world value. This is precisely why we have not needed to account for the last two expectations in \cref{eq:sig-j-last}. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Starting with \cref{eq:sig-j-distinct},
\AH{redo bringing in the $\term_2$. Advised to separate both (in the $\sigsq_j$ case) $\term_1, \term_2$, and do separate analysis for each.}
\begin{align}
\sigsq_j = &\sum_{\dist = 1}^{\prodsize}\sum_{\dist' = 1}^{\prodsize}\sum_{\surj, \surj'}\sum_{\substack{\dw_{_1} \prec \cdots \prec \dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw'_{\surj'(i)})\cdot \left(\term_1\left(\dw_{\surj(1)},\ldots,\dw_{\surj(\prodsize)}, \dw'_{\surj'(1)},\ldots, \dw'_{\surj'(\prodsize)}\right) - \term_2\left(\dw_{\surj(1)},\ldots,\dw_{\surj(\prodsize)}, \dw'_{\surj'(1)},\ldots, \dw'_{\surj'(\prodsize)}\right)\right)\nonumber\\
= &\sum_{\dist = 2}^{\prodsize}\sum_{\surj, \surj'}\sum_{\substack{\dw_{_1} \prec \cdots \prec \dw_{_\dist},\\\dw'_{1} \prec \cdots \prec\dw'_{\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw'_{\surj'(i)})\cdot \term_1\left(\dw_{\surj(1)},\ldots,\dw_{\surj(\prodsize)}, \dw'_{\surj'(1)},\ldots, \dw'_{\surj'(\prodsize)}\right)\label{eq:sig-j-bnd-1}\\
= &\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{\surj, \surj'\\\match{\surj}{\surj'}}}\sum_{\substack{\dw_{_1} \prec \cdots \prec \dw_{_\dist},\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw'_{\surj'(i)})\label{eq:sig-j-bnd-2}
%= &\sum_{\dist = 2}^{\prodsize}\sum_{\substack{\surj, \surj'\\\match{\surj}{\surj'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw_{\surj'(i)})\cdot \prod_{i = 1}^{\dist}\ind{\hfunc(\dw_i) = j}\label{eq:sig-j-bnd-3}\\
%= &\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{\surj, \surj'\\\match{\surj}{\surj'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw_{\surj'(i)})\label{eq:sig-j-bnd-4}
\end{align}
We obtain \cref{eq:sig-j-bnd-1} by the fact that $\dist = \dist'$ and the removal of $\term_2$. We conclude with \cref{eq:sig-j-bnd-2} by \cref{lem:sig-j-survive}.% as well as bringing out the indicator variables of $\term_1$. Equation \ref{eq:sig-j-bnd-3} is derived from the fact that $\forall i \in [\dist], \dw_i = \dw'_i$. We arrive at \cref{eq:sig-j-bnd-4}, since with $\dist$ distinct variables, the product of indicator variables will result in multiplying the uniform distribution probability distribution $\dist$ times.
Using \cref{eq:cvar-bound} and \cref{eq:sig-j-bnd-2}, we state the general bounds for $\sigsq$,
\AH{Needs to be redone. Missing the outer $\sum_j$ and $\sum_{j \neq j'}$. Use another letter than h for the iteration, h is the hash function.}
\[\sigsq = \sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{\surj, \surj'\\\match{\surj}{\surj'}}}\sum_{\substack{\dw_{_1}, \ldots, \dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{\surj(i)})\vect_i(\dw_{\surj'(i)}) -
\frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem).\]
\AH{Can start on SOP. Another thing I could work on would be revising lemma 1.}
\AR{Remaining TODOs: (1) Give expression for general $\sigma^2$, i.e. deal with the general $\lambda(j,j')$ term. (2) Show how to use the analysis for general $k$-product to handle generic SoP expressions-- the expectation arguments would just follow from the above and linearity of expectation but the variance bounds might need a bit of extra work.}
\AH{make h a subscript. Avoid superscripts generally unless you already have a subscript; make another file with notation for a quick easy table. Make every product term to have the same k product terms.
Also, the game plan is:
use lemma 6 for when $\ell = \ell'$
write lemma 6' for when $\ell \neq \ell'$.
}
\section{SOP}
\subsection{Notation}
We proceed to evaluating the sum of products ($\sop$) query. As it turns out, the analysis is much the same as in the analysis for $\prodsize$-way products. The output of an $\sop$ query is formally defined as
\[\term_j^{\sop} = \sum_{\sopind = 1}^{\sopsize}\sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize^{\sopind}}\vect_i(\wElem)\sine(\wElem)\]
where $\prodsize^{\sopind}$ is the set of $\prodsize$ vectors that make up the $\sopind^\text{th}$ addend (product term) of the sum. Further define an estimate of a $j^{\text{th}}$ bucket as
\[\sopest{j} = \sum_{\sopind = 1}^{\sopsize}\prod_{i = 1}^{\prodsize_{\sopind}}\sk^{\vect^{\sopind}_i}[j]\]
and the overall estimate
\[\sopest{} = \sum_{j}\sopest{j}.\]
\subsection{Expectation}
\begin{Lemma}
The expectation for an SOP query is exactly $\term_j^{\sop}$.
\end{Lemma}
\begin{proof}
The proof follows immediately by linearity of expecation. Recall that $\term_j = \sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize}\vect_i(\wElem)\sine(\wElem)$. Since $\sop$ is just the addition of an arbitrary number of $\prodsize$-way products, we end up $\term_j$ for each of the product terms, which leads to the sum of all $\term_j$'s, which is exactly $\term_j^{\sop}$.\qed
\end{proof}
\subsection{Variance}
It turns out that the variance computations naturally work out in a similar fashion as with $\prodsize$-way products. Let $\cvar_{\sop}(j, j') = \ex{\sop_{\est_j} \cdot \conj{\sop_{\est_{j'}}}} - \ex{\sop_{\est_j}}\ex{\conj{\sop_{\est_{j'}}}}$.
\begin{align}
\sigsq_{\sop} =& \ex{\sop_{\est} \cdot \conj{\sop_{\est}}} - \ex{\sop_{\est}}\ex{\conj{\sop_{\est}}}\nonumber\\
=& \ex{\sum_{j, j'} \sop_{\est_j} \cdot \conj{\sop_{\est_{j'}}}} - \ex{\sum_j \sop_{\est_j}}\ex{\sum_{j'} \conj{\sop_{\est_{j'}}}}\nonumber\\
=& \sum_{j, j'} \ex{\sop_{\est_j} \cdot \conj{\sop_{\est_{j'}}}} - \sum_j \ex{\sop_{\est_j}} \sum_{j'}\ex{\conj{\sop_{\est_{j'}}}}\nonumber\\
=& \sum_{j, j'} \ex{\sop_{\est_j} \cdot \conj{\sop_{\est_{j'}}}} - \ex{\sop_{\est_j}}\ex{\conj{\sop_{\est_{j'}}}}\nonumber\\
=& \left(\sum_j \cvar_{\sop}(j, j)\right) + \left(\sum_{j \neq j'}\cvar_{\sop}(j, j')\right)\label{eq:sop-sigsq}
\end{align}
Likewise we can bound $\cvar_{\sop}(j, j')$,
\begin{align}
\cvar_{\sop}(j, j') =& \ex{\left(\sum_{\sopind = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^\sopind}\sum_{\wElem \in \wSet}\vect_i^\sopind(\wElem)\sine(\wElem)\ind{\hfunc(\wElem) = j}\right)\cdot \left(\conj{\sum_{\sopind' = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^{\sopind'}}\sum_{\wElem' \in \wSet}\vect_i^{\sopind'}(\wElem')\sine(\wElem')\ind{\hfunc(\wElem') = j'}}\right)} -
\ex{\sum_{\sopind = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^\sopind}\sum_{\wElem \in \wSet}\vect_i^\sopind(\wElem)\sine(\wElem)\ind{\hfunc(\wElem) = j}}
\ex{\conj{\sum_{\sopind' = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^{\sopind'}}\sum_{\wElem' \in \wSet}\vect_i^{\sopind'}(\wElem')\sine(\wElem')\ind{\hfunc(\wElem') = j'}}}\nonumber \\
=& \ex{\left(\sum_{\sopind = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^\sopind}\sum_{\wElem \in \wSet}\vect_i^\sopind(\wElem)\sine(\wElem)\ind{\hfunc(\wElem) = j}\right)\cdot \left(\sum_{\sopind' = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^{\sopind'}}\sum_{\wElem' \in \wSet}\vect_i^{\sopind'}(\wElem')\conj{\sine(\wElem')}\ind{\hfunc(\wElem') = j'}\right)} -
\ex{\sum_{\sopind = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^\sopind}\sum_{\wElem \in \wSet}\vect_i^\sopind(\wElem)\sine(\wElem)\ind{\hfunc(\wElem) = j}}
\ex{\sum_{\sopind' = 1}^{\sopsize}\prod_{i = 1}^{\prodsize^{\sopind'}}\sum_{\wElem' \in \wSet}\vect_i^{\sopind'}(\wElem')\conj{\sine(\wElem')}\ind{\hfunc(\wElem') = j'}}\nonumber \\
=& \ex{\left(\sum_{\sopind = 1}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\ \in \wSet}}\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}\right)\cdot \left(\sum_{\sopind' = 1}^{\sopsize}\sum_{\substack{\wElem'_1,\ldots, \wElem'_{\prodsize^{\sopind'}} \\ \in \wSet}}\prod_{i' = 1}^{\prodsize^{\sopind'}}\vect_i^{\sopind'}(\wElem'_{i'})\conj{\sine(\wElem'_{i'})}\ind{\hfunc(\wElem'_{i'}) = j'}\right)} -
\ex{\sum_{\sopind = 1}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\ \in \wSet}}\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}}
\ex{\sum_{\sopind' = 1}^{\sopsize}\sum_{\substack{\wElem'_1,\ldots, \wElem'_{\prodsize^\sopind} \\ \in \wSet}}\prod_{i = 1}^{\prodsize^{\sopind'}}\vect_i^{\sopind'}(\wElem'_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}}\nonumber \\
=& \ex{\sum_{\sopind, \sopind'}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\\wElem'_1,\ldots, \wElem'_{\prodsize^\sopind}\\ \in \wSet}}\left(\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}\right)\cdot \left(\prod_{i = 1}^{\prodsize^{\sopind'}}\vect_i^{\sopind'}(\wElem'_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}\right)} -
\sum_{\sopind = 1}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\ \in \wSet}} \ex{\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}}
\sum_{\sopind' = 1}^{\sopsize}\sum_{\substack{\wElem'_1,\ldots, \wElem'_{\prodsize^\sopind} \\ \in \wSet}} \ex{\prod_{i = 1}^{\prodsize^{\sopind}}\vect_i^{\sopind'}(\wElem'_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}}\nonumber \\
=& \sum_{\sopind, \sopind'}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\\wElem'_1,\ldots, \wElem'_{\prodsize^\sopind}\\ \in \wSet}}\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\ \prod_{i = 1}^{\prodsize^{\sopind'}}\vect_i^{\sopind'}(\wElem'_i)\ex{\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}} -
\sum_{\sopind, \sopind'}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\ \wElem'_1,\ldots, \wElem'_{\prodsize^\sopind}\\ \in \wSet}} \ex{\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}}
\ex{\prod_{i = 1}^{\prodsize^{\sopind}}\vect_i^{\sopind'}(\wElem'_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}}\nonumber \\
=& \sum_{\sopind, \sopind'}^{\sopsize}\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize^\sopind}\\\wElem'_1,\ldots, \wElem'_{\prodsize^\sopind}\\ \in \wSet}}\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\ \prod_{i = 1}^{\prodsize^{\sopind'}}\vect_i^{\sopind'}(\wElem'_i) \left(\ex{\prod_{i = 1}^{\prodsize^{\sopind}}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}\prod_{i = 1}^{\prodsize^{\sopind'}}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}} -
\ex{\prod_{i = 1}^{\prodsize^\sopind}\vect_i^\sopind(\wElem_i)\sine(\wElem_i)\ind{\hfunc(\wElem_i) = j}}
\ex{\prod_{i = 1}^{\prodsize^{\sopind}}\vect_i^{\sopind'}(\wElem'_i)\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = j'}}\right)\nonumber
\end{align}