paper-BagRelationalPDBsAreHard/sop.tex

162 lines
16 KiB
TeX

%root--main.tex
\section{Sum of Products Analysis}
We now seek to bound the variance of a $\prodsize$-way join.
\begin{align}
&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. First we refer to the expectation computations as
\[\term = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \text{,} \]
\[\term_1 = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} \text{, and}\]
\[\term_2 = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote a value for which the equation \eqref{eq:sig-j-last} computes given a specific set of world values. To say that a term survives the expectation is to mean that a value other than zero is computed from \eqref{eq:sig-j-last} for a given set of world values. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{f, f'}
To help describe all possible matchings we introduce functions $f$ and $f'$.
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%f(i) = \begin{cases}
% \widetilde{w_1} &f(i) = 1\\
% \widetilde{w_2} &f(i) = 2\\
% \vdots &\vdots\\
% \widetilde{w_\dist} &f(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \widetilde{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist \in [\prodsize]}\sum_{\dist' \in [\prodsize]}\sum_{f, f'}\sum_{\substack{\wElem_1, \ldots,\wElem_\dist,\\\wElem'_1,\ldots,\wElem'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem'_{f'(i)}})\cdot\left( \ex{\prod_{i = 1}^\prodsize \sine(\widetilde{\wElem_{f(i)}}\conj{\sine(\widetilde{\wElem'_{f'(i)}})}\ind{h(\widetilde{\wElem_{f(i)}}) = j}\ind{h(\widetilde{w'_{f'(i)}}) = j}} -
\ex{\prod_{i = 1}^\prodsize \sine(\wElem_{f(i)})\ind{h(\widetilde{\wElem_{f(i)}}) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\conj{\sine(\wElem'_{f'(i)})}\ind{h(\widetilde{w'_{f'(i)}}) = j}} \right)\label{eq:sig-j-distinct}
\end{equation}
Note that for a given $\dist$, we may have several ways to map $\prodsize$ worlds to $\dist$ distinct values. We need to define what if means for $f$ and $f'$ to be matching.
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
\begin{enumerate}
\item $\dist = \dist'$
\item $\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} = \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}$, i.e., the set of preimage cardinalities for $f$ equals the set of preimage cardinalities for $f'$.
% \item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, or a symmetrical mapping exists, where $\forall i \in [\dist], \exists i' \in [\dist]$ such that $i'$ is unique, $|f^{-1}(i)| = |f^{-1}(i')|$.
\end{enumerate}
\end{Definition}
\begin{Lemma}\label{lem:sig-j-survive}
The only terms surviving the expectation of equation \eqref{eq:sig-j-distinct} are those with $f, f'$ matching, where $\forall j \in[\dist], \widetilde{\wElem_j} = \widetilde{\wElem'_j}$.
\end{Lemma}
In proving \cref{lem:sig-j-survive}, we introduce another fact.
\begin{Lemma}\label{lem:exp-prod-rand-roots}
Given a $\prodsize^{th}$ root of unity $\rou$, the expectation of the product of $\rou^i \cdot \rou^j$ for $i, j \in [\prodsize]$ is zero.
\end{Lemma}
\begin{align*}
&\ex{\sine(\wElem)^i \conj{\sine(w')}^j}\\
= &\left(\rou + \rou^1 + \ldots + \rou^{\prodsize - 1}\right)^i \cdot \left(\rou + \rou^1 + \ldots + \rou^{\prodsize - 1}\right)^j\\
= &0^i \cdot 0^j\\
= &0
\end{align*}
We state what the expectation looks like when $f, f'$ are not matching. From \cref{eq:sig-j-last} it can be seen that if $\term_1 = \term_2 = 0$, then there are no surviving terms. Beginning with \cref{eq:sig-j-last}, we first look into the case when $\dist \neq \dist'$. By the fact that $\dist \neq \dist'$ we know that one set of variables has at least one more distinct world than the other set of variables. Without loss of generality, assume that $\dist < \dist'$. Looking at $\term_1$ having $\dist$ distinct $\wElem$ values,
\begin{equation}
%&\sum_{\substack{\wElem_1,\ldots,\wElem_{\dist},\\ \wElem_1',\ldots,\wElem_{\dist}'\\\in \wSet}}\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i')
% \left(\ex{\prod_{i = 1}^{k}\sine(\wElem_i)\conj{\sine(\wElem_i')}\ind{\hfunc(\wElem_i) = \buck}\ind{\hfunc(\wElem_i') = \buck}} -
% \ex{\prod_{i = 1}^{k}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = \buck}}\ex{\prod_{i = 1}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = \buck}} \right) \\
\term_1 = \ex{\sine(\wElem_1)^{\dupSize_1}\cdot,\ldots,\cdot\sine(\wElem_m)^{\dupSize_m}\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot, \ldots,\cdot \conj{\sine(\wElem_{m}')}^{\dupSize_m'} \cdot, \ldots, \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}} = 0.
\end{equation}
Notice that, with $\dist < \dist'$, this means that there will be $\sum\limits_{n \in \{ [\dist'] - [\dist] \}}\dupSize_n'$ world values on either side that do not have a match since the number of $\prodsize$ products is constant. This leaves us with $\prod\limits_{\dupSize_{n^*} \in \{\dupSize_n | \forall n \in [m], \dupSize_n \neq \dupSize_n'\}}\sine(\wElem_i)^{\dupSize_{n^*} - \dupSize_{n'}} \cdot \prod\limits_{\dist_i' \in \{\dist' - \dist\}}\conj{\sine(\wElem_{m_i'}')}^{\dupSize_{m_i'}'}$ pairings, where $\forall \wElem_i \in \{\wElem_i | i \in [m], \dupSize_i \neq \dupSize_i'\}, \forall \dist_i' \in \{\dist_i' | \dist_i' \in \{\dist' - \dist\}\}, \wElem_i \neq \wElem_{\dist_i'}$, which in expectation is 0 by \cref{lem:exp-prod-rand-roots}. The proof is symmetric for the case of $\dist > \dist'$.
From $\term_2$, notice that by the fact that $\dist' > \dist$, it has to be the case that $\dist' > 1$, which by \cref{lem:exp-prod-rand-roots} equals zero, setting the whole product to zero. The proof is symmetric for the case of $\dist > \dist'$.
\qed\newline
%\newline Assume $\dist < \dist'$.
%\begin{align*}
%\term_2 = &\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}\\
%= &\ex{\sine(\wElem_1)^{\dupSize_1}\cdot\ldots\cdot\sine(\wElem_\dist)^{\dupSize_\dist}} \cdot
% \ex{\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot\ldots\cdot\conj{\sine(\wElem_{\dist}')}^{\dupSize_\dist'}\conj{\sine(\wElem_{\dist + 1}')}^{\dupSize_{\dist + 1}'}\cdot\ldots\cdot\conj{\sine(\wElem_{\dist'}')}^{\dupSize_{\dist'}'}}\\
%= &\ex{\sine(\wElem_1)^{\dupSize_1}\cdot\ldots\cdot\sine(\wElem_\dist)^{\dupSize_\dist}} \cdot 0 \\
%= & 0
%\end{align*}
%Note that the second expectation in $\term_2$ cancels out by the fact that there is at least one distinct worl $\wElem_\dist'$ that does not match any of the other world value in the product, and by \cref{lem:exp-prod-rand-roots} yields zero.
To finish the proof of \cref{lem:sig-j-survive}, we now approach the case where $\dist = \dist'$, but the set of preimage cardinalities for $f, f'$ are unequal. Effectively this condition means that we end up with the same result of unequal pairs as when $\dist \neq \dist'$.
\begin{align}
&\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} \neq \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}\\
\rightarrow &\exists i, i' \in [m] s.t. \dupSize_s = |f^{-1}(i)| \neq |f'^{-1}(i)| = \dupSize_s', \dupSize_t = |f^{-1}(i')| \neq |f'^{-1}(i')| = \dupSize_t'\\
\rightarrow &|\dupSize_s - \dupSize_s'| = |\dupSize_t - \dupSize_t'|\text{ by the fact that $\dist = \dist'$}
\end{align}
To be clear, if the set of preimage sizes do not match, then we have for both $f, f'$ at least two preimage mappings to their respective distinct variables whose corresponding sizes are both unequal. In the notation above, we set the unmatching cardinalities to $\dupSize_s$, etc. to reference later on. Note that since $\dist$ is the same for both $f, f'$, the disagreement in cardinalities evens out across variables. For the sake of argument, define $\dupSize_s > \dupSize_s', \dupSize_t' > \dupSize_t$.
Translating this to $\term_1$ and $\term2$ we have
\begin{align}
\term_1 = &\ex{\left(\prod_{i \in [\dist] | i \neq s, t}\sine(\wElem_i)^{\dupSize_i}\conj{\sine(\wElem_i')}^{\dupSize_i}\right)
\sine(\wElem_s)^{\dupSize_s}\conj{\sine(\wElem_s')^{\dupSize_s'}}\sine(\wElem_t)^{\dupSize_t}\conj{\sine(\wElem_t')}^{\dupSize_t'} }\\
= &\ex{\left(\prod_{i \in [\dist] | i \neq s, t}\sine(\wElem_i)^{\dupSize_i}\conj{\sine(\wElem_i')}^{\dupSize_i}\right)
\sine(\wElem_s)^{\dupSize_s'}\conj{\sine(\wElem_s')^{\dupSize_s'}} \sine(\wElem_t)^{\dupSize_t}\conj{\sine(\wElem_t')}^{\dupSize_t} \cdot
\sine(\wElem_s)^{\dupSize_s - \dupSize_s'} \conj{\sine(\wElem_t)}^{\dupSize_t' - \dupSize_t} }\\
=& 0.
\end{align}
Note, that because we have that $\wElem_s \neq \wElem_t$, the expectation is zero by \cref{lem:exp-prod-rand-roots}.
For $\term_2$, notice that since we have $m \geq 2$ for preimage sizes to disagree, both expectations yield zero by \cref{lem:exp-prod-rand-roots}.
For the case where $\dupSize_s < \dupSize_s', \dupSize_t' < \dupSize_t$, the argument is symmetric.\qed\newline
%Since we have at least one extra \textit{distinct} world value, whose conjugate of its sine value is paired with the sine value of another distinct world value, the expectation will equal zero.
%\newline
%The proof is immediate and follows from the fact that the random $\sine$ functions are only guaranteed to produce a product of one under one of two possible conditions:
% \begin{enumerate}
% \item $\sine(\wElem)^\prodsize = 1$,
% \item $\sine(\wElem) \conj{\sine(\wElem)} = 1$.\qed
% \end{enumerate}
%\AH{Here is where I have attempted to use prose to discuss the restrictions on $f$ and $f'$, rather than the use of \dist-tuples. Maybe there is a better, cleaner formal way?}
%E.g., for $\prodsize = 4, \dist = 2$, mappings could be such that one $\wElem_i$ is distinct, while the other three $\wElem_i$ are mapped to the other distinct value. Additionally, we would have the case where two $\wElem_i$ map to a distinct value, while the other two $\wElem_i$ map to a seperate distinct world. The expectations of equation \eqref{eq:sig-j-last} restrict $f$ and $f'$ to belonging to the same class of $\dist$-mapping, meaning, if the mapping $f$ for $\prodsize = 4, \dist = 2$ is in the setting of one distinct world and three equal world values, then $f'$ must be from that same set of mappings, and not from another class of mappings, such as when two $w_i$ map to a distinct world, while the other two $w_i$ map to a separate distinct world.
%\AH{Here is the use of \dist-tuples to explain the same thing.}
% In the example above, $f$ mappings for $\dist_{2_1}$ may only cross product with $f'$ mappings for $\dist_{2_1}$ and not with those for $\dist_{2_2}$. Likewise for $f, f'$ mappings of $\dist_{2_2}$.
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
%
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%This is because we know that the expectation from \eqref{eq:sig-j-last} will survive when we have mappings that produce pairs of the form $\sine(\wElem)\conj{\sine(\wElem)}$. Second, in consideration of the randomized hashing, with two distinct variables, the indicator variables in the expectation yield $\frac{1}{\sketchCols}\cdot \frac{1}{\sketchCols}$.
%
%We need to sum over all mappings for each case (c) when the number of distinct values is $\dist = 2$, resulting in
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\sum_{c \in \dist = 2}\sum_{f, f'}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%
%Finally, we need to do this for all $\dist$.
\begin{equation*}
\sigsq_j = \sum_{\dist \in [2, \prodsize]} \frac{1}{B^\dist} \sum_{\widetilde{w_1}\ldots\widetilde{w_\dist}\in W} \sum_{\substack{f, f',\\\match{f}{f'}}} \prod_{i = 1}^{\prodsize} v_i(\widetilde{w_{f(i)}}) v_i(\widetilde{w_{f'(i)}})
\end{equation*}