paper-BagRelationalPDBsAreHard/sop.tex

112 lines
12 KiB
TeX

%root--main.tex
\section{Sum of Products Analysis}
We now seek to bound the variance of a $\prodsize$-way join.
\begin{align}
&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. First we refer to the expectation computations as
\[\term = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \text{,} \]
\[\term_1 = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} \text{, and}\]
\[\term_2 = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote a value for which the equation \eqref{eq:sig-j-last} computes given a specific set of world values. To say that a term survives the expectation is to mean that a value other than zero is computed from \eqref{eq:sig-j-last} for a given set of world values. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
\subsection{f, f'}
To help describe all possible matchings we introduce functions $f$ and $f'$.
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
\end{Definition}
%\begin{equation*}
%f(i) = \begin{cases}
% \widetilde{w_1} &f(i) = 1\\
% \widetilde{w_2} &f(i) = 2\\
% \vdots &\vdots\\
% \widetilde{w_\dist} &f(i) = \dist.
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \widetilde{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist \in [\prodsize]}\sum_{\dist' \in [\prodsize]}\sum_{f, f'}\sum_{\substack{\wElem_1, \ldots,\wElem_\dist,\\\wElem'_1,\ldots,\wElem'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem'_{f'(i)}})\cdot\left( \ex{\prod_{i = 1}^\prodsize \sine(\widetilde{\wElem_{f(i)}}\conj{\sine(\widetilde{\wElem'_{f'(i)}})}\ind{h(\widetilde{\wElem_{f(i)}}) = j}\ind{h(\widetilde{w'_{f'(i)}}) = j}} -
\ex{\prod_{i = 1}^\prodsize \sine(\wElem_{f(i)})\ind{h(\widetilde{\wElem_{f(i)}}) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\conj{\sine(\wElem'_{f'(i)})}\ind{h(\widetilde{w'_{f'(i)}}) = j}} \right)\label{eq:sig-j-distinct}
\end{equation}
Note that for a given $\dist$, we may have several ways to map $\prodsize$ worlds to $\dist$ distinct values. We need to define what if means for $f$ and $f'$ to be matching.
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
\begin{enumerate}
\item $\dist = \dist'$
\item $\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} = \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}$, i.e., the set of preimage cardinalities for $f$ equals the set of preimage cardinalities for $f'$.
% \item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, or a symmetrical mapping exists, where $\forall i \in [\dist], \exists i' \in [\dist]$ such that $i'$ is unique, $|f^{-1}(i)| = |f^{-1}(i')|$.
\end{enumerate}
\end{Definition}
\begin{Lemma}\label{lem:sig-j-survive}
The only terms surviving the expectation of equation \eqref{eq:sig-j-distinct} are those with $f, f'$ matching, where $\forall j \in[\dist], \widetilde{\wElem_j} = \widetilde{\wElem'_j}$.
\end{Lemma}
We state what the expectation looks like when $f, f'$ are not matching. From \cref{eq:sig-j-last} it can be seen that if $\term_1 = \term_2 = 0$, then there are no surviving terms. Beginning with \cref{eq:sig-j-last}, we first look into the case when $\dist \neq \dist'$. By the fact that $\dist \neq \dist'$ we know that one set of variables has at least one more distinct world than the other set of variables. Without loss of generality, assume that $\dist < \dist'$. Looking at $\term_1$,
\begin{equation}
%&\sum_{\substack{\wElem_1,\ldots,\wElem_{\dist},\\ \wElem_1',\ldots,\wElem_{\dist}'\\\in \wSet}}\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i')
% \left(\ex{\prod_{i = 1}^{k}\sine(\wElem_i)\conj{\sine(\wElem_i')}\ind{\hfunc(\wElem_i) = \buck}\ind{\hfunc(\wElem_i') = \buck}} -
% \ex{\prod_{i = 1}^{k}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = \buck}}\ex{\prod_{i = 1}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = \buck}} \right) \\
\term_1 = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_1)^{\dupSize_1}\cdot,\ldots,\cdot\sine(\wElem_m)^{\dupSize_m}\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot, \ldots,\cdot \conj{\sine(\wElem_{m}')}^{\dupSize_m'} \cdot, \ldots, \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}} = 0.
\end{equation}
Notice that, with $\dist < \dist'$, this means that there will be $\sum\limits_{n \in \{ [\dist'] - [\dist] \}}\dupSize_n'$ world values on either side that do not have a match since the number of $\prodsize$ products is constant. This leaves us with at least one $\sine(\wElem_i)^{\dupSize_n - \dupSize_{n'}} \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}$ pairing which in expectation = 0. Since we have at least one extra \textit{distinct} world value, whose conjugate of its sine value is paired with the sine value of another distinct world value, the expectation will equal zero.
\newline
The proof is immediate and follows from the fact that the random $\sine$ functions are only guaranteed to produce a product of one under one of two possible conditions:
\begin{enumerate}
\item $\sine(\wElem)^\prodsize = 1$,
\item $\sine(\wElem) \conj{\sine(\wElem)} = 1$.\qed
\end{enumerate}
%\AH{Here is where I have attempted to use prose to discuss the restrictions on $f$ and $f'$, rather than the use of \dist-tuples. Maybe there is a better, cleaner formal way?}
%E.g., for $\prodsize = 4, \dist = 2$, mappings could be such that one $\wElem_i$ is distinct, while the other three $\wElem_i$ are mapped to the other distinct value. Additionally, we would have the case where two $\wElem_i$ map to a distinct value, while the other two $\wElem_i$ map to a seperate distinct world. The expectations of equation \eqref{eq:sig-j-last} restrict $f$ and $f'$ to belonging to the same class of $\dist$-mapping, meaning, if the mapping $f$ for $\prodsize = 4, \dist = 2$ is in the setting of one distinct world and three equal world values, then $f'$ must be from that same set of mappings, and not from another class of mappings, such as when two $w_i$ map to a distinct world, while the other two $w_i$ map to a separate distinct world.
%\AH{Here is the use of \dist-tuples to explain the same thing.}
% In the example above, $f$ mappings for $\dist_{2_1}$ may only cross product with $f'$ mappings for $\dist_{2_1}$ and not with those for $\dist_{2_2}$. Likewise for $f, f'$ mappings of $\dist_{2_2}$.
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
%
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%This is because we know that the expectation from \eqref{eq:sig-j-last} will survive when we have mappings that produce pairs of the form $\sine(\wElem)\conj{\sine(\wElem)}$. Second, in consideration of the randomized hashing, with two distinct variables, the indicator variables in the expectation yield $\frac{1}{\sketchCols}\cdot \frac{1}{\sketchCols}$.
%
%We need to sum over all mappings for each case (c) when the number of distinct values is $\dist = 2$, resulting in
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\sum_{c \in \dist = 2}\sum_{f, f'}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%
%Finally, we need to do this for all $\dist$.
\begin{equation*}
\sigsq_j = \sum_{\dist \in [2, \prodsize]} \frac{1}{B^\dist} \sum_{\widetilde{w_1}\ldots\widetilde{w_\dist}\in W} \sum_{\substack{f, f',\\\match{f}{f'}}} \prod_{i = 1}^{\prodsize} v_i(\widetilde{w_{f(i)}}) v_i(\widetilde{w_{f'(i)}})
\end{equation*}