paper-BagRelationalPDBsAreHard/sop.tex

112 lines
12 KiB
TeX
Raw Normal View History

2020-03-26 12:38:07 -04:00
%root--main.tex
\section{Sum of Products Analysis}
We now seek to bound the variance of a $\prodsize$-way join.
2020-03-26 12:38:07 -04:00
\begin{align}
&\sigsq_j = \ex{\est_j \cdot \overline{\est_j}} - \ex{\est_j} \cdot \ex{\overline{\est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{\prodsize}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^\prodsize\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_\prodsize \in W} \prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\ex{\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^\prodsize v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_\prodsize\\w'_1...w'_\prodsize\\ \in W}}\prod_{i = 1}^\prodsize v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
2020-03-26 12:38:07 -04:00
\end{align}
Before proceeding, we introduce some notation and terminology that will aid in communicating the bounds we are about to establish. First we refer to the expectation computations as
\[\term = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}} \text{,} \]
\[\term_1 = \ex{\prod_{i = 1}^\prodsize s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} \text{, and}\]
\[\term_2 = \ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\overline{s(w'_i)}\ind{h(w'_i) = j}}. \]
We will use the vocabulary 'term' to denote a value for which the equation \eqref{eq:sig-j-last} computes given a specific set of world values. To say that a term survives the expectation is to mean that a value other than zero is computed from \eqref{eq:sig-j-last} for a given set of world values. Note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [\prodsize]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set. In other words, the set of values in $\wElem_1,\ldots,\wElem_k$ has a bijective mapping to the set of values in $\wElem'_1,\ldots,\wElem'_k$.
2020-03-26 12:38:07 -04:00
2020-03-31 11:52:00 -04:00
%\subsection{M-tuples}
%\begin{Definition}
%Given a $\prodsize$-way join, define $\dist \in [\prodsize]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
2020-03-31 11:52:00 -04:00
%\end{Definition}
%For example, when $\prodsize = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $\prodsize = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
2020-03-31 11:52:00 -04:00
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
2020-04-01 10:57:37 -04:00
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
2020-03-26 20:15:00 -04:00
2020-03-26 12:38:07 -04:00
\subsection{f, f'}
2020-03-31 11:52:00 -04:00
To help describe all possible matchings we introduce functions $f$ and $f'$.
2020-03-26 12:38:07 -04:00
\begin{Definition}
Functions f, f' are the set of surjective mappings from $\prodsize$ to $\dist$ elements: $f: [\prodsize] \rightarrow [\dist], f': [\prodsize] \rightarrow [\dist'].$
2020-03-26 12:38:07 -04:00
\end{Definition}
2020-03-27 12:10:41 -04:00
%\begin{equation*}
%f(i) = \begin{cases}
% \widetilde{w_1} &f(i) = 1\\
% \widetilde{w_2} &f(i) = 2\\
% \vdots &\vdots\\
2020-04-01 10:57:37 -04:00
% \widetilde{w_\dist} &f(i) = \dist.
2020-03-27 12:10:41 -04:00
% \end{cases}
%\end{equation*}
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \widetilde{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $\prodsize$ $\wElem$-world variables to $\dist$ distinct values.
2020-04-01 10:57:37 -04:00
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist \in [\prodsize]}\sum_{\dist' \in [\prodsize]}\sum_{f, f'}\sum_{\substack{\wElem_1, \ldots,\wElem_\dist,\\\wElem'_1,\ldots,\wElem'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem'_{f'(i)}})\cdot\left( \ex{\prod_{i = 1}^\prodsize \sine(\widetilde{\wElem_{f(i)}}\conj{\sine(\widetilde{\wElem'_{f'(i)}})}\ind{h(\widetilde{\wElem_{f(i)}}) = j}\ind{h(\widetilde{w'_{f'(i)}}) = j}} -
\ex{\prod_{i = 1}^\prodsize \sine(\wElem_{f(i)})\ind{h(\widetilde{\wElem_{f(i)}}) = j}}\cdot \ex{\prod_{i = 1}^\prodsize\conj{\sine(\wElem'_{f'(i)})}\ind{h(\widetilde{w'_{f'(i)}}) = j}} \right)\label{eq:sig-j-distinct}
2020-04-01 10:57:37 -04:00
\end{equation}
Note that for a given $\dist$, we may have several ways to map $\prodsize$ worlds to $\dist$ distinct values. We need to define what if means for $f$ and $f'$ to be matching.
2020-03-26 20:15:00 -04:00
2020-03-31 11:52:00 -04:00
\begin{Definition}
Functions $f:[\prodsize]\mapsto [\dist], f':[\prodsize]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
2020-03-31 11:52:00 -04:00
\begin{enumerate}
2020-04-01 10:57:37 -04:00
\item $\dist = \dist'$
\item $\{|f^{-1}(i)| ~|~ \forall i \in [\dist]\} = \{|f'^{-1}(i')| ~|~ \forall i' \in [\dist] \}$, i.e., the set of preimage cardinalities for $f$ equals the set of preimage cardinalities for $f'$.
2020-04-01 10:57:37 -04:00
% \item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, or a symmetrical mapping exists, where $\forall i \in [\dist], \exists i' \in [\dist]$ such that $i'$ is unique, $|f^{-1}(i)| = |f^{-1}(i')|$.
2020-03-31 11:52:00 -04:00
\end{enumerate}
\end{Definition}
2020-04-01 10:57:37 -04:00
\begin{Lemma}\label{lem:sig-j-survive}
The only terms surviving the expectation of equation \eqref{eq:sig-j-distinct} are those with $f, f'$ matching, where $\forall j \in[\dist], \widetilde{\wElem_j} = \widetilde{\wElem'_j}$.
2020-03-31 11:52:00 -04:00
\end{Lemma}
We state what the expectation looks like when $f, f'$ are not matching. From \cref{eq:sig-j-last} it can be seen that if $\term_1 = \term_2 = 0$, then there are no surviving terms. Beginning with \cref{eq:sig-j-last}, we first look into the case when $\dist \neq \dist'$. By the fact that $\dist \neq \dist'$ we know that one set of variables has at least one more distinct world than the other set of variables. Without loss of generality, assume that $\dist < \dist'$. Looking at $\term_1$,
\begin{equation}
%&\sum_{\substack{\wElem_1,\ldots,\wElem_{\dist},\\ \wElem_1',\ldots,\wElem_{\dist}'\\\in \wSet}}\prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\vect_i(\wElem_i')
% \left(\ex{\prod_{i = 1}^{k}\sine(\wElem_i)\conj{\sine(\wElem_i')}\ind{\hfunc(\wElem_i) = \buck}\ind{\hfunc(\wElem_i') = \buck}} -
% \ex{\prod_{i = 1}^{k}\sine(\wElem_i)\ind{\hfunc(\wElem_i) = \buck}}\ex{\prod_{i = 1}\conj{\sine(\wElem'_i)}\ind{\hfunc(\wElem'_i) = \buck}} \right) \\
\term_1 = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_1)^{\dupSize_1}\cdot,\ldots,\cdot\sine(\wElem_m)^{\dupSize_m}\conj{\sine(\wElem_1')}^{\dupSize_1'}\cdot, \ldots,\cdot \conj{\sine(\wElem_{m}')}^{\dupSize_m'} \cdot, \ldots, \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}} = 0.
\end{equation}
Notice that, with $\dist < \dist'$, this means that there will be $\sum\limits_{n \in \{ [\dist'] - [\dist] \}}\dupSize_n'$ world values on either side that do not have a match since the number of $\prodsize$ products is constant. This leaves us with at least one $\sine(\wElem_i)^{\dupSize_n - \dupSize_{n'}} \cdot \conj{\sine(\wElem_{m'}')}^{\dupSize_{m'}'}$ pairing which in expectation = 0. Since we have at least one extra \textit{distinct} world value, whose conjugate of its sine value is paired with the sine value of another distinct world value, the expectation will equal zero.
\newline
2020-03-31 11:52:00 -04:00
The proof is immediate and follows from the fact that the random $\sine$ functions are only guaranteed to produce a product of one under one of two possible conditions:
\begin{enumerate}
\item $\sine(\wElem)^\prodsize = 1$,
2020-03-31 11:52:00 -04:00
\item $\sine(\wElem) \conj{\sine(\wElem)} = 1$.\qed
\end{enumerate}
2020-03-26 20:15:00 -04:00
2020-03-26 12:38:07 -04:00
2020-04-01 10:57:37 -04:00
%\AH{Here is where I have attempted to use prose to discuss the restrictions on $f$ and $f'$, rather than the use of \dist-tuples. Maybe there is a better, cleaner formal way?}
%E.g., for $\prodsize = 4, \dist = 2$, mappings could be such that one $\wElem_i$ is distinct, while the other three $\wElem_i$ are mapped to the other distinct value. Additionally, we would have the case where two $\wElem_i$ map to a distinct value, while the other two $\wElem_i$ map to a seperate distinct world. The expectations of equation \eqref{eq:sig-j-last} restrict $f$ and $f'$ to belonging to the same class of $\dist$-mapping, meaning, if the mapping $f$ for $\prodsize = 4, \dist = 2$ is in the setting of one distinct world and three equal world values, then $f'$ must be from that same set of mappings, and not from another class of mappings, such as when two $w_i$ map to a distinct world, while the other two $w_i$ map to a separate distinct world.
2020-04-01 10:57:37 -04:00
%\AH{Here is the use of \dist-tuples to explain the same thing.}
% In the example above, $f$ mappings for $\dist_{2_1}$ may only cross product with $f'$ mappings for $\dist_{2_1}$ and not with those for $\dist_{2_2}$. Likewise for $f, f'$ mappings of $\dist_{2_2}$.
2020-03-26 12:38:07 -04:00
2020-04-01 10:57:37 -04:00
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
2020-03-27 12:10:41 -04:00
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
2020-04-01 10:57:37 -04:00
%
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
2020-04-01 10:57:37 -04:00
%\end{equation*}
%This is because we know that the expectation from \eqref{eq:sig-j-last} will survive when we have mappings that produce pairs of the form $\sine(\wElem)\conj{\sine(\wElem)}$. Second, in consideration of the randomized hashing, with two distinct variables, the indicator variables in the expectation yield $\frac{1}{\sketchCols}\cdot \frac{1}{\sketchCols}$.
%
%We need to sum over all mappings for each case (c) when the number of distinct values is $\dist = 2$, resulting in
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\sum_{c \in \dist = 2}\sum_{f, f'}\prod_{i = 1}^{\prodsize}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
2020-04-01 10:57:37 -04:00
%\end{equation*}
%
%Finally, we need to do this for all $\dist$.
2020-03-27 12:10:41 -04:00
\begin{equation*}
\sigsq_j = \sum_{\dist \in [2, \prodsize]} \frac{1}{B^\dist} \sum_{\widetilde{w_1}\ldots\widetilde{w_\dist}\in W} \sum_{\substack{f, f',\\\match{f}{f'}}} \prod_{i = 1}^{\prodsize} v_i(\widetilde{w_{f(i)}}) v_i(\widetilde{w_{f'(i)}})
2020-03-26 20:15:00 -04:00
\end{equation*}