paper-BagRelationalPDBsAreHard/sop.tex

94 lines
8.8 KiB
TeX
Raw Normal View History

2020-03-26 12:38:07 -04:00
%root--main.tex
\section{Sum of Products Analysis}
We now seek to bound the variance of a k-way join.
\begin{align}
&\sigsq_j = \ex{est_j \cdot \overline{est_j}} - \ex{est_j} \cdot \ex{\overline{est_j}} \nonumber\\
&= \ex{\prod_{i = 1}^{k}\sum_{w \in W_j}v_i(w)s(w) \cdot \prod_{i = 1}^k\sum_{w' \in W_j}v_i(w')\overline{s(w')}} -
\ex{\prod_{i = 1}^{k}\sum_{w \in W_j}v_i(w)s(w)}\cdot \ex{\prod_{i = 1}^k\sum_{w' \in W_j}v_i(w')\overline{s(w')}}\nonumber\\
&= \ex{\sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\prod_{i = 1}^k v_i(w_i)v(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\sum_{w_1...w_k \in W} \prod_{i = 1}^k v_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot
\ex{\sum_{w'_1...w'_k \in W} \prod_{i = 1}^k v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
=&\sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\ex{\prod_{i = 1}^k v_i(w_i)v_i(w'_i)s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^kv_i(w_i)s(w_i)\ind{h(w_i) = j}} \cdot \ex{\prod_{i = 1}^k v_i(w'_i)\overline{s(w'_i)}\ind{h(w'_i) = j}}\nonumber\\
&= \sum_{\substack{w_1...w_k\\w'_1...w'_k\\ \in W}}\prod_{i = 1}^k v_i(w_i)v_i(w'_i)\cdot\left( \ex{\prod_{i = 1}^k s(w_i)\overline{s(w'_i)}\ind{h(w_i) = j}\ind{h(w'_i) = j}} -
\ex{\prod_{i = 1}^ks(w_i)\ind{h(w_i) = j}}\cdot \ex{\prod_{i = 1}^k\overline{s(w'_i)}\ind{h(w'_i) = j}} \right)\label{eq:sig-j-last}.
\end{align}
2020-03-31 11:52:00 -04:00
Before proceeding, we introduce some notation that will aid in communicating the bounds we are about to establish. First note, that the only terms that survive the expectation above are mappings of $w_i = w'_j = w$ for $i, j \in [k]$, such that each $w_i$ has a match, i.e., no $w_i$ or $w'_j$ stands alone without a matching world in its complimentary set.
2020-03-26 12:38:07 -04:00
2020-03-31 11:52:00 -04:00
%\subsection{M-tuples}
%\begin{Definition}
2020-04-01 10:57:37 -04:00
%Given a $k$-way join, define $\dist \in [k]$. An \dist-tuple then is a set of tuples, each tuple conatining $\dist$ elements, such that the values of each tuple sum up to $\dist$, i.e. $\forall i \in [\dist], \sum_j \dist_{t_{i, j}} = \dist$, where i is the $i^{th}$ tuple in $\dist_t$, and $j$ is the $j^{th}$ index of that tuple $t$. The set consists of each unique sum up to symmetry, meaning a tuple with the same elements only reversed is disallowed.
2020-03-31 11:52:00 -04:00
%\end{Definition}
2020-04-01 10:57:37 -04:00
%For example, when $k = 4$, $\dist = 2$, the \dist-tuple, denoted, $\dist_2$, would be$\left\{\left(1, 3\right), \left(2, 2\right)\right\}$. Here, $\dist_{2_{1, 1}} = 1$, and while the tuple $\left(3, 1\right)$ sums up to $k = 4$, we do not include it since we have it's symmetrical term $\left(1, 3\right)$.
2020-03-31 11:52:00 -04:00
%
%\AR{Why is the definition of M-tuples needed? From what I understand you need this to define what kinds of $f$ and $f'$ are allowed but in that case why not state those properties directly in terms of $f$ and $f'$? Actually after reading the next section, I do not see why these properties are needed at all..}
2020-04-01 10:57:37 -04:00
%\AH{I use the \dist-tuples to explain 1) what kind of matchings survive and 2) that $f, f'$ must only cross product from within the matchings of the same tuple. Maybe there is an easier way to do this.}
2020-03-26 20:15:00 -04:00
2020-03-26 12:38:07 -04:00
\subsection{f, f'}
2020-03-31 11:52:00 -04:00
To help describe all possible matchings we introduce functions $f$ and $f'$.
2020-03-26 12:38:07 -04:00
\begin{Definition}
2020-04-01 10:57:37 -04:00
Functions f, f' are the set of surjective mappings from $k$ to $\dist$ elements: $f: [k] \rightarrow [\dist], f': [k] \rightarrow [\dist'].$
2020-03-26 12:38:07 -04:00
\end{Definition}
2020-03-27 12:10:41 -04:00
%\begin{equation*}
%f(i) = \begin{cases}
% \widetilde{w_1} &f(i) = 1\\
% \widetilde{w_2} &f(i) = 2\\
% \vdots &\vdots\\
2020-04-01 10:57:37 -04:00
% \widetilde{w_\dist} &f(i) = \dist.
2020-03-27 12:10:41 -04:00
% \end{cases}
%\end{equation*}
2020-04-01 10:57:37 -04:00
The functions $f, f'$ are used to produce the mappings $w_i \mapsto \widetilde{w_{f(i)}}$. In particular, $f$ and $f'$ are machinery for mapping $k$ $\wElem$-world variables to $\dist$ distinct values.
We rewrite equation \eqref{eq:sig-j-last} in terms of $\dist$ distinct worlds, with $f, f'$ mappings.
\begin{equation}
\sum_{\dist \in [k]}\sum_{\dist' \in [k]}\sum_{f, f'}\sum_{\substack{\wElem_1, \cdots,\wElem_\dist,\\\wElem'_1,\cdots,\wElem'_{\dist'}\\ \in W}}\prod_{i = 1}^{k}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem'_{f'(i)}})\cdot\left( \ex{\prod_{i = 1}^k \sine(\widetilde{\wElem_{f(i)}}\conj{\sine(\widetilde{\wElem'_{f'(i)}})}\ind{h(\widetilde{\wElem_{f(i)}}) = j}\ind{h(\widetilde{w'_{f'(i)}}) = j}} -
\ex{\prod_{i = 1}^k \sine(\wElem_{f(i)})\ind{h(\widetilde{\wElem_{f(i)}}) = j}}\cdot \ex{\prod_{i = 1}^k\conj{\sine(\wElem'_{f'(i)})}\ind{h(\widetilde{w'_{f'(i)}}) = j}} \right)\label{eq:sig-j-distinct}
\end{equation}
Note that for a given $\dist$, we may have several ways to map $k$ worlds to $\dist$ distinct values. We need to define what if means for $f$ and $f'$ to be matching.
2020-03-26 20:15:00 -04:00
2020-03-31 11:52:00 -04:00
\begin{Definition}
2020-04-01 10:57:37 -04:00
Functions $f:[k]\mapsto [\dist], f':[k]\mapsto [\dist']$ are said to be matching, denoted $\match{f}{f'}$, if and only if
2020-03-31 11:52:00 -04:00
\begin{enumerate}
2020-04-01 10:57:37 -04:00
\item $\dist = \dist'$
\item $\{f^{-1}(i) ~|~ \forall i \in [\dist]\} = \{f'^{-1}(i') ~|~ \forall i' \in [\dist] \}$, i.e., the set of preimages for $f$ equals the set of preimages for $f'$
% \item $\forall i \in [\dist], |f^{-1}(i)| = |f'^{-1}(i)|$, or a symmetrical mapping exists, where $\forall i \in [\dist], \exists i' \in [\dist]$ such that $i'$ is unique, $|f^{-1}(i)| = |f^{-1}(i')|$.
2020-03-31 11:52:00 -04:00
\end{enumerate}
\end{Definition}
2020-04-01 10:57:37 -04:00
\begin{Lemma}\label{lem:sig-j-survive}
The only terms surviving the expectation of equation \eqref{eq:sig-j-distinct} are those with $f, f'$ matching, where $\forall j \in[\dist], \widetilde{\wElem_j} = \widetilde{\wElem'_j}$.
2020-03-31 11:52:00 -04:00
\end{Lemma}
The proof is immediate and follows from the fact that the random $\sine$ functions are only guaranteed to produce a product of one under one of two possible conditions:
\begin{enumerate}
\item $\sine(\wElem)^k = 1$,
\item $\sine(\wElem) \conj{\sine(\wElem)} = 1$.\qed
\end{enumerate}
2020-03-26 20:15:00 -04:00
2020-03-26 12:38:07 -04:00
2020-04-01 10:57:37 -04:00
%\AH{Here is where I have attempted to use prose to discuss the restrictions on $f$ and $f'$, rather than the use of \dist-tuples. Maybe there is a better, cleaner formal way?}
%E.g., for $k = 4, \dist = 2$, mappings could be such that one $\wElem_i$ is distinct, while the other three $\wElem_i$ are mapped to the other distinct value. Additionally, we would have the case where two $\wElem_i$ map to a distinct value, while the other two $\wElem_i$ map to a seperate distinct world. The expectations of equation \eqref{eq:sig-j-last} restrict $f$ and $f'$ to belonging to the same class of $\dist$-mapping, meaning, if the mapping $f$ for $k = 4, \dist = 2$ is in the setting of one distinct world and three equal world values, then $f'$ must be from that same set of mappings, and not from another class of mappings, such as when two $w_i$ map to a distinct world, while the other two $w_i$ map to a separate distinct world.
%\AH{Here is the use of \dist-tuples to explain the same thing.}
% In the example above, $f$ mappings for $\dist_{2_1}$ may only cross product with $f'$ mappings for $\dist_{2_1}$ and not with those for $\dist_{2_2}$. Likewise for $f, f'$ mappings of $\dist_{2_2}$.
2020-03-26 12:38:07 -04:00
2020-04-01 10:57:37 -04:00
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
2020-03-27 12:10:41 -04:00
2020-04-01 10:57:37 -04:00
By the fact that the expectations cancel when $\forall i, i', j, j'\in [k], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, k]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
%
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\prod_{i = 1}^{k}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%This is because we know that the expectation from \eqref{eq:sig-j-last} will survive when we have mappings that produce pairs of the form $\sine(\wElem)\conj{\sine(\wElem)}$. Second, in consideration of the randomized hashing, with two distinct variables, the indicator variables in the expectation yield $\frac{1}{\sketchCols}\cdot \frac{1}{\sketchCols}$.
%
%We need to sum over all mappings for each case (c) when the number of distinct values is $\dist = 2$, resulting in
%\begin{equation*}
%\frac{1}{\sketchCols^2}\sum_{\widetilde{\wElem_1}, \widetilde{\wElem_2}}\sum_{c \in \dist = 2}\sum_{f, f'}\prod_{i = 1}^{k}\vect_i(\widetilde{\wElem_{f(i)}})\vect_i(\widetilde{\wElem_{f'(i)}}).
%\end{equation*}
%
%Finally, we need to do this for all $\dist$.
2020-03-27 12:10:41 -04:00
\begin{equation*}
2020-04-01 10:57:37 -04:00
\sigsq_j = \sum_{\dist \in [2, k]} \frac{1}{B^\dist} \sum_{\widetilde{w_1}\cdots\widetilde{w_\dist}\in W} \sum_{\substack{f, f',\\\match{f}{f'}}} \prod_{i = 1}^{k} v_i(\widetilde{w_{f(i)}}) v_i(\widetilde{w_{f'(i)}})
2020-03-26 20:15:00 -04:00
\end{equation*}