Added expectation proof for k-way product.

This commit is contained in:
Aaron Huber 2020-04-16 17:26:58 -04:00
parent 6139c8274a
commit 38648e9ecd
2 changed files with 51 additions and 10 deletions

View file

@ -66,6 +66,7 @@
\newcommand{\dist}{m}
\newcommand{\dupSize}{j}
\newcommand{\dMap}[1]{\widetilde{#1}}
\newcommand{\order}{O}
%

60
sop.tex
View file

@ -1,7 +1,42 @@
%root--main.tex
\section{Sum of Products Analysis}
\section{Analysis of a $\prodsize$-way join}
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query.
There are several steps involved to obtaining bounds on the Sum of Products (SOP) query. We start by analyzing a $\prodsize$ product. Define the $j_{th}$ bucket of a sketch $\sk$ for a vector $\vect$ as
\[\sk^\vect[j] = \sum_{\substack{\wElem \in \wSet,\\ \hfunc(\wElem) = j}}\vect(\wElem)\sine(\wElem)\].
Define the estimate of the $j_{th}$ bucket to be
\[\est_j = \prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]\].
For notational convenience define
\begin{align*}
&\wSet_j = \{\wElem ~|~ \hfunc(\wElem) = j\}\\
&\term_j = \sum_{\wElem \in \wSet_j} \prod_{i = 1}^{\prodsize}\vect_i(\wElem)
\end{align*}
Let us show first that the expectation of the estimate does in fact yield the value we are estimating, $\term_j$.
\begin{proof}
\begin{align*}
\ex{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sk^{\vect_i}[j]} \\
= &\ex{\prod_{i = 1}^{\prodsize} \sum_{\substack{\wElem \in \wSet_j, \\ \hfunc(\wElem) = j}}\vect_i(\wElem)\sine(\wElem)}\\
= &\ex{\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\sum_{\substack{\wElem_1,\ldots, \wElem_{\prodsize}\\ \in \wSet_j}} \prod_{i = 1}^{\prodsize}\vect_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}
\end{align*}
Fix the variables $\wElem_1,\ldots, \wElem_{\prodsize}$. Define $\dist$ to be the number of distinct worlds in $\wElem_1,\ldots, \wElem_{\prodsize}$ and $e_l$ to be the number of repitions for the $l_{th}$ distinct world value. For $\term_1^{\est_j} = \ex{\prod_{i = 1}^{\prodsize} \sine(\wElem_i)}$, we get
\begin{align*}
\term_1^{\est_j} = &\ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i)}\\
= &\ex{\prod_{l = 1}^{\dist} \sine(\wElem_l)^{e_l}}\\
= & \begin{cases}
0 &1 <\dist < \prodsize\\
1 & \dist = 1.
\end{cases}
\end{align*}
Notice, that the above leaves us with the condition that $\forall i, j \in [\prodsize], \wElem_i = \wElem_j$,
\begin{align*}
= &\sum_{\wElem \in \wSet_j}\prod_{i = 1}^{\prodsize} \vect_i(w) \cdot \term_1^{\est_j} = \term_j.
\end{align*}
\end{proof}
The proof for $\est = \sum_j \est_j$ follows by linearity of expectation.\qed\newline
We need to compute the variance of the $\prodsize$-way product $\est$. We wish to prove that
\begin{equation}
@ -25,19 +60,21 @@ Recall that we started this section out by seeking to prove \cref{eq:var-to-prov
One can see that \cref{eq:sigsq-jneqj} is composed of two addends. We now bound each of them separately.
\subsection{Bounding $\sum_{j \neq j'}\cvar{j, j'}$}
\begin{align*}
\sum_{j \neq j'}\cvar{j, j'} &= \sum_{j \neq j'} \ex{\est_j \cdot \conj{\est_{j'}}} - \ex{\est_j}\cdot\ex{\conj{\est_{j'}}}\\
&=\ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}\cdot \prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}} - \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem \in W}v_i(\wElem)s(\wElem)\ind{h(\wElem) = j}}\cdot \ex{\prod_{i = 1}^{\prodsize}\sum_{\wElem' \in W}v_i(\wElem')\conj{s(\wElem')}\ind{h(\wElem') = j'}}\\
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)s(\wElem'_i) \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)s(\wElem'_i) \ind{h(\wElem'_i) = j'}}\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)s(\wElem'_i)\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem'_i)\ind{h(\wElem_i') = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}s(\wElem'_i)\ind{h(\wElem_i') = j'}} \right).
&=\ex{\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem_i) = j} \ind{h(\wElem'_i) = j'}} - \ex{\sum_{\substack{\wElem_1,\cdots, \wElem_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}}\cdot\ex{\sum_{\substack{\wElem'_1,\cdots, \wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)} \ind{h(\wElem'_i) = j'}}\\
&=\sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\ex{\prod_{i = 1}^{\prodsize}v_i(\wElem_i)s(\wElem_i)v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize} v_i(\wElem_i)s(\wElem_i) \ind{h(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\conj{s(\wElem'_i)}\ind{h(\wElem'_i) = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \prod_{i = 1}^{\prodsize}v_i(\wElem_i)\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot \prod_{i = 1}^{\prodsize}v_i(\wElem'_i)\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}}\\
&= \sum_{\substack{\wElem_1,\cdots,\wElem_\prodsize,\\\wElem'_1,\cdots,\wElem'_\prodsize\\\in W}}\prod_{i = 1}^{\prodsize}v_i(\wElem_i)v_i(\wElem'_i)\left(\ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\conj{s(\wElem'_i)}\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}} - \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)\ind{h(\wElem_i) = j}}\cdot\ex{\prod_{i = 1}^{\prodsize}\conj{s(\wElem'_i)}\ind{h(\wElem_i') = j'}} \right).
\end{align*}
\AH{Perhaps a formal proof is necessary below.}
For $\term_1^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only surviving terms occur when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, the only terms that survive are when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, that in such a case, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations, each can only survive when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Such constraints leave us with only one surviving case, when all variables are the same world. Thus,
For $\term_1^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}s(\wElem_i)s(\wElem'_i)\ind{h(\wElem_i) = j}\ind{h(\wElem'_i) = j'}}$, because hash function $h$ cannot bucket the same world to two different buckets, the only instance $\term_1^{\cvar{j, j'}} = 1$ occurs when there is no overlap between the $\wElem_i$ and $\wElem'_i$ variables. Given the condition of no overlap, $\term_1^{\cvar{j, j'}} = 1$ only with the further condition that $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem', \wElem \neq \wElem'$. Notice, however, given the conditions, the product of the remaining expectations will cancel this out. Looking at the remaining two expectations $\term_2^{\cvar{j, j'}} = \ex{\prod_{i = 1}^{\prodsize}\sine(\wElem_i) \ind{\hfunc(\wElem_i) = j}} \cdot \ex{\prod_{i = 1}^{\prodsize}\conj{\sine(\wElem'_i)} \ind{\hfunc(\wElem'_i) = j'}}$, that $\term_2^{\cvar{j, j'}} = 1$ only when $\forall i \in [\prodsize], \wElem_i = \wElem, \wElem'_i = \wElem'$. Taken together, the constraints leave us with only one possible case for $\term_1^{\cvar{j, j'}} - \term_2^{\cvar{j, j'}} \neq 0$, when all variables are the same world. Thus,
\begin{align}
&\sum_{j \neq j'}\cvar{j, j'} = - \frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -224,7 +261,7 @@ over multiplication. This then implies that each individiual $\sine(\dw_i)$ has
Using the above definitions, we can now present the variance bounds for $\sigsq_j$ based on \eqref{eq:sig-j-distinct}.
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem_{i'}' = \wElem_{j'}' = \wElem$, we can rid ourselves of the case when there exists only one distinct world value. This is precisely why we have not needed to account for the last two expectations in \cref{eq:sig-j-last}. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Note that the number of distinct values $\dist$ affects the randomness of the hash function $\hfunc$. E.g. only $\dist = 2$ distinct values will yield $\frac{1}{\sketchCols} \cdot \frac{1}{\sketchCols} = \frac{1}{\sketchCols^2} = \frac{1}{\sketchCols^\dist}$. By lemma \ref{lem:sig-j-survive} and equation \eqref{eq:sig-j-distinct} we get
By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsize], \wElem_i = \wElem_j = \wElem, \wElem_{i'}' = \wElem_{j'}' = \wElem'$, for both $\wElem = \wElem'$ and $\wElem \neq \wElem'$, we can rid ourselves of the case when there exists only one distinct world value. This is precisely why we have not needed to account for the last two expectations in \cref{eq:sig-j-last}. We then need to sum up all the $\dist$ distinct world value possibilities for $\dist \in [2, \prodsize]$. Starting with \cref{eq:sig-j-distinct},
\begin{align}
\sigsq_j = &\sum_{\dist = 2}^{\prodsize}\sum_{\dist' = 2}^{\prodsize}\sum_{f, f'}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist},\\\dw'_{1},\ldots,\dw'_{\dist'}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw'_{f'(i)})\cdot \term_1\left(\dw_{f(1)},\ldots,\dw_{f(\prodsize)}, \dw'_{f'(1)},\ldots, \dw'_{f'(\prodsize)}\right)\nonumber\\
@ -234,7 +271,10 @@ By the fact that the expectations cancel when $\forall i, i', j, j'\in [\prodsiz
= &\sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots,\dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)})\label{eq:sig-j-bnd-4}
\end{align}
We obtain \cref{eq:sig-j-bnd-1} by the fact that $\dist = \dist'$. Next, we arrive at \cref{eq:sig-j-bnd-2} by \cref{lem:sig-j-survive} as well as bringing out the indicator variables out $\term_1$. Equation \ref{eq:sig-j-bnd-3} is derived from the fact that $\forall i \in [\dist], \dw_i = \dw'_i$. We arrive at \cref{eq:sig-j-bnd-4}, since with $\dist$ distinct variables, the product of indicator variables will result in multiplying the uniform distribution probability of $\frac{1}{\sketchCols}$ $\dist$ times.
We obtain \cref{eq:sig-j-bnd-1} by the fact that $\dist = \dist'$. Next, we arrive at \cref{eq:sig-j-bnd-2} by \cref{lem:sig-j-survive} as well as bringing out the indicator variables of $\term_1$. Equation \ref{eq:sig-j-bnd-3} is derived from the fact that $\forall i \in [\dist], \dw_i = \dw'_i$. We arrive at \cref{eq:sig-j-bnd-4}, since with $\dist$ distinct variables, the product of indicator variables will result in multiplying the uniform distribution probability distribution $\dist$ times.
\AH{Next on the agenda, }
Using \cref{eq:cvar-bound} and \cref{eq:sig-j-bnd-4}, we state the general bounds for $\sigsq$,
\[\sigsq = \sum_{\dist = 2}^{\prodsize}\frac{1}{\sketchCols^{\dist}}\sum_{\substack{f, f'\\\match{f}{f'}}}\sum_{\substack{\dw_{_1}, \ldots, \dw_{_\dist}\\ \in W}}\prod_{i = 1}^{\prodsize}\vect_i(\dw_{f(i)})\vect_i(\dw_{f'(i)}) -
\frac{1}{B^2}\sum_{\wElem \in W}\prod_{i = 1}^{\prodsize}v_i^2(\wElem)\label{eq:cvar-bound}.\]
\AH{Next on the agenda, type up the expectation calculations, then start on SOP.}
\AR{Remaining TODOs: (1) Give expression for general $\sigma^2$, i.e. deal with the general $\lambda(j,j')$ term. (2) Show how to use the analysis for general $k$-product to handle generic SoP expressions-- the expectation arguments would just follow from the above and linearity of expectation but the variance bounds might need a bit of extra work.}