paper-BagRelationalPDBsAreHard/combining.tex

99 lines
8.6 KiB
TeX

% -*- root: main.tex -*-
\section{Combining Sketches}
\label{sec:combining}
\subsection{Adding Sketches}
When assuming that the variables are independent, as in the TIDB model, it is a known result that
\[
\varParam{X + Y} = \varParam{X} + \varParam{Y}.
\]
By \eqref{eq:sub-bounds-final} it immediately follows that adding $n$ base (base meaning a sketch that has not previously been added to another sketch) sketches results in the following variance:
\[
3n\left(\frac{\sqrt{\norm{\genV}_\infty}\left(|\pw|\right)}{\sqrt{\norm{\genV}_0\norm{\genV}_1} \epsilon^2} + \frac{1}{\epsilon^2}\right).
\]
\subsection{Multiplying Sketches}
There are various ways we can consider the multiplication of sketches. First, estimates might be multiplied, second, the sketches can be multiplied pointwise, taking then the estimate of the resultant sketch, and finally we consider an estimate simply as the multiplication of corresponding buckets. Stated formally the above is
\begin{align*}
&\est{1} = \sum_{\wVec \in \pw}\sCom{1}{\sketchHashParam{\wVec}}\sketchPolarParam{\wVec} \cdot \sCom{2}{\sketchHashParam{\wVec}}\sketchPolarParam{\wVec}\\
&\est{2} = \sum_{\wVec \in \pw }\left(\sCom{1}{\sketchHashParam{\wVec}} \cdot \sCom{2}{\sketchHashParam{\wVec}}\right)\sketchPolarParam{\wVec}\\
&\est{3} = \sum_{j \in \sketchCols}\sCom{1}{j} \cdot \sCom{2}{j}.
\end{align*}
Calculating the expectation for $\est{1}$ evaluates to
\begin{align*}
&\expect{\sum_{\wVec \in \pw}\sCom{1}{\sketchHashParam{\wVec}}\sketchPolarParam{\wVec} \cdot \sCom{2}{\sketchHashParam{\wVec}}\sketchPolarParam{\wVec}}\\
=& \expect{\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\sketchPolarParam{\wVec}\sum_{\substack{\wVecPrime \in \pw \st\\ \sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}}} \genV_1\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime} \sum_{\substack{\wVecPrime \in \pw \st\\ \sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}}}\genV_2\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime}}\\
=& \mathbb{E}\big[\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\sketchPolarParam{\wVec}\left(\sum_{\substack{\wVecPrime \in \pw \st\\
\wVecPrime \neq \wVec}} \genV_1\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime} + \genV_1\paramBox{\wVec}\sketchPolarParam{\wVec}\right)\\
& \qquad \left(\sum_{\substack{\wVecPrime \in \pw \st\\
\wVecPrime \neq \wVec}} \genV_2\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime} + \genV_2\paramBox{\wVec}\sketchPolarParam{\wVec}\right)\big]\\
=& \expect{\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\sketchPolarParam{\wVec}\genV_1\paramBox{\wVec}\sketchPolarParam{\wVec}\genV_2\paramBox{\wVec}\sketchPolarParam{\wVec}}\\
=& \genV_1\paramBox{\wVec}\genV_2\paramBox{\wVec}.
\end{align*}
This result is consistent for an arbitrary number of sketches in the product.
In expectation $\est{2}$ results in
\begin{align*}
&\expect{\sum_{\wVec \in \pw }\left(\sCom{1}{\sketchHashParam{\wVec}} \cdot \sCom{2}{\sketchHashParam{\wVec}}\right)\sketchPolarParam{\wVec}}\\
= &\expect{\sum_{\wVec \in \pw}\left(\sum_{\substack{\wVecPrime \in \pw \st\\
\sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}}} \genV_1\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime}\sum_{\substack{\wVecPrime \in \pw \st\\
\sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}}}\genV_2\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime}\right)\sketchPolarParam{\wVec}}\\
= &\mathbb{E}\big[\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\left(\sum_{\substack{\wVecPrime \in \pw \st\\ \sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}\\\wVecPrime \neq \wVec}}\genV_1\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime} + \genV_1\paramBox{\wVec}\sketchPolarParam{\wVec}\right)\\
&\qquad\left(\sum_{\substack{\wVecPrime \in \pw \st\\ \sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}\\\wVecPrime \neq \wVec}}\genV_2\paramBox{\wVecPrime}\sketchPolarParam{\wVecPrime} + \genV_2\paramBox{\wVec}\sketchPolarParam{\wVec}\right)\big]\\
= &\expect{\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\genV_1\paramBox{\wVec}\sketchPolarParam{\wVec}\genV_2\paramBox{\wVec}\sketchPolarParam{\wVec}}\\
= & 0.
\end{align*}
Note that with an odd number of sketches being multiplied, such as 3, we would get an expectation equal to the ground truth
\begin{align*}
= &\expect{\sum_{\wVec \in \pw}\sketchPolarParam{\wVec}\genV_1\paramBox{\wVec}\sketchPolarParam{\wVec}\genV_2\paramBox{\wVec}\sketchPolarParam{\wVec}\genV_3\paramBox{\wVec}\sketchPolarParam{\wVec}}\\
= &\genV_1\paramBox{\wVec}\genV_2\paramBox{\wVec}\genV_3\paramBox{\wVec}.
\end{align*}
For $\est{3}$, multiplying an even number of sketches yields
\begin{align*}
&\expect{\sum_{j \in \sketchCols}\sCom{1}{j} \cdot \sCom{2}{j}}\\
=&\expect{\sum_{j \in \sketchCols}\left(\sum_{\substack{\wVec \in \pw \st\\\sketchHashParam{\wVec} = j}}\gVP{1}{\wVec}\sketchPolarParam{\wVec}\cdot \sum_{\substack{\wVecPrime \in \pw \st\\\sketchHashParam{\wVecPrime} = j}}\gVP{2}{\wVecPrime}\sketchPolarParam{\wVecPrime}\right)}\\
=&\expect{\sum_{j \in \sketchCols}\sum_{\substack{\wVec, \wVecPrime \in \pw \st\\\sketchHashParam{\wVec} = j\\\wVec = \wVecPrime}}\gVP{1}{\wVec}\gVP{2}{\wVec}\sketchPolarParam{\wVec}\sketchPolarParam{\wVec}\sum_{\substack{\wVec, \wVecPrime \in \pw \st\\\sketchHashParam{\wVec} = j\\\wVec \neq \wVecPrime}}\gVP{1}{\wVec}\gVP{2}{\wVecPrime}\sketchPolarParam{\wVec}\sketchPolarParam{\wVecPrime}}\\
=&\expect{\sum_{\wVec \in \pw}\gVP{1}{\wVec}\gVP{2}{\wVec}}\\
=&\gVP{1}{\wVec}\gVP{2}{\wVec}
\end{align*}
For the case of multiplication, when assumming independent variables, it is a known result that
\[
\varParam{X \cdot Y} = \expect{X^2}\expect{Y^2} - (\expect{X})^2 (\expect{Y})^2.
\]
It is necessary then to calculate the expectation of the square of the sum of estimates. Assuming discreet variables the expectation of the square of a random variable is simply the sum of its weighted squares. This yields
\begin{align}
&\expect{\left(\sum_{\wVec \in \pw}\sketchJParam{\sketchHashParam{\wVec}}\cdot \sketchPolarParam{\wVec}\right)^2}\label{eq:rand-sq}\\
=& \sum_{\wVec \in \pw}\expect{\left(\sketchJParam{\sketchHashParam{\wVec}}\cdot\sketchPolarParam{\wVec}\right)^2}\label{eq:rand-sq-ex-push}\\
=& \sum_{\wVec \in \pw}\expect{\left(\sum_{\substack{\wVecPrime \in \pw \st \\
\sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec}}} \genVParam{\wVecPrime}\sketchPolarParam{\wVecPrime}\sketchPolarParam{\wVec}\right)^2}\label{eq:rand-sq-equiv}\\
=& \sum_{\wVec \in \pw}\expect{\left(\genVParam{\wVec}^2\sketchPolarParam{\wVec}^2 + \sum_{\substack{\wVecPrime \in \pw \st \\
\sketchHashParam{\wVecPrime} = \sketchHashParam{\wVec},
\wVecPrime \neq \wVec}} \genVParam{\wVecPrime}\sketchPolarParam{\wVecPrime}\sketchPolarParam{\wVec}\right)^2}\label{eq:rand-sq-assoc}\\
=& \sum_{\wVec \in \pw}\expect{\genVParam{\wVec}^2}\label{eq:rand-sq-reduce}\\
=& \sum_{\wVec \in \pw}\genVParam{\wVec}^2\label{eq:rand-sq-final}.
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item Starting out with \eqref{eq:rand-sq} since we need to know the expectation of the square of the sum of estimates.
\item \eqref{eq:rand-sq-ex-push} is the sum of weighted squares, or alternatively, pushes the expectation inside the summation by linearity of expectation.
\item \eqref{eq:rand-sq-equiv} substitutes the definition of a sketch bucket.
\item \eqref{eq:rand-sq-assoc} uses associativity to rearrange the operands of the sum.
\item \eqref{eq:rand-sq-reduce} reduces the second term of \eqref{eq:rand-sq-assoc} to $0$ by the property of uniform distribution of $\sketchPolar$.
\item \eqref{eq:rand-sq-final} is obtained by the fact that the expectation of $\genVParam{\wVec}$ is simply itself.
\end{itemize}
\end{Justification}
\begin{Assumption}
\hfill
\begin{itemize}\item Uniform distribution of both $\sketchHash$ and $\sketchPolar$.\end{itemize}
\end{Assumption}
It then follows that the variance corresponding to the muliplication of two base sketches is
\begin{align}
&\sum_{\wVec \in \pw}\genV_1\paramBox{\wVec}^2\sum_{\wVec \in \pw}\genV_2\paramBox{\wVec}^2 - \left(\sum_{\wVec \in \pw} \genV_1\paramBox{\wVec}\right)^2\left(\sum_{\wVec \in \pw} \genV_2\paramBox{\wVec}\right)^2\\
=&\norm{\genV_1}_2^2\cdot\norm{\genV_2}_2^2 - \norm{\genV_1}_1^2\cdot\norm{\genV_2}_1^2.
\end{align}
\AH{I don't think this equation makes sense. Where am I missing it?}
The subscript notation for $\genV$ is used to denote sketch identity. Substituting upper bounds obtained for the L1 norm squared from \eqref{eq:norm1-sq-cauchy} results in
\[
\norm{\genV_1}_2^2\cdot\norm{\genV_2}_2^2 - \left(|\pw|\right)\norm{\genV_1}_2^2 \cdot \left(|\pw|\right)\norm{\genV_2}_2^2.
\]