paper-BagRelationalPDBsAreHard/analysis.tex

% -*- root: main.tex -*-
\section{Analysis}
\label{sec:analysis}
We begin the analysis by showing that with high probability an estimate is approximately $\numWorldsP$, where $p$ is the probability measure for a given TIPD.  Note that $$\numWorldsP = \numWorldsSum.$$

The first step is to show that the expectation of the estimate of a tuple t's membership across all worlds is $\numWorldsSum$.

\begin{align}
&\expect{\estimate}\\
=&\expect{\estExpOne}\\
=&\expect{\sum_{\substack{j \in [B],\\
			 \wVec \in \pw~|~ \sketchHash{i}[\wVec] = j,\\
			 \wVec[w']\in \pw~|~ \sketchHash{i}[\wVec[w']] = j} } v_t[\wVec] \cdot s_i[\wVec] \cdot s_i[\wVec[w']]}\\
=&\multLineExpect\big[\sum_{\substack{j \in [B],\\
				\wVec~|~\sketchHashParam{\wVec}= j,\\
				\wVecPrime~|~\sketchHashParam{\wVecPrime} = j,\\
				\wVec = \wVecPrime}} \kMapParam{\wVec} \cdot \sketchPolarParam{\wVec} \cdot \sketchPolarParam{\wVecPrime} +  \nonumber \\
&\phantom{{}\kMapParam{\wVec}}\sum_{\substack{j \in [B], \\
				\wVec~|~\sketchHashParam{\wVec} = j,\\
				\wVecPrime ~|~ \sketchHashParam{\wVecPrime} = j,\\ \wVec \neq \wVecPrime}} \kMapParam{\wVec} \cdot \sketchPolarParam{\wVec} \cdot\sketchPolarParam{\wVecPrime}\big]\textit{(by linearity of expectation)}\\
=&\expect{\sum_{\substack{j \in [B],\\
				\wVec~|~\sketchHashParam{\wVec}= j,\\
				\wVecPrime~|~\sketchHashParam{\wVecPrime} = j,\\
				\wVec = \wVecPrime}} \kMapParam{\wVec} \cdot \sketchPolarParam{\wVec} \cdot \sketchPolarParam{\wVecPrime}} \nonumber \\
&\phantom{{}\big[}\textit{(by uniform distribution in the second summation)}\\
=& \estExp \sum_{\substack{j \in [B],\\
				\wVec~|~\sketchHashParam{\wVec}= j,\\}} \kMapParam{\wVec} \label{eq:estExpect}
\end{align}

For the next step, we show that the variance of an estimate is small.$$\varParam{\estimate}$$

\begin{align}
&=\varParam{\estExpOne}\\
&= \expect{\big(\estTwo\big)^2}\\
&=\expect{\sum_{\substack{
		\wVec_1, \wVec_2,\\
		 \wVecPrime_1, \wVecPrime_2 \in \pw,\\
		 \sketchHashParam{\wVec_1} = \sketchHashParam{\wVecPrime_1},\\
		 \sketchHashParam{\wVec_2} = \sketchHashParam{\wVecPrime_2}
		 }}\kMapParam{\wVec_1} \cdot \kMapParam{\wVec_2}\cdot\sketchPolarParam{\wVec_1}\cdot\sketchPolarParam{\wVec_2}\cdot\sketchPolarParam{\wVecPrime_1}\cdot\sketchPolarParam{\wVecPrime_2} }\label{eq:var-sum-w}
\end{align}

Note that four-wise independence is assumed across all four random variables of \eqref{eq:var-sum-w}.  Zooming in on the inner products of the $\sketchPolar$ functions,
\begin{equation}
\polarProdEq \label{eq:polar-product}
\end{equation}
it can be seen that for $\wOne, \wOneP \in \pw$ and $\wTwo, \wTwoP \in \pw'$, all four random variables in \eqref{eq:polar-product} take their values from $\pw$, although we have two separate sets.  Thus, there are four possible patterns of distribution between the $\wVec$ variables, namely:
\begin{align*}
&\distPattern{1}:&\cOne\\
&\distPattern{2}:&\cTwo \textit{*} \\
&\distPattern{3}:&\cThree \textit{*} \\
&\distPattern{4}:&\cFour \textit{*}\\
&\distPattern{5}:&\cFive
\end{align*}
$$\text{ }^*\textit{(and all variants of the respective pattern)}$$

We are interested in those particular cases whose expecation does not equal zero, since an expectation of zero will not add to the summation of \eqref{eq:var-sum-w}.  In expectation we have that
\begin{align}
&\expect{\sum_{\substack{\elems \\
			\st \cOne}} \polarProdEq} = 1 \label{eq:polar-prod-all}\\
&\expect{\sum_{\substack{\elems \\
			\st \cTwo}} \polarProdEq} = 1 \label{eq:polar-prod-two-and-two}\\
&\expect{\sum_{\substack{\elems \\
			\st \cThree}} \polarProdEq} = 0 \nonumber \\
&\expect{\sum_{\substack{\elems \\
			\st \cFour}} \polarProdEq} = 0 \nonumber \\
&\expect{\sum_{\substack{\elems \\
			\st \cFive}} \polarProdEq} = 0 \nonumber
\end{align}

Only equation \eqref{eq:polar-prod-all} (which maps to $\cOne$) and \eqref{eq:polar-prod-two-and-two} (mapping to $\cTwo$) affect the $\var$ computation.

Thus, when considering $\distPattern{1}$ the variance results in
\begin{equation}
\distPatOne\label{eq:distPatOne}
\end{equation}

For the distribution pattern $\cTwo$, we have three variants to consider.
\begin{align*}
&\vCase{1}:&\cTwo \\
&\vCase{2}:&\cTwoV{\wOne}{\wTwo}{\wOneP}{\wTwoP}\\
&\vCase{3}:&\cTwoV{\wOne}{\wTwoP}{\wOneP}{\wTwo}
\end{align*}
When considered separately, the variants have the following $\var$.
\begin{align}
\cTwo&= \variantOne \label{eq:variantOne}\\
\cTwoV{\wOne}{\wTwo}{\wOneP}{\wTwoP}&=\variantTwo \label{eq:variantTwo}\\
\cTwoV{\wOne}{\wTwoP}{\wOneP}{\wTwo}&=\variantThree\label{eq:variantThree}
\end{align}

Note that at the start of the analysis of $\var$, the second term (expectation \eqref{eq:estExpect} squared) of the $\var$ calculation was not considered.  This is because it is cancelled out by \eqref{eq:distPatOne} and \eqref{eq:variantOne}.
\begin{equation*}
\big(\estExp\big)^2 = \distPatOne + \variantOne
\end{equation*}
With only \eqref{eq:variantTwo} and \eqref{eq:variantThree} remaining, we have

\begin{multline*}
\varParam{\estimate} = \\
\variantTwo ~+ \\
\variantThree
\end{multline*}

Converting terms into their space requirements yields
\begin{align}
&\variantTwo \Rightarrow\numWorldsP \cdot \frac{\numWorlds}{\sketchCols} - 1\label{eq:spaceOne}\\
&\variantThree \Rightarrow \numWorldsP \cdot  \frac{\numWorldsP - 1}{\sketchCols}\label{eq:spaceTwo}
\end{align}
\eqref{eq:spaceOne} and \eqref{eq:spaceTwo} further reduce to
\begin{equation}
\frac{2^{2N}(\prob + \prob^2)}{\sketchCols} - \numWorlds(\frac{\prob}{\sketchCols} + \prob)\label{eq:variance}
\end{equation}
By \eqref{eq:variance} we have then
\begin{align*}
\varSym &< 2^{2N}\big(\frac{2\prob^2}{\sketchCols}\big) \\
\sd &< 2^N\big(\sqrt{\frac{2\prob^2}{\sketchCols}}\big)\\
\sdRel& < \sqrt{\frac{2}{\sketchCols}}.
\end{align*}
Since the sketch has multiple trials, a probability of exceeding error bound smaller than one half guarantees an estimate that is less than or equal to the error bound when taking the median of all trials.  Expressing this in Chebyshev's Inequality yields
\begin{equation*}
\cheby.
\end{equation*}
Substituting $\mu\epsilon$ for $k\sd$ and solving for $\sketchCols$ results in
\begin{align*}
&k\sdRelVar = \mu\epsilon\\
&k = \frac{\mu\epsilon}{\sdRelVar}\\
&k = \frac{\mu\epsilon\sqrt{\sketchCols}}{\sqrt{2}}\\
&k^2 = \frac{1}{\big(\frac{\mu\epsilon\sqrt{\sketchCols}}{\sqrt{2}}\big)^2}\\
&k^2 = \frac{2}{\big(\mu^2\epsilon^2\sketchCols}\\
&\chebyK\Rightarrow \sketchCols = \frac{6}{\mu^2\epsilon^2}
\end{align*}