paper-BagRelationalPDBsAreHard/analysis.tex

501 lines
30 KiB
TeX

% -*- root: main.tex -*-
\section{Analysis}
\label{sec:analysis}
%We begin the analysis by showing that with high probability an estimate is approximately $\numWorldsP$, where $p$ is a tuple's probability measure for a given TIPD. Note that
%\begin{equation}
%%\gVt{k\cdot}
%\numWorldsP = \numWorldsSum\label{eq:mu}.
%\end{equation}
%Furthermore, when $\genV$ is generalized to have elements in the range $\left[0, \infty\right]$, we obtain the result
%\begin{equation}
%\norm{\genV}\prob = \numWorldsSum\label{eq:gen-mu}.
%\end{equation}
We start off by making the claim that the expectation of the estimate of annotations across all worlds is $\sum\limits_{\wVec \in \pw}\genVParam{\wVec}$, formally
\begin{equation}
\expect{\sum_{\wVec \in \pw} \sketchJParam{\hashP{\wVec}} \cdot \polP{\wVec}} = \sum_{\wVec \in \pw}\genVParam{\wVec}\label{eq:allWorlds-est}.
\end{equation}
To verify this claim, we argue that $\forall \wVec \in \pw$, the expectation of the estimate of an annotation in a single world is its annotation, i.e. the output of $\genVParam{\wVec}$, i.e.
\begin{equation}
\expect{\sketchJParam{\hashP{\wVec}}\cdot \polP{\wVec}} = \genVParam{\wVec} \label{eq:single-est}.
\end{equation}
For a given $\wVec \in \pw$, substituting definitions we have
\begin{align}
&\expect{\sketchJParam{\hashP{\wVec}} \cdot \polP{\wVec}} = \nonumber\\
&\phantom{{}\sketchJParam{\hashP{\wVec}}}\expect{\big(\sum_{\substack{\wVecPrime \in \pw \st \\
\hashP{\wVecPrime} = \hashP{\wVec}}}\genVParam{\wVecPrime} \cdot \polP{\wVecPrime}\big) \cdot \polP{\wVec} }\label{eq:step-one}\\.
%\end{align}
%Since $\wVec \in \pw$, we know that for $\wVecPrime\in \pw, \exists \wVecPrime \st \wVecPrime = \wVec$. This yields
%\[
=&~\expect{\genVParam{\wVecPrime}\polP{\wVec}^2 +
\sum\limits_{\substack{\wVecPrime, \wVec \in \pw \st \\
\hashP{\wVecPrime} = \hashP{\wVec},\\
\wVecPrime \neq \wVec}}\genVParam{\wVecPrime}\polP{\wVecPrime}\polP{\wVec}}\label{eq:step-two}\\
=&~\expect{\genVParam{\wVec}\polP{\wVec}^2} +
\expect{\sum\limits_{\substack{\wVecPrime, \wVec \in \pw \st \\
\hashP{\wVecPrime} = \hashP{\wVec} \\
\wVecPrime \neq \wVec}}\genVParam{\wVecPrime}\polP{\wVecPrime}\polP{\wVec}}\label{eq:step-three}\\
=&~\genVParam{\wVec} \cdot \expect{\polP{\wVec}^2} + \nonumber\\
&\qquad\sum\limits_{\substack{\wVecPrime, \wVec \in \pw \st \\
\hashP{\wVecPrime} = \hashP{\wVec} \\
\wVecPrime \neq \wVec}}\genVParam{\wVecPrime}\cdot\expect{\sum\limits_{\substack{\wVecPrime, \wVec \in \pw \st \\
\hashP{\wVecPrime} = \hashP{\wVec} \\
\wVecPrime \neq \wVec}}\polP{\wVecPrime}\polP{\wVec}}\label{eq:step-four}\\
=&~\genVParam{\wVec}\label{eq:step-five}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eq{\eqref{eq:step-one}} is a substitution of the definition of $\sketch$.
\item \eq{\eqref{eq:step-two}} uses the associativity of addition to rearrange the sum.
\item \eq{\eqref{eq:step-three}} uses linearity of expectation to reduce the large expectation into smaller expectations.
\item \eq{\eqref{eq:step-four}} further pushes expectation into the $\pol$ terms.
\item \eq{\eqref{eq:step-five}} follows from evaluating the expectations and taking the product of their results and their respective operands.
\end{itemize}
\end{Justification}
%which in turn
%\begin{multline*}
%\mathbb{E}\big[\genVParam{\wVecPrime_0}\cdot \polP{\wVecPrime_0} + \cdots \\
%+\genVParam{\wVecPrime_j}\cdot \polP{\wVecPrime_j}\cdot \polP{\wVecPrime_j}+ \cdots \\
%+ \genVParam{\wVecPrime_n}\polP{\wVecPrime_n}\big]
%\end{multline*}
%\AH{break it up into w' and w}
%Due to the uniformity of $\pol$, we have
%\begin{equation*}
%= \genVParam{\wVec},
%\end{equation*}
thus verifying \eqref{eq:single-est}.
\begin{Assumption}
\hfill
\begin{itemize}
\item \eq{\eqref{eq:step-four}} assumes that $\pol$ is pairwise independent.
%\item $\hash$ is uniformly distributed.
\end{itemize}
\end{Assumption}
Since \eqref{eq:single-est} holds, by linearity of expectation, \eqref{eq:allWorlds-est} also must hold.
%We can now take \eqref{eq:single-est}, substitute it in for \eqref{eq:allWorlds-est} and show by linearity of expectation that \eqref{eq:allWorlds-est} holds.
%\begin{align}
%&\expect{\sum_{\wVec \in \pw} \sketchJParam{\hashP{\wVec}} \cdot \polP{\wVec}} \nonumber\\
%&= \expect{\sum_{\wVecPrime \in \pw}\genVParam{\wVecPrime} \cdot \polP{\wVecPrime} \cdot \sum_{\substack{\wVec \in \pw \st \\
%\hashP{\wVecPrime} = \hashP{\wVec}}}\polP{\wVec}}\nonumber\\
%&= \sum_{\wVec \in \pw} \expect{\left( \sum_{\substack{\wVecPrime \in \pw \st \\
%\hashP{\wVecPrime} = \hashP{\wVec}}}\genVParam{\wVecPrime}\cdot\polP{\wVecPrime}\right) \cdot \polP{\wVec}}\nonumber\\
%&= \sum_{\wVec \in \pw}\genVParam{\wVec}\label{eq:estExpect}.
%\end{align}
%\begin{align}
%&\expect{\estimate}\\
%=&\expect{\estExpOne}\\
%=&\expect{\sum_{\substack{j \in [B],\\
% \wVec \in \pw~|~ \hash{i}[\wVec] = j,\\
% \wVec[w']\in \pw~|~ \hash{i}[\wVec[w']] = j} } v_t[\wVec] \cdot s_i[\wVec] \cdot s_i[\wVec[w']]}\\
%=&\multLineExpect\big[\sum_{\substack{j \in [B],\\
% \wVec~|~\hashP{\wVec}= j,\\
% \wVecPrime~|~\hashP{\wVecPrime} = j,\\
% \wVec = \wVecPrime}} \genVParam{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVecPrime} + \nonumber \\
%&\phantom{{}\genVParam{\wVec}}\sum_{\substack{j \in [B], \\
% \wVec~|~\hashP{\wVec} = j,\\
% \wVecPrime ~|~ \hashP{\wVecPrime} = j,\\ \wVec \neq \wVecPrime}} \genVParam{\wVec} \cdot \polP{\wVec} \cdot\polP{\wVecPrime}\big]\textit{(by linearity of expectation)}\\
%=&\expect{\sum_{\substack{j \in [B],\\
% \wVec~|~\hashP{\wVec}= j,\\
% \wVecPrime~|~\hashP{\wVecPrime} = j,\\
% \wVec = \wVecPrime}} \genVParam{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVecPrime}} \nonumber \\
%&\phantom{{}\big[}\textit{(by uniform distribution in the second summation)}\\
%=& \estExp \label{eq:estExpect}
%\end{align}
%\AR{A general comment: The last display equation should have a period at the end. The idea is that display equations are considered part of a sentence and every sentence should end with a period.}
%\AH{Thank you for clarifying this, as I have always wondered what the convention was for display equations. Hopefully, I haven't missed any end display equations in this paper, and have them all fixed properly.}
For the next step, we show that the variance of an estimate is small.%$$\varParam{\estimate}$$
\begin{align}
&\varParam{\sum_{\wVec \in \pw}\sketchJParam{\hashP{\wVec}} \cdot \polP{\wVec}}\\%\nonumber\\
=~&\varParam{\sum_{\wVec \in \pw}\genVParam{\wVec} \cdot \polP{\wVec} \sum_{\substack{\wVecPrime \in \pw \st\\ \hashP{\wVec} = \hashP{\wVecPrime}}}\polP{\wVecPrime}}\label{eq:var_step-one}\\%\nonumber\\%\estExpOne}\\
=~& \mathbb{E}\big[\big(\sum_{\substack{ \wVec, \wVecPrime \in \pw \st \\
\hashP{\wVec} = \hashP{\wVecPrime}}} \genVParam{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVecPrime}\nonumber\\
&\qquad - \expect{\sum_{\wVec \in \pw} \sketchJParam{\hashP{\wVec}} \cdot \polP{\wVec}}\big)^2\big]\label{eq:var_step-two}\\%\nonumber\\
=~&\mathbb{E}\big[\sum_{\substack{
\wVec_1, \wVec_2,\\
\wVecPrime_1, \wVecPrime_2 \in \pw,\\
\hashP{\wVec_1} = \hashP{\wVecPrime_1},\\
\hashP{\wVec_2} = \hashP{\wVecPrime_2}
}}\genVParam{\wVec_1} \genVParam{\wVec_2}\polP{\wVec_1}\polP{\wVec_2}\polP{\wVecPrime_1}\polP{\wVecPrime_2}\big]\nonumber\\
&\qquad - \left(\sum_{\wVec \in \pw}\genVParam{\wVec}\right)^2 \label{eq:var-sum-w}.
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eq{\eqref{eq:var_step-one}} follows from substituting the definition of $\sketch$ and the commutativity of addition. Note the constraint on $\hash$ hashing to the same bucket follows from the definition of $\sketch$. Also, the sum can be rearranged to take each component item in the sum of each bucket and take its sum of products with each of the $\pol$ mapped to it. This can be done as previously stated, using the commutativity of addition.
\item \eq{\eqref{eq:var_step-two}} by substituting the definition of variance.
\item \eq{\eqref{eq:var-sum-w}} results from the further evaluation of \eqref{eq:var_step-two}.
\end{itemize}
\end{Justification}
\begin{Assumption}
\hfill
\begin{itemize}
\item The subsequent evaluations of expectation assume 4-wise independence of $\pol$.
\end{itemize}
\end{Assumption}
Note that four-wise independence is assumed across all four random variables of \eqref{eq:var-sum-w}. Zooming in on the products of the $\pol$ functions,
\begin{equation}
\polP{\wOne}\cdot\polP{\wOneP}\cdot\polP{\wTwo}\cdot\polP{\wTwoP} \label{eq:polar-product}
\end{equation}
we see that %it can be seen that for $\wOne, \wOneP \in \pw$ and $\wTwo, \wTwoP \in \pw'$, all four random variables in \eqref{eq:polar-product} take their values from $\pw$, although we have iteration over two separate sets $\pw$.
there are five possible sets of $\wVec$ variable combinations. The following sets all assume each $\wVec$ to be from the set $\pw$. For $a, b, c, d \in \{1, 1', 2, 2'\} \st a \neq b \neq c \neq d$:
\begin{align*}
&\distPattern{1}:\forNElems{\cOne}\\
&\distPattern{2}:\forElems{\cTwo}\\
&\distPattern{3}:\forElems{\cThree}\\
&\distPattern{4}:\forElems{\cFour}\\
&\distPattern{5}:\forNElems{\cFive}
\end{align*}
With four random variables coming from sets containing the same elements, there exist five possibilities in how they relate to one another. This is true since they come from the same set or seperate, yet duplicate sets each containing the same members. Therefore, any $\wVec$ variable can be equal or alternatively not equal to its remaining counterparts. A simple enumeration in equalities (non-equalities) suffices to partition the set of all possible combinations. The variables could all be equal as we see in $\distPattern{1}$, or three of the variables could be equal, with the fourth different. Enumerating to having just two varibales sharing an equality generates two cases, because we have two variables left over, which themselves may either be equal or not equal. There is the case of $\distPattern{2}$ where a pair of variables could be the same with the remaining two equal to each other but not equal to the first two. $\distPattern{3}$ is the case when there are two variables the same, with the remaining variables not equal to any of the others. And finally, they could all be different as in $\distPattern{5}$.
The use of variable subscripts in the notation is necessary as different combinations of equal $\wVec$ variables produce different results in the variance computation, as we will see shortly.
Note that each $\wVec$ is the input of the same $\pol$ function, meaning, that equal worlds will produce the same output.
We are interested in those particular cases whose expectation does not equal zero, since an expectation of zero will not add to the summation of \eqref{eq:var-sum-w}. In expectation we have that
\begin{align}
\forAllNW{\distPattern{1}}&\rightarrow\expect{%\sum_{\substack{\elems \\
%\st \cOne}}
\polarProdNEq} = 1 \label{eq:polar-prod-all}
\end{align}
since we have the same element of the image of $\pol$ being multiplied to itself an even number of times. Similarly,
\begin{align}
\forAllW{\distPattern{2}}&\rightarrow\expect{%\sum_{\substack{\elems \\
%\st \cTwo}}
\polarProdEq} = 1 \label{eq:polar-prod-two-and-two}
\end{align}
because the same element of the image of $\pol$ is being multiplied to itself for each equality, producing a polarity of 1 for each equality, and then a final product of 1. For $\distPattern{3}, \distPattern{4}, \distPattern{5}$, we have a final product of two, three or four independent variables $\in \{-1, 1\}$, thus producing the following results:
\begin{align}
\forAllW{\distPattern{3}}&\rightarrow\expect{%\sum_{\substack{\elems \\
%\st \cThree}}
\polarProdEq} = 0 \nonumber
\end{align}
\begin{align}
\forAllW{\distPattern{4}}&\rightarrow\expect{%\sum_{\substack{\elems \\
%\st \cFour}}
\polarProdEq} = 0 \nonumber
\end{align}
\begin{align}
\forAllNW{\distPattern{5}}&\rightarrow\expect{%\sum_{\substack{\elems \\
%\st \cFive}}
\polarProdNEq} = 0. \nonumber
\end{align}
Only equations \eqref{eq:polar-prod-all} and \eqref{eq:polar-prod-two-and-two} influence the $\var$ computation.
Considering $\distPattern{1}$ the variance results in
\begin{equation}
\distPatOne\label{eq:distPatOne}.
\end{equation}
This is the case because we have that
\begin{align*}
&\sum_{\substack{\wOne, \wOneP, \wTwo, \wTwoP \in \pw \st \\
\wOne = \wTwo = \wOneP = \wTwoP = \wVec}}
\expect{\genVParam{\wVec} \cdot \genVParam{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVec} \cdot \polP{\wVec}}\\
= &\sum_{\wVec \in \pw} \expect{\genVParam{\wVec}\cdot \genVParam{\wVec}}\\
= &\sum_{\wVec \in \pw} \expect{\genVParam{\wVec}^2}.
\end{align*}
For the distribution pattern $\cTwo$, we have three subsets $\distPattern{21}, \distPattern{22}, \distPattern{23} \subseteq \distPattern{2}$ to consider.
\begin{align*}
&\distPattern{21}:&\cTwoV{\wOne}{\wOneP}{\wTwo}{\wTwoP} \\
&\distPattern{22}:&\cTwoV{\wOne}{\wTwo}{\wOneP}{\wTwoP}\\
&\distPattern{23}:&\cTwoV{\wOne}{\wTwoP}{\wOneP}{\wTwo}
\end{align*}
Considered separately, the subsets result in the following $\var$.
\begin{align}
&\wOne = \wOneP \neq \wTwo =\wTwoP \rightarrow\nonumber\\
&\qquad = \sum_{\substack{\wOne, \wOneP, \wTwo, \wTwoP \in \pw \st \\
\wOne = \wOneP = \wVec \neq\\
\wTwo = \wTwoP = \wVecPrime}}\expect{\genVParam{\wVec}\genVParam{\wVecPrime}\polP{\wVec}\polP{\wVec}\polP{\wVecPrime}\polP{\wVecPrime}} \label{eq:variantOne}\nonumber\\
&\qquad = \sum_{\wVec, \wVecPrime \in \pw \st \wVec \neq \wVecPrime}\expect{\genVParam{\wVec}\genVParam{\wVecPrime}}\\
&\wOne = \wTwo \neq \wOneP = \wTwoP \rightarrow\nonumber\\
&\qquad = \sum_{\substack{\wOne, \wOneP, \wTwo, \wTwoP \in \pw \st \\
\wOne = \wTwo = \wVec \neq\\
\wOneP = \wTwoP = \wVecPrime,\\
\hashP{\wVec} = \hashP{\wVecPrime}}} \expect{\genVParam{\wVec}\genVParam{\wVec}\polP{\wVec}\polP{\wVecPrime}\polP{\wVec}\polP{\wVecPrime}}\nonumber \\
&\qquad = \sum_{\wVec \in \pw}\expect{| \{\wVecPrime \st \wVecPrime \neq \wVec, \hashP{\wVec} = \hashP{\wVecPrime}\} | \cdot \genVParam{\wVec}^2}\label{eq:variantTwo} \\
&\wOne = \wTwoP \neq \wOneP =\wTwo \rightarrow \nonumber \\
&\qquad = \sum_{\substack{\wOne, \wOneP, \wTwo, \wTwoP \in \pw \st \\
\wOne = \wTwoP = \wVec \neq \\
\wOneP = \wTwo = \wVecPrime,\\
\hashP{\wVec} = \hashP{\wVecPrime}}}\expect{ \genVParam{\wVec} \genVParam{\wVecPrime}\polP{\wVec}\polP{\wVecPrime}\polP{\wVecPrime}\polP{\wVec}} \nonumber \\
&\qquad = \sum_{\substack{\wVec, \wVecPrime \in \pw \st \\
\wVec \neq \wVecPrime,\\
\hashP{\wVec} = \hashP{\wVecPrime}}}\expect{\genVParam{\wVec}\cdot\kMapParam{\wVecPrime}}\label{eq:variantThree}
\end{align}
Note that for $\distPattern{22}$, we have the cardinality of a bucket as a multiplicative factor for each squared annotation. This is because of the constraint that $\wOne \neq \wOneP$ coupled with the additional constraint that $\hashP{\wOne} = \hashP{\wOneP}$. Since $\wOneP$ must belong to the same bucket as $\wOne$, yet not equal to $\wOne$, we have that each operand of the sum must be the annotation squared for each $\wOneP$ that belongs to the same bucket but is not equal to $\wOne$.
Looking at $\distPattern{23}$, we have a similar case as $\distPattern{22}$, but this time there is no multiplicative factor since $\wOneP$ and $\wTwoP$ are constrained to equal their opposite $\wVec$ counterparts, which are the arguments for both $\genV$ terms.
Notice that the second term (expectation squared) of the $\var$ calculation is cancelled out by \eqref{eq:distPatOne} and \eqref{eq:variantOne}. %
\begin{equation*}
\expect{\big(\sum_{\wVec \in \pw}\genVParam{\wVec}\big)^2} = \expect{\sum_{\wVec \in \pw}\genVParam{\wVec}^2} +
\expect{\sum_{\substack{\wVec, \wVecPrime \in \pw \st\\
\wVec \neq \wVecPrime}}\genVParam{\wVec}\genVParam{\wVecPrime}}.%\distPatOne + \variantOne.
\end{equation*}
\begin{Justification}
\hfill
\begin{itemize}
\item The LHS is the expectation squared. We obtain the RHS by first squaring the sum, and then, using the assoicative property of addition, rearranging the operands of the summation.
\end{itemize}
\end{Justification}
With only \eqref{eq:variantTwo} and \eqref{eq:variantThree} remaining, we have
\begin{multline*}
\varParam{\estimate} = \\
\expect{\sum_{\wVec, \wVecPrime \in \pw \st \wVec \neq \wVecPrime}| \{\wVecPrime \st \hashP{\wVec} = \hashP{\wVecPrime}\} | \cdot \genVParam{\wVec}^2} ~+ \\
\expect{\sum_{\substack{\wVec, \wVecPrime \in \pw \st \\
\wVec \neq \wVecPrime,\\
\hashP{\wVec} = \hashP{\wVecPrime}}}\genVParam{\wVec}\cdot\kMapParam{\wVecPrime}}.
\end{multline*}
%Our current analysis is limited to TIPDBs, where the annotations are in the boolean $\mathbb{B}$ set. Because this is the case, the square of any element is itself.
Computing each term separately gives
\begin{align}
&\expect{\sum_{\wVec \in \pw}\big|~ \{\wVecPrime \st \wVecPrime \neq \wVec, \hashP{\wVec} = \hashP{\wVecPrime}\} ~\big| \cdot \genVParam{\wVec}^2}\nonumber\\
&~=\sum_{\wVec \in \pw}\genVParam{\wVec}^2 \cdot \expect{\big|~ \{\wVecPrime \st \wVecPrime \neq \wVec, \hashP{\wVec} = \hashP{\wVecPrime}\} ~\big|}\label{eq:s22-one}\\%\numWorldsP
&~=\norm{\genV}^2_2\cdot \left(\frac{|\pw|}{\sketchCols} - 1\right)\label{eq:spaceOne}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:s22-one} follows from linearity of expectation.
\item \eqref{eq:spaceOne} follows from the uniform distribution of $\hash$.
\end{itemize}
\end{Justification}
\begin{Assumption}
\hfill
\begin{itemize}
\item $\hash$ must be uniformally distributed.
\end{itemize}
\end{Assumption}
\begin{align}
&\expect{ \sum_{\substack{\wVec, \wVecPrime \in \pw \st \\
\wVec \neq \wVecPrime,\\
\hashP{\wVec} = \hashP{\wVecPrime}}}\genVParam{\wVec}\cdot\kMapParam{\wVecPrime}}\nonumber \\
&~= \expect{\sum_{\wVec \in \pw}\genVParam{\wVec} \cdot \sum_{\substack{\wVecPrime \in\pw \st\\
\wVecPrime \neq \wVec,\\
\hashP{\wVec} = \hashP{\wVecPrime}}}\genVParam{\wVecPrime}} \label{eq:s23-one} \\
%\numWorldsP \cdot \frac{\numWorldsP - 1}{\sketchCols}\label{eq:spaceTwo}.
&~= \expect{\sum_{\wVec \in \pw}\genVParam{\wVec} \cdot \big((\sum_{\substack{\wVecPrime \in\pw \st\\
\hashP{\wVec} = \hashP{\wVecPrime}} }\genVParam{\wVecPrime } )- \genVParam{\wVec}\big)} \nonumber \\
&~=\expect{\left(\sum_{\wVec \in \pw}\genVParam{\wVec} \cdot \sum_{\substack{\wVecPrime \in \pw \st \\
\hashP{\wVec} = \hashP{\wVecPrime}}}\genVParam{\wVecPrime}\right) - \sum_{\wVec \in \pw}\genVParam{\wVec}^2}\label{eq:s23-two}\\
&~=\expect{\sum_{\wVec \in \pw}\genVParam{\wVec} \cdot \sum_{\substack{\wVecPrime \in \pw \st \\
\hashP{\wVec} = \hashP{\wVecPrime}}}\genVParam{\wVecPrime}} - \expect{\sum_{\wVec \in \pw}\genVParam{\wVec}^2}\label{eq:s23-three} \\
&~\leq\norm{\genV}_1 \cdot \frac{\norm{\genV}_1}{\sketchCols} - \expect{\sum_{\wVec \in \pw}\genVParam{\wVec}^2}\label{eq:s23-four} \\
&~\leq\frac{\norm{\genV}_1^2 - \norm{\genV}_2^2}{\sketchCols} \label{eq:spaceTwo}.
%&\norm{\genV}\prob \cdot \frac{\norm{\genV}\prob - \frac{\norm{\genV}}{\numWorlds}}{\sketchCols}\label{eq:spaceTwo}.
\end{align}
\AH{It doesn't appear intuitive or obvious to the reader as to why the second term in \eqref{eq:spaceTwo} is divided by $\sketchCols$.}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:s23-one} is an equivalent representation of the LHS.
\item \eqref{eq:s23-two} follows from the fact that $\wVec \neq \wVecPrime$.
\item \eqref{eq:s23-three} is the result of distributing the multiplication over the terms in the parenthesis.
\item \eqref{eq:s23-four} follows since $\sum\limits_{\wVec \in \pw}\genVParam{\wVec} = \norm{\genV}_1$; the second term also relies the preceding fact and the assumption of uniform distribution of $\hash$.
\item \eqref{eq:spaceTwo} is the result of the multiplication of the first two terms in \eqref{eq:s23-four} and that $\sum\limits_{\wVec \in \pw}\genVParam{\wVec}^2 = \norm{\genV}_2^2$.
\end{itemize}
\end{Justification}
\AH{Can we not have the looser requirement of uniform distribution?}
\begin{Assumption}
\hfill
\begin{itemize}
\item $\hash$ must be pairwise independent.
\end{itemize}
\end{Assumption}
%In both equations, the sum of $\genVParam{\wVec}$ over all $\wVec \in \pw$ is $\numWorldsP$ since as noted in equation \eqref{eq:mu} we are summing the number of worlds a tuple $t$ appears in, and for a TIPDB, that is exactly 2 to the power of the number of tuples in the TIPDB (due to the independence of tuples) times tuple $t$'s probability.
Note that when $\genV$ is positive, the bound is tight.
In equation \eqref{eq:spaceOne} we have the multiplicative factor which in expectation turns out to be the number of worlds $|\pw|$ divided evenly across the number of buckets $\sketchCols$ minus the one tuple that $\wVecPrime$ cannot be. This factor is multiplied to the sum of squares of each of the world values.
Equation \eqref{eq:spaceTwo} has each of the $|\pw|$ worlds times all the rest of the worlds appearing in the corresponding bucket. The equation is first rearranged, by allowing the duplicating of $\wVec$ in the second summation and subsequently subtracting the product afterwards. The product in the expectation yields two factors. The first factor is simply the sum of vector values. The latter is the same sum divided by bucket size. Finally, we subtract the quantity that shouldn't be there, specifically when $\wVecPrime = \wVec$, which is the sum of squares within a bucket.
\eqref{eq:spaceOne} and \eqref{eq:spaceTwo} together form
\AH{We cannot use L0 'norm' here because \eqref{eq:spaceOne} relies on the cardinality of worlds independent of whether world existence.}
\begin{align}
&\norm{\genV}^2_2\left(\frac{|\pw|}{\sketchCols}- 1\right) + \frac{\norm{\genV}_1^2 - \norm{\genV}_2^2}{\sketchCols}\\
& < \frac{\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2}{\sketchCols} \label{eq:variance}
\end{align}
By \eqref{eq:variance} we have
\begin{align*}
%\varSym &< 2^{2N}\big(\frac{2\prob}{\sketchCols}\big) \\
\varSym &< \frac{\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2}{\sketchCols} \\
%\sd &<\sdEq\\
\sd &< \sqrt{\frac{\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2}{\sketchCols}} \\
%\sdRel& < \sqrt{\frac{2}{\sketchCols\prob}}.
\sdRel &< \frac{\sqrt{\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2}}{\sqrt{\sketchCols}\cdot\norm{\genV}_1}
\end{align*}
Recall that $\sdRel = \frac{\sd}{\mu}$.% where $\mu$ is defined as $\numWorldsP$ in \eqref{eq:mu} for TIDB and $\norm{\genV}\prob$ for general $\genV$ in \eqref{eq:gen-mu}.
Since the sketch has multiple trials, a probability of exceeding error bound $\errB$ smaller than one half guarantees an estimate that is less than or equal to the error bound when taking the median of all trials. Expressing the error relative to $\mu$ in Chebyshev's Inequality yields
\begin{equation*}
Pr\left[~|X - \mu|~> \Delta\right] < \frac{1}{3}.
%\cheby.
\end{equation*}
Substituting $\Delta = k\sigma \rightarrow k = \frac{\Delta}{\sigma} \rightarrow k^2 = \frac{\Delta^2}{\sigma^2}$ we have
\begin{equation*}
Pr\left[~|X - \mu|~> \Delta~\right] < \frac{\sigma^2}{\Delta^2}
\end{equation*}
For the case when $\Delta = \mu\epsilon$, taking both Chebyshev bounds, setting them equal to each other, simplifying and solving for $\sketchCols$ results in
\begin{align}
\frac{\sigma^2}{\Delta^2} &= \frac{1}{3}\\
\frac{\norm{\genV}_2^2 \cdot \left(|\pw|\right) + \norm{\genV}_1^2}{\sketchCols \norm{\genV}_1^2 \cdot \epsilon^2} &= \frac{1}{3}\label{eq:b-bnd-no-sub1}\\
\frac{3\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2\right)}{\norm{\genV}_1^2 \cdot \epsilon^2} &= \sketchCols\label{eq:bucket-bounds-no-sub}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:b-bnd-no-sub1} is the substitution of values for $\Delta^2$ and $\mu^2$.
\item \eqref{eq:bucket-bounds-no-sub} is derived by rearranging terms through mulitplying each side by $3\sketchCols$.
\end{itemize}
\end{Justification}
A brief digression is desirable for the purpose of simplifying the above bounds. Recall the Cauchy Schwarts inequality which states:
\[\sum_i a_i \cdot b_i \leq \norm{a}_2 \cdot \norm{b}_2.\]
The L1 norm can be expanded to the following expression,
\begin{equation}
\norm{\genV}_1 = \sum_{\wVec \in \pw} 1 \cdot \genVParam{\wVec}\label{eq:expandL1}.
\end{equation}
Notice that the constant term can be viewed as a vector of $1$'s with size $n$ (the size of $\genV$). Calling this vector $x$ and taking the L2 norm gives
\SR{Tighten the bounds further with L0 'norm', although that makes the simplification more difficult.}
\begin{align}
\norm{x} &= \sqrt{1_1^2 + 1_2^2 + \cdots + 1_n^2}\nonumber\\
&= \sqrt{n * 1} \nonumber\\
&= \sqrt{n}\nonumber\\
&= \sqrt{|\pw|}\label{eq:w-card}
\end{align}
By \eqref{eq:w-card} and Cauchy Swarts, we then have
\begin{equation}
\norm{\genV}_1 \leq \sqrt{|\pw|} \cdot \norm{\genV}_2\label{eq:norm1-cauchy},
\end{equation}
which squared yields
\begin{equation}
\norm{\genV}_1^2 \leq |\pw| \cdot \norm{\genV}_2^2\label{eq:norm1-sq-cauchy}.
\end{equation}
Note that \eqref{eq:expandL1} can be further tightened by using a vector with ones appearing only in places where $\genV_i > 0$. This tightens \eqref{eq:norm1-cauchy} and \eqref{eq:norm1-sq-cauchy} by replacing the $|\pw|$ factor with $\norm{\genV}_0$.
%\begin{equation}
%\norm{\genV}_1^2 \leq \norm{\genV}_0 \cdot \norm{\genV}_2^2
%\end{equation}
\AH{Did not use L0 here because it was easier to reduce terms with the $|\pw|$ factor.}
Substituting the Cauchy Schwarts bounds into the Chebyshev calculations gives
\begin{align}
&\sketchCols \leq \frac{3\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_2^2\left(|\pw|\right)\right)}{\norm{\genV}_2\sqrt{|\pw|}\cdot \epsilon^2}\label{eq:cheb-cauch1}\\
&\sketchCols \leq \frac{3\left(2\norm{\genV}_2^2\left(|\pw|\right)\right)}{\norm{\genV}_2\sqrt{|\pw|}\cdot \epsilon^2}\label{eq:cheb-cauch2}\\
&\sketchCols \leq \frac{6\norm{\genV}_2\sqrt{|\pw|}}{\epsilon^2}\label{eq:b-cauchy}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:cheb-cauch1} substitutes \eqref{eq:norm1-sq-cauchy} and \eqref{eq:norm1-cauchy} for the numerator and denominator terms respectively.
\item \eqref{eq:cheb-cauch2} combines common terms in the numerator.
\item \eqref{eq:b-cauchy} multiplies constant terms and cancels out common factors on the numerator and denominator.
\end{itemize}
\end{Justification}
To further tighten the bounds calculations above, we can bound the square of the L2 norm.
\begin{align}
\norm{\genV}_2^2 &= \sum_{i = 1}^{n}|\genV|^2 \label{eq:l2-bnd1} \\
&\leq \sum_{i = 1}^{n}\left(max_{i}|\genV_i|\right)\left|\genV_i\right|\label{eq:l2-bnd2}\\
&\leq \sum_{i = 1}^{n}\norm{\genV}_\infty |\genV_i|\label{eq:l2-bnd3}\\
&\leq \norm{\genV}_\infty \sum_{i = 1}^{n}|\genV_i|\label{eq:l2-bnd4}\\
&\leq \norm{\genV}_\infty \cdot \norm{\genV}_1 \label{eq:l2-bounds}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:l2-bnd1} is the defintion of L2 norm squared.
\item \eqref{eq:l2-bnd2} is an upper bound of the L2 norm, and it is true because the max of a vector $\genV$ is always greater than or equal to all the other elements in $\genV$, which implies that unless the max value is in every element, this is a strict upper bound.
\item \eqref{eq:l2-bnd3} is given by a simple substitution of notation.
\item \eqref{eq:l2-bnd4} is obtained by the equivalence of pushing the summation inside the product.
\item \eqref{eq:l2-bounds} is the result of substituting the definition of L1 norm.
\end{itemize}
\end{Justification}
Going back to equation \eqref{eq:bucket-bounds-no-sub} and substituting in the above bounds obtains the following.
\begin{align}
\sketchCols &= \frac{3\left(\norm{\genV}_2^2\left(|\pw|\right) + \norm{\genV}_1^2\right)}{\norm{\genV}_1^2 \cdot \epsilon^2} \\
&\leq \frac{3\left(\norm{\genV}_\infty\norm{\genV}_1 \left(|\pw|\right) + \norm{\genV}_1^2\right)}{\norm{\genV}_1^2\cdot \epsilon^2}\label{eq:sub-bounds1}\\
&\leq \frac{3\left(\norm{\genV}_\infty\sqrt{\norm{\genV}_0}\norm{\genV}_2\left(|\pw|\right) + \norm{\genV}_0\norm{\genV}_2^2\right)}{\norm{\genV}_0\norm{\genV}_2^2 \cdot \epsilon^2}\label{eq:sub-bounds2}\\
&\leq \frac{3\left(\norm{\genV}_\infty \sqrt{\norm{\genV}_0} \sqrt{\norm{\genV}_\infty\norm{\genV}_1}\left(|\pw|\right) + \norm{\genV}_0\norm{\genV}_\infty\norm{\genV}_1\right)}{\norm{\genV}_0\norm{\genV}_\infty\norm{\genV}_1\epsilon^2}\label{eq:sub-bounds3}\\
&\leq \frac{3\norm{\genV}_\infty \sqrt{\norm{\genV}_0\norm{\genV}_1}\left(\sqrt{\norm{\genV}_\infty}\left(|\pw|\right) + \sqrt{\norm{\genV}_0\norm{\genV}_1}\right)}{\norm{\genV}_0\norm{\genV}_\infty\norm{\genV}_1\epsilon^2}\label{eq:sub-bounds4}\\
&\leq \frac{3\left(\sqrt{\norm{\genV}_\infty}\left(|\pw|\right) + \sqrt{\norm{\genV}_0\norm{\genV}_1}\right)}{\sqrt{\norm{\genV}_0\norm{\genV}_1} \epsilon^2} \label{eq:sub-bounds5}\\
&\leq \frac{3\sqrt{\norm{\genV}_\infty}\left(|\pw|\right)}{\sqrt{\norm{\genV}_0\norm{\genV}_1} \epsilon^2} + \frac{3}{\epsilon^2}\label{eq:sub-bounds-final}
\end{align}
\begin{Justification}
\hfill
\begin{itemize}
\item \eqref{eq:sub-bounds1} results from substituting \eqref{eq:l2-bounds} for the L2 norm.
\item \eqref{eq:sub-bounds2} is obtained from substituting \eqref{eq:norm1-cauchy} for the L1 norm and \eqref{eq:norm1-sq-cauchy} for the L1 norm squared terms in both the numerator and denominator.
\item \eqref{eq:sub-bounds3} is the result of further substituting \eqref{eq:l2-bounds} for the newly introduced L2 norm terms in the numerator.
\item \eqref{eq:sub-bounds4} is the result of factoring out common terms in the numerator.
\item \eqref{eq:sub-bounds5} is the result of cancelling out common terms in the numerator and denominator.
\item \eqref{eq:sub-bounds-final} is simply a rearrangement of the two numerator terms, for the purpose of making things simpler.
\end{itemize}
\end{Justification}
\startOld{Bound calculations}
\begin{align*}
\frac{\sigma^2}{\Delta^2} &= \frac{1}{3}\\
\frac{ 2^{2N}\big(\frac{2\prob}{\sketchCols}\big)}{\mu^2\epsilon^2} &= \frac{1}{3}\\
\frac{2^{2N + 1}\prob}{\mu^2\epsilon^2\sketchCols} &= \frac{1}{3}\\
\frac{6 \cdot 2^{2N}\prob}{\mu^2\epsilon^2} &= \sketchCols \\
\frac{6}{p\epsilon^2} &= \sketchCols.
\end{align*}
In the above, recall that $\mu$ or the expectation of an estimate is $\sum\limits_{\wVec \in \pw}\genVParam{\wVec}$ as seen in equation \eqref{eq:allWorlds-est}.
Setting $\Delta = \epsilon\numWorlds$ gives
\begin{align*}
\frac{ 2^{2N}\big(\frac{2\prob}{\sketchCols}\big)}{\epsilon^22^{2N}} &= \frac{1}{3}\\
\frac{2^{2N+ 1}\prob}{\epsilon^22^{2N}\sketchCols} &= \frac{1}{3}\\
\frac{6 \cdot 2^{2N}\prob}{\epsilon^22^{2N}} &= \sketchCols \\
\frac{6\prob}{\epsilon^2} &= \sketchCols.
\end{align*}
Other cases for $\Delta$ can be solved similarly.
Spacing...\newline
you\newline
can\newline
get
rid
of
this
later.
\finOld