paper-BagRelationalPDBsAreHard/hash_const.tex

% -*- root: main.tex -*-
\section{Hash Function Construction}

\AR{Aaron: Please re-write this section more generally. I.e. instead of assuming $h_i$ and $s_i$ are specifically defined as linear functions, define them generally: i.e. keep $h_i:W\to [B]$ and $s_i:W\to \{-1,1\}$ as generic function but abstract out the properties we want from them-- i.e. (1) $h_i$ is pair-wise independent, (2) $s_i$ is $4$-wise independent and (3) given any $\buck\in \{0,1\}^b$, we want to be able to compute the following quantity in $\mathrm{poly}(N)$ time (or an approximation of it):
\[|\{\mathbf{w}\in W| h_i(\mathbf{w})=\buck, s_i(\mathbf{w})=1\}|-|\{\mathbf{w}\in W| h_i(\mathbf{w})=\buck, s_i(\mathbf{w})=-1\}|.\]
From my discussion with the folks here at the workshop the requirement (3) seems to be new for $k$-wise independent hash functions and we should highlight this definition too. Once things have been defined this way, you can state the definition of $h_i$ as you have stated below. But in the next section, it would be good to state the algorithm only in terms of these more general properties of the hash functions. Once you have made this change, I can make a more careful pass over this section and the next.}

\subsection{Requirements}
As mentioned in section $\ref{sec:notation}$, we define our row wise hash functions (bucket mapping and polarity) as follows:
\begin{align*}
\hash&: \pw \to \sketchCols \\
\pol&: \pw \to \{-1, 1\}.
\end{align*}

We require that $\hash$ be pairwise independent and $\pol$ 4-wise independent.

Turning to the computation of the exact values of
\begin{equation}
\sum\limits_{\wVec \in \pw } \sketchJParam{\hashP{\wVec}}\polP{\wVec} =
\sum\limits_{\wVec \in \pw } \kMapParam{\wVec}\polP{\wVec}
\sum_{\substack{\wVecPrime \in \pw\st\\
			\hashP{\wVec} = \hashP{\wVecPrime}}} \polP{\wVecPrime}\label{eq:exact-results} .
\end{equation}
Starting with the latter term $\gIJ = \sum\limits_{\wVecPrime \in \pw}\polP{\wVecPrime}$, by the definition of the image of $\pol$ and the property of associativity in addition, we can break the sum into
\begin{equation*}
\gIJ = \sum_{\substack{\wVecPrime \in \pw \st\\
				\polP{\wVecPrime} = 0}} 1 + \sum_{\substack{\wVecPrime \in \pw \st\\
													\polP{\wVecPrime} = 1}} -1.
\end{equation*}
Setting the terms to $T_1 =  \sum\limits_{\substack{\wVecPrime \in \pw \st\\
				\polP{\wVecPrime} = 0}} 1$ and $T_2 =  \sum\limits_{\substack{\wVecPrime \in \pw \st\\
													\polP{\wVecPrime} = 1}} -1$ and fixing $\buck \in \{0,1\}^\lenB$  (with $\lenB = \log\sketchCols$)  to a specific value, gives a system of linear equations for each term.  It is a known result given a consistent matrix multiplication that the number of solutions are $| \kDom |^{\numTup - rank(\matrixH')}$, where $\kDom$ is the set being considered.  For $\kDom = \mathbb{B}$ this gives us an exact calculation for both terms,
\begin{align*}
T_1 = |\{\wVec \st \matrixH' \cdot \wVec = \buck^{(0)}\}|\rightarrow T_1 \in \{0, 2^{\numTup - rank(\matrixH')}\},\\
T_2 = |\{\wVec \st \matrixH' \cdot \wVec = \buck^{(1)}\}|\rightarrow T_2 \in \{0, 2^{\numTup - rank(\matrixH')}\},
\end{align*}
where the notation $\jpbit{y}$ denotes the polarity bit $\lenB$ value of the $\buck$ bucket identifier, specifically $\buck(b)$, such that $\buck(b)\in \{0, 1\}$. For each bucket $\buck$, we therefore want to compute the following quantitity $\mathrm{poly}(N)$ time, or an approximation thereof:
\[
\big|\{ \wVec \in \pw \st \hashP{\wVec} = \buck, \polP{\wVec} = 1 \}\big| - \big|\{\wVec \in \pw \st \hashP{\wVec} = \buck, \polP{\wVec} = -1\}\big|.
\]
We refer to the above quantity as $\polSum$.


Examining the former term of equation \eqref{eq:exact-results}, we fix $\kMap{t}$ to be defined as
\begin{equation*}
\kMapParam{\wVec} = \begin{cases}
					1,&\text{if } w_t = 1\\
					0,		&\text{otherwise}.
				  \end{cases}
\end{equation*}
\gVt{(generalizing)$\cdot$
\begin{equation*}
\kMapParam{\wVec} = \begin{cases}
					k,&\text{if } w_t = 1\\
					0, &\text{otherwise}.
				\end{cases}
\end{equation*}}
%Therefore, by definition we have
%\begin{equation*}
%\sum_{\wVec \in \pw}\sketchJParam{\hashP{\wVec}} = \sum_{\wVec \in \pw}\kMapParam{\wVec}\polP{\wVec},
%\end{equation*}
Using the same argument as in $\gIJ$ yields
\begin{equation*}
\sum_{\wVec \in \pw \st \polP{\wVec} = 0}\kMapParam{\wVec} - \sum_{\wVec \in \pw \st \polP{\wVec} = 1}\kMapParam{\wVec}.
\end{equation*}
Setting $T_3 = \sum\limits_{\wVec \in \pw \st \polP{\wVec} = 0}\kMapParam{\wVec}$, $T_4 = \sum\limits_{\wVec \in \pw \st \polP{\wVec} = 1}\kMapParam{\wVec}$ gives an exact calculation for each term given a fixed $\buck$:
\begin{equation*}
T_3 = \gVt{\sum_{\substack{k \in \{\wVec \st \\
				\matrixH \cdot \wVec = \buck^{(0)},\\
				\kMapParam{\wVec} = k\}}}k} | \{\wVec \st \matrixH \cdot \wVec = \buck^{(0)}, \kMapParam{\wVec} =  \gVt{(k) }1\}\rightarrow T_3 \in [0, 2^{\numTup - rank(\matrixH')}]
\end{equation*}
\begin{equation*}
T_4 = \gVt{(k \cdot)} | \{\wVec \st \matrixH \cdot \wVec = \buck^{(1)}, \kMapParam{\wVec} = \gVt{(k) 1}\}\rightarrow T_4 \in [0, 2^{\numTup - rank(\matrixH')}]
\end{equation*}


\subsection{Implementation}
As with world identification, bucket identification can be viewed as a binary vector.  As detailed above, this vector is of length $\lenB$.  In a similar manner, we can define a set of hash vectors $\matrixH$ as a matrix of $\lenB$ precomputed vectors $\hVec$ where each $\hVec \in \{0, 1\}^\numTup$, formally
\begin{equation*}
	\begin{pmatrix*}[l]
		h_{i, 0, 0}&\cdots &h_{0, \numTup} \\
		\vdots \\
		h_{i, \lenB, 0} &\cdots &h_{\lenB, \numTup}\\
		\end{pmatrix*}.
\end{equation*}
We can then define the row hash function $\hash$ that maps input to buckets as the multiplication of the matrix $\matrixH \cdot \wVec = \jVec$ , as
\begin{equation*}
\hVecMatrix \cdot \vecCol{w}{\numTup} = \vecCol{j}{\lenB - 1},
\end{equation*}
or equivalently
\begin{equation*}
\hashP{\wVec} \coloneq (\forall i \in [\lenB], j_i =  \langle\textbf{h}_{i, k}, \wVec\rangle) = \buck
\end{equation*}

Polarity function $\pol$ can be analogously defined as the inner product of a precomputed vector (abusing notation) $\mathbf{\pol}$ and $\wVec$,
\begin{equation*}
\polP{\wVec} \coloneq \langle\mathbf{\pol}, \wVec\rangle
\end{equation*}
Augmenting $\matrixH$ to $\matrixH$' by adding $\mathbf{\pol}$  as an additional row in $\matrixH$ gives
\begin{equation*}
	\matrixH' = \begin{pmatrix*}[l]
		h_{i, 0, 0}&\cdots &h_{0, \numTup} \\
		\vdots \\
		h_{i, \lenB, 0} &\cdots &h_{\lenB, \numTup}\\
		s_{i, 0} &\cdots &s_{i, \numTup}
		\end{pmatrix*}.
\end{equation*}
Note that this also turns $\buck$ into a $b + 1$ size column vector, with the last element being the polarity of the hashed world vector.