paper-BagRelationalPDBsAreHard/approx_alg.tex

%root: main.tex
%!TEX root=./main.tex

\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}

In~\Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for \ti (even just based on project-join queries) is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability  (\Cref{th:single-p-hard}).
Given this, we now design an approximation algorithm for our problem that runs in {\em linear time}.
The folowing approximation algorithm applies to \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.

\subsection{Preliminaries and some more notation}

We now introduce useful definitions and notation related to polynomials.  We use the following polynomial as an example:
\begin{equation}
\label{eq:poly-eg}
\poly(X, Y) = 2X^2 + 3XY - 2Y^2.
\end{equation}

\begin{Definition}[Variables in a monomial]\label{def:vars}
 Given a monomial $v$, we use $\var(v)$ to denote the set of variables in $v$.
\end{Definition}
\noindent For example the monomial $XY$ has $\var(XY)=\inset{X,Y}$.

\revision{
\begin{Definition}[Pure Expansion]
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials.  The pure expansion of $\poly$ generalizes ~\Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
\end{Definition}

}


\begin{Definition}[Expanded \revision{\circuit}]\label{def:expand-circuit}
%\revision{$\expansion{\circuit}$} is the reduced pure expansion of $\revision{\circuit}$.
The logical view of \revision{$\expansion{\circuit}$} is a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef$ is in $\reals$.
\revision{$\expansion{\circuit}$} has the following recursive definition ($\circ$ is list concatenation).

$\expansion{\circuit} =
\begin{cases}
					\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput}		&\textbf{ if }\revision{\circuit.\type = \circplus}\\
					\left\{(\monom_\linput \cup \monom_\rinput, \coef_\linput \cdot \coef_\rinput) ~|~(\monom_\linput, \coef_\linput) \in \expansion{\circuit_\linput}, (\monom_\rinput, \coef_\rinput) \in \expansion{\circuit_\rinput}\right\} 		&\textbf{ if }\revision{\circuit.\type = \circmult}\\
					\elist{(\emptyset, \revision{\circuit.\val})}								&\textbf{ if }\revision{\circuit}.\type = \tnum\\
					\elist{(\{\revision{\circuit}.\val\}, 1)}									&\textbf{ if }\revision{\circuit}.\type = \var.\\
\end{cases}
$

\end{Definition}
\revision{
Note that similar in spirit to ~\Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though ~\Cref{def:reduced-bi-poly} is more general.
}

In the following, we abuse notation and write $\monom$ to denote the monomial obtained as the products of the variables in the set.

\begin{Example}\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}.
Its circuit $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}.
The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expansion{\circuit}$ is $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
\end{Example}
$\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$.
Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.

\begin{Definition}[Positive \circuit]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$. 
\end{Definition}


Using the same factorization from ~\Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$.  Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.


\begin{Definition}[Evaluation]\label{def:exp-poly-eval}
Given an expression tree $\circuit$ and a valuation $\vct{a} \in \mathbb{R}^\numvar$, we define the evaluation of $\circuit$ on $\vct{a}$ as $\circuit(\vct{a}) = \polyf(\circuit)(\vct{a})$.
\end{Definition}

\begin{Definition}[\size($\cdot$)]
The function \size~ takes a circuit $\circuit$ as input and outputs the number of gates (nodes) in \circuit.
\end{Definition}

\begin{Definition}[\depth($\cdot$)]
The function \depth~ has circuit $\circuit$ as input and outputs the number of levels in \circuit.
\end{Definition}

\begin{Definition}[$\degree(\cdot)$]
The function $\degree(\cdot)$ takes a circuit \circuit as input and outputs the degree of $\polyf(\abs{\circuit})$.
\end{Definition}

\begin{Definition}[Subcircuit]
A subcircuit of a circuit $\circuit$ is a circuit \subcircuit such that \subcircuit is a DAG \textit{subgraph} of the DAG representing \circuit.  The sink of \subcircuit has exactly one gate \gate.
\end{Definition}

\subsection{Our main result}
In the subsequent subsections we will prove the following theorem.

\begin{Theorem}\label{lem:approx-alg}
Let \circuit be a circuit for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\]
such that
\begin{equation}
\label{eq:approx-algo-bound}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
\end{Theorem}

\noindent The proof of~\Cref{lem:approx-alg} (which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample}) can be found in~\Cref{sec:proof-lem-approx-alg}.  The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}.


To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}):
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\circuit$, define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\]
\end{Definition}

\noindent We next present couple of corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time
\[O\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\]
In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$.
\end{Corollary}

The proof for~\Cref{cor:approx-algo-const-p} can be seen in~\Cref{sec:proofs-approx-alg}.
The restriction on $\gamma$ is satisfied by any \ti (where $\gamma=0$) as well as for all three queries of the PDBench \bi benchmark (\Cref{app:subsec:experiment} shows experimentally that $\gamma$ is negligible in practice for these queries).
We also observe that (i) tuple presence is independent across blocks, so the corresponding probabilities (and hence $\prob_0$) are independent of the number of blocks, and (ii) \bis model uncertain attributes, so block size (and hence $\gamma$) is a function of the ``messiness'' of a dataset, rather than its size.
Thus, we expect the corollary to hold in general.

\subsection{Approximating $\rpoly$}

The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit over $\bi$, we can exactly represent $\rpoly(\vct{X})$ as follows:
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{X_1,\dots,X_\numvar}=\hspace*{-1mm}\sum_{(\monom,\coef)\in \expansion{\circuit}} \hspace*{-2mm} \indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \var\inparen{\monom}}\hspace*{-2mm} X_i
\end{equation}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional\footnote{We could have also uniformly sampled from $\expansion{\circuit}$ but this gives better parameters.} to $\abs{\coef}$ and compute $Y=\indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{\monom}} p_i$. Taking $\numsamp$ samples and computing the average of $Y$ gives us our final estimate.
The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
\begin{equation*}
2\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \conf \implies\samplesize \geq \frac{2\log{\frac{2}{\conf}}}{\error^2}.
\end{equation*}

To summarize, \approxq modifies \circuit with a call to \onepass.  It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$.

\begin{algorithm}[t]
	\caption{$\approxq(\circuit, \vct{p}, \conf, \error)$}
	\label{alg:mon-sam}
	\begin{algorithmic}[1]
		\Require \circuit: Circuit
		\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
		\Require $\conf$ $\in [0, 1]$
		\Require $\error$ $\in [0, 1]$
		\Ensure \vari{acc} $\in \mathbb{R}$

		\State $\accum \gets 0$\label{alg:mon-sam-global1}
		\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
		\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass-iter}}
	
		\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
			\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets  $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
			\State\Comment{\sampmon \; is ~\Cref{alg:sample}}
			\If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block}
				\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}
				\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
			\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
			\EndIf
		\EndFor

		\State  $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
		\State \Return \vari{acc}
	\end{algorithmic}
\end{algorithm}

\subsubsection{Correctness}

In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies.  


\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\circuit) \cdot \frac{\log{\abs{\circuit(1\ldots, 1)}}}{\log{N}})$ time, where $N = \size(\circuit)$.\footnote{In the appendix we give a sufficient condition when $\abs{\circuit}(1,\ldots, 1)$ is indeed $O(1)$ in arithmetic computations.  Most notably, WCOJ and FAQ results are not affected by the general runtime of arithmetic computations, a point which we also address in the appendix.}  $\onepass$ guarantees two post-conditions:  First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$.  Second, when $\vari{S}.\type  = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
\end{Lemma}
To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.

\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\frac{\log{\abs{\circuit}(1,\ldots, 1)}}{\log{\size(\circuit)}})$ time\footnote{Note that the same sufficient condition on \circuit to guarentee $O(1)$ arithmetic computations applies here, and when this condition is met, the runtime loses the $\frac{\log{\abs{\circuit}(1,\ldots, 1)}}{\log{\size(\circuit)}}$ factor}, where $k = \degree(\circuit)$.  Upon completion, every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ is returned with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$. 
\end{Lemma}

Armed with the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}):
\begin{Theorem}\label{lem:mon-samp}
For any $\circuit$ with $\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
 in $O\left(\size(\circuit)\right.$ $+$ $\left.\left(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\right)$ time.
\end{Theorem}


\subsection{\onepass\ Algorithm}
\label{sec:onepass}

The evaluation of $\abs{\circuit}(1,\ldots, 1)$ can be defined recursively, as follows (where $\circuit_\linput$ and $\circuit_\rinput$ are the `left' and `right' inputs of $\circuit$ if they exist):

{\small
\begin{align}
\label{eq:T-all-ones}
\abs{\circuit}(1,\ldots, 1) = \begin{cases}
						\abs{\circuit_\linput}(1,\ldots, 1) \cdot \abs{\circuit_\rinput}(1,\ldots, 1)	&\textbf{if }\circuit.\type = \revision{\circmult}\\
						\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1)		&\textbf{if }\circuit.\type = \revision{\circplus} \\
						 |\circuit.\val|											&\textbf{if }\circuit.\type = \tnum\\
						1													&\textbf{if }\circuit.\type = \var.
					\end{cases}
\end{align}
}

It turns out that for proof of~\Cref{lem:sample}, we need to argue that when $\circuit.\type = +$, we indeed have
\begin{align}
\label{eq:T-weights}
\circuit.\lwght &\gets \frac{\abs{\circuit_\linput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1)};\\
\circuit.\rwght &\gets \frac{\abs{\circuit_\rinput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1)+ \abs{\circuit_\rinput}(1,\ldots, 1)}
\end{align}

\noindent \onepass\ (Algorithm ~\ref{alg:one-pass-iter} in \Cref{sec:proofs-approx-alg}) iteratively visits each gate one time according to the topological ordering of \circuit annotating the \lwght, \rwght, and \prt variables of each node according to the definitions above.  Lemma~\ref{lem:one-pass} is also proved in~\Cref{sec:proofs-approx-alg}.

\subsection{\sampmon\ Algorithm}
\label{sec:samplemonomial}

A naive (slow) implementation of \sampmon\ would first compute $\expansion{\circuit}$ and then sample from it.
Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top-down traversal.
For a parent $+$ gate, the input to be visited is sampled from the weighted distribution precomputed by \onepass.
When a parent $\times$ node is visited, both inputs are visited.
The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of signs of visited coefficient leaf nodes.

We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements.

$\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}.

\begin{algorithm}[t]
	\caption{\sampmon(\circuit)}
	\label{alg:sample}
	\begin{algorithmic}[1]
		\revision{\Require \circuit: Circuit}
		\Ensure \vari{vars}: TreeSet
		\Ensure \vari{sgn} $\in \{-1, 1\}$
		\Comment{\Cref{alg:one-pass-iter} should have been run before this one} % algorithm ~\ref{alg:sample}}
		\State $\vari{vars} \gets \emptyset$ \label{alg:sample-global1}
		\If{$\circuit.\type = +$}\Comment{Sample at every $+$ node}
			\State $\circuit_{\vari{samp}} \gets$ Sample from left input ($\circuit_{\linput}$) and right input ($\circuit_{\rinput}$) w.p. $\circuit.\vari{Lweight}$ and $\circuit.\vari{Rweight}$. \label{alg:sample-plus-bsamp} \Comment{Each call to \sampmon uses fresh randomness}
			\State $(\vari{v}, \vari{s}) \gets \sampmon(\circuit_{\vari{samp}})$\label{alg:sample-plus-traversal}
			\State $\Return ~(\vari{v}, \vari{s})$
		\ElsIf{$\circuit.\type = \times$}\Comment{Multiply the sampled values of all inputs}
			\State $\vari{sgn} \gets 1$\label{alg:sample-global2}
			\For {$input$ in $\circuit.\vari{input}$}\label{alg:sample-times-for-loop}
				\State $(\vari{v}, \vari{s}) \gets \sampmon(input)$
				\State $\vari{vars} \gets \vari{vars} \cup \{\vari{v}\}$\label{alg:sample-times-union}
				\State $\vari{sgn} \gets \vari{sgn} \times \vari{s}$\label{alg:sample-times-product}
			\EndFor
			\State $\Return ~(\vari{vars}, \vari{sgn})$
		\ElsIf{$\circuit.\type = numeric$}\Comment{The leaf is a coefficient}
			%\State $\vari{sgn} \gets \vari{sgn} \times sign(\circuit.\val)$
			\State $\Return ~\left(\{\}, sign(\circuit.\val)\right)$\label{alg:sample-num-return}
		\ElsIf{$\circuit.\type = var$}
			%\State $\vari{vars} \gets \vari{vars} \; \cup \; \{\;\circuit.\val\;\}\label{alg:sample-var-union}$\Comment{Add the variable to the set}
			\State $\Return~\left(\{\circuit.\val\}, 1\right)	$\label{alg:sample-var-return}
		\EndIf
	\end{algorithmic}
\end{algorithm}

% \subsection{Experimental results}
% \label{sec:experiments}
% We conducted an experiment running modified TPCH queries over uncertain data generated by pdbench~\cite{pdbench}, both of which (data and queries) represent what is typically encountered in practice.  Queries were run two times, once filtering $\bi$ cancellations, and then second not filtering the cancellations.  The purpose of this was to determine an indication for how many $\bi$ cancellations occur in practice.  Details and results can be found in~.

%\AR{Experimental stuff about \bi should go in here}
%%%%%%%%%%%%%%%%%%%%%%%

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: