paper-BagRelationalPDBsAreHard/approx_alg.tex

%root: main.tex
%!TEX root=./main.tex

\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}
We showed in~\Cref{sec:hard} that a runtime of $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ cannot be acheived for~\Cref{prob:bag-pdb-poly-expected}.  In light of this, we desire to produce an approximation algorithm that runs in time $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$.  We do this by showing the result via circuits,
such that our $1\pm\epsilon$ approximation algorithm for this problem runs in $\bigO{\abs{\circuit}}$ for a very broad class of circuits, (thus solving~\Cref{prob:intro-stmt});  see the discussion after \Cref{lem:val-ub} for more.
The following approximation algorithm applies to bag query semantics over both
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice. %, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$).  
Our experimental results (see~\Cref{app:subsec:experiment}), which use queries from the PDBench benchmark~\cite{pdbench} support the notion that our bounds hold for general \abbrBIDB in practice.
%
%
Proofs and pseudocode for all formal statements and algorithms
  are in \Cref{sec:proofs-approx-alg}.

\subsection{Preliminaries and some more notation}

For notational convenience, in this section we will assume that \dbbaseName $\tupset'=[n]$.
We now introduce definitions  related to circuits and polynomials that we will need to state our upper bound results. First, we introduce the expansion $\expansion{\circuit}$ of circuit $\circuit$ which 
is used in our auxiliary algorithm \sampmon for sampling monomials when computing the approximation.  

\begin{Definition}[$\expansion{\circuit}$]\label{def:expand-circuit}
For a circuit $\circuit$, we define $\expansion{\circuit}$ as a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef \in \domN$.
$\expansion{\circuit}$ has the following recursive definition ($\circ$ is list concatenation).\\
$\expansion{\circuit} =
\begin{cases}
					\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput}		&\textbf{ if }\circuit.\type = \circplus\\
					\left\{(\monom_\linput \cup \monom_\rinput, \coef_\linput \cdot \coef_\rinput) \right.\\\left.~|~(\monom_\linput, \coef_\linput) \in \expansion{\circuit_\linput}, (\monom_\rinput, \coef_\rinput) \in \expansion{\circuit_\rinput}\right\} 		&\textbf{ if }\circuit.\type = \circmult\\
					\elist{(\emptyset, \circuit.\val)}								&\textbf{ if }\circuit.\type = \tnum\\
					\elist{(\circuit.\val, 1)}									&\textbf{ if }\circuit.\type = \var.\\
\end{cases}
$
\end{Definition}
Later on, we will denote the monomial composed of the variables in $\monom$ as $\encMon$.  As an example of $\expansion{\circuit}$, consider $\circuit$ illustrated in \Cref{fig:circuit}.  $\expansion{\circuit}$ is then $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$. This helps us redefine $\rpoly$ (see \Cref{eq:tilde-Q-bi}) in a way that makes our algorithm more transparent.

Next, we present a sequence of definitions that will be useful for our algorithm and its analysis.
\begin{Definition}[$\abs{\circuit}$]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
\end{Definition}
We will overload notation and use $\abs{\circuit}\inparen{\vct{X}}$ to mean $\polyf\inparen{\abs{\circuit}}$.
Conveniently, $\abs{\circuit}\inparen{1,\ldots,1}$ gives us $\sum\limits_{\inparen{\monom, \coef} \in \expansion{\circuit}}\abs{\coef}$.

\begin{Definition}[\size($\cdot$), \depth$\inparen{\cdot}$]\label{def:size-depth}
The functions \size and \depth output the number of gates and levels respectively for input \circuit.
\end{Definition}

\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
$\degree(\circuit)$ is defined recursively:
\[\degree(\circuit)=
\begin{cases}
\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
\degree(\circuit_\linput) + \degree(\circuit_\rinput)+1 &\text{ if }\circuit.\type=\times\\
1 & \text{ if }\circuit.\type = \var\\
0 & \text{otherwise}.
\end{cases}
\]
\end{Definition}

\noindent
We use the following notation for integer multiplication complexity:
\begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$.}
%More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
In a RAM model of word size of $W$-bits, $\multc{M}{W}$ denotes the complexity of multiplying two integers represented with $M$-bits. (For input of size $N$, we make the standard assumption $W=O(\log{N})$.)
\end{Definition}

Finally, to get linear runtime results, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$
that need to be `canceled' when monomials with dependent variables are removed (\Cref{subsec:one-bidb}).  
Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise; further, let $\indicator{\theta}$ also be a boolean function returning true if $\theta$ evaluates to true.
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given \abbrOneBIDB circuit $\circuit$, let
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\neg\isInd{\encMon}} }
{\abs{\circuit}(1,\ldots, 1)}.\]
\end{Definition}
\subsection{Our main result}\label{sec:algo:sub:main-result}
 In what follows, we solve~\Cref{prob:intro-stmt} for any fixed $\epsilon > 0$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Algorithm Idea}
Our approximation algorithm (\approxq) % pseudo code in \Cref{sec:proof-lem-approx-alg})
is based on the following observation.
Given a lineage polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit over 
\abbrOneBIDB (recall that all \abbrCTIDB can be reduced to \abbrOneBIDB by~\Cref{prop:ctidb-reduct}), we have: 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{p_1,\dots,p_\numvar}=%\hspace*{-1mm}
\sum_{(\monom,\coef)\in \expansion{\circuit}} 
\indicator{\isInd{\encMon}
}\cdot \coef\cdot\prod_{X_i\in \monom} p_i.
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Given the above, the algorithm is a sampling based algorithm for the above sum: we sample (via \sampmon) $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional
 to $\abs{\coef}$ and compute $\vari{Y}=\indicator{\isInd{\encMon}}
 \cdot \prod_{X_i\in \monom} p_i$. 
Repeating the sampling an appropriate number of times
and computing the average of $\vari{Y}$ gives us our final estimate. 

We illustrate \sampmon  (\Cref{alg:sample}), using the circuit $\circuit$ in \Cref{fig:circuit}. As a pre-processing step, \onepass (\Cref{alg:one-pass-iter}) recursively (for each sub-circuit) computes $\abs{\circuit}\inparen{1,\ldots, 1}$ in the obvious manner.
%for any subcircuit whose sink is gate \circuit.  
The \textcolor{gray}{gray} values in~\Cref{fig:circuit} represent the value $\abs{\circuit'}\inparen{1,\ldots, 1}$ for each sub-circuit $\circuit'$ rooted at the corresponding node. E.g. in the bottom right $\circmult$ (\textcolor{blue}{blue}) gate, the value is $1\times 1=1$ (where the left child has $Y\gets 1$ and the right child has $\abs{-1}=1$).  
%The probability of sampling the left child of the red gate is then the computed sum of its left child divided by the red gate's computed sum, $\frac{1}{3}$.% = \abs{\circuit_\linput}\inparen{1,\ldots, 1} + \abs{\circuit_\rinput}\inparen{1,\ldots, 1}$, the sum of its children's values.
% visits each gate \circuit exactly once, computing $\abs{\circuit}\inparen{1,\ldots, 1}$.  %for each gate \circuit.  
%If we consider the leftmost source gates, \onepass computes $\abs{\circuit}\inparen{1,\ldots,1} = 1$ for $\circuit.\val = X$ and $\abs{\circuit}\inparen{1,\ldots, 1} = 2$ for $\circuit.\val = 2$. For the leftmost $\circmult$ gate, \onepass computes $\abs{\circuit}\inparen{1,\ldots, 1} = 2\circmult 1$, i.e. $\abs{\circuit_\linput}\inparen{1,\ldots, 1} \times \abs{\circuit_\rinput}\inparen{1,\ldots, 1}$ for children $\circuit_\linput$ and $\circuit_\rinput$.  A level higher, the leftmost $\circplus$ gate recursively adds the values of its two children deriving $\abs{\circuit}\inparen{1,\ldots, 1} = 2 \circplus 1$, while using the expression $\frac{\abs{\circuit_i}\inparen{1,\ldots, 1}}{\abs{\circuit}\inparen{1,\ldots, 1}}$ for $i\in\inset{\linput, \rinput}$ to simultaneously compute the weights $\frac{1}{3}$ and $\frac{2}{3}$ for its children.  The final sum value is then computed in similar fashion.  % yielding $\abs{\circuit}\inparen{1,\ldots, 1} = 3 \circmult 3 = 9$.
%Given the computed values of \onepass, \sampmon picks a sampling path by traversing both children of a $\circmult$ gate and randomly choosing a child from a $\circplus$ gate according to the weight $\frac{\abs{\circuit_i}\inparen{1,\ldots, 1}}{\abs{\circuit}\inparen{1,\ldots, 1}}$ for $i\in\inset{\linput, \rinput}$.  
%then uses the weights provided by \onepass to randomly select a monomial from $\expansion{\circuit}$.  

  We now consider a partial run of \sampmon that samples  $\inparen{XY, -1}$ in~\Cref{fig:circuit}.  It recursively traverses \emph{both} children of the sink $\circmult$ gate.  For the  \textcolor{red}{red} and \textcolor{green}{green} children, which are both  $\circplus$ gates, we randomly choose one of their children. Specifically \sampmon then randomly picks the right child of the \textcolor{green}{green} $\circplus$ gate (i.e the \textcolor{blue}{blue} $\circmult$ gate that represents $-Y$ and is computed by recursing on both children of the \textcolor{blue}{blue} $\circmult$ gate) with probability of $\frac{1}{3}$ (where the numerator and denominator are the values computed by \onepass for the \textcolor{blue}{blue} $\circmult$ and \textcolor{green}{green} $\circplus$ gates respectively). Similarly at the left \textcolor{red}{red} $\circplus$ gate we sample the left child (representing $X$ and is computed by recursing to the leaf node $X$) with  probability $\frac{1}{3}$.  Note that the probability for choosing $\inparen{XY, -1}$ overall  is $\frac{1}{3}\cdot \frac{1}{3}=\frac{1}{9}$, which is indeed the ratio of the coefficient of $\inparen{XY, -1}$ to the sum of all coefficients in $\abs{\circuit}$, as needed.  %For the recursive call on the red gate, $\inparen{X, 1}$ is returned, while the purple gate recursively visits both children of the sampled $\circmult$ gate, returning $\inparen{Y, 1}$.  %Multiplying $-1 \circmult XY$, concludes the random sampling of monomial $-XY$.  Suppose \sampmon also randomly samples $X$ and $-Y$ from $\rpoly$ in a call to \approxq.  To estimate $\rpoly\inparen{\vct{\prob}}$, \approxq computes $\prob_X - \prob_X\prob_Y - \prob_Y$ and scales the accumulation accordingly.
%such that a source gate \circuit has $\abs{\circuit}\inparen{1,\ldots, 1} = \circuit.\val$ when \circuit.\type $=$ \num and $\abs{\circuit}\inparen{1,\ldots,1} = 1$ otherwise.  For every gate \circuit, \onepass computes $\abs{\circuit}\inparen{1,\ldots, 1}$ as seen in the lighter font of~\Cref{fig:circuit}.  \onepass further weights each child $\circuit_i$ for $i\in\inset{\linput, \rinput}$, by the expression $\frac{\abs{\circuit_i}\inparen{1,\ldots, 1}}{\abs{\circuit}\inparen{1,\ldots, 1}}$.  These weight are the basis for the sampling performed by \sampmon. 
 All algorithm details, including those for \approxq (\Cref{alg:mon-sam}), are in \Cref{sec:proofs-approx-alg}.
%%%%%%%%%%%%%%%%%%%%%%%


\mypar{Runtime analysis} We can argue the following runtime for the \approxq (which solves \Cref{prob:intro-stmt}):
\begin{Theorem}
\label{cor:approx-algo-const-p}
Let \circuit be an arbitrary \emph{\abbrOneBIDB} circuit, define $\poly(\vct{X})=\polyf(\circuit)$, let $k=\degree(\circuit)$, and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then for all $\epsilon'>0$, an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$
satisfying
\begin{equation}
\label{eq:approx-algo-bound-main}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf
\end{equation}
 can be computed in time
\begin{footnotesize}
\begin{equation}
\label{eq:approx-algo-runtime}
O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right).
\end{equation}
\end{footnotesize}
In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $$O_k\left(\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right).$$
\end{Theorem}

The restriction on $\gamma$ is satisfied by any
$1$-\abbrTIDB (where $\gamma=0$ in the equivalent $1$-\abbrBIDB of~\Cref{prop:ctidb-reduct})
as well as for all three queries of the PDBench \abbrBIDB benchmark (\Cref{app:subsec:experiment}). 

%We prove \Cref{cor:approx-algo-punchline-ctidb} from \Cref{eq:approx-algo-runtime} via the following sequence of arguments.
Next,  by \Cref{prop:circuit-depth} and \Cref{lem:circ-model-runtime} for any $\raPlus$ query $\query$, there exists a circuit $\circuit^*$ for $\apolyqdt$ such that $\depth(\circuit^*)\le O_{|Q|}(\log{n})$ and $\size(\circuit^*)\le O_k\inparen{\qruntime{\query, \tupset, \bound}}$. Then, we note that \Cref{prop:ctidb-reduct} gives us an equivalent $\circuit$ from $\circuit^*$ with essentially the same size/depth and has $\gamma(\circuit)\le 1-c^{-\Omega(k)}$ (\Cref{lem:ctidb-gamma}). Finally, we argue (using the fact $\circuit$ has low depth) that $\abs{\circuit}(1,\dots,1)\le \size(\circuit)^{O_k(1)}$ (\Cref{lem:val-ub}).
%Next, we note that the above result %along with \Cref{lem:ctidb-gamma}
The above sequence of arguments results in the following result (which answers \Cref{prob:big-o-joint-steps} in the affirmative):
\begin{Corollary}
\label{cor:approx-algo-punchline-ctidb}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \abbrCTIDB with $p_0>0$, where $p_0$ as in \Cref{cor:approx-algo-const-p}, is an absolute constant. Let $\poly(\vct{X})=\apolyqdt$ for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf,\bound}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $\prob_{\tup, j}$ for each $\tup\in\tupset,~j\in\pbox{\bound}$ that defines $\bpd$).
\end{Corollary}

If we want to approximate the expected multiplicities of all $Z=O(n^k)$ result tuples $\tup$ simultaneously, we just need to run the above result with $\conf$ replaced by $\frac \conf Z$, which increases the runtime by a  factor of $O_k(\log{n})$.


%%%% Commenting out he details below-- also copied over into the appendix
\iffalse
Further, we can also argue the following result.%, recalling from~\Cref{sec:intro} for \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, where $\tupset$ is the set of possible tuples across all possible worlds of $\pdb$.

\begin{Lemma}
\label{lem:ctidb-gamma}
Given $\raPlus$ query $\query$ and \abbrCTIDB $\pdb$, let \circuit be the circuit computed by $\query\inparen{\tupset}$.  Then, for the reduced \abbrOneBIDB $\pdb'$ there exists an equivalent circuit \circuit' obtained from $\query\inparen{\tupset'}$, such that $\gamma\inparen{\circuit'}\leq 1 - \bound^{-\inparen{k-1}}$ with $\size\inparen{\circuit'} \leq \size\inparen{\circuit} + \bigO{\numvar\bound}$ 
 and $\depth\inparen{\circuit'} = \depth\inparen{\circuit} + \bigO{\log{\bound}}$.
\end{Lemma}

We briefly connect the runtime in \Cref{eq:approx-algo-runtime} to the algorithm outline earlier (where we ignore the dependence on $\multc{\cdot}{\cdot}$, which is needed to handle the cost of arithmetic operations over integers). The $\size(\circuit)$ comes from the time taken to run \onepass once (\onepass essentially computes $\abs{\circuit}(1,\ldots, 1)$ using the natural circuit evaluation algorithm on $\circuit$). We make $\frac{\log{\frac{1}{\conf}}}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}$ many calls to \sampmon (each of which essentially traces $O(k)$ random sink to source paths in $\circuit$ all of which by definition have length at most $\depth(\circuit)$).

Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}$ term in the runtime. 
\begin{Lemma}
\label{lem:val-ub}
For any \emph{\abbrOneBIDB} circuit $\circuit$ with $\degree(\circuit)=k$, we have
$\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \depth(\circuit)}.$
Further, if $\circuit$ is a tree, then we have $\abs{\circuit}(1,\ldots, 1)\le  \size(\circuit)^{O(k)}.$
\end{Lemma}

Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$. If $\circuit$ is a tree, then the runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$, which then answers \Cref{prob:intro-stmt} with yes for such circuits.

Finally, note that by \Cref{prop:circuit-depth} and \Cref{lem:circ-model-runtime} for any $\raPlus$ query $\query$, there exists a circuit $\circuit^*$ for $\apolyqdt$\textcolor{red}{CHANGE} such that $\depth(\circuit^*)\le O_{|Q|}(\log{n})$ and $\size(\circuit)\le O_k\inparen{\qruntime{\query, \tupset, \bound}}$. Using this along with \Cref{lem:val-ub}, \Cref{cor:approx-algo-const-p} and the fact that $n\le \qruntime{\query, \tupset, \bound}$, we have the following corollary:
\begin{Corollary}
\label{cor:approx-algo-punchline}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \emph{\abbrOneBIDB} with $p_0>0$ and $\gamma<1$, where $p_0,\gamma$ as in \Cref{cor:approx-algo-const-p}, are absolute constants. Let $\poly(\vct{X})=\apolyqdt$\textcolor{red}{CHANGE} for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $p_i$ for each $i\in [n]$ that defines $\pd$).
\end{Corollary}

Next, we note that the above result along with \Cref{lem:ctidb-gamma}
answers \Cref{prob:big-o-joint-steps} in the affirmative as follows:
\begin{Corollary}
\label{cor:approx-algo-punchline-ctidb}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \abbrCTIDB with $p_0>0$, where $p_0$ as in \Cref{cor:approx-algo-const-p}, is an absolute constant. Let $\poly(\vct{X})=\apolyqdt$ \textcolor{red}{CHANGE}for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf,\bound}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $\prob_{\tup, j}$ for each $\tup\in\tupset,~j\in\pbox{\bound}$ that defines $\bpd$).
\end{Corollary}
\begin{proof}[Proof of~\Cref{cor:approx-algo-punchline-ctidb}]
By~\Cref{lem:ctidb-gamma} and~\Cref{cor:approx-algo-punchline}, the proof follows.
\end{proof}
\qed
\fi


%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: