paper-BagRelationalPDBsAreHard/approx_alg.tex

440 lines
30 KiB
TeX

%root: main.tex
%!TEX root=./main.tex
\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}
In~\Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for \ti (even just based on project-join queries) is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability (\Cref{th:single-p-hard}).
Given this, we now design an approximation algorithm for our problem that runs in {\em linear time}.
The folowing approximation algorithm applies to \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
\subsection{Preliminaries and some more notation}
We now introduce useful definitions and notation related to polynomials. We use the following polynomial as an example:
\begin{equation}
\label{eq:poly-eg}
\poly(X, Y) = 2X^2 + 3XY - 2Y^2.
\end{equation}
\begin{Definition}[Variables in a monomial]\label{def:vars}
Given a monomial $v$, we use $\var(v)$ to denote the set of variables in $v$.
\end{Definition}
\noindent For example the monomial $XY$ has $\var(XY)=\inset{X,Y}$.
%\begin{Definition}[Expression Tree]\label{def:express-tree}
%An expression tree $\circuit$ is a binary %an ADT logically viewed as an n-ary
%tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\circuit$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\circuit$'s children where $\circuit_\lchild$ is the left child and $\circuit_\rchild$ the right child. Remaining fields hold values whose semantics we will fix later. When $\circuit$ is used as input of ~\Cref{alg:mon-sam} and ~\Cref{alg:one-pass}, the values of \vari{partial} and \vari{weight} will not be set. %SEMANTICS FOR \circuit: \vari{partial} is the sum of $\circuit$'s coefficients , n, and \vari{weight} is the probability of $\circuit$ being sampled.
%\end{Definition}
%Note that $\circuit$ need not encode an expression in the standard monomial basis. For instance, $\circuit$ could represent a compressed form of the polynomial in~\Cref{eq:poly-eg}, such as $(x + 2y)(2x - y)$.
\revision{
\begin{Definition}[Pure Expansion]
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes ~\Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
\end{Definition}
}
\begin{Definition}[Expanded \revision{\circuit}]\label{def:expand-circuit}
%\revision{$\expansion{\circuit}$} is the reduced pure expansion of $\revision{\circuit}$.
The logical view of \revision{$\expansion{\circuit}$} is a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef$ is in $\reals$.
\revision{$\expansion{\circuit}$} has the following recursive definition ($\circ$ is list concatenation).
{\small
\begin{multline*}
\expansion{\circuit} =
\begin{cases}
\revision{\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput}} &\textbf{ if }\revision{\circuit.\type = \circplus}\\
\left\{(\monom_\linput \cup \monom_\rinput, \coef_\linput \cdot \coef_\rinput) ~|~\right.&\\ \quad \left.(\monom_\linput, \coef_\linput) \in \revision{\expansion{\circuit_\linput}}, (\monom_\rinput, \coef_\rinput) \in \revision{\expansion{\circuit_\rinput}}\right\} &\textbf{ if }\revision{\circuit.\type = \circmult}\\
\elist{(\emptyset, \revision{\circuit.\val})} &\textbf{ if }\revision{\circuit}.\type = \tnum\\
\elist{(\{\revision{\circuit}.\val\}, 1)} &\textbf{ if }\revision{\circuit}.\type = \var.\\
\end{cases}
\end{multline*}
}
\end{Definition}
\revision{
Note that similar in spirit to ~\Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though ~\Cref{def:reduced-bi-poly} is more general.
}
In the following, we abuse notation and write $\monom$ to denote the monomial obtained as the products of the variables in the set.
\begin{Example}\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}.
Its circuit $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}.
The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expansion{\circuit}$ is $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
\end{Example}
$\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$.
Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.
\revision{
\begin{Definition}[Positive \circuit]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
\end{Definition}
Using the same factorization from ~\Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.
\begin{Definition}[Evaluation]\label{def:exp-poly-eval}
Given an expression tree $\circuit$ and a valuation $\vct{a} \in \mathbb{R}^\numvar$, we define the evaluation of $\circuit$ on $\vct{a}$ as $\circuit(\vct{a}) = \polyf(\circuit)(\vct{a})$.
\end{Definition}
\begin{Definition}[\size($\cdot$)]
The function \size~ takes a circuit $\circuit$ as input and outputs the number of gates (nodes) in \circuit.
\end{Definition}
\begin{Definition}[\depth($\cdot$)]
The function \depth~ has circuit $\circuit$ as input and outputs the number of levels in \circuit.
\end{Definition}
\begin{Definition}[Subcircuit]
A subcircuit of a circuit $\circuit$ is a circuit \subcircuit such that \subcircuit is a DAG \textit{subgraph} of the DAG representing \circuit. The sink of \subcircuit has exactly one gate \gate.
\end{Definition}
}
\subsection{Our main result}
In the subsequent subsections we will prove the following theorem.
\begin{Theorem}\label{lem:approx-alg}
Let \revision{\circuit be a circuit} for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\poly)$.
%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a \bi.
Then an estimate $\mathcal{E}$ %=\approxq(\circuit, P_1,\dots,p_\numvar), \conf, \error')$
of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\revision{\size(\circuit)} + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\]
such that
\begin{equation}
\label{eq:approx-algo-bound}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
%with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
\end{Theorem}
\noindent The proof of~\Cref{lem:approx-alg} \revision{
(which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample})
}
can be found in~\Cref{sec:proof-lem-approx-alg}. \revision{
The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}.
}
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}):
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\circuit$, define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\]
\end{Definition}
%\AH{This....combined with \Cref{def:mod-set-polys} is \emph{really} nice notation!}
%\AR{Need to make sure use of indicator variable $\onesymbol$ above is consistent with the rest of the paper.}
%\OK{Done}
\noindent We next present couple of corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time
\[O\left(\revision{\size(\circuit)} + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\]
In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$.
\end{Corollary}
The proof for~\Cref{cor:approx-algo-const-p} can be seen in~\Cref{sec:proofs-approx-alg}.
The restriction on $\gamma$ is satisfied by any \ti (where $\gamma=0$) as well as for all three queries of the PDBench \bi benchmark (\Cref{app:subsec:experiment} shows experimentally that $\gamma$ is negligible in practice for these queries).
We also observe that (i) tuple presence is independent across blocks, so the corresponding probabilities (and hence $\prob_0$) are independent of the number of blocks, and (ii) \bis model uncertain attributes, so block size (and hence $\gamma$) is a function of the ``messiness'' of a dataset, rather than its size.
Thus, we expect the corollary to hold in general.
% \AH{I am thinking that perhaps the terminology and presentation of~\Cref{app:subsec:experiment} may need word-smithing to clearly illustrate the $\bi$ benchmarks satisfied--although the substance is already written there.}
% \AR{Yes! E.g. $\gamma$ is not used at all in~\Cref{app:subsec:experiment}}
% \AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of tuples?}
% \OK{@Atri: This seems like a reasonable claim. It's too late for me to come up with a reasonable motivation (maybe something will come to me in the morning), but the intuition for me is that each tuple/block is independent... it would be hard for that to be the case if the probability were a function of the number of tuples.}
\subsection{Approximating $\rpoly$}
The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=\polyf(\circuit)$ for \revision{circuit \circuit} over $\bi$, we can exactly represent $\rpoly(\vct{X})$ as follows:
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{X_1,\dots,X_\numvar}=\hspace*{-1mm}\sum_{(\monom,\coef)\in \expansion{\circuit}} \hspace*{-2mm} \indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \var\inparen{\monom}}\hspace*{-2mm} X_i
\end{equation}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional\footnote{We could have also uniformly sampled from $\expansion{\circuit}$ but this gives better parameters.}
%\AH{Regarding the footnote, is there really a difference? I \emph{suppose} technically, but in this case they are \emph{effectively} the same. Just wondering.}
%\AR{Yes, there is! If we used uniform distribution then in our bounds we will have a parameter that depends on the largest $\abs{coef}$, which e.g. could be dependent on $n$. But with the weighted probability distribution, we avoid paying this price. Though I guess perhaps we can say for the kinds of queries we consider thhese coefficients are all constants?}
to $\abs{\coef}$ and compute $Y=\indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{\monom}} p_i$. Taking $\numsamp$ samples and computing the average of $Y$ gives us our final estimate.
The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
\begin{equation*}
2\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \conf \implies\samplesize \geq \frac{2\log{\frac{2}{\conf}}}{\error^2}.
%\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \frac{\conf}{2}\\
%\frac{\samplesize\error^2}{2}\geq \log{\frac{2}{\conf}}\\
\end{equation*}
To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$.
%We state the approximation algorithm in terms of a $\bi$.
%\subsubsection{Description}
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\circuit$ produces a non-biased weight distribution over the monomials of $\expansion{\circuit}$ and a correct count of $|\circuit|(1,\ldots, 1)$, i.e., the number of monomials in $\expansion{\circuit}$. Next, ~\Cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expansion{\circuit}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\Cref{lem:mon-samp}.
%\AR{Seems like the notation below belongs to the notation section (if we decide to state this explicitly at all)?}
%\AH{Yes, I only included this per your request a few months ago. Based on @lordpretzel removing my definition of monomial, perhaps we can assume that the reader understands the notation below. I \emph{think} this should be a reasonable assumption.}
%Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$.
%\subsubsection{Psuedo Code}
%Original \ti Algorithm
%\begin{algorithm}[H]
% \caption{$\approxq$($\circuit$, $\vct{p}$, $\conf$, $\error$)}
% \label{alg:mon-sam}
% \begin{algorithmic}[1]
% \Require \circuit: Binary Expression Tree
% \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
% \Require $\conf$ $\in [0, 1]$
% \Require $\error$ $\in [0, 1]$
% \Ensure \vari{acc} $\in \mathbb{R}$
% \State $\accum \gets 0$\label{alg:mon-sam-global1}
% \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
% \State $(\vari{\circuit}_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass} \;and \sampmon \; is ~\Cref{alg:sample}}\newline
% \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples}
% \State $(\vari{M}_\vari{i}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
% \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}
% \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}_{\vari{i}}$}
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
% \EndFor
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
% \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
% \EndFor
%
% \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
% \State \Return \vari{acc}
% \end{algorithmic}
%\end{algorithm}
%\bi Version of Approximation Algorithm
\begin{algorithm}[t]
\caption{$\approxq(\circuit, \vct{p}, \conf, \error)$}
\label{alg:mon-sam}
\begin{algorithmic}[1]
\Require \revision{\circuit: Circuit}
\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
\Require $\conf$ $\in [0, 1]$
\Require $\error$ $\in [0, 1]$
%\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
\Ensure \vari{acc} $\in \mathbb{R}$
%\State $\vari{sample}_\vari{next} \gets 0$
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass-iter}}
%\newline
%\State $\vari{i} \gets 1$
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
\State\Comment{\sampmon \; is ~\Cref{alg:sample}}
%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
% \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
% \newline
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State break
% \Else
% \State $\bivec[\block] = 1$
% \State $\vari{sum} = 0$
% \For{$\ell \in [\abs{\block}]$}
% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$
% \EndFor
% \If{$\vari{sum} \geq 2$}
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.}
% \EndIf
% \EndFor
% \If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
% \State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
% \Else
\If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block}
\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}%\newline
%\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
%\EndFor
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
%\State $\vari{i} \gets \vari{i} + 1$
\EndIf
\EndFor
%\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\circuit, \numsamp, \abs{\block})$
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
\State \Return \vari{acc}
\end{algorithmic}
\end{algorithm}
%\begin{algorithm}[H]
% \caption{$\algname{Estimate}$ $\gamma(\circuit, \numsamp, \abs{\block})$}
% \label{alg:est-gamma}
% \begin{algorithmic}[1]
% \Require \circuit: Binary Expression Tree
% \Require $\numsamp \in \mathbb{N}$
% \Require $\abs{\block} \in \mathbb{N}$
% \Ensure \vari{cTerms} $]in \mathbb{R}$
%
% \State $\vari{cTerms} \gets 0$
% \State $\vari{isCross} \gets 0$
% \For{$\vari{i} \text{ in } 1 \text{ to } \numsamp$}
% \State $\bivec \gets [0]^{\abs{\block}}$
% \State $(\vari{M}, \vari{sgn}) \gets $ \sampmon($\circuit_\vari{mod}$)
% \For{$\vari{x}_{\vari{b}, \vari{j}} \text{ in } \vari{M}$}
% \If{$\bivec[b] = 1$}
% \State $\vari{isCross} \gets 1$
% \State Break
% \Else
% \State $\bivec[b] \gets 1$
% \EndIf
% \EndFor
% \If{$\vari{isCross} = 1$}
% \State $\vari{cTerms} \gets \vari{cTerms} + 1$
% \State $\vari{isCross} \gets 0$
% \EndIf
% \EndFor
% \State \Return $\frac{\vari{cTerms}}{\numsamp}$
% \end{algorithmic}
%\end{algorithm}
\subsubsection{Correctness}
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies. %Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\circuit))$ time. $\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.% is correctly computed for each child of $\vari{S}.$
\end{Lemma}
To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
%\AH{I'm wondering if there is a better notation to use here. I myself got confused by my own notation of $\circuit_{\vari{mod}}$. \emph{But}, we need to to be referencing the modified $\circuit$ returned by $\onepass$ in the algorithm, so maybe this is the best we can do?}
%\AR{yeah, I think this is fine.}
%At the conclusion of $\onepass$, $\circuit.\vari{partial}$ will hold the sum of all coefficients in $\expansion{\abs{\circuit}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expansion{\abs{\circuit}}}\coef$. $\circuit.\vari{weight}$ will hold the weighted probability that $\circuit$ is sampled from from its parent $+$ node.
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot \depth(\circuit))$ time, where $k = \degree(poly(\abs{\circuit})$. Upon completion, every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ is returned with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$. %, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expansion{\abs{\circuit}}$.
\end{Lemma}
Armed with the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}):
\begin{Theorem}\label{lem:mon-samp}
%If the contracts for $\onepass$ and $\sampmon$ hold, then
For any $\circuit$ with $\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that %$\expct\pbox{\empmean} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma)}{\abs{\circuit}(1,\ldots, 1)}$. %within an additive $\error \cdot \abs{\circuit}(1,\ldots, 1)$ error with
%$\empmean$ has bounds
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
in $O\left(\size(\circuit)\right.$ $+$ $\left.\left(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\right)$ time.
\end{Theorem}
\subsection{\onepass\ Algorithm}
\label{sec:onepass}
%\subsubsection{Description}
%Algorithm ~\ref{alg:one-pass} satisfies the requirements of lemma ~\ref{lem:one-pass}.
The evaluation of $\abs{\circuit}(1,\ldots, 1)$ can be defined recursively, as follows (where $\circuit_\linput$ and $\circuit_\rinput$ are the `left' and `right' inputs of $\circuit$ if they exist):
{\small
\begin{align}
\label{eq:T-all-ones}
\abs{\circuit}(1,\ldots, 1) = \begin{cases}
\abs{\circuit_\linput}(1,\ldots, 1) \cdot \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \revision{\circmult}\\
\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \revision{\circplus} \\
|\circuit.\val| &\textbf{if }\circuit.\type = \tnum\\
1 &\textbf{if }\circuit.\type = \var.
\end{cases}
\end{align}
}
%\begin{align*}
%&\eval{\circuit ~|~ \circuit.\type = +}_{\abs{\circuit}} =&& \eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}\\
%&\eval{\circuit ~|~ \circuit.\type = \times}_{\abs{\circuit}} = && \eval{\circuit_\lchild}_{\abs{\circuit}} \cdot \eval{\circuit_\rchild}_{\abs{\circuit}}\\
%&\eval{\circuit ~|~ \circuit.\type = \tnum}_{\abs{\circuit}} = && \circuit.\val\\
%&\eval{\circuit ~|~ \circuit.\val = \var}_{\abs{\circuit}} = && 1
%\end{align*}
%In the same fashion the weighted distribution can be described as above with the following modification for the case when $\circuit.\type = +$:
It turns out that for proof of~\Cref{lem:sample}, we need to argue that when $\circuit.\type = +$, we indeed have
\begin{align}
\label{eq:T-weights}
%&\abs{\circuit_\lchild}(1,\ldots, 1) + \abs{\circuit_\rchild}(1,\ldots, 1); &\textbf{if }\circuit.\type = + \\
\circuit.\lwght &\gets \frac{\abs{\circuit_\linput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1)};\\
\circuit.\rwght &\gets \frac{\abs{\circuit_\rinput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1)+ \abs{\circuit_\rinput}(1,\ldots, 1)}
\end{align}
%\begin{align*}
%&\eval{\circuit~|~\circuit.\type = +}_{\wght} =&&\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}; \circuit_\lchild.\wght = \frac{\eval{\circuit_\lchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}; \circuit_\rchild.\wght = \frac{\eval{\circuit_\rchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}
%\end{align*}
\noindent \onepass\ (Algorithm ~\ref{alg:one-pass-iter} in \Cref{sec:proofs-approx-alg}) iteratively visits each gate one time according to the topological ordering of \circuit annotating the \lwght, \rwght, and \prt variables of each node according to the definitions above. Lemma~\ref{lem:one-pass} is also proved in~\Cref{sec:proofs-approx-alg}.
%\subsubsection{Psuedo Code}
%See algorithm ~\ref{alg:one-pass} for details.
\subsection{\sampmon\ Algorithm}
\label{sec:samplemonomial}
%Algorithm ~\ref{alg:sample} takes $\circuit$ as input, samples an arbitrary $(\monom, \coef)$ from $\expansion{\circuit}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputting the tuple $(\monom, \sign(\coef))$. While one cannot compute $\expansion{\circuit}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\circuit$ which produces a sample from $\expansion{\circuit}$ without ever materializing $\expansion{\circuit}$.
A naive (slow) implementation of \sampmon\ would first compute $\expansion{\circuit}$ and then sample from it.
% However, this would be too time consuming.
%
Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top-down traversal.
For a parent $+$ gate, the input to be visited is sampled from the weighted distribution precomputed by \onepass.
When a parent $\times$ node is visited, both inputs are visited.
The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of signs of visited coefficient leaf nodes.
We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements.
$\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}.
\begin{algorithm}[t]
\caption{\sampmon(\circuit)}
\label{alg:sample}
\begin{algorithmic}[1]
\revision{\Require \circuit: Circuit}
\Ensure \vari{vars}: TreeSet
\Ensure \vari{sgn} $\in \{-1, 1\}$
\Comment{\Cref{alg:one-pass-iter} should have been run before this one} % algorithm ~\ref{alg:sample}}
\State $\vari{vars} \gets \emptyset$ \label{alg:sample-global1}
\If{$\circuit.\type = +$}\Comment{Sample at every $+$ node}
\State $\circuit_{\vari{samp}} \gets$ Sample from left input ($\circuit_{\linput}$) and right input ($\circuit_{\rinput}$) w.p. $\circuit.\vari{Lweight}$ and $\circuit.\vari{Rweight}$. \label{alg:sample-plus-bsamp} \Comment{Each call to \sampmon uses fresh randomness}
\State $(\vari{v}, \vari{s}) \gets \sampmon(\circuit_{\vari{samp}})$\label{alg:sample-plus-traversal}
\State $\Return ~(\vari{v}, \vari{s})$
\ElsIf{$\circuit.\type = \times$}\Comment{Multiply the sampled values of all inputs}
\State $\vari{sgn} \gets 1$\label{alg:sample-global2}
\For {$input$ in $\circuit.\vari{input}$}\label{alg:sample-times-for-loop}
\State $(\vari{v}, \vari{s}) \gets \sampmon(input)$
\State $\vari{vars} \gets \vari{vars} \cup \{\vari{v}\}$\label{alg:sample-times-union}
\State $\vari{sgn} \gets \vari{sgn} \times \vari{s}$\label{alg:sample-times-product}
\EndFor
\State $\Return ~(\vari{vars}, \vari{sgn})$
\ElsIf{$\circuit.\type = numeric$}\Comment{The leaf is a coefficient}
%\State $\vari{sgn} \gets \vari{sgn} \times sign(\circuit.\val)$
\State $\Return ~\left(\{\}, sign(\circuit.\val)\right)$\label{alg:sample-num-return}
\ElsIf{$\circuit.\type = var$}
%\State $\vari{vars} \gets \vari{vars} \; \cup \; \{\;\circuit.\val\;\}\label{alg:sample-var-union}$\Comment{Add the variable to the set}
\State $\Return~\left(\{\circuit.\val\}, 1\right) $\label{alg:sample-var-return}
\EndIf
\end{algorithmic}
\end{algorithm}
% \subsection{Experimental results}
% \label{sec:experiments}
% We conducted an experiment running modified TPCH queries over uncertain data generated by pdbench~\cite{pdbench}, both of which (data and queries) represent what is typically encountered in practice. Queries were run two times, once filtering $\bi$ cancellations, and then second not filtering the cancellations. The purpose of this was to determine an indication for how many $\bi$ cancellations occur in practice. Details and results can be found in~.
%\AR{Experimental stuff about \bi should go in here}
%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: