paper-BagRelationalPDBsAreHard/approx_alg.tex

440 lines
30 KiB
TeX
Raw Normal View History

%root: main.tex
2020-12-19 01:15:50 -05:00
%!TEX root=./main.tex
2020-12-17 16:40:48 -05:00
\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}
2020-12-19 16:44:18 -05:00
In~\Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for \ti (even just based on project-join queries) is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability (\Cref{th:single-p-hard}).
Given this, we now design an approximation algorithm for our problem that runs in {\em linear time}.
The folowing approximation algorithm applies to \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.
2020-12-14 11:47:18 -05:00
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
\subsection{Preliminaries and some more notation}
2020-12-19 16:53:17 -05:00
We now introduce useful definitions and notation related to polynomials. We use the following polynomial as an example:
2020-12-14 11:47:18 -05:00
\begin{equation}
\label{eq:poly-eg}
\poly(X, Y) = 2X^2 + 3XY - 2Y^2.
2020-12-14 11:47:18 -05:00
\end{equation}
\begin{Definition}[Variables in a monomial]\label{def:vars}
Given a monomial $v$, we use $\var(v)$ to denote the set of variables in $v$.
\end{Definition}
2020-12-20 17:13:52 -05:00
\noindent For example the monomial $XY$ has $\var(XY)=\inset{X,Y}$.
2020-12-14 11:47:18 -05:00
%\begin{Definition}[Expression Tree]\label{def:express-tree}
%An expression tree $\circuit$ is a binary %an ADT logically viewed as an n-ary
%tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\circuit$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\circuit$'s children where $\circuit_\lchild$ is the left child and $\circuit_\rchild$ the right child. Remaining fields hold values whose semantics we will fix later. When $\circuit$ is used as input of ~\Cref{alg:mon-sam} and ~\Cref{alg:one-pass}, the values of \vari{partial} and \vari{weight} will not be set. %SEMANTICS FOR \circuit: \vari{partial} is the sum of $\circuit$'s coefficients , n, and \vari{weight} is the probability of $\circuit$ being sampled.
%\end{Definition}
2020-08-07 13:04:18 -04:00
%Note that $\circuit$ need not encode an expression in the standard monomial basis. For instance, $\circuit$ could represent a compressed form of the polynomial in~\Cref{eq:poly-eg}, such as $(x + 2y)(2x - y)$.
\revision{
\begin{Definition}[Pure Expansion]
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes ~\Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
2020-08-17 13:52:18 -04:00
\end{Definition}
}
\begin{Definition}[Expanded \revision{\circuit}]\label{def:expand-circuit}
%\revision{$\expansion{\circuit}$} is the reduced pure expansion of $\revision{\circuit}$.
The logical view of \revision{$\expansion{\circuit}$} is a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef$ is in $\reals$.
\revision{$\expansion{\circuit}$} has the following recursive definition ($\circ$ is list concatenation).
2020-12-20 17:19:07 -05:00
{\small
\begin{multline*}
\expansion{\circuit} =
\begin{cases}
\revision{\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput}} &\textbf{ if }\revision{\circuit.\type = \circplus}\\
\left\{(\monom_\linput \cup \monom_\rinput, \coef_\linput \cdot \coef_\rinput) ~|~\right.&\\ \quad \left.(\monom_\linput, \coef_\linput) \in \revision{\expansion{\circuit_\linput}}, (\monom_\rinput, \coef_\rinput) \in \revision{\expansion{\circuit_\rinput}}\right\} &\textbf{ if }\revision{\circuit.\type = \circmult}\\
\elist{(\emptyset, \revision{\circuit.\val})} &\textbf{ if }\revision{\circuit}.\type = \tnum\\
\elist{(\{\revision{\circuit}.\val\}, 1)} &\textbf{ if }\revision{\circuit}.\type = \var.\\
\end{cases}
\end{multline*}
}
2020-12-19 01:15:50 -05:00
\end{Definition}
\revision{
Note that similar in spirit to ~\Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though ~\Cref{def:reduced-bi-poly} is more general.
}
2020-12-20 17:13:52 -05:00
In the following, we abuse notation and write $\monom$ to denote the monomial obtained as the products of the variables in the set.
2020-09-10 22:14:25 -04:00
\begin{Example}\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}.
Its circuit $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}.
The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expansion{\circuit}$ is $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
2020-09-10 22:14:25 -04:00
\end{Example}
$\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$.
Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.
2020-09-10 22:14:25 -04:00
\revision{
\begin{Definition}[Positive \circuit]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
\end{Definition}
2020-08-12 17:41:09 -04:00
Using the same factorization from ~\Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.
2020-08-12 17:41:09 -04:00
\begin{Definition}[Evaluation]\label{def:exp-poly-eval}
Given an expression tree $\circuit$ and a valuation $\vct{a} \in \mathbb{R}^\numvar$, we define the evaluation of $\circuit$ on $\vct{a}$ as $\circuit(\vct{a}) = \polyf(\circuit)(\vct{a})$.
\end{Definition}
\begin{Definition}[\size($\cdot$)]
The function \size~ takes a circuit $\circuit$ as input and outputs the number of gates (nodes) in \circuit.
\end{Definition}
\begin{Definition}[\depth($\cdot$)]
The function \depth~ has circuit $\circuit$ as input and outputs the number of levels in \circuit.
\end{Definition}
\begin{Definition}[Subcircuit]
A subcircuit of a circuit $\circuit$ is a circuit \subcircuit such that \subcircuit is a DAG \textit{subgraph} of the DAG representing \circuit. The sink of \subcircuit has exactly one gate \gate.
\end{Definition}
}
2020-12-14 11:47:18 -05:00
\subsection{Our main result}
2020-12-14 11:47:18 -05:00
In the subsequent subsections we will prove the following theorem.
2020-09-08 12:05:51 -04:00
2020-08-22 15:47:56 -04:00
\begin{Theorem}\label{lem:approx-alg}
Let \revision{\circuit be a circuit} for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\poly)$.
2020-12-19 16:44:18 -05:00
%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a \bi.
Then an estimate $\mathcal{E}$ %=\approxq(\circuit, P_1,\dots,p_\numvar), \conf, \error')$
2020-12-19 16:44:18 -05:00
of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\revision{\size(\circuit)} + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\]
such that
\begin{equation}
\label{eq:approx-algo-bound}
2020-12-20 00:10:20 -05:00
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
2020-12-14 11:47:18 -05:00
%with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
2020-08-22 15:47:56 -04:00
\end{Theorem}
\noindent The proof of~\Cref{lem:approx-alg} \revision{
(which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample})
}
can be found in~\Cref{sec:proof-lem-approx-alg}. \revision{
The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}.
}
2020-12-17 16:40:48 -05:00
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}):
2020-12-14 11:47:18 -05:00
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\circuit$, define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\]
2020-12-14 11:47:18 -05:00
\end{Definition}
2020-12-15 19:26:19 -05:00
%\AH{This....combined with \Cref{def:mod-set-polys} is \emph{really} nice notation!}
2020-12-19 23:20:31 -05:00
%\AR{Need to make sure use of indicator variable $\onesymbol$ above is consistent with the rest of the paper.}
%\OK{Done}
2020-12-14 11:47:18 -05:00
2020-12-20 17:13:52 -05:00
\noindent We next present couple of corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time
\[O\left(\revision{\size(\circuit)} + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\]
2021-02-02 11:42:24 -05:00
In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$.
\end{Corollary}
2020-12-17 16:40:48 -05:00
The proof for~\Cref{cor:approx-algo-const-p} can be seen in~\Cref{sec:proofs-approx-alg}.
2020-12-20 17:13:52 -05:00
The restriction on $\gamma$ is satisfied by any \ti (where $\gamma=0$) as well as for all three queries of the PDBench \bi benchmark (\Cref{app:subsec:experiment} shows experimentally that $\gamma$ is negligible in practice for these queries).
We also observe that (i) tuple presence is independent across blocks, so the corresponding probabilities (and hence $\prob_0$) are independent of the number of blocks, and (ii) \bis model uncertain attributes, so block size (and hence $\gamma$) is a function of the ``messiness'' of a dataset, rather than its size.
2020-12-20 17:19:07 -05:00
Thus, we expect the corollary to hold in general.
2020-12-19 23:20:31 -05:00
% \AH{I am thinking that perhaps the terminology and presentation of~\Cref{app:subsec:experiment} may need word-smithing to clearly illustrate the $\bi$ benchmarks satisfied--although the substance is already written there.}
% \AR{Yes! E.g. $\gamma$ is not used at all in~\Cref{app:subsec:experiment}}
% \AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of tuples?}
% \OK{@Atri: This seems like a reasonable claim. It's too late for me to come up with a reasonable motivation (maybe something will come to me in the morning), but the intuition for me is that each tuple/block is independent... it would be hard for that to be the case if the probability were a function of the number of tuples.}
\subsection{Approximating $\rpoly$}
The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=\polyf(\circuit)$ for \revision{circuit \circuit} over $\bi$, we can exactly represent $\rpoly(\vct{X})$ as follows:
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{X_1,\dots,X_\numvar}=\hspace*{-1mm}\sum_{(\monom,\coef)\in \expansion{\circuit}} \hspace*{-2mm} \indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \var\inparen{\monom}}\hspace*{-2mm} X_i
\end{equation}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional\footnote{We could have also uniformly sampled from $\expansion{\circuit}$ but this gives better parameters.}
2020-12-19 16:44:18 -05:00
%\AH{Regarding the footnote, is there really a difference? I \emph{suppose} technically, but in this case they are \emph{effectively} the same. Just wondering.}
2020-12-15 19:26:19 -05:00
%\AR{Yes, there is! If we used uniform distribution then in our bounds we will have a parameter that depends on the largest $\abs{coef}$, which e.g. could be dependent on $n$. But with the weighted probability distribution, we avoid paying this price. Though I guess perhaps we can say for the kinds of queries we consider thhese coefficients are all constants?}
to $\abs{\coef}$ and compute $Y=\indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{\monom}} p_i$. Taking $\numsamp$ samples and computing the average of $Y$ gives us our final estimate.
The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
2020-12-19 16:13:42 -05:00
\begin{equation*}
2\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \conf \implies\samplesize \geq \frac{2\log{\frac{2}{\conf}}}{\error^2}.
%\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \frac{\conf}{2}\\
%\frac{\samplesize\error^2}{2}\geq \log{\frac{2}{\conf}}\\
\end{equation*}
To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$.
%We state the approximation algorithm in terms of a $\bi$.
%\subsubsection{Description}
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\circuit$ produces a non-biased weight distribution over the monomials of $\expansion{\circuit}$ and a correct count of $|\circuit|(1,\ldots, 1)$, i.e., the number of monomials in $\expansion{\circuit}$. Next, ~\Cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expansion{\circuit}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\Cref{lem:mon-samp}.
2020-12-15 19:26:19 -05:00
%\AR{Seems like the notation below belongs to the notation section (if we decide to state this explicitly at all)?}
%\AH{Yes, I only included this per your request a few months ago. Based on @lordpretzel removing my definition of monomial, perhaps we can assume that the reader understands the notation below. I \emph{think} this should be a reasonable assumption.}
%Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$.
%\subsubsection{Psuedo Code}
2020-12-19 01:15:50 -05:00
%Original \ti Algorithm
%\begin{algorithm}[H]
% \caption{$\approxq$($\circuit$, $\vct{p}$, $\conf$, $\error$)}
% \label{alg:mon-sam}
% \begin{algorithmic}[1]
% \Require \circuit: Binary Expression Tree
% \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
% \Require $\conf$ $\in [0, 1]$
% \Require $\error$ $\in [0, 1]$
% \Ensure \vari{acc} $\in \mathbb{R}$
% \State $\accum \gets 0$\label{alg:mon-sam-global1}
% \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
% \State $(\vari{\circuit}_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass} \;and \sampmon \; is ~\Cref{alg:sample}}\newline
% \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples}
% \State $(\vari{M}_\vari{i}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
% \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}
% \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}_{\vari{i}}$}
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
% \EndFor
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
% \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
% \EndFor
%
% \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
% \State \Return \vari{acc}
% \end{algorithmic}
%\end{algorithm}
2020-12-19 01:15:50 -05:00
%\bi Version of Approximation Algorithm
2020-12-13 15:51:55 -05:00
2020-12-19 12:59:27 -05:00
\begin{algorithm}[t]
\caption{$\approxq(\circuit, \vct{p}, \conf, \error)$}
\label{alg:mon-sam}
\begin{algorithmic}[1]
\Require \revision{\circuit: Circuit}
\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
\Require $\conf$ $\in [0, 1]$
\Require $\error$ $\in [0, 1]$
%\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
\Ensure \vari{acc} $\in \mathbb{R}$
%\State $\vari{sample}_\vari{next} \gets 0$
2020-09-01 14:39:50 -04:00
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass-iter}}
%\newline
%\State $\vari{i} \gets 1$
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
\State\Comment{\sampmon \; is ~\Cref{alg:sample}}
%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
% \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
% \newline
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State break
% \Else
% \State $\bivec[\block] = 1$
% \State $\vari{sum} = 0$
% \For{$\ell \in [\abs{\block}]$}
% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$
% \EndFor
% \If{$\vari{sum} \geq 2$}
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.}
% \EndIf
% \EndFor
% \If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
% \State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
% \Else
\If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block}
\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}%\newline
%\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
%\EndFor
2020-09-04 21:08:02 -04:00
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
%\State $\vari{i} \gets \vari{i} + 1$
\EndIf
\EndFor
2020-12-13 15:51:55 -05:00
%\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\circuit, \numsamp, \abs{\block})$
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
2020-08-25 11:18:08 -04:00
\State \Return \vari{acc}
\end{algorithmic}
\end{algorithm}
2020-09-04 18:32:40 -04:00
2020-12-14 11:47:18 -05:00
%\begin{algorithm}[H]
% \caption{$\algname{Estimate}$ $\gamma(\circuit, \numsamp, \abs{\block})$}
2020-12-14 11:47:18 -05:00
% \label{alg:est-gamma}
% \begin{algorithmic}[1]
% \Require \circuit: Binary Expression Tree
2020-12-14 11:47:18 -05:00
% \Require $\numsamp \in \mathbb{N}$
% \Require $\abs{\block} \in \mathbb{N}$
% \Ensure \vari{cTerms} $]in \mathbb{R}$
%
% \State $\vari{cTerms} \gets 0$
% \State $\vari{isCross} \gets 0$
% \For{$\vari{i} \text{ in } 1 \text{ to } \numsamp$}
% \State $\bivec \gets [0]^{\abs{\block}}$
% \State $(\vari{M}, \vari{sgn}) \gets $ \sampmon($\circuit_\vari{mod}$)
2020-12-14 11:47:18 -05:00
% \For{$\vari{x}_{\vari{b}, \vari{j}} \text{ in } \vari{M}$}
% \If{$\bivec[b] = 1$}
% \State $\vari{isCross} \gets 1$
% \State Break
% \Else
% \State $\bivec[b] \gets 1$
% \EndIf
% \EndFor
% \If{$\vari{isCross} = 1$}
% \State $\vari{cTerms} \gets \vari{cTerms} + 1$
% \State $\vari{isCross} \gets 0$
% \EndIf
% \EndFor
% \State \Return $\frac{\vari{cTerms}}{\numsamp}$
% \end{algorithmic}
%\end{algorithm}
\subsubsection{Correctness}
2020-12-19 01:15:50 -05:00
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that,
2020-12-19 23:36:11 -05:00
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies. %Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
2020-09-01 14:39:50 -04:00
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\circuit))$ time. $\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.% is correctly computed for each child of $\vari{S}.$
\end{Lemma}
To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
%\AH{I'm wondering if there is a better notation to use here. I myself got confused by my own notation of $\circuit_{\vari{mod}}$. \emph{But}, we need to to be referencing the modified $\circuit$ returned by $\onepass$ in the algorithm, so maybe this is the best we can do?}
2020-12-15 19:26:19 -05:00
%\AR{yeah, I think this is fine.}
%At the conclusion of $\onepass$, $\circuit.\vari{partial}$ will hold the sum of all coefficients in $\expansion{\abs{\circuit}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expansion{\abs{\circuit}}}\coef$. $\circuit.\vari{weight}$ will hold the weighted probability that $\circuit$ is sampled from from its parent $+$ node.
2020-09-01 14:39:50 -04:00
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot \depth(\circuit))$ time, where $k = \degree(poly(\abs{\circuit})$. Upon completion, every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ is returned with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$. %, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expansion{\abs{\circuit}}$.
2020-09-01 14:39:50 -04:00
\end{Lemma}
2020-12-17 16:40:48 -05:00
Armed with the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}):
\begin{Theorem}\label{lem:mon-samp}
2020-12-19 16:44:18 -05:00
%If the contracts for $\onepass$ and $\sampmon$ hold, then
For any $\circuit$ with $\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that %$\expct\pbox{\empmean} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma)}{\abs{\circuit}(1,\ldots, 1)}$. %within an additive $\error \cdot \abs{\circuit}(1,\ldots, 1)$ error with
2020-12-19 23:36:11 -05:00
%$\empmean$ has bounds
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
in $O\left(\size(\circuit)\right.$ $+$ $\left.\left(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\right)$ time.
\end{Theorem}
2020-08-13 20:54:06 -04:00
2020-12-15 01:09:00 -05:00
\subsection{\onepass\ Algorithm}
\label{sec:onepass}
2020-12-15 01:09:00 -05:00
%\subsubsection{Description}
%Algorithm ~\ref{alg:one-pass} satisfies the requirements of lemma ~\ref{lem:one-pass}.
The evaluation of $\abs{\circuit}(1,\ldots, 1)$ can be defined recursively, as follows (where $\circuit_\linput$ and $\circuit_\rinput$ are the `left' and `right' inputs of $\circuit$ if they exist):
2020-12-19 12:59:27 -05:00
{\small
2020-12-17 00:02:07 -05:00
\begin{align}
\label{eq:T-all-ones}
\abs{\circuit}(1,\ldots, 1) = \begin{cases}
\abs{\circuit_\linput}(1,\ldots, 1) \cdot \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \revision{\circmult}\\
\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \revision{\circplus} \\
|\circuit.\val| &\textbf{if }\circuit.\type = \tnum\\
1 &\textbf{if }\circuit.\type = \var.
\end{cases}
2020-12-17 00:02:07 -05:00
\end{align}
2020-12-19 12:59:27 -05:00
}
%\begin{align*}
%&\eval{\circuit ~|~ \circuit.\type = +}_{\abs{\circuit}} =&& \eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}\\
%&\eval{\circuit ~|~ \circuit.\type = \times}_{\abs{\circuit}} = && \eval{\circuit_\lchild}_{\abs{\circuit}} \cdot \eval{\circuit_\rchild}_{\abs{\circuit}}\\
%&\eval{\circuit ~|~ \circuit.\type = \tnum}_{\abs{\circuit}} = && \circuit.\val\\
%&\eval{\circuit ~|~ \circuit.\val = \var}_{\abs{\circuit}} = && 1
%\end{align*}
%In the same fashion the weighted distribution can be described as above with the following modification for the case when $\circuit.\type = +$:
It turns out that for proof of~\Cref{lem:sample}, we need to argue that when $\circuit.\type = +$, we indeed have
2020-12-17 00:02:07 -05:00
\begin{align}
\label{eq:T-weights}
%&\abs{\circuit_\lchild}(1,\ldots, 1) + \abs{\circuit_\rchild}(1,\ldots, 1); &\textbf{if }\circuit.\type = + \\
\circuit.\lwght &\gets \frac{\abs{\circuit_\linput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1)};\\
\circuit.\rwght &\gets \frac{\abs{\circuit_\rinput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1)+ \abs{\circuit_\rinput}(1,\ldots, 1)}
2020-12-17 00:02:07 -05:00
\end{align}
%\begin{align*}
%&\eval{\circuit~|~\circuit.\type = +}_{\wght} =&&\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}; \circuit_\lchild.\wght = \frac{\eval{\circuit_\lchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}; \circuit_\rchild.\wght = \frac{\eval{\circuit_\rchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}
%\end{align*}
\noindent \onepass\ (Algorithm ~\ref{alg:one-pass-iter} in \Cref{sec:proofs-approx-alg}) iteratively visits each gate one time according to the topological ordering of \circuit annotating the \lwght, \rwght, and \prt variables of each node according to the definitions above. Lemma~\ref{lem:one-pass} is also proved in~\Cref{sec:proofs-approx-alg}.
2020-08-17 17:12:25 -04:00
2020-12-15 01:09:00 -05:00
%\subsubsection{Psuedo Code}
%See algorithm ~\ref{alg:one-pass} for details.
2020-12-15 01:09:00 -05:00
\subsection{\sampmon\ Algorithm}
\label{sec:samplemonomial}
2020-08-17 13:52:18 -04:00
%Algorithm ~\ref{alg:sample} takes $\circuit$ as input, samples an arbitrary $(\monom, \coef)$ from $\expansion{\circuit}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputting the tuple $(\monom, \sign(\coef))$. While one cannot compute $\expansion{\circuit}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\circuit$ which produces a sample from $\expansion{\circuit}$ without ever materializing $\expansion{\circuit}$.
2020-08-17 13:52:18 -04:00
A naive (slow) implementation of \sampmon\ would first compute $\expansion{\circuit}$ and then sample from it.
2020-12-19 12:59:27 -05:00
% However, this would be too time consuming.
2020-12-15 01:09:00 -05:00
%
Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top-down traversal.
For a parent $+$ gate, the input to be visited is sampled from the weighted distribution precomputed by \onepass.
When a parent $\times$ node is visited, both inputs are visited.
2020-12-19 12:59:27 -05:00
The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of signs of visited coefficient leaf nodes.
2020-08-17 13:52:18 -04:00
We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements.
2021-01-28 11:50:33 -05:00
2020-12-19 12:59:27 -05:00
$\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}.
2020-09-07 17:03:22 -04:00
2020-12-19 12:59:27 -05:00
\begin{algorithm}[t]
\caption{\sampmon(\circuit)}
\label{alg:sample}
\begin{algorithmic}[1]
2021-01-28 11:50:33 -05:00
\revision{\Require \circuit: Circuit}
2020-08-25 11:18:08 -04:00
\Ensure \vari{vars}: TreeSet
\Ensure \vari{sgn} $\in \{-1, 1\}$
\Comment{\Cref{alg:one-pass-iter} should have been run before this one} % algorithm ~\ref{alg:sample}}
\State $\vari{vars} \gets \emptyset$ \label{alg:sample-global1}
\If{$\circuit.\type = +$}\Comment{Sample at every $+$ node}
\State $\circuit_{\vari{samp}} \gets$ Sample from left input ($\circuit_{\linput}$) and right input ($\circuit_{\rinput}$) w.p. $\circuit.\vari{Lweight}$ and $\circuit.\vari{Rweight}$. \label{alg:sample-plus-bsamp} \Comment{Each call to \sampmon uses fresh randomness}
\State $(\vari{v}, \vari{s}) \gets \sampmon(\circuit_{\vari{samp}})$\label{alg:sample-plus-traversal}
\State $\Return ~(\vari{v}, \vari{s})$
\ElsIf{$\circuit.\type = \times$}\Comment{Multiply the sampled values of all inputs}
\State $\vari{sgn} \gets 1$\label{alg:sample-global2}
\For {$input$ in $\circuit.\vari{input}$}\label{alg:sample-times-for-loop}
\State $(\vari{v}, \vari{s}) \gets \sampmon(input)$
2020-09-07 17:03:22 -04:00
\State $\vari{vars} \gets \vari{vars} \cup \{\vari{v}\}$\label{alg:sample-times-union}
\State $\vari{sgn} \gets \vari{sgn} \times \vari{s}$\label{alg:sample-times-product}
\EndFor
2020-08-25 11:18:08 -04:00
\State $\Return ~(\vari{vars}, \vari{sgn})$
\ElsIf{$\circuit.\type = numeric$}\Comment{The leaf is a coefficient}
%\State $\vari{sgn} \gets \vari{sgn} \times sign(\circuit.\val)$
\State $\Return ~\left(\{\}, sign(\circuit.\val)\right)$\label{alg:sample-num-return}
\ElsIf{$\circuit.\type = var$}
%\State $\vari{vars} \gets \vari{vars} \; \cup \; \{\;\circuit.\val\;\}\label{alg:sample-var-union}$\Comment{Add the variable to the set}
\State $\Return~\left(\{\circuit.\val\}, 1\right) $\label{alg:sample-var-return}
\EndIf
\end{algorithmic}
\end{algorithm}
2020-12-19 01:15:50 -05:00
% \subsection{Experimental results}
% \label{sec:experiments}
% We conducted an experiment running modified TPCH queries over uncertain data generated by pdbench~\cite{pdbench}, both of which (data and queries) represent what is typically encountered in practice. Queries were run two times, once filtering $\bi$ cancellations, and then second not filtering the cancellations. The purpose of this was to determine an indication for how many $\bi$ cancellations occur in practice. Details and results can be found in~.
2020-12-14 11:47:18 -05:00
2020-12-19 01:15:50 -05:00
%\AR{Experimental stuff about \bi should go in here}
2020-10-01 14:38:40 -04:00
%%%%%%%%%%%%%%%%%%%%%%%
2020-12-19 16:44:18 -05:00
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: