Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

This commit is contained in:
Oliver Kennedy 2020-12-14 23:21:32 -05:00
commit fd23ea69f9
Signed by: okennedy
GPG key ID: 3E5F9B3ABD3FDB60
3 changed files with 120 additions and 69 deletions

View file

@ -2,7 +2,7 @@
\section{$1 \pm \epsilon$ Approximation Algorithm}
\label{sec:algo}
In~\cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for TIDB (even just based on project-join queries) is unlikely to be possible in linear time (\cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\cref{cor:single-p-hard}). Given this, in this section we will design an approrixmation algorithm for our that runs in {\em linear time}. Unlike the results in~\cref{sec:hard} our approximation algorithm works for BIDB though our bounds are more meaningful for a non-trivial sublcass of BIDB that includes TIDB as well as PDB benchmarks (\cref{sec:experiments}).
In~\cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for TIDB (even just based on project-join queries) is unlikely to be possible in linear time (\cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\cref{cor:single-p-hard}). Given this, in this section we will design an approximation algorithm for our that runs in {\em linear time}. Unlike the results in~\cref{sec:hard} our approximation algorithm works for BIDB though our bounds are more meaningful for a non-trivial subclass of BIDB that includes TIDB as well as PDB benchmarks (\cref{sec:experiments}).
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
\subsection{Preliminaries and some more notation}
@ -138,7 +138,7 @@ Consider the factorized representation $(x + 2y)(2x - y)$ of the polynomial in~\
\begin{Definition}[Positive T]\label{def:positive-tree}
For any expression tree $\etree$, the correspondign
For any expression tree $\etree$, the corresponding
{\em positive tree}, denoted $\abs{\etree}$ obtained from $\etree$ as follows. For each leaf node $\ell$ of $\etree$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$. %value $\coef$ of each coefficient leaf node in $\etree$ is set to %$\coef_i$ in $\etree$ is exchanged with its absolute value$|\coef|$.
\end{Definition}
@ -154,19 +154,20 @@ Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(
In the subsequent subsections we will prove the following theorem.
\begin{Theorem}\label{lem:approx-alg}
Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\deg(\poly)$
Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\degree(\poly)$
%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a BIDB.
An estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\]
An estimate $\mathcal{E}$ %=\approxq(\etree, (p_1,\dots,p_\numvar), \conf, \error')$
of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\]
such that
\begin{equation}
\label{eq:approx-algo-bound}
P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
%with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
\end{Theorem}
It turns out that to get linear runtime resuls from~\cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
It turns out that to get linear runtime results from~\cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\etree$, define
\[\gamma(\etree)=\frac{\sum_{(\monom, \coef)\in \expandtree{\etree}} \abs{\coef}\cdot \onesymbol\inparen{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\etree}(1,\ldots, 1)}\]
@ -177,10 +178,10 @@ We next present couple of corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\etree)$. Further let it be the case that $p_i\ge p_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\cref{eq:approx-algo-bound} can be computed in time
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
In particular, if $p_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1\eps\cdot\treesize(\etree)\cdot \log{\frac{1}{\conf}}\right)$.
\end{Corollary}
We note that the restiction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
We note that the restriction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
\AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of tuples?}
\begin{proof}[Proof of~\Cref{cor:approx-algo-const-p}]
@ -193,12 +194,21 @@ Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the firs
\end{proof}
\subsection{Approximating $\rpoly$}
We state the approximation algorithm in terms of a $\bi$.
\subsubsection{Description}
Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}.
The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=poly(\etree)$ for expression tree $\etree$ over $\bi$, we note that we can exactly represent $\rpoly(\vct{X}$ as follows:
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{X_1,\dots,X_\numvar}=\sum_{(v,c)\in \expandtree{\etree}} \onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} X_i.
\end{equation}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(v,c)\in \expandtree{\etree}$ with probability proportional\footnote{We could have also uniformly sampled from $\expandtree{\etree}$ but this gives better parameters.} to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i$. Taking enough samples and computing the average of $Y$ gives us our final estimate. Algowithm~\ref{alg:mon-sam} has the details.
%We state the approximation algorithm in terms of a $\bi$.
%\subsubsection{Description}
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}.
\AR{Seems like the notation below belongs to the notation section (if we decide to state this explicitly at all)?}
Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$.
\subsubsection{Psuedo Code}
%\subsubsection{Psuedo Code}
%Original TIDB Algorithm
%\begin{algorithm}[H]
@ -232,32 +242,32 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
\begin{algorithm}[H]
\caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\abs{\block}$)}
\caption{$\approxq(\etree, \vct{p}, \conf, \error)$}
\label{alg:mon-sam}
\begin{algorithmic}[1]
\Require \etree: Binary Expression Tree
\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
\Require $\conf$ $\in [0, 1]$
\Require $\error$ $\in [0, 1]$
\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
%\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
\Ensure \vari{acc} $\in \mathbb{R}$
\State $\vari{sample}_\vari{next} \gets 0$
%\State $\vari{sample}_\vari{next} \gets 0$
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{4}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}}
\newline
\State $\vari{i} \gets 1$
\While{$\vari{i} \leq \numsamp$}\Comment{Perform the required number of samples}
\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}
\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
\If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
\newline
\State $\vari{sample}_{\vari{next}} \gets 1$
\State break
\Else
\State $\bivec[\block] = 1$
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass}}
%\newline
%\State $\vari{i} \gets 1$
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\Comment{Perform the required number of samples}
%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon \; is ~\cref{alg:sample}}
%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
% \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
% \newline
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State break
% \Else
% \State $\bivec[\block] = 1$
% \State $\vari{sum} = 0$
% \For{$\ell \in [\abs{\block}]$}
% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$
@ -265,23 +275,24 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
% \If{$\vari{sum} \geq 2$}
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.}
\EndIf
\EndFor
\If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
\State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
\Else
\State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}\newline
\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
\EndFor
% \EndIf
% \EndFor
% \If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
% \State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
% \Else
\If{$\vari{M}$ has at most one variable from each block}
\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}%\newline
%\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
%\EndFor
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
\State $\vari{i} \gets \vari{i} + 1$
%\State $\vari{i} \gets \vari{i} + 1$
\EndIf
\EndWhile
\EndFor
\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\etree, \numsamp, \abs{\block})$
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp \cdot (1 - \gamma)}$\label{alg:mon-sam-global3}
%\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\etree, \numsamp, \abs{\block})$
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
\State \Return \vari{acc}
\end{algorithmic}
\end{algorithm}
@ -319,16 +330,19 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
\subsubsection{Correctness}
We state the lemmas for $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are subsequent.
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$. Second, when $\vari{S}.\val = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, for each subtree $\vari{S}$ of $\etree$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when$\vari{S}.\val = +$, each $\vari{child}$ of $\vari{S}$, $\vari{child}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
\end{Lemma}
At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$. $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node.
In proving correctness of~\Cref{alg:mon-sam}, we will only use the following fact (which follows from the above lemma), $\etree_{\vari{mod}}.\vari{partial}=\abs{\etree}(1,\dots,1)$.
%At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$. $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node.
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, every $\left(\monom, sign(\coef)\right)\in \expandtree{\abs{\etree}}$ is returned with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. %, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
\end{Lemma}
\begin{Theorem}\label{lem:mon-samp}
@ -367,7 +381,7 @@ P\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\e
As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma) = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize \cdot (1 - \gamma)}$. This computation is performed on ~\cref{alg:mon-sam-global3}.
%Also see that to properly estimate $\rpoly$, it is necessary to multiply by the number of monomials in $\rpoly$, i.e. $\abs{\etree}(1,\ldots, 1)$. Therefore it is the case that $\frac{acc}{N}$ gives the estimate of one monomial, and multiplying by $\abs{\etree}(1,\ldots, 1)$ yields the estimate of $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This scaling is performed in line ~\ref{alg:mon-sam-global3}.
Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upperbounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the pseudo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upper bounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
\begin{equation*}
P\pbox{~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error} \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} \leq \frac{\conf}{2}.
\end{equation*}
@ -382,7 +396,7 @@ Solving for the number of samples $\samplesize$ we get
&\frac{2\log{\frac{4}{\conf}}}{\error^2} \leq \samplesize.\label{eq:hoeff-6}
\end{align}
By Hoeffding we obtain the number of samples necessary to acheive the claimed additive error bounds.
By Hoeffding we obtain the number of samples necessary to achieve the claimed additive error bounds.
This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}.
@ -419,6 +433,8 @@ and the runtime then follows, thus upholding ~\cref{lem:approx-alg}.
\subsection{OnePass Algorithm}
\label{sec:onepass}
\subsubsection{Description}
Algorithm ~\ref{alg:one-pass} satisfies the requirements of lemma ~\ref{lem:one-pass}.
@ -604,13 +620,14 @@ Thus, the algorithm visits each node of $\etree$ one time, with a constant numbe
\subsection{Sample Algorithm}
\label{sec:samplemonomial}
Algorithm ~\ref{alg:sample} takes $\etree$ as input, samples an arbitrary $(\monom, \coef)$ from $\expandtree{\etree}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputing the tuple $(\monom, \sign(\coef))$. While one cannot compute $\expandtree{\etree}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\etree$ which produces a sample from $\expandtree{\etree}$ without ever materializing $\expandtree{\etree}$.
Algorithm ~\ref{alg:sample} takes $\etree$ as input, samples an arbitrary $(\monom, \coef)$ from $\expandtree{\etree}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputting the tuple $(\monom, \sign(\coef))$. While one cannot compute $\expandtree{\etree}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\etree$ which produces a sample from $\expandtree{\etree}$ without ever materializing $\expandtree{\etree}$.
Algorithm ~\ref{alg:sample} selects a monomial from $\expandtree{\etree}$ by the following top-down traversal. For a parent $+$ node, a subtree is chosen over the previously computed weighted sampling distribution. When a parent $\times$ node is visited, both children are visited. All variable leaf nodes of the subgraph traversal are added to a set. Additionally, the product of signs over all coefficient leaf nodes of the subgraph traversal is computed. The algorithm returns a set of the distinct variables of which the monomial is composed and the monomial's sign.
\begin{Definition}[TreeSet]
A TreeSet is a datastructure whose elements form a set, each of which are stored in a binary tree.
A TreeSet is a data structure whose elements form a set, each of which are stored in a binary tree.
\end{Definition}
Note that as stated, a TreeSet then facilitates logarithmic insertion.
@ -659,9 +676,9 @@ First, we need to show that $\sampmon$ indeed returns a monomial $\monom$, such
For the base case, let the depth $d$ of $\etree$ be $0$. We have that the root node is either a constant $\coef$ for which by line ~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\etree.\type = \var$ and $\etree.\val = x$, and by line ~\ref{alg:sample-var-return} we return $\{x\}$. Both cases satisfy ~\cref{def:monomial}, and the base case is proven.
By inductive hyptothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial.
By inductive hypothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial.
For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.
For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hypothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a valid monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.
%Note that for any monomial sampled by algorithm ~\ref{alg:sample}, the nodes traversed form a subgraph of $\etree$ that is \textit{not} a subtree in the general case. We thus seek to prove that the subgraph traversed produces the correct probability corresponding to the monomial sampled.
@ -689,7 +706,7 @@ and we obtain the desired result.
\paragraph{Run-time Analysis}
We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$. Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$. Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$. Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upperbounded by $x_{i + 1} - x_i \leq y_i$.
We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$. Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$. Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$. Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upper bounded by $x_{i + 1} - x_i \leq y_i$.
Now, we prove by induction on the depth $d$ of tree $\etree$ the following claim.
\begin{Claim}\label{claim:num-nodes-level-i}
@ -708,7 +725,7 @@ The inductive step is to show that for arbitrary $\etree$ with depth = $d + 1 \l
By ~\cref{def:degree}, a sampled monomial will have $O(k)$ $\times$ nodes, and this along with ~\cref{claim:num-nodes-level-i} implies $O(k)$ nodes at $\leq$ $depth(\etree)$ levels of the $\sampmon$ subgraph, bounding the number of recursive calls to $O(k \cdot depth(\etree))$.
Globally, lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-global2} are $O(1)$ time. For the $+$ node, line ~\ref{alg:sample-plus-bsamp} has $O(1)$ time by the fact that $\etree$ is binary. Line ~\ref{alg:sample-plus-union} has $O(\log{k})$ time by nature of the TreeSet datastructure and the fact that by definition any monomial sampled from $\expandtree{\etree}$ has degree $\leq k$ and hence at most $k$ distinct variables, which in turn implies that the TreeSet has $\leq k$ elements in it at any time.
Globally, lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-global2} are $O(1)$ time. For the $+$ node, line ~\ref{alg:sample-plus-bsamp} has $O(1)$ time by the fact that $\etree$ is binary. Line ~\ref{alg:sample-plus-union} has $O(\log{k})$ time by nature of the TreeSet data structure and the fact that by definition any monomial sampled from $\expandtree{\etree}$ has degree $\leq k$ and hence at most $k$ distinct variables, which in turn implies that the TreeSet has $\leq k$ elements in it at any time.
Finally, line ~\ref{alg:sample-times-product} is in $O(1)$ for a product and an assignment operation. When a times node is visited, the same union, product, and assignment operations take place, and we again have $O(\log{k})$ runtime. When a variable leaf node is traversed, the same union operation occurs with $O(\log{k})$ runtime, and a constant leaf node has the above mentioned product and assignment operations. Thus for each node visited, we have $O(\log{k})$ runtime, and the final runtime for $\sampmon$ is $O(\log{k} \cdot k \cdot depth(\etree))$.

View file

@ -88,7 +88,7 @@ We first argue that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0
%\sum_{\substack{(i_1, j_1),\\\cdots,\\(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem}
%\end{equation*}
%Since each of $(i_1, j_1),\ldots, (i_\kElem, j_\kElem)$ are from $E$, it follows that the set of $\kElem!$ permutations of the $\kElem$ $X_iX_j$ pairs which form the monomial products are of degree $2\kElem$ with the number of distinct variables in an arbitrary monomial $\leq 2\kElem$.
By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\deg(\rpoly_{G}^\kElem)\le \deg\poly_G^\kElem=2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\deg(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\degree(\rpoly_{G}^\kElem)\le \degree(\poly_G^\kElem)=2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\degree(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
%thereby shrinking the degree a monomial product term in the SOP form of $\poly_{G}^{\kElem}(\vct{X})$ to the exact number of distinct variables the monomial contains. This implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. Then it is the case that
\begin{equation*}
\rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i

View file

@ -4,43 +4,59 @@
\subsection{Polynomial Formulation and Equivalences}
Since we have shown that computing the expected multiplicity of a result tuple is equivalent to computing the expectation of a polynomial (for that tuple) given a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$, we from now on focus on this problem exclusively.
Before proceeding, note that the following is assuming \bis (which subsume \tis as a special case). Thus, variables are independent of each other and each variable $X$ is associated with a probability $\vct{p}(X)$.
Let us use the expression $(x + y)^2$ for a running example in the following definitions.
Before proceeding, note that the following is assuming \bis (which subsume \tis as a special case). Thus, variables are independent of each other and each variable $X$ is associated with a probability $\vct{p}(X) = \pd[X = 1]$.
Let us use the expression $(x + y)^2$ as a running example in this section.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Monomial]\label{def:monomial}
A monomial is a product of a fixed set of variables, each raised to a non-negative integer power.
A monomial is a product of a set of variables, each raised to a non-negative integer power.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For the term $2xy$, by ~\cref{def:monomial} the monomial is $xy$.
For instance, the term $2xy$ contains a single monomial $xy$. % \Cref{def:monomial} the monomial is $xy$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
A polynomial is in standard monomial basis when it is fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
A polynomial is in standard monomial basis when it is of the form:
\[
\sum_{i=1}^n c_i \cdot m_i
\]
where each $c_i$ is a positive integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$.
% fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The standard monomial basis for the running example is $x^2 +2xy + y^2$. While $x^2 + xy + xy + y^2$ is an expanded form of the expression, it is not the standard monomial basis since $xy$ appears more than once.
Throughout this paper, we also make the following \textit{assumption}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Assumption}\label{assump:poly-smb}
All polynomials considered are in standard monomial basis, i.e., $\poly(\vct{X}) = \sum\limits_{\vct{d} \in \mathbb{N}^\numvar}q_d \cdot \prod\limits_{i = 1, d_i \geq 1}^{\numvar}X_i^{d_i}$, where $q_d$ is the coefficient for the monomial encoded in $\vct{d}$ and $d_i$ is the $i^{th}$ element of $\vct{d}$.
\end{Assumption}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
While the definition of polynomial $\poly(\vct{X})$ over a $\bi$ input doesn't change, we introduce an alternative notation which will come in handy. Given $\ell$ blocks, we write $\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $\block_{i, j}$ denotes tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.
The number of tuples in the $\bi$ instance can be (trivially) computed as $\numvar = \sum\limits_{i = 1}^{\ell}\abs{\block_i}$ .
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Degree]\label{def:degree}
The degree of polynomial $\poly(\vct{X})$ is the maximum sum of the exponents of a monomial, over all monomials when $\poly(\vct{X})$ is in SOP form.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The degree of the running example is $2$. In this paper we consider only finite degree polynomials.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\rpoly(\vct{X})$] \label{def:qtilde}
Define $\rpoly(X_1,\ldots, X_\numvar)$ as the reduced version of $\poly(X_1,\ldots, X_\numvar)$, of the form
$\rpoly(X_1,\ldots, X_\numvar) = $
\[\poly(X_1,\ldots, X_\numvar) \mod X_1^2-X_1\cdots\mod X_\numvar^2 - X_\numvar.\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Example}\label{example:qtilde}
Consider when $\poly(x, y) = (x + y)(x + y)$. Then the expanded derivation for $\rpoly(x, y)$ is
\begin{align*}
@ -49,12 +65,14 @@ Consider when $\poly(x, y) = (x + y)(x + y)$. Then the expanded derivation for
= ~& x + 2xy + y
\end{align*}
\end{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Intuitively, $\rpoly(\textbf{X})$ is the SOP form of $\poly(\textbf{X})$ such that if any $X_j$ term has an exponent $e > 1$, it is reduced to $1$, i.e. $X_j^e\mapsto X_j$ for any $e > 1$.
Alternatively, one can gain intuition for $\rpoly$ by thinking of $\rpoly$ as the resulting SOP of $\poly(\vct{X})$ with an idemptent product operator.
When considering $\bi$ input, it becomes necessary to redefine $\rpoly(\vct{X})$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\rpoly$ $\bi$ Redefinition]
A polynomial $\poly(\vct{X})$ over a $\bi$ instance is reduced to $\rpoly(\vct{X})$ with the following criteria. First, all exponents $e > 1$ are reduced to $e = 1$. Second, all monomials sharing the same $\block$ are dropped. Formally this is expressed as
@ -63,36 +81,44 @@ A polynomial $\poly(\vct{X})$ over a $\bi$ instance is reduced to $\rpoly(\vct{X
\end{equation*}
for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{block_s}]$, $t \neq u$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The usefulness of this reduction will be seen in ~\cref{lem:exp-poly-rpoly}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:pre-poly-rpoly}
When $\poly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar}q_{\vct{d}} \cdot \prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i^{d_i}$, we have then that $\rpoly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar} q_{\vct{d}}\cdot\prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i$.
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}
Follows by the construction of $\rpoly$ in \cref{def:qtilde}.
Follows by the construction of $\rpoly$ in \cref{def:qtilde}. \qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\qed
Note the following fact:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{proposition:q-qtilde}
\[\text{For all } (X_1,\ldots, X_\numvar) \in \{0, 1\}^\numvar, \poly(X_1,\ldots, X_\numvar) = \rpoly(X_1,\ldots, X_\numvar).\]
\end{Proposition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof for Proposition ~\ref{proposition:q-qtilde}]
Note that any $\poly$ in factorized form is equivalent to its sum of product expansion. For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$.
Note that any $\poly$ in factorized form is equivalent to its sum of product expansion. For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$. \qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\qed
Define all variables $X_i$ in $\poly$ to be independent.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:exp-poly-rpoly}
The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
\begin{equation*}
\expct_{\vct{w}}\pbox{\poly(\vct{w})} = \rpoly(\prob_1,\ldots, \prob_\numvar).
\end{equation*}
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note that in the preceding lemma, we have assigned $\vct{p}$ (introduced in ~\cref{subsec:def-data}) to the variables $\vct{X}$.
@ -127,11 +153,19 @@ Finally, observe \cref{p1-s5} by construction in \cref{lem:pre-poly-rpoly}, that
\qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Corollary}\label{cor:expct-sop}
If $\poly$ is given as a sum of monomials, the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $O(|\poly|)$, where $|\poly|$ denotes the total number of multiplication/addition operators.
\end{Corollary}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof For Corollary ~\ref{cor:expct-sop}]
Note that \cref{lem:exp-poly-rpoly} shows that $\expct\pbox{\poly} =$ $\rpoly(\prob_1,\ldots, \prob_\numvar)$. Therefore, if $\poly$ is already in sum of products form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numvar)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numvar)$), which indeed has $O(|\poly|)$ compututations.\qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: