Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

2020-12-14 23:21:32 -05:00 · 2020-12-14 23:21:32 -05:00 · fd23ea69f9
parent 186d22a12e 50669af401
commit fd23ea69f9
3 changed files with 120 additions and 69 deletions
--- a/approx_alg.tex
+++ b/approx_alg.tex
@ -2,7 +2,7 @@
 \section{$1 \pm \epsilon$ Approximation Algorithm}
 \label{sec:algo}

-In~\cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for TIDB (even just based on project-join queries) is unlikely to be possible in linear time (\cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\cref{cor:single-p-hard}). Given this, in this section we will design an approrixmation algorithm for our that runs in {\em linear time}. Unlike the results in~\cref{sec:hard} our approximation algorithm works for BIDB though our bounds are more meaningful for a non-trivial sublcass of BIDB that includes TIDB as well as PDB benchmarks (\cref{sec:experiments}).
+In~\cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for TIDB (even just based on project-join queries) is unlikely to be possible in linear time (\cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\cref{cor:single-p-hard}). Given this, in this section we will design an approximation algorithm for our that runs in {\em linear time}. Unlike the results in~\cref{sec:hard} our approximation algorithm works for BIDB though our bounds are more meaningful for a non-trivial subclass of BIDB that includes TIDB as well as PDB benchmarks (\cref{sec:experiments}).
 %it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.

 \subsection{Preliminaries and some more notation}
@ -138,7 +138,7 @@ Consider the factorized representation $(x + 2y)(2x - y)$ of the polynomial in~\


 \begin{Definition}[Positive T]\label{def:positive-tree}
-For any expression tree $\etree$, the correspondign
+For any expression tree $\etree$, the corresponding
 {\em positive tree}, denoted $\abs{\etree}$ obtained from $\etree$ as follows. For each leaf node $\ell$ of $\etree$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$. %value $\coef$ of each coefficient leaf node in $\etree$ is set to %$\coef_i$ in $\etree$ is exchanged with its absolute value$|\coef|$.
 \end{Definition}

@ -154,19 +154,20 @@ Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(
 In the subsequent subsections we will prove the following theorem.

 \begin{Theorem}\label{lem:approx-alg}
-Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\deg(\poly)$
+Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\degree(\poly)$
 %Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a BIDB. 
-An estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time 
-\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\] 
+An estimate $\mathcal{E}$ %=\approxq(\etree, (p_1,\dots,p_\numvar), \conf, \error')$ 
+ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time 
+\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\] 
 such that
 \begin{equation}
 \label{eq:approx-algo-bound}
-P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
+P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
 \end{equation}
 %with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
 \end{Theorem}

-It turns out that to get linear runtime resuls from~\cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
+It turns out that to get linear runtime results from~\cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
 \begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
 Given an expression tree $\etree$, define
 \[\gamma(\etree)=\frac{\sum_{(\monom, \coef)\in \expandtree{\etree}} \abs{\coef}\cdot \onesymbol\inparen{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\etree}(1,\ldots, 1)}\]
@ -177,10 +178,10 @@ We next present couple of corollaries of~\Cref{lem:approx-alg}.
 \begin{Corollary}
 \label{cor:approx-algo-const-p}
 Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\etree)$. Further let it be the case that $p_i\ge p_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\cref{eq:approx-algo-bound} can be computed in time
-\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
+\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
 In particular, if $p_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1\eps\cdot\treesize(\etree)\cdot \log{\frac{1}{\conf}}\right)$. 
 \end{Corollary}
-We note that the restiction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
+We note that the restriction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
 \AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of  tuples?}

 \begin{proof}[Proof of~\Cref{cor:approx-algo-const-p}]
@ -193,12 +194,21 @@ Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the firs
 \end{proof}

 \subsection{Approximating $\rpoly$}
-We state the approximation algorithm in terms of a $\bi$.
-\subsubsection{Description}
-Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps.  First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$.  Next, ~\cref{alg:mon-sam} calls $\sampmon$  to sample one monomial and its sign from $\expandtree{\etree}$.  The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed.  The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}.

+The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=poly(\etree)$ for expression tree $\etree$ over $\bi$, we note that we can exactly represent $\rpoly(\vct{X}$ as follows:
+\begin{equation}
+\label{eq:tilde-Q-bi}
+\rpoly\inparen{X_1,\dots,X_\numvar}=\sum_{(v,c)\in \expandtree{\etree}} \onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} X_i.
+\end{equation}
+Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(v,c)\in \expandtree{\etree}$ with probability proportional\footnote{We could have also uniformly sampled from $\expandtree{\etree}$ but this gives better parameters.} to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i$. Taking enough samples and computing the average of $Y$ gives us our final estimate. Algowithm~\ref{alg:mon-sam} has the details.
+
+%We state the approximation algorithm in terms of a $\bi$.
+%\subsubsection{Description}
+%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps.  First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$.  Next, ~\cref{alg:mon-sam} calls $\sampmon$  to sample one monomial and its sign from $\expandtree{\etree}$.  The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed.  The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}.
+
+\AR{Seems like the notation below belongs to the notation section (if we decide to state this explicitly at all)?}
 Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive.  The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$.
-\subsubsection{Psuedo Code}
+%\subsubsection{Psuedo Code}

 %Original TIDB Algorithm
 %\begin{algorithm}[H]
@ -232,32 +242,32 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y


 \begin{algorithm}[H]
-	\caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\abs{\block}$)}
+	\caption{$\approxq(\etree, \vct{p}, \conf, \error)$}
 	\label{alg:mon-sam}
 	\begin{algorithmic}[1]
 		\Require \etree: Binary Expression Tree
 		\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
 		\Require $\conf$ $\in [0, 1]$
 		\Require $\error$ $\in [0, 1]$
-		\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
+		%\Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$
 		\Ensure \vari{acc} $\in \mathbb{R}$

-		\State $\vari{sample}_\vari{next} \gets 0$
+		%\State $\vari{sample}_\vari{next} \gets 0$
 		\State $\accum \gets 0$\label{alg:mon-sam-global1}
-		\State $\numsamp \gets \ceil{\frac{2 \log{\frac{4}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
-		\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}}
-		\newline
-		\State $\vari{i} \gets 1$
-		\While{$\vari{i} \leq \numsamp$}\Comment{Perform the required number of samples}
-			\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
-			\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets  $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}
-			\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
-				\If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
-				\newline
-					\State $\vari{sample}_{\vari{next}} \gets 1$
-					\State break
-				\Else
-					\State $\bivec[\block] = 1$
+		\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
+		\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass}}
+		%\newline
+		%\State $\vari{i} \gets 1$
+		\For{$\vari{i} \in 1 \text{ to }\numsamp$}\Comment{Perform the required number of samples}
+			%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
+			\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets  $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon \; is ~\cref{alg:sample}}
+			%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
+		%		\If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
+		%		\newline
+		%			\State $\vari{sample}_{\vari{next}} \gets 1$
+		%			\State break
+		%		\Else
+		%			\State $\bivec[\block] = 1$
 %				\State $\vari{sum} = 0$
 %				\For{$\ell \in [\abs{\block}]$}
 %					\State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$
@ -265,23 +275,24 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
 %				\If{$\vari{sum} \geq 2$}
 %					\State $\vari{sample}_{\vari{next}} \gets 1$
 %					\State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.}
-				\EndIf
-			\EndFor
-			\If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
-				\State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
-			\Else
-				\State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}\newline
-				\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
-					\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
-				\EndFor
+		%		\EndIf
+		%	\EndFor
+		%	\If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop}
+		%		\State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp}
+		%	\Else
+                        \If{$\vari{M}$ has at most one variable from each block}
+				\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}%\newline
+				%\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
+				%	\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
+				%\EndFor
 				\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
 			\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
-			\State $\vari{i} \gets \vari{i} + 1$
+			%\State $\vari{i} \gets \vari{i} + 1$
 			\EndIf
-		\EndWhile
+		\EndFor

-		\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\etree, \numsamp, \abs{\block})$
-		\State  $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp \cdot (1 - \gamma)}$\label{alg:mon-sam-global3}
+		%\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\etree, \numsamp, \abs{\block})$
+		\State  $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
 		\State \Return \vari{acc}
 	\end{algorithmic}
 \end{algorithm}
@ -319,16 +330,19 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y


 \subsubsection{Correctness}
-We state the lemmas for $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies.  Their proofs are subsequent.
+
+In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\cref{alg:mon-sam}. Before we formally do that,
+we first state the lemmas that summarize the relevant properties of $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies.  Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
+

 \begin{Lemma}\label{lem:one-pass}
-The $\onepass$ function completes in $O(size(\etree))$ time.  After $\onepass$ returns the following post conditions hold.  First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$.  Second, when $\vari{S}.\val  = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$
+The $\onepass$ function completes in $O(size(\etree))$ time.  After $\onepass$ returns the following post conditions hold.  First, for each subtree $\vari{S}$ of $\etree$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$.  Second, when$\vari{S}.\val  = +$, each $\vari{child}$ of $\vari{S}$, $\vari{child}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
 \end{Lemma}
-
-At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$.  $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node.
+In proving correctness of~\Cref{alg:mon-sam}, we will only use the following fact (which follows from the above lemma), $\etree_{\vari{mod}}.\vari{partial}=\abs{\etree}(1,\dots,1)$.
+%At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$.  $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node.

 \begin{Lemma}\label{lem:sample}
-The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$.  Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
+The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$.  Upon completion, every $\left(\monom, sign(\coef)\right)\in \expandtree{\abs{\etree}}$ is returned with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. %, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
 \end{Lemma}

 \begin{Theorem}\label{lem:mon-samp}
@ -367,7 +381,7 @@ P\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\e
 As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables.  Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma) = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize \cdot (1 - \gamma)}$.  This computation is performed on ~\cref{alg:mon-sam-global3}.
 %Also see that to properly estimate $\rpoly$, it is necessary to multiply by the number of monomials in $\rpoly$, i.e. $\abs{\etree}(1,\ldots, 1)$.  Therefore it is the case that $\frac{acc}{N}$ gives the estimate of one monomial, and multiplying by $\abs{\etree}(1,\ldots, 1)$ yields the estimate of $\rpoly(\prob_1,\ldots, \prob_\numvar)$.  This scaling is performed in line ~\ref{alg:mon-sam-global3}.

-Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$.  Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$.  Then by upperbounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
+Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the pseudo code) is then strictly bounded by $[-1, 1]$.  Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$.  Then by upper bounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
 \begin{equation*}
 P\pbox{~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error} \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} \leq \frac{\conf}{2}.
 \end{equation*}
@ -382,7 +396,7 @@ Solving for the number of samples $\samplesize$ we get
 &\frac{2\log{\frac{4}{\conf}}}{\error^2} \leq \samplesize.\label{eq:hoeff-6}
 \end{align}

-By Hoeffding we obtain the number of samples necessary to acheive the claimed additive error bounds.
+By Hoeffding we obtain the number of samples necessary to achieve the claimed additive error bounds.

 This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}.

@ -419,6 +433,8 @@ and the runtime then follows, thus upholding ~\cref{lem:approx-alg}.


 \subsection{OnePass Algorithm}
+\label{sec:onepass}
+
 \subsubsection{Description}
 Algorithm ~\ref{alg:one-pass} satisfies the requirements of lemma ~\ref{lem:one-pass}.

@ -604,13 +620,14 @@ Thus, the algorithm visits each node of $\etree$ one time, with a constant numbe


 \subsection{Sample Algorithm}
+\label{sec:samplemonomial}

-Algorithm ~\ref{alg:sample} takes $\etree$ as input, samples an arbitrary $(\monom, \coef)$ from $\expandtree{\etree}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputing the tuple $(\monom, \sign(\coef))$.  While one cannot compute $\expandtree{\etree}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\etree$ which produces a sample from $\expandtree{\etree}$ without ever materializing $\expandtree{\etree}$.
+Algorithm ~\ref{alg:sample} takes $\etree$ as input, samples an arbitrary $(\monom, \coef)$ from $\expandtree{\etree}$ with probabilities $\stree_\lchild.\wght$ and $\stree_\rchild.\wght$ for each subtree $\stree$ with $\stree.\type = +$, outputting the tuple $(\monom, \sign(\coef))$.  While one cannot compute $\expandtree{\etree}$ in time better than $O(N^k)$, the algorithm, similar to \textsc{OnePass}, uses a technique on $\etree$ which produces a sample from $\expandtree{\etree}$ without ever materializing $\expandtree{\etree}$.

 Algorithm ~\ref{alg:sample} selects a monomial from $\expandtree{\etree}$ by the following top-down traversal.  For a parent $+$ node, a subtree is chosen over the previously computed weighted sampling distribution.  When a parent $\times$ node is visited, both children are visited.  All variable leaf nodes of the subgraph traversal are added to a set.  Additionally, the product of signs over all coefficient leaf nodes of the subgraph traversal is computed.  The algorithm returns a set of the distinct variables of which the monomial is composed and the monomial's sign.

 \begin{Definition}[TreeSet]
-A TreeSet is a datastructure whose elements form a set, each of which are stored in a binary tree.
+A TreeSet is a data structure whose elements form a set, each of which are stored in a binary tree.
 \end{Definition}

 Note that as stated, a TreeSet then facilitates logarithmic insertion.
@ -659,9 +676,9 @@ First, we need to show that $\sampmon$ indeed returns a monomial $\monom$, such

 For the base case, let the depth $d$ of $\etree$ be $0$.  We have that the root node is either a constant $\coef$ for which by line ~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\etree.\type = \var$ and $\etree.\val = x$, and  by line ~\ref{alg:sample-var-return} we return $\{x\}$.  Both cases satisfy ~\cref{def:monomial}, and the base case is proven.

-By inductive hyptothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial.
+By inductive hypothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial.

-For the inductive step, let us take a tree $\etree$ with $d = k + 1$.  Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial.  Then the root can be either a $+$ or $\times$ node.  For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root.  Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$.  When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.
+For the inductive step, let us take a tree $\etree$ with $d = k + 1$.  Note that each child has depth $d \leq k$, and by inductive hypothesis both of them return a valid monomial.  Then the root can be either a $+$ or $\times$ node.  For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root.  Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$.  When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a valid monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.

 %Note that for any monomial sampled by algorithm ~\ref{alg:sample}, the nodes traversed form a subgraph of $\etree$ that is \textit{not} a subtree in the general case.  We thus seek to prove that the subgraph traversed produces the correct probability corresponding to the monomial sampled.

@ -689,7 +706,7 @@ and we obtain the desired result.


 \paragraph{Run-time Analysis}
-We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$.  Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$.  Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$.  Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upperbounded by $x_{i + 1} - x_i \leq y_i$.
+We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$.  Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$.  Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$.  Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upper bounded by $x_{i + 1} - x_i \leq y_i$.

 Now, we prove by induction on the depth $d$ of tree $\etree$ the following claim.
 \begin{Claim}\label{claim:num-nodes-level-i}
@ -708,7 +725,7 @@ The inductive step is to show that for arbitrary $\etree$ with depth = $d + 1 \l

 By ~\cref{def:degree}, a sampled monomial will have $O(k)$ $\times$ nodes, and this along with ~\cref{claim:num-nodes-level-i} implies $O(k)$ nodes at $\leq$ $depth(\etree)$ levels of the $\sampmon$ subgraph, bounding the number of recursive calls to $O(k \cdot depth(\etree))$.

-Globally, lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-global2} are $O(1)$ time.  For the $+$ node, line ~\ref{alg:sample-plus-bsamp} has $O(1)$ time by the fact that $\etree$ is binary.  Line ~\ref{alg:sample-plus-union} has $O(\log{k})$ time by nature of the TreeSet datastructure and the fact that by definition any monomial sampled from $\expandtree{\etree}$ has degree $\leq k$ and hence at most $k$ distinct variables, which in turn implies that the TreeSet has $\leq k$ elements in it at any time.
+Globally, lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-global2} are $O(1)$ time.  For the $+$ node, line ~\ref{alg:sample-plus-bsamp} has $O(1)$ time by the fact that $\etree$ is binary.  Line ~\ref{alg:sample-plus-union} has $O(\log{k})$ time by nature of the TreeSet data structure and the fact that by definition any monomial sampled from $\expandtree{\etree}$ has degree $\leq k$ and hence at most $k$ distinct variables, which in turn implies that the TreeSet has $\leq k$ elements in it at any time.

 Finally, line ~\ref{alg:sample-times-product} is in $O(1)$ for a product and an assignment operation.  When a times node is visited, the same union, product, and assignment operations take place, and we again have $O(\log{k})$ runtime.  When a variable leaf node is traversed, the same union operation occurs with $O(\log{k})$ runtime, and a constant leaf node has the above mentioned product and assignment operations.  Thus for each node visited, we have $O(\log{k})$ runtime, and the final runtime for $\sampmon$ is $O(\log{k} \cdot k \cdot depth(\etree))$.

--- a/mult_distinct_p.tex
+++ b/mult_distinct_p.tex
@ -88,7 +88,7 @@ We first argue that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0
 %\sum_{\substack{(i_1, j_1),\\\cdots,\\(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem}
 %\end{equation*}
 %Since each of $(i_1, j_1),\ldots, (i_\kElem, j_\kElem)$ are from $E$, it follows that the set of $\kElem!$ permutations of the $\kElem$ $X_iX_j$ pairs which form the monomial products are of degree $2\kElem$ with the number of distinct variables in an arbitrary monomial $\leq 2\kElem$.  
-By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\deg(\rpoly_{G}^\kElem)\le \deg\poly_G^\kElem=2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\deg(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
+By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\degree(\rpoly_{G}^\kElem)\le \degree(\poly_G^\kElem)=2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\degree(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
 %thereby shrinking the degree a monomial product term in the SOP form of $\poly_{G}^{\kElem}(\vct{X})$ to the exact number of distinct variables the monomial contains.  This implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$.  Then it is the case that
 \begin{equation*}
 \rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i
--- a/poly-form.tex
+++ b/poly-form.tex
@ -4,43 +4,59 @@
 \subsection{Polynomial Formulation and Equivalences}

 Since we have shown that computing the expected multiplicity of a result tuple is equivalent to computing the expectation of a polynomial (for that tuple) given a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$, we from now on focus on this problem exclusively.
-Before proceeding, note that the following is assuming \bis (which subsume \tis as a special case). Thus, variables are independent of each other and each variable $X$ is associated with a probability $\vct{p}(X)$.
-
-Let us use the expression $(x + y)^2$ for a running example in the following definitions.
+Before proceeding, note that the following is assuming \bis (which subsume \tis as a special case). Thus, variables are independent of each other and each variable $X$ is associated with a probability $\vct{p}(X) = \pd[X = 1]$.
+Let us use the expression $(x + y)^2$ as a running example in this section.

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[Monomial]\label{def:monomial}
-A monomial is a product of a fixed set of variables, each raised to a non-negative integer power.
+A monomial is a product of a set of variables, each raised to a non-negative integer power.
 \end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-For the term $2xy$, by ~\cref{def:monomial} the monomial is $xy$.
+For instance, the term $2xy$ contains a single monomial $xy$. % \Cref{def:monomial} the monomial is $xy$.

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[Standard Monomial Basis]\label{def:smb}
-A polynomial is in standard monomial basis when it is fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
+  A polynomial is in standard monomial basis when it is of the form:
+  \[
+    \sum_{i=1}^n c_i \cdot m_i
+  \]
+where each $c_i$ is a positive integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$.
+%  fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
 \end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 The standard monomial basis for the running example is $x^2 +2xy + y^2$.  While $x^2 + xy + xy + y^2$ is an expanded form of the expression, it is not the standard monomial basis since $xy$ appears more than once.

 Throughout this paper, we also make the following \textit{assumption}.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Assumption}\label{assump:poly-smb}
 All polynomials considered are in standard monomial basis, i.e., $\poly(\vct{X}) = \sum\limits_{\vct{d} \in \mathbb{N}^\numvar}q_d \cdot \prod\limits_{i = 1, d_i \geq 1}^{\numvar}X_i^{d_i}$, where $q_d$ is the coefficient for the monomial encoded in $\vct{d}$ and $d_i$ is the $i^{th}$ element of $\vct{d}$.
 \end{Assumption}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 While the definition of polynomial $\poly(\vct{X})$ over a $\bi$ input doesn't change, we introduce an alternative notation which will come in handy.  Given $\ell$ blocks, we write $\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $\block_{i, j}$ denotes tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.
 The number of tuples in the $\bi$ instance can be (trivially) computed as $\numvar = \sum\limits_{i = 1}^{\ell}\abs{\block_i}$ .

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[Degree]\label{def:degree}
 The degree of polynomial $\poly(\vct{X})$ is the maximum sum of the exponents of a monomial, over all monomials when $\poly(\vct{X})$ is in SOP form.
 \end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 The degree of the running example is $2$.  In this paper we consider only finite degree polynomials.

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[$\rpoly(\vct{X})$] \label{def:qtilde}
 Define $\rpoly(X_1,\ldots, X_\numvar)$ as the reduced version of $\poly(X_1,\ldots, X_\numvar)$, of the form
 $\rpoly(X_1,\ldots, X_\numvar) = $

 \[\poly(X_1,\ldots, X_\numvar) \mod X_1^2-X_1\cdots\mod X_\numvar^2 - X_\numvar.\]
 \end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Example}\label{example:qtilde}
 Consider when $\poly(x, y) = (x + y)(x + y)$.  Then the expanded derivation for $\rpoly(x, y)$ is
 \begin{align*}
@ -49,12 +65,14 @@ Consider when $\poly(x, y) = (x + y)(x + y)$.  Then the expanded derivation for
 = ~& x + 2xy + y
 \end{align*}
 \end{Example}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 Intuitively, $\rpoly(\textbf{X})$ is the SOP form of $\poly(\textbf{X})$ such that if any $X_j$ term  has an exponent $e > 1$, it is reduced to $1$, i.e. $X_j^e\mapsto X_j$ for any $e > 1$.
 Alternatively, one can gain intuition for $\rpoly$ by thinking of $\rpoly$ as the resulting SOP of $\poly(\vct{X})$ with an idemptent product operator.

 When considering $\bi$ input, it becomes necessary to redefine $\rpoly(\vct{X})$.

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[$\rpoly$ $\bi$ Redefinition]
 A polynomial $\poly(\vct{X})$ over a $\bi$ instance is reduced to $\rpoly(\vct{X})$ with the following criteria.  First, all exponents $e > 1$ are reduced to $e = 1$.  Second, all monomials sharing the same $\block$ are dropped.  Formally this is expressed as

@ -63,36 +81,44 @@ A polynomial $\poly(\vct{X})$ over a $\bi$ instance is reduced to $\rpoly(\vct{X
 \end{equation*}
 for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{block_s}]$, $t \neq u$.
 \end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 The usefulness of this reduction will be seen in ~\cref{lem:exp-poly-rpoly}.

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Lemma}\label{lem:pre-poly-rpoly}
 When $\poly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar}q_{\vct{d}} \cdot \prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i^{d_i}$, we have then that $\rpoly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar} q_{\vct{d}}\cdot\prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i$.
 \end{Lemma}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{proof}
-Follows by the construction of $\rpoly$ in \cref{def:qtilde}.
+Follows by the construction of $\rpoly$ in \cref{def:qtilde}. \qed
 \end{proof}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-\qed

 Note the following fact:
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Proposition}\label{proposition:q-qtilde}
 \[\text{For all } (X_1,\ldots, X_\numvar) \in \{0, 1\}^\numvar, \poly(X_1,\ldots, X_\numvar) = \rpoly(X_1,\ldots, X_\numvar).\]
 \end{Proposition}
-
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{proof}[Proof for Proposition ~\ref{proposition:q-qtilde}]
-Note that any $\poly$ in factorized form is equivalent to its sum of product expansion.  For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$.
+Note that any $\poly$ in factorized form is equivalent to its sum of product expansion.  For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$. \qed
 \end{proof}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

-\qed

 Define all variables $X_i$ in $\poly$ to be independent.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Lemma}\label{lem:exp-poly-rpoly}
 The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
 \begin{equation*}
 \expct_{\vct{w}}\pbox{\poly(\vct{w})}  = \rpoly(\prob_1,\ldots, \prob_\numvar).
 \end{equation*}
 \end{Lemma}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 Note that in the preceding lemma, we have assigned $\vct{p}$ (introduced in ~\cref{subsec:def-data}) to the variables $\vct{X}$.

@ -127,11 +153,19 @@ Finally, observe \cref{p1-s5} by construction in \cref{lem:pre-poly-rpoly}, that

 \qed
 \end{proof}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Corollary}\label{cor:expct-sop}
 If $\poly$ is given as a sum of monomials, the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $O(|\poly|)$, where $|\poly|$ denotes the total number of multiplication/addition operators.
 \end{Corollary}
-
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{proof}[Proof For Corollary ~\ref{cor:expct-sop}]
 Note that \cref{lem:exp-poly-rpoly} shows that $\expct\pbox{\poly} =$ $\rpoly(\prob_1,\ldots, \prob_\numvar)$.  Therefore, if $\poly$ is already in sum of products form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numvar)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numvar)$), which indeed has $O(|\poly|)$ compututations.\qed
 \end{proof}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "main"
+%%% End: