Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

This commit is contained in:
Oliver Kennedy 2020-12-13 17:46:33 -05:00
commit ff71aa617e
Signed by: okennedy
GPG key ID: 3E5F9B3ABD3FDB60
6 changed files with 304 additions and 88 deletions

View file

@ -1,5 +1,6 @@
%root: main.tex
\section{$1 \pm \epsilon$ Approximation Algorithm}
\label{sec:algo}
Since it is the case that computing the expected multiplicity of a compressed representation of a bag polynomial is hard, it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
First, let us introduce some useful definitions and notation. For illustrative purposes in the definitions below, let us consider when $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$.
@ -30,10 +31,10 @@ Denote $poly(\etree)$ to be the function that takes as input expression tree $\e
\end{equation*}
\end{Definition}
Note that addition and multiplication above follow the standard interpretation over polynomials.
Note that addition and multiplication above follow the standard interpretation over polynomials.
%Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial. Multiplication here is denoted by concatenation of the monomial and coefficient. When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added. Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents.
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\end{Definition}
For our running example, $\etreeset{\smb} = \{2x^2 + 3xy - 2y^2, (x + 2y)(2x - y), x(2x - y) + 2y(2x - y), 2x(x + 2y) - y(x + 2y)\}$. Note that \cref{def:express-tree-set} implies that $\etree \in \etreeset{poly(\etree)}$.
@ -59,7 +60,7 @@ $\expandtree{\etree}$ is the pure sum of products expansion of $\etree$. The lo
\elist{(\{\etree.\val\}, 1)} &\textbf{ if }\etree.\type = \var.\\
\end{cases}
\end{align*}
%where that the multiplication of two tuples %is the standard multiplication over monomials and the standard multiplication over coefficients to produce the product tuple, as in
%where that the multiplication of two tuples %is the standard multiplication over monomials and the standard multiplication over coefficients to produce the product tuple, as in
%is their direct product $(\monom_1, \coef_1) \cdot (\monom_2, \coef_2) = (\monom_1 \cdot \monom_2, \coef_1 \times \coef_2)$ such that monomials $\monom_1$ and $\monom_2$ are concatenated in a product operation, while the standard product operation over reals applies to $\coef_1 \times \coef_2$. The product of $\expandtree{\etree_\lchild} \cdot \expandtree{\etree'_\rchild}$ is then the cross product of the multiplication of all such tuples returned to both $\expandtree{\etree_\lchild}$ and $\expandtree{\etree_\rchild}$. %The operator $\otimes$ is defined as the cross-product tuple multiplication of all such tuples returned by both $\expandtree{\etree_\lchild}$ and $\expandtree{\etree_\rchild}$.
@ -117,7 +118,7 @@ Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(
\end{Definition}
\begin{Definition}[Probability $\gamma$]
Define $\gamma$ to be the probability that a monomial with variables from the same block $\block$ is sampled.
Define $\gamma$ to be the probability that a monomial with variables from the same block $\block$ is sampled.
\end{Definition}
Algorithm ~\ref{alg:est-gamma} estimates $\gamma$.
@ -167,8 +168,8 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
%\end{algorithm}
%BIDB Version of Approximation Algorithm
\begin{algorithm}[H]
\caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\abs{\block}$)}
\label{alg:mon-sam}
@ -216,8 +217,8 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
\State $\vari{i} \gets \vari{i} + 1$
\EndIf
\EndWhile
\EndWhile
\State $\gamma \gets $ $\algname{Estimate}$ $\gamma(\etree, \numsamp, \abs{\block})$
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp \cdot (1 - \gamma)}$\label{alg:mon-sam-global3}
\State \Return \vari{acc}
@ -235,7 +236,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
\State $\vari{cTerms} \gets 0$
\State $\vari{isCross} \gets 0$
\For{$\vari{i} \text{ in } 1 \text{ to } \numsamp$}
\For{$\vari{i} \text{ in } 1 \text{ to } \numsamp$}
\State $\bivec \gets [0]^{\abs{\block}}$
\State $(\vari{M}, \vari{sgn}) \gets $ \sampmon($\etree_\vari{mod}$)
\For{$\vari{x}_{\vari{b}, \vari{j}} \text{ in } \vari{M}$}
@ -249,9 +250,9 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
\If{$\vari{isCross} = 1$}
\State $\vari{cTerms} \gets \vari{cTerms} + 1$
\State $\vari{isCross} \gets 0$
\EndIf
\EndIf
\EndFor
\State \Return $\frac{\vari{cTerms}}{\numsamp}$
\State \Return $\frac{\vari{cTerms}}{\numsamp}$
\end{algorithmic}
\end{algorithm}
@ -260,29 +261,29 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y
We state the lemmas for $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are subsequent.
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$. Second, when $\vari{S}.\val = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$. Second, when $\vari{S}.\val = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$
\end{Lemma}
At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$. $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node.
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$.
\end{Lemma}
\begin{Theorem}\label{lem:mon-samp}
If the contracts for $\onepass$ and $\sampmon$ hold, then for any $\etree$ with $\degree(poly(|\etree|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\empmean$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that $\expct\pbox{\empmean} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$. %within an additive $\error \cdot \abs{\etree}(1,\ldots, 1)$ error with
If the contracts for $\onepass$ and $\sampmon$ hold, then for any $\etree$ with $\degree(poly(|\etree|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\empmean$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that $\expct\pbox{\empmean} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$. %within an additive $\error \cdot \abs{\etree}(1,\ldots, 1)$ error with
$\empmean$ has bounds $P\left(\left|\empmean - \expct\pbox{\empmean}\right|\geq \error \cdot \frac{\abs{\etree}(1,\ldots, 1)}{1 - \gamma}\right) \leq \conf$, in $O\left(\treesize(\etree)\right.$ $+$ $\left.\left(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot depth(\etree)\right)\right)$ time.
\end{Theorem}
\begin{proof}[Proof of Theorem \ref{lem:mon-samp}]
As previously noted, by lines ~\ref{alg:mon-sam-check} and ~\ref{alg:mon-sam-drop} the algorithm will resample when it encounters a sample with variables from the same block. The probability of sampling such a monomial is $\gamma$.
As previously noted, by lines ~\ref{alg:mon-sam-check} and ~\ref{alg:mon-sam-drop} the algorithm will resample when it encounters a sample with variables from the same block. The probability of sampling such a monomial is $\gamma$.
Now, consider $\expandtree{\etree}$ and let $(\monom, \coef)$ be an arbitrary tuple in $\expandtree{\etree}$. For convenience, over an alphabet $\Sigma$ of size $\numvar$, define
Now, consider $\expandtree{\etree}$ and let $(\monom, \coef)$ be an arbitrary tuple in $\expandtree{\etree}$. For convenience, over an alphabet $\Sigma$ of size $\numvar$, define
\begin{equation*}
\evalmp: \left(\left\{\monom^a~|~\monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}, [0, 1]^\numvar\right)\mapsto \mathbb{R},
\end{equation*}
a function that takes a monomial $\monom$ in $\left\{\monom^a ~|~ \monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}$ and probability vector $\vct{p}$ (introduced in ~\cref{subsec:def-data}) as input and outputs the evaluation of $\monom$ over $\vct{p}$. By ~\cref{lem:sample}, the sampling scheme samples $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. Note that $\coef \cdot \evalmp(\monom, \vct{p})$ is the value of $(\monom, \coef)$ in $\expandtree{\etree}$ when all variables in $\monom$ are assigned their corresponding probabilities.
a function that takes a monomial $\monom$ in $\left\{\monom^a ~|~ \monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}$ and probability vector $\vct{p}$ (introduced in ~\cref{subsec:def-data}) as input and outputs the evaluation of $\monom$ over $\vct{p}$. By ~\cref{lem:sample}, the sampling scheme samples $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. Note that $\coef \cdot \evalmp(\monom, \vct{p})$ is the value of $(\monom, \coef)$ in $\expandtree{\etree}$ when all variables in $\monom$ are assigned their corresponding probabilities.
Let $Vars(\monom) = \{X_{\block, i} \st X_{\block, i} \in \monom\}$. Define the set of elements containing no cross-terms in $\expandtree{\etree}$ as $\expandtree{\etree}' = \{(\monom, \coef) \st \forall (\monom, \coef) \in \expandtree{\etree}, \forall X_{\block, i}, X_{\block', j} \in Vars(\monom), \block \neq \block'\}$.
@ -290,7 +291,7 @@ Note again that the sum of $\coef \cdot \evalmp(\monom, \vct{p})$ over $\expandt
Consider now a set of $\samplesize$ random variables $\vct{\randvar}$, where each $\randvar_i$ is distributed as described above. Then for random variable $\randvar_i$, it is the case that
$\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}' }\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}'}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot \frac{1}{1 - \gamma}} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that
$\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}' }\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}'}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot \frac{1}{1 - \gamma}} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that
\begin{align*}
&\expct\pbox{\empmean} = \expct\pbox{ \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}\nonumber\\
@ -305,7 +306,7 @@ P\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\e
As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma) = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize \cdot (1 - \gamma)}$. This computation is performed on ~\cref{alg:mon-sam-global3}.
%Also see that to properly estimate $\rpoly$, it is necessary to multiply by the number of monomials in $\rpoly$, i.e. $\abs{\etree}(1,\ldots, 1)$. Therefore it is the case that $\frac{acc}{N}$ gives the estimate of one monomial, and multiplying by $\abs{\etree}(1,\ldots, 1)$ yields the estimate of $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This scaling is performed in line ~\ref{alg:mon-sam-global3}.
Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upperbounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upperbounding Hoeffding with $\frac{\conf}{2}$ (since we take an additional estimate of $\gamma$), it is the case that
\begin{equation*}
P\pbox{~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error} \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} \leq \frac{\conf}{2}.
\end{equation*}
@ -320,16 +321,16 @@ Solving for the number of samples $\samplesize$ we get
&\frac{2\log{\frac{4}{\conf}}}{\error^2} \leq \samplesize.\label{eq:hoeff-6}
\end{align}
By Hoeffding we obtain the number of samples necessary to acheive the claimed additive error bounds.
By Hoeffding we obtain the number of samples necessary to acheive the claimed additive error bounds.
This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}.
\paragraph{Run-time Analysis}
%For a $\bi$ instance, it is possible that cancellations can occur as seen in ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations.
Note that lines ~\ref{alg:mon-sam-global1}, ~\ref{alg:mon-sam-global2}, and ~\ref{alg:mon-sam-global3} are $O(1)$ global operations. The call to $\onepass$ in line ~\ref{alg:mon-sam-onepass} by lemma ~\ref{lem:one-pass} is $O(\treesize(\etree))$ time.
%First, algorithm ~\ref{alg:mon-sam} calls \textsc{OnePass} which takes $O(|\etree|)$ time.
Then for $\numsamp = \ceil{\frac{2 \log{\frac{4}{\conf}}}{\error^2}}$, the $O(1)$ assignment, product, and addition operations occur. Over the same $\numsamp$ iterations, $\sampmon$ is called, with a runtime of $O(\log{k}\cdot k \cdot depth(\etree))$ by lemma ~\ref{lem:sample}. Finally, over the same iterations, because $\degree(\polyf(\abs{\etree})) = k$, the assignment and product operations of line ~\ref{alg:mon-sam-product2} are called at most $k$ times.
Note that lines ~\ref{alg:mon-sam-global1}, ~\ref{alg:mon-sam-global2}, and ~\ref{alg:mon-sam-global3} are $O(1)$ global operations. The call to $\onepass$ in line ~\ref{alg:mon-sam-onepass} by lemma ~\ref{lem:one-pass} is $O(\treesize(\etree))$ time.
%First, algorithm ~\ref{alg:mon-sam} calls \textsc{OnePass} which takes $O(|\etree|)$ time.
Then for $\numsamp = \ceil{\frac{2 \log{\frac{4}{\conf}}}{\error^2}}$, the $O(1)$ assignment, product, and addition operations occur. Over the same $\numsamp$ iterations, $\sampmon$ is called, with a runtime of $O(\log{k}\cdot k \cdot depth(\etree))$ by lemma ~\ref{lem:sample}. Finally, over the same iterations, because $\degree(\polyf(\abs{\etree})) = k$, the assignment and product operations of line ~\ref{alg:mon-sam-product2} are called at most $k$ times.
Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error^2}\right) \cdot \left(k + \log{k}\cdot k \cdot depth(\etree)\right) = O\left(\treesize(\etree) + \left(\left(\frac{\log{\frac{1}{\conf}}}{\error^2}\right) \cdot \left(k \cdot\log{k} \cdot depth(\etree)\right)\right)\right)$ overall running time.
\end{proof}
@ -343,9 +344,9 @@ Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error
Setting $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$ achieves $1 \pm \epsilon$ multiplicative error bounds, in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)(1 - \gamma)^2}\right)$.
%\end{Corollary}
Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly.
Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly.
The derivation over the number of samples is then
The derivation over the number of samples is then
\begin{align*}
&\frac{2\log{\frac{4}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}\right)^2}\\
= &\frac{2\log{\frac{4}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)^2},
@ -370,20 +371,20 @@ The evaluation of $\abs{\etree}(1,\ldots, 1)$ can be defined recursively, as fol
|\etree.\val| &\textbf{if }\etree.\type = \tnum\\
1 &\textbf{if }\etree.\type = \var.
\end{cases}
\end{align*}
\end{align*}
%\begin{align*}
%&\eval{\etree ~|~ \etree.\type = +}_{\abs{\etree}} =&& \eval{\etree_\lchild}_{\abs{\etree}} + \eval{\etree_\rchild}_{\abs{\etree}}\\
%&\eval{\etree ~|~ \etree.\type = \times}_{\abs{\etree}} = && \eval{\etree_\lchild}_{\abs{\etree}} \cdot \eval{\etree_\rchild}_{\abs{\etree}}\\
%&\eval{\etree ~|~ \etree.\type = \tnum}_{\abs{\etree}} = && \etree.\val\\
%&\eval{\etree ~|~ \etree.\val = \var}_{\abs{\etree}} = && 1
%&\eval{\etree ~|~ \etree.\val = \var}_{\abs{\etree}} = && 1
%\end{align*}
In the same fashion the weighted distribution can be described as above with the following modification for the case when $\etree.\type = +$:
In the same fashion the weighted distribution can be described as above with the following modification for the case when $\etree.\type = +$:
\begin{align*}
&\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1); &\textbf{if }\etree.\type = + \\
&\etree_\lchild.\vari{weight} \gets \frac{\abs{\etree_\lchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1)};\\
&\etree_\rchild.\vari{weight} \gets \frac{\abs{\etree_\rchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1)+ \abs{\etree_\rchild}(1,\ldots, 1)}
&\etree_\rchild.\vari{weight} \gets \frac{\abs{\etree_\rchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1)+ \abs{\etree_\rchild}(1,\ldots, 1)}
\end{align*}
%\begin{align*}
@ -445,26 +446,26 @@ level 2/.style={sibling distance=0.7cm},
%sibling distance = 1cm,
%every node/.append style = {anchor=center}
]
\Tree [.\node(root){$\boldsymbol{+}$};
\edge [wght_color] node[midway, auto= right, font=\bfseries, gray] {$\bsym{\frac{4}{5}}$}; [.\node[highlight_color](tl){$\boldsymbol{\times}$};
[.\node(s){$\bsym{+}$};
\Tree [.\node(root){$\boldsymbol{+}$};
\edge [wght_color] node[midway, auto= right, font=\bfseries, gray] {$\bsym{\frac{4}{5}}$}; [.\node[highlight_color](tl){$\boldsymbol{\times}$};
[.\node(s){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sr){$\bsym{x_2}$}; ]
]
[.\node(sp){$\bsym{+}$};
[.\node(sp){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spr){$\bsym{\times}$};
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spr){$\bsym{\times}$};
[.$\bsym{-1}$ ] [.$\bsym{x_2}$ ]
]
]
]
]
\edge [wght_color] node[midway, auto=left, font=\bfseries, gray] {$\bsym{\frac{1}{5}}$}; [.\node[highlight_color](tr){$\boldsymbol{\times}$};
[.$\bsym{x_2}$
\edge [wght_color] node[midway, auto=left, font=\bfseries, gray] {$\bsym{\frac{1}{5}}$}; [.\node[highlight_color](tr){$\boldsymbol{\times}$};
[.$\bsym{x_2}$
\edge [draw=none]; [.\node[draw=none]{}; ]
\edge [draw=none]; [.\node[draw=none]{}; ]
]
[.$\bsym{x_2}$ ] ]
\edge [draw=none]; [.\node[draw=none]{}; ]
]
[.$\bsym{x_2}$ ] ]
]
% labels for plus node children, with arrows
\node[left=2pt of sl, highlight_color, inner sep=0pt] (sl-label) {$\stree_\lchild$};
@ -493,10 +494,10 @@ level 2/.style={sibling distance=0.7cm},
% \begin{tikzpicture}[thick, level distance=1.2cm, level 1/.style={sibling distance= 5cm}, level 2/.style={sibling distance=3cm}, level 3/.style={sibling distance=1.5cm}, level 4/.style={sibling distance= 1cm}, every child/.style={black}]
% \node[tree_node](root) {$\boldsymbol{+}$}
% child[red]{node[tree_node](tl) {$\boldsymbol{\times}$}
% child{node[tree_node] {$\boldsymbol{+}$}
% child{node[tree_node] {$\boldsymbol{+}$}
% child{node[tree_node]{$\boldsymbol{x_1}$} }
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% }
% }
% child{node[tree_node] {$\boldsymbol{+}$}
% child{node[tree_node] {$\boldsymbol{x_1}$}}
% %child[missing]{node[tree_node] {$\boldsymbol{1}$}}
@ -504,7 +505,7 @@ level 2/.style={sibling distance=0.7cm},
% child{node[tree_node] {$\boldsymbol{-1}$}}
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% }
% }
% }
% }
% child{node[tree_node] {$\boldsymbol{\times}$} edge from parent [red]
% child{node[tree_node] {$\boldsymbol{x_2}$}}
@ -524,13 +525,13 @@ We prove the first part of lemma ~\ref{lem:one-pass}, i.e., correctness, by stru
For the base case, $d = 0$, it is the case that the node is a leaf and therefore by definition ~\ref{def:express-tree} must be a variable or coefficient. When it is a variable, \textsc{OnePass} returns $1$, and we have in this case that $\polyf(\etree) = X_i = \polyf(\abs{\etree})$ for some $i$ in $[\numvar]$, and this evaluated at all $1$'s indeed gives $1$, verifying the correctness of the returned value of $\abs{\etree}(1,\ldots, 1) = 1$. When the root is a coefficient, the absolute value of the coefficient is returned, which is indeed $\abs{\etree}(1,\ldots, 1)$. This proves the base case.
Let the inductive hypothesis be the assumption that for $d \leq k$ for $k \geq 1$, lemma ~\ref{lem:one-pass} is true for algorithm ~\ref{alg:one-pass}.
Let the inductive hypothesis be the assumption that for $d \leq k$ for $k \geq 1$, lemma ~\ref{lem:one-pass} is true for algorithm ~\ref{alg:one-pass}.
Now prove that lemma ~\ref{lem:one-pass} holds for $k + 1$. Notice that $\etree$ has at most two children, $\etree_\lchild$ and $\etree_\rchild$. Note also, that for each child, it is the case that $d \leq k$. Then, by inductive hypothesis, lemma ~\ref{lem:one-pass} holds for each existing child, and we are left with two possibilities for $\etree$. The first case is when $\etree$ is a $+$ node. When this happens, algorithm ~\ref{alg:one-pass} computes $|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)$ on line ~\ref{alg:one-pass-plus-add} which by definition is $\abs{\etree}(1,\ldots, 1)$ and hence the inductive hypothesis holds in this case. For the weight computation of the children of $+$, by lines ~\ref{alg:one-pass-plus-add}, ~\ref{alg:one-pass-plus-assign2}, and ~\ref{alg:one-pass-plus-prob} algorithm ~\ref{alg:one-pass} computes $\etree_i.\wght = \frac{|T_i|(1,\ldots, 1)}{|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)}$ which is indeed as claimed. The second case is when the $\etree.\val = \times$. By inductive hypothesis, it is the case that both $\abs{\etree_\lchild}\polyinput{1}{1}$ and $\abs{\etree_\rchild}\polyinput{1}{1}$ have been correctly computed. On line~\ref{alg:one-pass-times-product} algorithm ~\ref{alg:one-pass} then computes the product of the subtree partial values, $|T_\lchild|(1,\ldots, 1) \times |T_\rchild|(1,\ldots, 1)$ which by definition is $\abs{\etree}(1,\ldots, 1)$.
%That $\onepass$ makes exactly one traversal of $\etree$ follows by noting for lines ~\ref{alg:one-pass-equality1} and ~\ref{alg:one-pass-equality2} are the checks for the non-base cases, where in each matching exactly one recursive call is made on each of $\etree.\vari{children}$. For the base cases, lines ~\ref{alg:one-pass-equality3} and ~\ref{alg:one-pass-equality4} both return values without making any further recursive calls. Since all nodes are covered by the cases, and the base cases cover only leaf nodes, it follows that algorithm ~\ref{alg:one-pass} then terminates after it visits every node exactly one time.
%To conclude, note that when $\etree.\type = +$, the compuatation of $\etree_\lchild.\wght$ and $\etree_\rchild.\wght$ are solely dependent on the correctness of $\abs{\etree}\polyinput{1}{1}$, $\abs{\etree_\lchild}\polyinput{1}{1}$, and $\abs{\etree_\rchild}\polyinput{1}{1}$, which have already been argued to be correct.
%To conclude, note that when $\etree.\type = +$, the compuatation of $\etree_\lchild.\wght$ and $\etree_\rchild.\wght$ are solely dependent on the correctness of $\abs{\etree}\polyinput{1}{1}$, $\abs{\etree_\lchild}\polyinput{1}{1}$, and $\abs{\etree_\rchild}\polyinput{1}{1}$, which have already been argued to be correct.
\paragraph{Run-time Analysis}
The runtime for \textsc{OnePass} is fairly straight forward. Note that line ~\ref{alg:one-pass-equality1}, ~\ref{alg:one-pass-equality2}, and ~\ref{alg:one-pass-equality3} give a constant number of equality checks per node. Then, for $+$ nodes, lines ~\ref{alg:one-pass-plus-add} and ~\ref{alg:one-pass-plus-prob} perform a constant number of arithmetic operations, while ~\ref{alg:one-pass-plus-assign1} ~\ref{alg:one-pass-plus-assign2}, and ~\ref{alg:one-pass-plus-assign3} all have $O(1)$ assignments. Similarly, when a $\times$ node is visited, lines \ref{alg:one-pass-times-assign1}, \ref{alg:one-pass-times-assign2}, and \ref{alg:one-pass-times-assign3} have $O(1)$ assignments, while line ~\ref{alg:one-pass-times-product} has $O(1)$ product operations per node. For leaf nodes, ~\cref{alg:one-pass-leaf-assign1} and ~\cref{alg:one-pass-global-assign} are both $O(1)$ assignment.
@ -574,7 +575,7 @@ See algorithm ~\ref{alg:sample} for the details of $\sampmon$ algorithm.
\State $\Return ~(\vari{v}, \vari{s})$
\ElsIf{$\etree.\type = \times$}\Comment{Multiply the sampled values of all subtree children}
\State $\vari{sgn} \gets 1$\label{alg:sample-global2}
\For {$child$ in $\etree.\vari{children}$}
\For {$child$ in $\etree.\vari{children}$}
\State $(\vari{v}, \vari{s}) \gets \sampmon(child)$
\State $\vari{vars} \gets \vari{vars} \cup \{\vari{v}\}$\label{alg:sample-times-union}
\State $\vari{sgn} \gets \vari{sgn} \times \vari{s}$\label{alg:sample-times-product}
@ -599,21 +600,21 @@ For the base case, let the depth $d$ of $\etree$ be $0$. We have that the root
By inductive hyptothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial.
For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.
For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial.
%Note that for any monomial sampled by algorithm ~\ref{alg:sample}, the nodes traversed form a subgraph of $\etree$ that is \textit{not} a subtree in the general case. We thus seek to prove that the subgraph traversed produces the correct probability corresponding to the monomial sampled.
We seek to prove by induction on the depth $d$ of $\etree$ that the subgraph traversed by $\sampmon$ has a probability that is in accordance with the monomial sampled, $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$.
We seek to prove by induction on the depth $d$ of $\etree$ that the subgraph traversed by $\sampmon$ has a probability that is in accordance with the monomial sampled, $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$.
For the base case $d = 0$, by definition ~\ref{def:express-tree} we know that the root has to be either a coefficient or a variable. For either case, the probability of the value returned is $1$ since there is only one value to sample from. When the root is a variable $x$ the algorithm correctly returns $(\{x\}, 1 )$. When the root is a coefficient, \sampmon ~correctly returns $(\{~\}, sign(\coef_i))$.
For the base case $d = 0$, by definition ~\ref{def:express-tree} we know that the root has to be either a coefficient or a variable. For either case, the probability of the value returned is $1$ since there is only one value to sample from. When the root is a variable $x$ the algorithm correctly returns $(\{x\}, 1 )$. When the root is a coefficient, \sampmon ~correctly returns $(\{~\}, sign(\coef_i))$.
For the inductive hypothesis, assume that for $d \leq k$ and $k \geq 1$ $\sampmon$ indeed samples $\monom$ in $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$.%bove is true.%lemma ~\ref{lem:sample} is true.
Prove now, that when $d = k + 1$ the correctness holds. It is the case that the root of $\etree$ has up to two children $\etree_\lchild$ and $\etree_\rchild$. Since $\etree_\lchild$ and $\etree_\rchild$ are both depth $d \leq k$, by inductive hypothesis correctness holds for both of them, thus, $\sampmon$ will sample both monomials $\monom_\lchild$ in $(\monom_\lchild, \coef_\lchild)$ of $\expandtree{\etree_\lchild}$ and $\monom_\rchild$ in $(\monom_\rchild, \coef_\rchild)$ of $\expandtree{\etree_\rchild}$, from $\etree_\lchild$ and $\etree_\rchild$ with probability $\frac{|\coef_\lchild|}{\abs{\etree_\lchild}\polyinput{1}{1}}$ and $\frac{|\coef_\rchild|}{\abs{\etree_\rchild}\polyinput{1}{1}}$.
Prove now, that when $d = k + 1$ the correctness holds. It is the case that the root of $\etree$ has up to two children $\etree_\lchild$ and $\etree_\rchild$. Since $\etree_\lchild$ and $\etree_\rchild$ are both depth $d \leq k$, by inductive hypothesis correctness holds for both of them, thus, $\sampmon$ will sample both monomials $\monom_\lchild$ in $(\monom_\lchild, \coef_\lchild)$ of $\expandtree{\etree_\lchild}$ and $\monom_\rchild$ in $(\monom_\rchild, \coef_\rchild)$ of $\expandtree{\etree_\rchild}$, from $\etree_\lchild$ and $\etree_\rchild$ with probability $\frac{|\coef_\lchild|}{\abs{\etree_\lchild}\polyinput{1}{1}}$ and $\frac{|\coef_\rchild|}{\abs{\etree_\rchild}\polyinput{1}{1}}$.
Then the root has to be either a $+$ or $\times$ node.
Then the root has to be either a $+$ or $\times$ node.
Consider the case when the root is $\times$. Note that we are sampling a term from $\expandtree{\etree}$. Consider $(\monom, \coef)$ in $\expandtree{\etree}$, where $\monom$ is the sampled monomial. Notice also that it is the case that $\monom = \monom_\lchild \times \monom_\rchild$, where $\monom_\lchild$ is coming from $\etree_\lchild$ and $\monom_\rchild$ from $\etree_\rchild$. The probability that \sampmon$(\etree_{\lchild})$ returns $\monom_\lchild$ is $\frac{|\coef_{\monom_\lchild}|}{|\etree_\lchild|(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\lchild}|}{\abs{\etree_\rchild}\polyinput{1}{1}}$ for $\monom_R$. Since both $\monom_\lchild$ and $\monom_\rchild$ are sampled with independent randomness, the final probability for sample $\monom$ is then $\frac{|\coef_{\monom_\lchild}| \cdot |\coef_{\monom_R}|}{|\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)}$. For $(\monom, \coef)$ in \expandtree{\etree}, it is indeed the case that $|\coef_i| = |\coef_{\monom_\lchild}| \cdot |\coef_{\monom_\rchild}|$ and that $\abs{\etree}(1,\ldots, 1) = |\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)$, and therefore $\monom$ is sampled with correct probability $\frac{|\coef_i|}{\abs{\etree}(1,\ldots, 1)}$.
Consider the case when the root is $\times$. Note that we are sampling a term from $\expandtree{\etree}$. Consider $(\monom, \coef)$ in $\expandtree{\etree}$, where $\monom$ is the sampled monomial. Notice also that it is the case that $\monom = \monom_\lchild \times \monom_\rchild$, where $\monom_\lchild$ is coming from $\etree_\lchild$ and $\monom_\rchild$ from $\etree_\rchild$. The probability that \sampmon$(\etree_{\lchild})$ returns $\monom_\lchild$ is $\frac{|\coef_{\monom_\lchild}|}{|\etree_\lchild|(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\lchild}|}{\abs{\etree_\rchild}\polyinput{1}{1}}$ for $\monom_R$. Since both $\monom_\lchild$ and $\monom_\rchild$ are sampled with independent randomness, the final probability for sample $\monom$ is then $\frac{|\coef_{\monom_\lchild}| \cdot |\coef_{\monom_R}|}{|\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)}$. For $(\monom, \coef)$ in \expandtree{\etree}, it is indeed the case that $|\coef_i| = |\coef_{\monom_\lchild}| \cdot |\coef_{\monom_\rchild}|$ and that $\abs{\etree}(1,\ldots, 1) = |\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)$, and therefore $\monom$ is sampled with correct probability $\frac{|\coef_i|}{\abs{\etree}(1,\ldots, 1)}$.
For the case when $\etree.\val = +$, \sampmon ~will sample monomial $\monom$ from one of its children. By inductive hypothesis we know that any $\monom_\lchild$ in $\expandtree{\etree_\lchild}$ and any $\monom_\rchild$ in $\expandtree{\etree_\rchild}$ will both be sampled with correct probability $\frac{|\coef_{\monom_\lchild}|}{\etree_{\lchild}(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\rchild}|}{|\etree_\rchild|(1,\ldots, 1)}$, where either $\monom_\lchild$ or $\monom_\rchild$ will equal $\monom$, depending on whether $\etree_\lchild$ or $\etree_\rchild$ is sampled. Assume that $\monom$ is sampled from $\etree_\lchild$, and note that a symmetric argument holds for the case when $\monom$ is sampled from $\etree_\rchild$. Notice also that the probability of choosing $\etree_\lchild$ from $\etree$ is $\frac{\abs{\etree_\lchild}\polyinput{1}{1}}{\abs{\etree_\lchild}\polyinput{1}{1} + \abs{\etree_\rchild}\polyinput{1}{1}}$ as computed by $\onepass$. Then, since $\sampmon$ goes top-down, and each sampling choice is independent (which follows from the randomness in the root of $\etree$ being independent from the randomness used in its subtrees), the probability for $\monom$ to be sampled from $\etree$ is equal to the product of the probability that $\etree_\lchild$ is sampled from $\etree$ and $\monom$ is sampled in $\etree_\lchild$, and
\begin{align*}
@ -627,7 +628,7 @@ and we obtain the desired result.
\paragraph{Run-time Analysis}
We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$. Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$. Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$. Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upperbounded by $x_{i + 1} - x_i \leq y_i$.
We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth(\etree)\right)$. Take an arbitrary sample subgraph of expression tree $\etree$ of degree $k$ and pick an arbitrary level $i$. Call the number of $\times$ nodes in this level $y_i$, and the total number of nodes $x_i$. Given that both children of a $\times$ node are traversed in $\sampmon$ while only one child is traversed for a $+$ parent node, note that the number of nodes on level $i + 1$ in the general case is at most $y_i + x_i$, and the increase in the number of nodes from level $i$ to level $i + 1$ is upperbounded by $x_{i + 1} - x_i \leq y_i$.
Now, we prove by induction on the depth $d$ of tree $\etree$ the following claim.
\begin{Claim}\label{claim:num-nodes-level-i}
@ -639,7 +640,7 @@ For the base case, $d = 0$, we have the following cases. For both cases, when $
Assume that for $d \leq k$ for $k \geq 0$ that ~\cref{claim:num-nodes-level-i} holds.
The inductive step is to show that for arbitrary $\etree$ with depth = $d + 1 \leq k + 1$ the claim still holds. Note that we have two possibilities for the value of $\etree$. First, $\etree.\type = +$, and it is the case in ~\cref{alg:sample-plus-traversal} that only one of $\etree_\lchild$ or $\etree_\rchild$ are part of the subgraph traversed by $\sampmon$. By inductive hypothesis, both subtrees satisfy the claim. Since only one child is part of the subgraph, there is exactly one node at level 1, which, as in the base case analysis, satisfies ~\cref{claim:num-nodes-level-i}. For the second case, $\etree.\type = \times$, $\sampmon$ traverses both children, and the number of nodes at level $1$ in the subgraph is then $2$, which satisfies ~\cref{claim:num-nodes-level-i} since the sum of $\times$ nodes in previous levels (level $0$) is $1$, and $1 + 1 = 2$, proving the claim.
The inductive step is to show that for arbitrary $\etree$ with depth = $d + 1 \leq k + 1$ the claim still holds. Note that we have two possibilities for the value of $\etree$. First, $\etree.\type = +$, and it is the case in ~\cref{alg:sample-plus-traversal} that only one of $\etree_\lchild$ or $\etree_\rchild$ are part of the subgraph traversed by $\sampmon$. By inductive hypothesis, both subtrees satisfy the claim. Since only one child is part of the subgraph, there is exactly one node at level 1, which, as in the base case analysis, satisfies ~\cref{claim:num-nodes-level-i}. For the second case, $\etree.\type = \times$, $\sampmon$ traverses both children, and the number of nodes at level $1$ in the subgraph is then $2$, which satisfies ~\cref{claim:num-nodes-level-i} since the sum of $\times$ nodes in previous levels (level $0$) is $1$, and $1 + 1 = 2$, proving the claim.
\end{proof}
\qed

21
atri.bib Normal file
View file

@ -0,0 +1,21 @@
@inproceedings{triang-hard,
author = {Tsvi Kopelowitz and
Virginia Vassilevska Williams},
editor = {Artur Czumaj and
Anuj Dawar and
Emanuela Merelli},
title = {Towards Optimal Set-Disjointness and Set-Intersection Data Structures},
booktitle = {47th International Colloquium on Automata, Languages, and Programming,
{ICALP} 2020, July 8-11, 2020, Saarbr{\"{u}}cken, Germany (Virtual
Conference)},
series = {LIPIcs},
volume = {168},
pages = {74:1--74:16},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
year = {2020},
url = {https://doi.org/10.4230/LIPIcs.ICALP.2020.74},
doi = {10.4230/LIPIcs.ICALP.2020.74},
timestamp = {Tue, 30 Jun 2020 17:15:44 +0200},
biburl = {https://dblp.org/rec/conf/icalp/KopelowitzW20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}

View file

@ -15,6 +15,7 @@
\newcommand{\db}{D}
\newcommand{\idb}{\Omega}
\newcommand{\pdb}{\mathcal{D}}
\newcommand{\pxdb}{\mathbf{D}}
\newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db
\newcommand{\tset}{\mathcal{T}}%the set of tuples in a database
\newcommand{\pd}{P}%pd for probability distribution
@ -36,6 +37,12 @@
\newcommand{\dtrm}[1]{Det\left(#1\right)}
\newcommand{\tuple}[1]{\left<#1\right>}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Query Classes
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\qClass}{\mathcal{Q}}
\newcommand{\raPlus}{\ensuremath{\mathcal{RA}^{+}}\xspace}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Approx Alg
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -73,9 +80,9 @@
\newcommand{\lchild}{\vari{L}}
\newcommand{\rchild}{\vari{R}}
%members of T
\newcommand{\val}{\vari{val}}
\newcommand{\type}{\vari{type}}
\newcommand{\wght}{\vari{weight}}
\newcommand{\val}{\vari{val}\xspace}
\newcommand{\type}{\vari{type}\xspace}
\newcommand{\wght}{\vari{weight}\xspace}
%types of T
\newcommand{\var}{var}
\newcommand{\tnum}{num}
@ -117,6 +124,11 @@
\newcommand{\bivar}{x_{\block, i}}
\newcommand{\tipdb}{\pdb_{\tiabb}}
% REPRESENTATIONS
\newcommand{\rmod}{Mod}
\newcommand{\reprs}{\mathcal{M}}
\newcommand{\repr}{M}
%Polynomial Reformulation
\newcommand{\wbit}{w}
\newcommand{\expct}{\mathop{\mathbb{E}}}
@ -250,16 +262,17 @@
\DeclareMathAlphabet{\mathbbold}{U}{bbold}{m}{n}
\newtheorem{Theorem}{Theorem}[section]
\newtheorem{Definition}{Definition}
\newtheorem{Lemma}{Lemma}
\newtheorem{Proposition}{Proposition}
\newtheorem{Property}{Property}
\newtheorem{Corollary}{Corollary}
\newtheorem{Claim}{Claim}
\newtheorem{Example}{Example}
\newtheorem{Axiom}{Axiom}
\newtheorem{Question}{Question}
\newtheorem{Assumption}{Assumption}
\newtheorem{Definition}[Theorem]{Definition}
\newtheorem{Lemma}[Theorem]{Lemma}
\newtheorem{Proposition}[Theorem]{Proposition}
\newtheorem{Property}[Theorem]{Property}
\newtheorem{Corollary}[Theorem]{Corollary}
\newtheorem{Claim}[Theorem]{Claim}
\newtheorem{Example}[Theorem]{Example}
\newtheorem{Axiom}[Theorem]{Axiom}
\newtheorem{Question}[Theorem]{Question}
\newtheorem{Assumption}[Theorem]{Assumption}
\newtheorem{hypo}[Theorem]{Conjecture}
@ -283,7 +296,7 @@
\newcommand{\semN}{\mathbb{N}}
\newcommand{\domN}{\mathbb{N}}
\newcommand{\semB}{\mathbb{B}}
\newcommand{\semNX}{\mathbb{N}[X]}
\newcommand{\semNX}{\mathbb{N}[\vct{X}]}
\newcommand{\domK}{K}
\newcommand{\semK}{\mathcal{K}}
@ -310,6 +323,8 @@
\newcommand{\dbDomK}[1]{\mathcal{DB}_{#1}}
\newcommand{\assign}{\varphi}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPLEXITY
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -328,3 +343,7 @@
%%%Adding stuff below so that long chain of display equatoons can be split across pages
\allowdisplaybreaks
\newcommand{\eps}{\epsilon}
\newcommand{\inparen}[1]{\left({#1}\right)}
\newcommand{\inset}[1]{\left\{{#1}\right\}}

View file

@ -176,7 +176,7 @@ sensitive=true
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliographystyle{plain}
\bibliography{aaron.bib}
\bibliography{aaron,atri}

View file

@ -23,23 +23,61 @@ Given a positive integer $k$ and an undirected graph $G$ with no self-loops or
The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem take time $|V|^{\Omega(k)}$.
To prove our hardness result, consider a graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$.
Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
\begin{hypo}
\label{conj:graph}
There exists a constant $\eps_0>0$ such that given an undirected graph $G=(V,E)$, computing exactly the values $\numocc{G}{\tri}$, $\numocc{G}{\threepath}$ and $\numocc{G}{\threedis}$ cannot be done in time $o\inparen{|E|^{1+\eps_0}}$.
\end{hypo}
Based on the so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detection whether $G$ has a triangle or not takes time $\Omega\inparen{|E|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.
\AR{Need to add something about 3-paths and 3-matchings as well.}
Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$.
Both of our hardness results use a query polynomial that is based on a simple encoding of the edges of a graph.
To prove our hardness result, consider a graph $G(V, E)$, where $|E| = \ge$, $|V| = \numvar$. Our query polynomial will have a variable $X_i$ for every $i, [\numvar]$.
Now consider the query
\[\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in E} X_i \cdot X_j.\]
The hard query polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above, i.e.
\begin{Definition}
Let $G=([n],E)$ be a graph. Then for any $\kElem\ge 1$, define
\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in E} X_i \cdot X_j\right)^\kElem.\]
\end{Definition}
\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p. Finally, we only need porject-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}
Our hardness results only need TIDB instance and further, we consider the special case when all the tuple probabilities are the same value. It is not too hard to see that we can encode the above polynomial in an expression tree of size $\Theta(km)$.
Following up on the discussion around Example~\ref{ex:intro}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the query polynomial corresponding to the following query:
\[\poly:- R(A_1),E(A_1,B_1),R(B_1),\dots,R(A_\kElem),E(A_\kElem,B_\kElem),R(B_\kElem)\]
where generalizaing the PDB instance in Example~\ref{ex:intro}, relation $R$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $p$ and $E(A,B)$ has tuples corresponding to the edges in $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $E$ as well but since they always are present with probability $1$, we drop those. Our argument also work when all the tuples in $E$ also are present with probability $p$ but to make notation a bit simpler, we make this simplification.}
For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$.
Note that this imples that our hard query polynimial can be created from a join-project query-- by contrast our approximation algorithm in Section~\ref{sec:algo} can handle lineage polynonmials generated by union of select-project-join queries. % (i.e. we do not need union or select operator to derive our hardness result).
%\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p. Finally, we only need porject-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}
\subsection{Multiple Distinct $\prob$ Values}
\label{sec:multiple-p}
We are now ready to present our main hardness result.
\begin{Theorem}\label{thm:mult-p-hard-result}
Computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ for arbitraryy $G$ and any $(2k+1)$ values $\prob_i$ ($0\le i \le 2k$) is \sharpwonehard.
\end{Theorem}
We will prove the above result by reducing the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state of the art $k$-matching algorithms are improved, we cannot hope to have a better runtime to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its SOP form and use~\Cref{cor:expct-sop}. By constrast our approximation algorithm would run in time $O_k\inparen{m}$ on this query (since it runs in linear-time on all query polynomials).
As mentioned earlier, we prove our hardness result by presenting a reduction from the problem of couting $\kElem$-matchings in a graph:
\begin{Lemma}\label{lem:qEk-multi-p}
Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $poly(\kElem)$ time.
\end{Lemma}
Before we prove the above Lemma, let us use it to prove~\Cref{thm:mult-p-hard-result}:
\begin{proof}[Proof of Theorem~\ref{thm:mult-p-hard-result}]
For the sake of contradiction, let us assume we can solve our problem in $f(\kElem)\cdot m^c$ time for some absolute constant $c$. Then given a graph $G$ we can compute the query polynomial $\rpoly_G^\kElem$ (in the obvious way) in $O(km)$ time. Then after we run our algorithm on $\rpoly_G^\kElem$, we get $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$ in additional $f(\kElem)\cdot m^c$ time. \Cref{lem:qEk-multi-p} then computes the number of $k$-matchings in $G$ in $poly(\kElem)$ time. Thus, overall we have an algorithm for computing the number of $k$-matchings in time
\begin{align*}
O(km) + f(\kElem)\cdot m^c + poly(\kElem)
&\le \inparen{poly(\kElem) + f(\kElem)}\cdot m^{c+1} \\
&\le \inparen{poly(\kElem) + f(\kElem)}\cdot n^{2c+2},
\end{align*}
which contradicts~\cref{thm:k-match-hard}.
\end{proof}
Finally, we are rerady to prove~\Cref{lem:qEk-multi-p}:
\begin{proof}[Proof of ~\cref{lem:qEk-multi-p}]
%It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$.
We will show that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$. First, since $\poly_G^\kElem(\vct{X})$ has $\kElem$ products of monomials of degree $2$, it follows that $\poly_G^\kElem(\vct{X})$ has degree $2\kElem$. We can further write $\poly_{G}^{\kElem}(\vct{X})$ in its expanded SOP form,
@ -62,14 +100,6 @@ Then, since we have $\kElem!$ duplicates of each distinct $\kElem$-matching, and
\qed
\begin{Corollary}\label{cor:mult-p-hard-result}
Computing $\rpoly(\vct{X})$ given multiple distinct $\prob$ values is $\#W[1]$-hard.
\end{Corollary}
\begin{proof}[Proof of Corollary ~\ref{cor:mult-p-hard-result}]
The proof follows by ~\cref{thm:k-match-hard} and ~\cref{lem:qEk-multi-p}.
\end{proof}
\qed

View file

@ -15,25 +15,167 @@ For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is th
\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are each annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when being combined with $\multK$.
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when being combined with $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ ($\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to tuple $t$.
We review the semantics of positive relational algebra queries over $\semK$-relations below. \BG{should we use DL instead}
We review the semantics of positive relational algebra queries over $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural number. $\semN$-databases are used to model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-databases is a PDB where each possible world is a $\semN$-database. We will study the problem of evaluating statical moments of query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\db \in \idb}[\query(\db)(t)]$:
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural number. $\semN$-databases are used to model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-databases ($\semN$-PDB) is a PDB where each possible world is a $\semN$-database. We will study the problem of evaluating statical moments of query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \pd}[\query(\db)(t)]$:
\[\expct_{\db \in \idb}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db) \]
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \pd}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db)
\end{align}
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in the query result.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\semK$-relational Query Semantics}\label{sec:semnx-as-repr}
For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
We use $\eval{\cdot}{\db}$ to denote evaluating query $\query$ over database $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$.
\begin{align*}
&\eval{\project_A(\rel)}(\tup)&& = &&\sum_{\tup': \project_A(\tup) = \tup} \eval{\rel}(\tup')\\
&\eval{(\rel_1 \union \rel_2)}(\tup)&& = &&\eval{\rel_1}(\tup) + \eval{\rel_2}(\tup)\\
&\eval{(\rel_1 \join \rel_2)}(\tup) && = &&\eval{\rel_1}(\project_{\sch(\rel_1)}(\tup)) \times \eval{\rel_2}(\project_{\sch(\rel_2)}(\tup)) \\
&\eval{\select_\theta(\rel)}(\tup) && = &&\begin{cases}
\eval{\rel}(\tup) &\text{if }\theta(\tup) = 1\\
0 &\text{otherwise}.
\end{cases}\\
&\eval{R}(\tup) && = &&\rel(\tup)
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number co-efficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose elements are $\semNX$ and addition and multiplication are standard addition and multiplication of polynomials. We will utilize $\semNX$-databases $\db$ paired with a probability distribution to represent $\semN$-PDBs.\BG{Need more motivation?} To justify the use of $\semNX$-databases, we need to that we can encode any $\semN$-PDBs in this way and that the query semantics over this representation coincides with query semantics over $\semN$-PDB. For that it will be opportune to define the notion of representation systems.\BG{cite}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Representation System]\label{def:representation-syste}
A representation system for $\semN$-PDBs is a tuple $(\reprs, \rmod)$ where $\reprs$ is a set of representations and $\rmod$ associates which each $\repr \in \reprs$ a $\semN$-PDB $\pdb$. We say that a representation system is \emph{closed} under a class of queries $\qClass$ if for any query $\query \in \qClass$ we have:
%
\[ \rmod(\query(\repr)) = \query(\rmod(\repr)) \]
A representation system is \emph{complete} if for every $\semN$-PDB $\pdb$ there exists $\repr \in \reprs$ such that:
%
\[ \rmod(\repr) = \pdb \]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As mentioned above we will use $\semNX$-databases paired with a probability distribution as a representation system.
We refer to such databases as $\semNX$-PDBs and use bold symbols to distinguish them from possible worlds (which are $\semN$-databases).
Formally, a $\semNX$-PDB is a $\semNX$-database and a probability distribution over assignments $\assign$ of the variables $\vct{X}$ occurring in annotations of $\db$ to $\{0,1\}$. Note that an assignment $\assign: \vct{X} \to \{0,1\}$ can be represented as a vector $\vct{w} \in \{0,1\}^n$ where $\vct{w}[i]$ records the value assigned to $X_i$. Thus, from now on we will solely use such vectors and implicitly understand them to represent assignments. Given an assignment $\assign$ we use $\assign(\pxdb)$ to denote the semiring homomorphism $\semNX \to \semN$ that applies the assignment $\assign$ to all variables of a polynomial and evaluates the resulting expression in $\semN$.\BG{explain connection to homomorphism lifting in K-relations}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\semNX$-PDBs]\label{def:semnx-pdbs}
A $\semNX$-PDB $\pxdb$ over variables $\vct{X} = \{X_1, \ldots, X_n\}$ is a tuple $(\db,\pd)$ where $\db$ is an $\semNX$-database and $\pd$ is a probability distribution over $\vct{w} \in \{0,1\}^n$. We use $\assign_{\vct{w}}$ to denote the assignment corresponding to $\vct{w} \in \{0,1\}^n$. The $\semN$-PDB $\rmod(\pxdb) = (\idb, \pd')$ encoded by $\pxdb$ is defined as:
\begin{align*}
\idb & = \{ \assign_{\vct{w}}(\pxdb) \mid \vct{w} \in \{0,1\}^n \} \\
\pd'(\db) & = \sum_{\vct{w} \in \{0,1\}^n: \assign_{\vct{w}}(\pxdb) = \db} \pd(\vct{w})
\end{align*}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\BG{Need an example here}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{prop:semnx-pdbs-are-a-}
$\semNX$-PDBs are a complete representation system for $\semN$-PDBs that is closed under $\raPlus$ queries.
\end{Proposition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}
To prove that $\semNX$-PDBs are complete consider the following construction that for any $\semN$-PDB $\pdb$ produces a $\semNX$-PDB $\db$ such that $\rmod(\db) = \pdb$.
\BG{Add: create a number of variables $X_{ij}$ for each possible world $i$ that correspond to the maximum multiplicity of a tuple in the world. Then each tuple is annotated with a sum of variables $\sum_{i} \sum_{j \leq D_i(t)} X_{ij}$ and the probability distribution assigns only vectors where $X_{ij} = 1$ for a fixed $i$ and all $j$ a probability of $\pd(D_i)$}
The closure under $\raPlus$ queries follows from the fact that an assignment $\vct{X} \to \{0,1\}$ is a semiring homomorphism and that semiring homomorphisms commute with queries over $\semK$-relations.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Since $\semNX$-PDBs $\pxdb$ are a complete representation system closed under $\raPlus$, computing the expectation of the multiplicity of a tuple $t$ in the result a $raPlus$ query over the $\semN$-PDB $\rmod(\pxdb)$, is the same as computing the exception of the polynomial $\query(\pxdb)(t)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given a $\semN$-PDB $\pdb$ and $\semNX$-PDB $\pxdb$ such that $\rmod(\pxdb) = \pdb$, we have:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{\rw} \sim \pd}\pbox{\poly(\rw)} \]
\end{Proposition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\BG{Define TIDB, BIDB as subclasses of $\semNX$ with restrictions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[TIDBs and BIDBs]\label{def:tidbs-and-bidbs}
A \emph{TIDB} $\pxdb = (\db, \pd)$ is a $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) the probability distribution $\pd$ is such that all variables are independent.
A \emph{BIDB} $\pxdb = (\db, \pd)$ is a $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note that the main difference to the standard definitions of TIDB and BIDBs is that we define them as subclasses of $\semNX$-PDBs and that we use bag semantics. Even though tuples cannot occur more than once in the input TIDB or BIDB, they can occur with a multiplicity large than one in the result of a query.
\BG{Oliver's conjecture: Bag-TIDBs + Q can express any finite bag-PDB:
A well-known result for set semantics PDBs is that while not all finite PDBs can be encoded as TIDBs, any finite PDB can be encoded using a TIDB and a query. An analog result holds in our case: any finite $\semN$-PDB can be encoded as a bag TIDB and a query (WHAT CLASS? ADD PROOF)
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Expression Trees}\label{sec:expression-trees}
In the following we will make use of expression trees to encode polynomials which we define formally in this subsection.
For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$ over $\vct{X} = (x,y)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Expression Tree]\label{def:express-tree}
Consider a vector of variables $\vct{X}$.
An expression tree $\etree$ over $\vct{X}$ is a binary %an ADT logically viewed as an n-ary
tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We ignore the remaining fields (\vari{partial} and \vari{weight}) for now. Their purpose will become clear in~\Cref{sec:approximation-algo}. Note that $\etree$ need not encode an expression in standard monomial basis. For instance, $\etree$ could represent a compressed form of the running example, such as $(x + 2y)(2x - y)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[poly$(\cdot)$]\label{def:poly-func}
Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial. $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively.
% \begin{align*}
% &\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\
% &\etree.\type = \times\mapsto&& \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)\\
% &\etree.\type = \var \text{ OR } \tnum\mapsto&& \etree.\val
% \end{align*}
\begin{equation*}
\polyf(\etree) = \begin{cases}
\polyf(\etree_\lchild) + \polyf(\etree_\rchild) &\text{ if \etree.\type } = +\\
\polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild) &\text{ if \etree.\type } = \times\\
\etree.\val &\text{ if \etree.\type } = \var \text{ OR } \tnum.
\end{cases}
\end{equation*}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note that addition and multiplication above follow the standard interpretation over polynomials.
%Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial. Multiplication here is denoted by concatenation of the monomial and coefficient. When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added. Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For our running example, $\etreeset{\smb} = \{2x^2 + 3xy - 2y^2, (x + 2y)(2x - y), x(2x - y) + 2y(2x - y), 2x(x + 2y) - y(x + 2y)\}$. Note that \cref{def:express-tree-set} implies that $\etree \in \etreeset{poly(\etree)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Definition}\label{sec:problem-definition}
We are now ready to formally state the main problem addressed in this work.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
The \expectProblem is defined as follows:
\begin{itemize}
\item \textbf{Input}: A $\semN$-PDB $\pdb$, n-ary query $\query$, an n-ary tuple $t$
\item \textbf{Output}:
\item \textbf{Input}: Given an expression tree $\etree \in \etreeset{\smb}$ for $\poly(\vct{X}) = \query(\pdb)(t)$
\item \textbf{Output}: $\expct_{\vct{X} \sim \pd}[\poly(\vct{X})]$
\end{itemize}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -44,6 +186,9 @@ We are now ready to formally state the main problem addressed in this work.
% The possible worlds semantics gives a framework for how to think about running queries over $\idb$. Given a query $\query$, $\query$ is deterministically run over each $\db \in \idb$, and the output of $\query(\idb)$ is defined as the set of results (worlds) from running $\query$ over each $\db_i \in \idb$. We write this formally as,
% \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}.\]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Previous}
\begin{Definition}[$\bi$~\cite{DBLP:series/synthesis/2011Suciu}]