Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

2020-12-13 17:46:33 -05:00 · 2020-12-13 17:46:33 -05:00 · ff71aa617e
parent d6825c38c6 9c638e6f4c
commit ff71aa617e
6 changed files with 304 additions and 88 deletions
--- a/approx_alg.tex
+++ b/approx_alg.tex
@ -1,5 +1,6 @@
 %root: main.tex
 \section{$1 \pm \epsilon$ Approximation Algorithm}
+\label{sec:algo}
 Since it is the case that computing the expected multiplicity of a compressed representation of a bag polynomial is hard, it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.

 First, let us introduce some useful definitions and notation.  For illustrative purposes in the definitions below, let us consider when $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$.
--- a/atri.bib
+++ b/atri.bib
@ -0,0 +1,21 @@
+@inproceedings{triang-hard,
+  author    = {Tsvi Kopelowitz and
+               Virginia Vassilevska Williams},
+  editor    = {Artur Czumaj and
+               Anuj Dawar and
+               Emanuela Merelli},
+  title     = {Towards Optimal Set-Disjointness and Set-Intersection Data Structures},
+  booktitle = {47th International Colloquium on Automata, Languages, and Programming,
+               {ICALP} 2020, July 8-11, 2020, Saarbr{\"{u}}cken, Germany (Virtual
+               Conference)},
+  series    = {LIPIcs},
+  volume    = {168},
+  pages     = {74:1--74:16},
+  publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
+  year      = {2020},
+  url       = {https://doi.org/10.4230/LIPIcs.ICALP.2020.74},
+  doi       = {10.4230/LIPIcs.ICALP.2020.74},
+  timestamp = {Tue, 30 Jun 2020 17:15:44 +0200},
+  biburl    = {https://dblp.org/rec/conf/icalp/KopelowitzW20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
--- a/macros.tex
+++ b/macros.tex
@ -15,6 +15,7 @@
 \newcommand{\db}{D}
 \newcommand{\idb}{\Omega}
 \newcommand{\pdb}{\mathcal{D}}
+\newcommand{\pxdb}{\mathbf{D}}
 \newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db
 \newcommand{\tset}{\mathcal{T}}%the set of tuples in a database
 \newcommand{\pd}{P}%pd for probability distribution
@ -36,6 +37,12 @@
 \newcommand{\dtrm}[1]{Det\left(#1\right)}
 \newcommand{\tuple}[1]{\left<#1\right>}

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Query Classes
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\newcommand{\qClass}{\mathcal{Q}}
+\newcommand{\raPlus}{\ensuremath{\mathcal{RA}^{+}}\xspace}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %Approx Alg
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -73,9 +80,9 @@
 \newcommand{\lchild}{\vari{L}}
 \newcommand{\rchild}{\vari{R}}
 %members of T
-\newcommand{\val}{\vari{val}}
-\newcommand{\type}{\vari{type}}
-\newcommand{\wght}{\vari{weight}}
+\newcommand{\val}{\vari{val}\xspace}
+\newcommand{\type}{\vari{type}\xspace}
+\newcommand{\wght}{\vari{weight}\xspace}
 %types of T
 \newcommand{\var}{var}
 \newcommand{\tnum}{num}
@ -117,6 +124,11 @@
 \newcommand{\bivar}{x_{\block, i}}
 \newcommand{\tipdb}{\pdb_{\tiabb}}

+% REPRESENTATIONS
+\newcommand{\rmod}{Mod}
+\newcommand{\reprs}{\mathcal{M}}
+\newcommand{\repr}{M}
+
 %Polynomial Reformulation
 \newcommand{\wbit}{w}
 \newcommand{\expct}{\mathop{\mathbb{E}}}
@ -250,16 +262,17 @@
 \DeclareMathAlphabet{\mathbbold}{U}{bbold}{m}{n}

 \newtheorem{Theorem}{Theorem}[section]
-\newtheorem{Definition}{Definition}
-\newtheorem{Lemma}{Lemma}
-\newtheorem{Proposition}{Proposition}
-\newtheorem{Property}{Property}
-\newtheorem{Corollary}{Corollary}
-\newtheorem{Claim}{Claim}
-\newtheorem{Example}{Example}
-\newtheorem{Axiom}{Axiom}
-\newtheorem{Question}{Question}
-\newtheorem{Assumption}{Assumption}
+\newtheorem{Definition}[Theorem]{Definition}
+\newtheorem{Lemma}[Theorem]{Lemma}
+\newtheorem{Proposition}[Theorem]{Proposition}
+\newtheorem{Property}[Theorem]{Property}
+\newtheorem{Corollary}[Theorem]{Corollary}
+\newtheorem{Claim}[Theorem]{Claim}
+\newtheorem{Example}[Theorem]{Example}
+\newtheorem{Axiom}[Theorem]{Axiom}
+\newtheorem{Question}[Theorem]{Question}
+\newtheorem{Assumption}[Theorem]{Assumption}
+\newtheorem{hypo}[Theorem]{Conjecture}



@ -283,7 +296,7 @@
 \newcommand{\semN}{\mathbb{N}}
 \newcommand{\domN}{\mathbb{N}}
 \newcommand{\semB}{\mathbb{B}}
-\newcommand{\semNX}{\mathbb{N}[X]}
+\newcommand{\semNX}{\mathbb{N}[\vct{X}]}
 \newcommand{\domK}{K}
 \newcommand{\semK}{\mathcal{K}}

@ -310,6 +323,8 @@

 \newcommand{\dbDomK}[1]{\mathcal{DB}_{#1}}

+\newcommand{\assign}{\varphi}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % COMPLEXITY
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -328,3 +343,7 @@

 %%%Adding stuff below so that long chain of display equatoons can be split across pages
 \allowdisplaybreaks
+
+\newcommand{\eps}{\epsilon}
+\newcommand{\inparen}[1]{\left({#1}\right)}
+\newcommand{\inset}[1]{\left\{{#1}\right\}}
--- a/main.tex
+++ b/main.tex
@ -176,7 +176,7 @@ sensitive=true

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \bibliographystyle{plain}
-\bibliography{aaron.bib}
+\bibliography{aaron,atri}



--- a/mult_distinct_p.tex
+++ b/mult_distinct_p.tex
@ -23,23 +23,61 @@ Given a positive integer $k$ and  an undirected graph $G$ with no self-loops or

 The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem take time $|V|^{\Omega(k)}$.

-To prove our hardness result, consider a graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$.
+Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
+\begin{hypo}
+\label{conj:graph}
+There exists a constant $\eps_0>0$ such that given an undirected graph $G=(V,E)$, computing exactly the values $\numocc{G}{\tri}$, $\numocc{G}{\threepath}$ and $\numocc{G}{\threedis}$ cannot be done in time $o\inparen{|E|^{1+\eps_0}}$.
+\end{hypo}
+Based on the so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detection whether $G$ has a triangle or not takes time $\Omega\inparen{|E|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.
+\AR{Need to add something about 3-paths and 3-matchings as well.}

-Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$.
+Both of our hardness results use a query polynomial that is based on a simple encoding of the edges of a graph.
+To prove our hardness result, consider a graph $G(V, E)$, where $|E| = \ge$, $|V| = \numvar$. Our query polynomial will have a variable $X_i$ for every $i, [\numvar]$.
+Now consider the query 
+\[\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in E} X_i \cdot X_j.\]
+The hard query polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above, i.e.
+\begin{Definition}
+Let $G=([n],E)$ be a graph. Then for any $\kElem\ge 1$, define
+\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in E} X_i \cdot X_j\right)^\kElem.\]
+\end{Definition}

-\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p.  Finally, we only need porject-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}
+Our hardness results only need TIDB instance and further, we consider the special case when all the tuple probabilities are the same value. It is not too hard to see that we can encode the above polynomial in an expression tree of size $\Theta(km)$.

+Following up on the discussion around Example~\ref{ex:intro}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the query polynomial corresponding to the following query:
+\[\poly:- R(A_1),E(A_1,B_1),R(B_1),\dots,R(A_\kElem),E(A_\kElem,B_\kElem),R(B_\kElem)\]
+where generalizaing the PDB instance in Example~\ref{ex:intro}, relation $R$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $p$ and $E(A,B)$ has tuples corresponding to the edges in $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $E$ as well but since they always are present with probability $1$, we drop those. Our argument also work when all the tuples in $E$ also are present with probability $p$ but to make notation a bit simpler, we make this simplification.}

-For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$.
+Note that this imples that our hard query polynimial can be created from a join-project query-- by contrast our approximation algorithm in Section~\ref{sec:algo} can handle lineage polynonmials generated by union of select-project-join queries. % (i.e. we do not need union or select operator to derive our hardness result).

+%\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p.  Finally, we only need porject-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}

 \subsection{Multiple Distinct $\prob$ Values}
 \label{sec:multiple-p}

+We are now ready to present our main hardness result.
+\begin{Theorem}\label{thm:mult-p-hard-result}
+Computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ for arbitraryy $G$ and any $(2k+1)$ values $\prob_i$ ($0\le i \le 2k$) is \sharpwonehard.
+\end{Theorem}
+We will prove the above result by reducing the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state of the art $k$-matching algorithms are improved, we cannot hope to have a better runtime to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its SOP form and use~\Cref{cor:expct-sop}. By constrast our approximation algorithm would run in time $O_k\inparen{m}$ on this query (since it runs in linear-time on all query polynomials).
+
+
+As mentioned earlier, we prove our hardness result by presenting a reduction from the problem of couting $\kElem$-matchings in a graph:
 \begin{Lemma}\label{lem:qEk-multi-p}
 Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$.  Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $poly(\kElem)$ time.
 \end{Lemma}

+Before we prove the above Lemma, let us use it to prove~\Cref{thm:mult-p-hard-result}:
+\begin{proof}[Proof of Theorem~\ref{thm:mult-p-hard-result}]
+For the sake of contradiction, let us assume we can solve our problem in $f(\kElem)\cdot m^c$ time for some absolute constant $c$. Then given a graph $G$ we can compute the query polynomial $\rpoly_G^\kElem$ (in the obvious way) in $O(km)$ time. Then after we run our algorithm on $\rpoly_G^\kElem$, we get $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$ in additional $f(\kElem)\cdot m^c$ time. \Cref{lem:qEk-multi-p} then computes the number of $k$-matchings in $G$ in $poly(\kElem)$ time. Thus, overall we have an algorithm for computing the number of $k$-matchings in time
+\begin{align*}
+ O(km) + f(\kElem)\cdot m^c + poly(\kElem)
+&\le \inparen{poly(\kElem) + f(\kElem)}\cdot m^{c+1} \\
+&\le \inparen{poly(\kElem) + f(\kElem)}\cdot n^{2c+2},
+\end{align*}
+which contradicts~\cref{thm:k-match-hard}.
+\end{proof}
+
+Finally, we are rerady to prove~\Cref{lem:qEk-multi-p}:
 \begin{proof}[Proof of ~\cref{lem:qEk-multi-p}]
 %It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement.  By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$.  Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$.  Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$.
 We will show that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$.  First, since $\poly_G^\kElem(\vct{X})$ has $\kElem$ products of monomials of degree $2$, it follows that $\poly_G^\kElem(\vct{X})$ has degree $2\kElem$.  We can further write $\poly_{G}^{\kElem}(\vct{X})$ in its expanded SOP form,
@ -62,14 +100,6 @@ Then, since we have $\kElem!$ duplicates of each distinct $\kElem$-matching, and

 \qed

-\begin{Corollary}\label{cor:mult-p-hard-result}
-Computing $\rpoly(\vct{X})$ given multiple distinct $\prob$ values is $\#W[1]$-hard.
-\end{Corollary}
-\begin{proof}[Proof of Corollary ~\ref{cor:mult-p-hard-result}]
-The proof follows by ~\cref{thm:k-match-hard} and ~\cref{lem:qEk-multi-p}.
-\end{proof}
-
-\qed



--- a/ra-to-poly.tex
+++ b/ra-to-poly.tex
@ -15,25 +15,167 @@ For a probabilistic  database $\pdb = (\idb, \pd)$,  the result of a query is th

 \[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]

-Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are each annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$.  A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when being combined with $\multK$.
+Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$.  A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when being combined with $\multK$.
 Let $\udom$ be a countable domain of values.
 Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
 A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ ($\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to tuple $t$.
-We review the semantics of positive relational algebra queries over $\semK$-relations below. \BG{should we use DL instead}
+We review the semantics of positive relational algebra queries over $\semK$-relations below.

-Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural number. $\semN$-databases are used to model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-databases is a PDB where each possible world is a $\semN$-database. We will study the problem of evaluating statical moments of query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\db \in \idb}[\query(\db)(t)]$:
+Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural number. $\semN$-databases are used to model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-databases ($\semN$-PDB) is a PDB where each possible world is a $\semN$-database. We will study the problem of evaluating statical moments of query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \pd}[\query(\db)(t)]$:

-\[\expct_{\db \in \idb}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db) \]
+\begin{align}\label{eq:bag-expectation}
+\expct_{\idb \sim \pd}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db)
+\end{align}

 Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in the query result.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{$\semK$-relational Query Semantics}\label{sec:semnx-as-repr}
+
+For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
+We use $\eval{\cdot}{\db}$ to denote evaluating query $\query$ over database $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$.
+
+\begin{align*}
+&\eval{\project_A(\rel)}(\tup)&& = &&\sum_{\tup': \project_A(\tup) = \tup} \eval{\rel}(\tup')\\
+&\eval{(\rel_1 \union \rel_2)}(\tup)&& = &&\eval{\rel_1}(\tup) + \eval{\rel_2}(\tup)\\
+&\eval{(\rel_1 \join \rel_2)}(\tup) && = &&\eval{\rel_1}(\project_{\sch(\rel_1)}(\tup)) \times \eval{\rel_2}(\project_{\sch(\rel_2)}(\tup))	\\
+&\eval{\select_\theta(\rel)}(\tup) && = &&\begin{cases}
+					\eval{\rel}(\tup)	&\text{if }\theta(\tup) = 1\\
+					0		&\text{otherwise}.
+				\end{cases}\\
+&\eval{R}(\tup) && = &&\rel(\tup)
+\end{align*}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
+
+Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number co-efficients and exponents.
+Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose elements are $\semNX$ and addition and multiplication are standard addition and multiplication of polynomials. We will utilize $\semNX$-databases $\db$ paired with a probability distribution to represent $\semN$-PDBs.\BG{Need more motivation?}  To justify the use of $\semNX$-databases, we need to that we can encode any $\semN$-PDBs in this way and that the query semantics over this representation coincides with query semantics over $\semN$-PDB. For that it will be opportune to define the notion of representation systems.\BG{cite}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[Representation System]\label{def:representation-syste}
+  A representation system for $\semN$-PDBs is a tuple $(\reprs, \rmod)$ where $\reprs$ is a set of representations and $\rmod$ associates which each $\repr \in \reprs$ a $\semN$-PDB $\pdb$. We say that a representation system is \emph{closed} under a class of queries $\qClass$ if for any query $\query \in \qClass$ we have:
+%
+  \[ \rmod(\query(\repr)) = \query(\rmod(\repr)) \]
+
+  A representation system is \emph{complete} if for every $\semN$-PDB $\pdb$ there exists $\repr \in \reprs$ such that:
+%
+  \[ \rmod(\repr) = \pdb \]
+
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+As mentioned above we will use $\semNX$-databases paired with a probability distribution as a representation system.
+We refer to such databases as $\semNX$-PDBs and use bold symbols to distinguish them from possible worlds (which are $\semN$-databases).
+Formally, a $\semNX$-PDB is a $\semNX$-database and a probability distribution over assignments $\assign$ of the variables $\vct{X}$ occurring in annotations of $\db$ to $\{0,1\}$.  Note that an assignment $\assign: \vct{X} \to \{0,1\}$ can be represented as a vector $\vct{w} \in \{0,1\}^n$ where $\vct{w}[i]$ records the value assigned to $X_i$. Thus, from now on we will solely use such vectors and implicitly understand them to represent assignments. Given an assignment $\assign$ we use $\assign(\pxdb)$ to denote the semiring homomorphism $\semNX \to \semN$ that applies the assignment $\assign$ to all variables of a polynomial and evaluates the resulting expression in $\semN$.\BG{explain connection to homomorphism lifting in K-relations}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[$\semNX$-PDBs]\label{def:semnx-pdbs}
+  A $\semNX$-PDB $\pxdb$ over variables $\vct{X} = \{X_1, \ldots, X_n\}$ is a tuple $(\db,\pd)$ where $\db$ is an $\semNX$-database and $\pd$ is a probability distribution over $\vct{w} \in \{0,1\}^n$. We use $\assign_{\vct{w}}$ to denote the assignment corresponding to $\vct{w} \in \{0,1\}^n$. The $\semN$-PDB $\rmod(\pxdb) = (\idb, \pd')$ encoded by $\pxdb$ is defined as:
+  \begin{align*}
+    \idb      & = \{ \assign_{\vct{w}}(\pxdb) \mid \vct{w} \in  \{0,1\}^n \} \\
+    \pd'(\db) & = \sum_{\vct{w} \in \{0,1\}^n: \assign_{\vct{w}}(\pxdb) = \db} \pd(\vct{w})
+  \end{align*}
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\BG{Need an example here}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Proposition}\label{prop:semnx-pdbs-are-a-}
+$\semNX$-PDBs are a complete representation system for $\semN$-PDBs that is closed under $\raPlus$ queries.
+\end{Proposition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{proof}
+  To prove that $\semNX$-PDBs are complete consider the following construction that for any $\semN$-PDB $\pdb$ produces a $\semNX$-PDB $\db$  such that $\rmod(\db) = \pdb$.
+\BG{Add: create a number of variables $X_{ij}$ for each possible world $i$ that correspond to the maximum multiplicity of a tuple in the world. Then each tuple is annotated with a sum of variables $\sum_{i} \sum_{j \leq D_i(t)} X_{ij}$ and the probability distribution assigns only vectors where $X_{ij} = 1$ for a fixed $i$ and all $j$ a probability of $\pd(D_i)$}
+The closure under $\raPlus$ queries follows from the fact that an assignment $\vct{X} \to \{0,1\}$ is a semiring homomorphism and that semiring homomorphisms commute with queries over $\semK$-relations.
+\end{proof}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+Since $\semNX$-PDBs $\pxdb$ are a complete representation system closed under $\raPlus$, computing the expectation of the  multiplicity of a tuple $t$ in the result a $raPlus$ query over the $\semN$-PDB $\rmod(\pxdb)$, is the same as computing the exception of the polynomial $\query(\pxdb)(t)$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
+  Given a $\semN$-PDB $\pdb$ and $\semNX$-PDB $\pxdb$ such that $\rmod(\pxdb) = \pdb$, we have:
+  \[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{\rw} \sim \pd}\pbox{\poly(\rw)} \]
+\end{Proposition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\BG{Define TIDB, BIDB as subclasses of $\semNX$ with restrictions}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[TIDBs and BIDBs]\label{def:tidbs-and-bidbs}
+  A \emph{TIDB} $\pxdb = (\db, \pd)$ is a $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) the probability distribution $\pd$ is such that all variables are independent.
+
+  A \emph{BIDB} $\pxdb = (\db, \pd)$ is a $\semNX$-PDB  such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+Note that the main difference to the standard definitions of TIDB and BIDBs is that we define them as subclasses of $\semNX$-PDBs and that we use bag semantics. Even though tuples cannot occur more than once in the input TIDB or BIDB, they can occur with a multiplicity large than one in the result of a query.
+\BG{Oliver's conjecture: Bag-TIDBs + Q can express any finite bag-PDB:
+A well-known result for set semantics PDBs is that while not all finite PDBs can be encoded as TIDBs, any finite PDB can be encoded using a TIDB and a query. An analog result holds in our case: any finite $\semN$-PDB can be encoded as a bag TIDB and a query (WHAT CLASS? ADD PROOF)
+}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Expression Trees}\label{sec:expression-trees}
+
+In the following we will make use of expression trees to encode polynomials which we define formally in this subsection.
+For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$ over $\vct{X} = (x,y)$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[Expression Tree]\label{def:express-tree}
+Consider a vector of variables $\vct{X}$.
+  An expression tree $\etree$ over $\vct{X}$ is a binary %an ADT logically viewed as an n-ary
+tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$.  The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child.
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+We ignore the remaining fields (\vari{partial} and \vari{weight}) for now. Their purpose will become clear in~\Cref{sec:approximation-algo}. Note that $\etree$ need not encode an expression in standard monomial basis.  For instance, $\etree$ could represent a compressed form of the running example, such as $(x + 2y)(2x - y)$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[poly$(\cdot)$]\label{def:poly-func}
+Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial.  $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively.
+
+%	\begin{align*}
+%		&\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\
+%		&\etree.\type = \times\mapsto&& \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)\\
+%		&\etree.\type =  \var \text{ OR } \tnum\mapsto&& \etree.\val
+%	\end{align*}
+
+
+\begin{equation*}
+	\polyf(\etree) = \begin{cases}
+					\polyf(\etree_\lchild) + \polyf(\etree_\rchild)			&\text{ if \etree.\type } = +\\
+					\polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)		&\text{ if \etree.\type } = \times\\
+					\etree.\val									&\text{ if \etree.\type } = \var \text{ OR } \tnum.
+				\end{cases}
+\end{equation*}
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+Note that addition and multiplication above follow the standard interpretation over polynomials.
+%Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial.  Multiplication here is denoted by concatenation of the monomial and coefficient.  When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added.  Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
+\end{Definition}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+For our running example, $\etreeset{\smb} = \{2x^2 + 3xy - 2y^2, (x + 2y)(2x - y), x(2x - y) + 2y(2x - y), 2x(x + 2y) - y(x + 2y)\}$.  Note that \cref{def:express-tree-set} implies that $\etree \in \etreeset{poly(\etree)}$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Problem Definition}\label{sec:problem-definition}
+
 We are now ready to formally state the main problem addressed in this work.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
+Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X}  \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
  The \expectProblem is defined as follows:
 \begin{itemize}
-\item \textbf{Input}: A $\semN$-PDB $\pdb$, n-ary query $\query$, an n-ary tuple $t$
-\item \textbf{Output}:
+\item \textbf{Input}: Given an expression tree $\etree \in \etreeset{\smb}$ for $\poly(\vct{X}) = \query(\pdb)(t)$
+\item \textbf{Output}: $\expct_{\vct{X} \sim \pd}[\poly(\vct{X})]$
 \end{itemize}
 \end{Definition}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -44,6 +186,9 @@ We are now ready to formally state the main problem addressed in this work.
 % The possible worlds semantics gives a framework for how to think about running queries over $\idb$.  Given a query $\query$, $\query$ is deterministically run over each $\db \in \idb$, and the output of $\query(\idb)$ is defined as the set of results (worlds) from running $\query$ over each $\db_i \in \idb$.  We write this formally as,
 % \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}.\]

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Previous}
+


 \begin{Definition}[$\bi$~\cite{DBLP:series/synthesis/2011Suciu}]