paper-BagRelationalPDBsAreHard/ra-to-poly.tex

%root: main.tex
%!TEX root=./main.tex
%\onecolumn
\section{Background and Notation}\label{sec:background}


\subsection{Probabilistic Databases (PDBs)}

An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds.
Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world:
\[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]

For a probabilistic  database $\pdb = (\idb, \pd)$,  the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$  that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]

Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use $\domK$-relations to model multisets. A \emph{$\domK$-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$.  A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ (resp., $\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to $t$.
We review positive relational algebra semantics for $\semK$-relations below.


Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
\end{align}
%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Representation System and Semantics}\label{sec:semnx-as-repr}

\subsubsection{$\semK$-relational Query Semantics}
For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ over $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$.  Furthermore, $\theta(\tup)$ denotes the (Boolean) result of evaluating condition $\theta$ over $\tup$.
%
\begin{align*}
                                            & \evald{\project_A(\rel)}{\db}(\tup)       &  & = &  & \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup')                                               \\
                                            & \evald{(\rel_1 \union \rel_2)}{\db}(\tup) &  & = &  & \evald{\rel_1}{\db}(\tup) \addK \evald{\rel_2}{\db}(\tup)                                                        \\
                                            & \evald{(\rel_1 \join \rel_2)}{\db}(\tup)  &  & = &  & \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\
                                            & \evald{\select_\theta(\rel)}{\db}(\tup)   &  & = &  & \begin{cases}
					\evald{\rel}{\db}(\tup)	& \text{if }\theta(\tup)                                                                                                                                          \\
					\zeroK                       & \text{otherwise}.
				\end{cases}                                                                                                                                                                                      \\
                                            & \evald{R}{\db}(\tup)                      &  & = &  & \rel(\tup)
\end{align*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}

Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number coefficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$, with the standard addition and multiplication of polynomials. 
We will use $\semNX$-PDB $\pxdb$, defined as the tuple $(\db, \pd)$, where $\semNX$-database $\db$ is paired with probability distribution $\pd$.  
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-PDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-PDBs and a function $\rmod$ from an $\semNX$-PDB to an equivalent $\semN$-PDB are both formalized in \Cref{subsec:supp-mat-background}.

 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
  Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db',\pd')$ where $\rmod(\pxdb) = \pdb$:
  \[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.  
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from now on).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{\tis and \bis}
\label{subsec:tidbs-and-bidbs}
In this paper, we focus on two popular forms of PDB: Block-Independent (\bi) and Tuple-Independent (\ti) PDBs.
%
A \bi $\pxdb = (\db, \pd)$ is an $\semNX$-PDB  such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
%
A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\input{poly-form.tex}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Definition}\label{sec:expression-trees}

We first formally define circuits, an encoding of polynomials that we use throughout the paper.  Since we are particularly using \emph{lineage} circuits, we drop the term lineage and only refer to them as circuits.

For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY - 2Y^2$ over $\vct{X} = [X, Y]$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\oldstuff{\begin{Definition}[Expression Tree]\label{def:express-tree}
%Consider a vector of variables $\vct{X}$.
%  An expression tree $\etree$ over $\vct{X}$ is a binary %an ADT logically viewed as an n-ary
%tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$.  The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child.
%\end{Definition}}

\revision{

We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way.

\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$.  The internal nodes and sink node of $\circuit$ have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.  

$\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input.  When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree.  Note that in such a case, the root of \etree is analogous to the sink of the \circuit.
\end{Definition}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


As stated in ~\Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees. 
\begin{Example}
The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$.  Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$.  Note further that \circuit is indeed a tree with edges pointing towards the root.
\end{Example}

\begin{figure}[t]
	\begin{tikzpicture}[thick]
		\node[tree_node] (a1) at (0, 0){$\boldsymbol{X}$};
		\node[tree_node] (b1) at (1, 0){$\boldsymbol{Y}$};
		\node[tree_node] (c1) at (2, 0){$\boldsymbol{W}$};
		\node[tree_node] (d1) at (3, 0){$\boldsymbol{Z}$};

		\node[tree_node] (a2) at (0.5, 1){$\boldsymbol{\circmult}$};
		\node[tree_node] (b2) at (2.5, 1){$\boldsymbol{\circmult}$};

		\node[tree_node] (a3) at (1.5, 2){$\boldsymbol{\circplus}$};

		\draw[->] (a1) -- (a2);
		\draw[->] (b1) -- (a2);
		\draw[->] (c1) -- (b2);
		\draw[->] (d1) -- (b2);
		\draw[->] (a2) -- (a3);
		\draw[->] (b2) -- (a3);
	\end{tikzpicture}
	\caption{Circuit encoding $XY + WZ$, a special case of an expression tree}
	\label{fig:circuit-express-tree}
\end{figure}

%\begin{figure}[t]
%
%\resizebox{0.65\columnwidth}{!}{
%\begin{tikzpicture}[thick, level distance=0.9cm,level 1/.style={sibling distance=3.55cm}, level 2/.style={sibling distance=1.8cm}, level 3/.style={sibling distance=0.8cm}]% level/.style={sibling distance=6cm/(#1 * 1.5)}]
%	\node[tree_node](root){$\boldsymbol{\times}$}
%		child{node[tree_node]{$\boldsymbol{+}$}
%			child{node[tree_node]{x}
%				%child[missing]{node[tree_node]{}}
%				%child{node[tree_node]{x}}
%				}
%			child{node[tree_node]{$\boldsymbol{\times}$}
%				child{node[tree_node]{2}}
%				child{node[tree_node]{y}}
%				}
%			}
%		child{node[highlight_treenode] (TR) {$\boldsymbol{+}$}
%			child{node[tree_node]{$\boldsymbol{\times}$}
%				child{node[tree_node]{2}}
%				child{node[tree_node]{x}}
%				}
%			child{node[tree_node]{$\boldsymbol{\times}$}
%				child{node[tree_node] (neg-leaf) {-1}}
%				child{node[tree_node]{y}}
%				}
%			%child[sibling distance= 0cm, grow=north east, red]{node[tree_node]{$\circuit_\rchild$}}
%			};
%%		\node[below=2pt  of neg-leaf, inner sep=1pt, blue] (neg-comment) {\textbf{Negation pushed to leaf nodes}};
%%		\draw[<-|, blue] (neg-leaf) -- (neg-comment);
%		\node[above right=0.7cm of TR, highlight_color, inner sep=0pt, font=\bfseries] (tr-label) {$\circuit_\rinput$};
%		\node[above right=0.7cm of root, highlight_color, inner sep=0pt, font=\bfseries] (t-label) {$\circuit$};
%		\draw[<-|, highlight_color] (TR) -- (tr-label);
%		\draw[<-|, highlight_color] (root) -- (t-label);
%\end{tikzpicture}
%}
%\vspace*{-2mm}
%\caption{Expression tree $\circuit$ for the product $\boldsymbol{(x + 2y)(2x - y)}$.}
%\label{fig:expr-tree-T}
%\trimfigurespacing
%\end{figure}


We ignore the remaining fields (\vari{partial}, \vari{Lweight}, and \vari{Rweight}) until \Cref{sec:algo}.
}
  %Also note that the out degree of any internal node can grow with the circuit size.

The semantics of \revision{circuits} ~follows the obvious interpretation.  We \revision{next} define \revision{its realtionship with polynomials } formally:
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
Denote \revision{$\polyf(\circuit)$}~ to be the function from circuit \revision{$\circuit$}~ to its corresponding polynomial.  $\polyf(\cdot)$ is recursively defined on \revision{$\circuit$}~ as follows, with addition and multiplication following the standard interpretation for polynomials:

\begin{equation*}
	\polyf(\revision{\circuit}) = \begin{cases}
					\polyf(\revision{\circuit_\lchild}) + \polyf(\revision{\circuit_\rchild})			&\text{ if \revision{\circuit}.\type } = \revision{\circplus}\\
					\polyf(\revision{\circuit_\lchild}) \cdot \polyf(\revision{\circuit_\rchild})		&\text{ if \revision{\circuit}.\type } = \revision{\circmult}\\
					\revision{\circuit.\val}									&\text{ if \revision{\circuit}.\type } = \var \text{ OR } \tnum.
				\end{cases}
\end{equation*}
\end{Definition}

Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB.  For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$\revision{
, as shown in \Cref{fig:circuit}.

\begin{figure}[t]
	\begin{tikzpicture}[thick]
		\node[tree_node] (a1) at (0, 0) {$\boldsymbol{X}$};
		\node[tree_node] (b1) at (1.5, 0) {$\boldsymbol{2}$};
		\node[tree_node] (c1) at (3, 0) {$\boldsymbol{Y}$};
		\node[tree_node] (d1) at (4.5, 0) {$\boldsymbol{-1}$};

		\node[tree_node] (a2) at (0.75, 1) {$\boldsymbol{\circmult}$};
		\node[tree_node] (b2) at (2.25, 1) {$\boldsymbol{\circmult}$};
		\node[tree_node] (c2) at (3.75, 1) {$\boldsymbol{\circmult}$};

		\node[tree_node] (a3) at (0.55, 2) {$\boldsymbol{\circplus}$};
		\node[tree_node] (b3) at (3.75, 2) {$\boldsymbol{\circplus}$};

		\node[tree_node] (a4) at (2.25, 3) {$\boldsymbol{\circmult}$};
		

		\draw[->] (a1) -- (a2);
		\draw[->, thick] (a1) -- (a3);
		\draw[->] (b1) -- (a2);
		\draw[->] (b1) -- (b2);
		\draw[->] (c1) -- (c2);
		\draw[->] (c1) -- (b2);
		\draw[->] (d1) -- (c2);
		\draw[->] (a2) -- (b3);
		\draw[->] (b2) -- (a3);
		\draw[->] (c2) -- (b3);
		\draw[->] (a3) -- (a4);
		\draw[->] (b3) -- (a4);
		\draw[->] (a4) -- (2.25, 3.5);
	\end{tikzpicture}
	\caption{Circuit encoding of the formula (X + 2Y)(2X - Y)}
	\label{fig:circuit}
\end{figure}
}

\oldstuff{
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\end{Definition}

\revision{
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\smb}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \poly(\vct{X})$.
\end{Definition}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

For our running example, $\circuitset{\smb} \supset \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$.  Note that ~\Cref{def:circuit-set} implies that \revision{$\circuit \in \circuitset{\polyf(\circuit)}$}.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip

\noindent We are now ready to formally state our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X}  \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
  The \expectProblem is defined as follows:
  % \AH{I think we mean $\poly(\vct{X}) = \query(\pxdb)(t)$ instead of $\poly(\vct{X}) = \query(\pdb)(t)$.  I changed the following to reflect this.}
  % \BG{Correct}
\\\hspace*{5mm}\textbf{Input}: A \revision{circuit $\circuit \in \circuitset{\smb}$}~ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\\\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


% When $\idb$ is a probabilistic database, $\idb$ can be viewed as a two-tuple $(\wSet, \pd)$, where $\wSet$ as noted, is the set of possible worlds, and $\pd$ is a probability distribution over $\wSet$.

% The possible worlds semantics gives a framework for how to think about running queries over $\idb$.  Given a query $\query$, $\query$ is deterministically run over each $\db \in \idb$, and the output of $\query(\idb)$ is defined as the set of results (worlds) from running $\query$ over each $\db_i \in \idb$.  We write this formally as,
% \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}.\]


%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: