paper-BagRelationalPDBsAreHard/ra-to-poly.tex
2020-12-20 00:13:58 -05:00

163 lines
14 KiB
TeX

%root: main.tex
%!TEX root=./main.tex
%\onecolumn
\section{Background and Notation}\label{sec:background}
\subsection{Probabilistic Databases (PDBs)}
An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds.
Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world:
\[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ (resp., $\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to $t$.
We review positive relational algebra semantics for $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
\end{align}
%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Representation System and Semantics}\label{sec:semnx-as-repr}
\subsubsection{$\semK$-relational Query Semantics}
For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ over $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$. Furthermore, $\theta(\tup)$ denotes the (Boolean) result of evaluating condition $\theta$ over $\tup$.
%
\begin{align*}
& \evald{\project_A(\rel)}{\db}(\tup) & & = & & \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') \\
& \evald{(\rel_1 \union \rel_2)}{\db}(\tup) & & = & & \evald{\rel_1}{\db}(\tup) \addK \evald{\rel_2}{\db}(\tup) \\
& \evald{(\rel_1 \join \rel_2)}{\db}(\tup) & & = & & \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\
& \evald{\select_\theta(\rel)}{\db}(\tup) & & = & & \begin{cases}
\evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
\zeroK & \text{otherwise}.
\end{cases} \\
& \evald{R}{\db}(\tup) & & = & & \rel(\tup)
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number coefficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$ and with the standard addition and multiplication of polynomials.
We will utilize $\semNX$-PDB $\pxdb$, defined as the tuple $(\db, \pd)$, where $\semNX$-database $\db$ is paired with probability distribution $\pd$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ (i.e., $\polyForTuple = \query(\pxdb)(t)$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-PDBs and a function $\rmod$ that takes an $\semNX$-PDB input and outputs an equivalent $\semN$-PDB are formally defined in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db',\pd')$ where $\rmod(\pxdb) = \pdb$:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from now on).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{\tis and \bis}
\label{subsec:tidbs-and-bidbs}
In this paper, we focus on two popular forms of PDB: Block-Independent (\bi) and Tuple-Independent (\ti) PDBs.
%
A \bi $\pxdb = (\db, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
%
A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{poly-form.tex}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Definition}\label{sec:expression-trees}
We first formally define expression trees, an encoding of polynomials that we use throughout much of the paper before generalizing them to circuits in~\Cref{sec:gen}.
For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY - 2Y^2$ over $\vct{X} = [X, Y]$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Expression Tree]\label{def:express-tree}
Consider a vector of variables $\vct{X}$.
An expression tree $\etree$ over $\vct{X}$ is a binary %an ADT logically viewed as an n-ary
tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We ignore the remaining fields (\vari{partial} and \vari{weight}) until \Cref{sec:algo}. Note that $\etree$ need not encode an expression in standard monomial basis. For instance, $\etree$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Definition}[poly$(\cdot)$]\label{def:poly-func}
%Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial in \abbrSMB. We define $poly(\cdot)$ recursively on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left/right child of $\etree$ respectively.
%%
%% \begin{align*}
%% &\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\
%% &\etree.\type = \times\mapsto&& \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)\\
%% &\etree.\type = \var \text{ OR } \tnum\mapsto&& \etree.\val
%% \end{align*}
%%
%\begin{equation*}
% \polyf(\etree) = \begin{cases}
% \polyf(\etree_\lchild) + \polyf(\etree_\rchild) &\text{ if \etree.\type } = +\\
% \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild) &\text{ if \etree.\type } = \times\\
% \etree.\val &\text{ if \etree.\type } = \var \text{ OR } \tnum.
% \end{cases}
%\end{equation*}
%\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Note that addition and multiplication above follow the standard interpretation over polynomials.
%Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial. Multiplication here is denoted by concatenation of the monomial and coefficient. When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added. Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For our running example, $\etreeset{\smb} = \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$. Note that \cref{def:express-tree-set} implies that $\etree \in \etreeset{poly(\etree)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\medskip
\noindent We are now ready to formally state our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
The \expectProblem is defined as follows:
% \AH{I think we mean $\poly(\vct{X}) = \query(\pxdb)(t)$ instead of $\poly(\vct{X}) = \query(\pdb)(t)$. I changed the following to reflect this.}
% \BG{Correct}
\\\hspace*{5mm}\textbf{Input}: An expression tree $\etree \in \etreeset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\\\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% When $\idb$ is a probabilistic database, $\idb$ can be viewed as a two-tuple $(\wSet, \pd)$, where $\wSet$ as noted, is the set of possible worlds, and $\pd$ is a probability distribution over $\wSet$.
% The possible worlds semantics gives a framework for how to think about running queries over $\idb$. Given a query $\query$, $\query$ is deterministically run over each $\db \in \idb$, and the output of $\query(\idb)$ is defined as the set of results (worlds) from running $\query$ over each $\db_i \in \idb$. We write this formally as,
% \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}.\]
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: