paper-BagRelationalPDBsAreHard/ra-to-poly.tex

181 lines
15 KiB
TeX

%root: main.tex
%!TEX root=./main.tex
%\onecolumn
\section{Background and Notation}\label{sec:background}
\subsection{Probabilistic Databases (PDBs)}
An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds.
Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world:
\[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use $\domK$-relations to model multisets. A \emph{$\domK$-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ (resp., $\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to $t$.
We review positive relational algebra semantics for $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
\end{align}
%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Representation System and Semantics}\label{sec:semnx-as-repr}
\subsubsection{$\semK$-relational Query Semantics}
For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ over $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$. Furthermore, $\theta(\tup)$ denotes the (Boolean) result of evaluating condition $\theta$ over $\tup$.
%
\begin{align*}
& \evald{\project_A(\rel)}{\db}(\tup) & & = & & \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') \\
& \evald{(\rel_1 \union \rel_2)}{\db}(\tup) & & = & & \evald{\rel_1}{\db}(\tup) \addK \evald{\rel_2}{\db}(\tup) \\
& \evald{(\rel_1 \join \rel_2)}{\db}(\tup) & & = & & \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\
& \evald{\select_\theta(\rel)}{\db}(\tup) & & = & & \begin{cases}
\evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
\zeroK & \text{otherwise}.
\end{cases} \\
& \evald{R}{\db}(\tup) & & = & & \rel(\tup)
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number coefficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$, with the standard addition and multiplication of polynomials.
We will use $\semNX$-PDB $\pxdb$, defined as the tuple $(\db, \pd)$, where $\semNX$-database $\db$ is paired with probability distribution $\pd$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-PDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-PDBs and a function $\rmod$ from an $\semNX$-PDB to an equivalent $\semN$-PDB are both formalized in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db',\pd')$ where $\rmod(\pxdb) = \pdb$:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from now on).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{\tis and \bis}
\label{subsec:tidbs-and-bidbs}
In this paper, we focus on two popular forms of PDB: Block-Independent (\bi) and Tuple-Independent (\ti) PDBs.
%
A \bi $\pxdb = (\db, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
%
A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{poly-form.tex}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Definition}\label{sec:expression-trees}
We first formally define circuits, an encoding of polynomials that we use throughout the paper. Since we are particularly using \emph{lineage} circuits, we drop the term lineage and only refer to them as circuits.
For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY - 2Y^2$ over $\vct{X} = [X, Y]$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\oldstuff{\begin{Definition}[Expression Tree]\label{def:express-tree}
Consider a vector of variables $\vct{X}$.
An expression tree $\etree$ over $\vct{X}$ is a binary %an ADT logically viewed as an n-ary
tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child.
\end{Definition}}
\revision{
As in the introduction, we could consider polynomials to be represented as an expression tree.
However, they do not capture many of the compressed polynomial representations that we can get from query processing algorithms on bags, including the recent work on worst-case optimal join algorithms~\cite{ngo-survey,skew}, factorized databases~\cite{factorized-db}, and FAQ~\cite{DBLP:conf/pods/KhamisNR16}. Intuitively, the main reason is that an expression tree does not allow for `sharing' of intermediate results, which is crucial for these algorithms (and other query processing methods as well).
We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way.
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal and sink nodes of $\circuit$ have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
Circuit $\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored, and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. When the underlying DAG is a tree, we will refer to the structure as an expression tree.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As stated in ~\Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees.
We ignore the remaining fields (\vari{partial}, \vari{Lweight}, and \vari{Rweight}) until \Cref{sec:algo}.
}
%Also note that the out degree of any internal node can grow with the circuit size.
The semantics of \revision{circuits} ~follows the obvious interpretation. We \revision{next} define \revision{its realtionship with polynomials } formally:
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
Denote \revision{$\polyf(\circuit)$}~ to be the function from circuit \revision{$\circuit$}~ to its corresponding polynomial. $\polyf(\cdot)$ is recursively defined on \revision{$\circuit$}~ as follows, with addition and multiplication following the standard interpretation for polynomials:
\begin{equation*}
\polyf(\revision{\circuit}) = \begin{cases}
\polyf(\revision{\circuit_\lchild}) + \polyf(\revision{\circuit_\rchild}) &\text{ if \revision{\circuit}.\type } = \revision{\circplus}\\
\polyf(\revision{\circuit_\lchild}) \cdot \polyf(\revision{\circuit_\rchild}) &\text{ if \revision{\circuit}.\type } = \revision{\circmult}\\
\revision{\circuit.\val} &\text{ if \revision{\circuit}.\type } = \var \text{ OR } \tnum.
\end{cases}
\end{equation*}
\end{Definition}
Note that $\circuit$ need not encode an expression in standard monomial basis. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$.
\oldstuff{
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
\end{Definition}
\revision{
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\smb}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \poly(\vct{X})$.
\end{Definition}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For our running example, $\circuitset{\smb} = \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$. Note that ~\Cref{def:express-tree-set} implies that \revision{$\circuit \in \circuitset{\polyf(\circuit)}$}.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip
\noindent We are now ready to formally state our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
The \expectProblem is defined as follows:
% \AH{I think we mean $\poly(\vct{X}) = \query(\pxdb)(t)$ instead of $\poly(\vct{X}) = \query(\pdb)(t)$. I changed the following to reflect this.}
% \BG{Correct}
\\\hspace*{5mm}\textbf{Input}: A \revision{circuit $\circuit \in \circuitset{\smb}$}~ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\\\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% When $\idb$ is a probabilistic database, $\idb$ can be viewed as a two-tuple $(\wSet, \pd)$, where $\wSet$ as noted, is the set of possible worlds, and $\pd$ is a probability distribution over $\wSet$.
% The possible worlds semantics gives a framework for how to think about running queries over $\idb$. Given a query $\query$, $\query$ is deterministically run over each $\db \in \idb$, and the output of $\query(\idb)$ is defined as the set of results (worlds) from running $\query$ over each $\db_i \in \idb$. We write this formally as,
% \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}.\]
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: