%root: main.tex %!TEX root=./main.tex %\onecolumn \section{Background and Notation}\label{sec:background} \subsection{Polynomial Definition and Terminology} %We now introduce some terminology %and develop a reduced form of lineage polynomials for a \abbrBIDB or \abbrTIDB. %Note that \secrev{A } polynomial over $\vct{X}=(X_1,\dots,X_n)$ with individual degree $B <\infty$ is formally defined as (where $c_{\vct{d}}\in \semN$): \begin{equation} \label{eq:sop-form} \poly\inparen{X_1,\dots,X_n}=\secrev{\sum_{\vct{d}\in\{0,\ldots,B\}^\tupset} c_{\vct{d}}\cdot \prod_{\tup\in\tupset} X_\tup^{d_\tup}.} \end{equation} %where $c_{\vct{d}}\in \semN$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Definition}[Standard Monomial Basis]\label{def:smb} The term $\prod_{\tup\in\tupset} X_\tup^{d_\tup}$ in \Cref{eq:sop-form} is a {\em monomial}. A polynomial $\poly\inparen{\vct{X}}$ is in standard monomial basis (\abbrSMB) when we keep only the terms with $c_{\vct{d}}\ne 0$ from \Cref{eq:sop-form}. \end{Definition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Unless othewise noted, we consider all polynomials to be in \abbrSMB representation. When it is unclear, we use $\smbOf{\poly}$ to denote the \abbrSMB form of a polynomial $\poly$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Definition}[Degree]\label{def:degree-of-poly} The degree of polynomial $\poly(\vct{X})$ is the largest \secrev{$\norm{\vct{d}}_1$}% = \sum_{\tup\in\tupset} d_\tup$ such that $c_{(d_1,\dots,d_n)}\ne 0$. % maximum sum of exponents, over all monomials in $\smbOf{\poly(\vct{X})}$. \end{Definition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% As an example, the degree of the polynomial $X^2+2XY^2+Y^2$ is $3$. Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantics}), so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins needed to produce a result tuple. %in any clause of the $\raPlus$ query that created it. \secrev{ We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynomial} (%resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if there exists a $\raPlus$ query $\query$, \abbrCTIDB $\pdb$, and result tuple $\tup$ such that $\poly\inparen{\vct{X}} = \apolyqdt\inparen{\vct{X}}.$ } \subsection{Probabilistic Databases} Following the typical representation of bags in production databases, for query inputs, we will use \abbrBPDB\xplural with multiplicities $\{0, 1\}$ (see \Cref{sec:gener-results-beyond} for more on this choice). An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds. \secrev{ A \abbrCTIDB $\pdb$ is a pair $\inparen{\worlds, \bpd}$ such that $\worlds$ is an incomplete database and $\bpd$ is a probability distribution over $\worlds$. Queries over probabilistic databases (and thus \abbrCTIDB\xplural) are evaluated using the so-called possible world semantics. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\worlds$ is the set of query answers produced by evaluating $\query$ over each possible world $\worldvec\in\worlds$: $\inset{\query\inparen{\worldvec}: \worldvec\in\worlds}$. The result of a query is the pair $\inparen{\query\inparen{\worlds}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\worldvec\in\worlds} = \sum\limits_{\substack{\worldvec'\in\worlds,\\\query\inparen{\worldvec'}=\query\inparen{\worldvec}}}\probOf\pbox{\worldvec'}$. } \AH{Move~\Cref{prop:expection-of-polynom} to after/at the end of~\Cref{subsec:tidbs-and-bidbs}. We may have to introduce some new notation for $1$-\abbrBIDB, like the $\randDB$ in the expecatation.} Recall \Cref{fig:nxDBSemantics} which defines the lineage polynomial $\apolyqdt$ for any $\raPlus$ query. We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial. \begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom} Given a \abbrBPDB $\pdb = (\idb,\pd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$, %$\semNX$-\abbrPDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$, we have (denoting $\randDB$ as the random variable over $\idb$): $ \expct_{\randDB \sim \pd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $ \end{Proposition} \noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.\footnote{Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.} We focus on the problem of computing $\expct_\pdassign\pbox{\apolyqdt\inparen{\vct{\randWorld}}}$ from now on, assume implicit $\query, \dbbase, \tup$, and drop them from $\apolyqdt$ (i.e., $\poly\inparen{\vct{X}}$ will denote a polynomial). \subsubsection{Reduction to $1$-\abbrBIDB} \label{subsec:tidbs-and-bidbs} \secrev{ A block independent database (\abbrBIDB) is a common probabilistic data model $\pdb=\inparen{\Omega, \pdb}$ such that the base set of tuples $\tupset = \bigcup_{\omega\in\Omega,~\tup\in\omega}\tup$ is partitioned into a set of $\numvar$ independent blocks $\inset{\inparen{\block_\tup}_{\tup\in\pbox{\numvar}}}$ such that the set of tuples $\inset{\inparen{\tup_j}_{j\in\pbox{\abs{\block}}}}$ in block $\block_\tup$ are disjoint from one another. This construction produces the set of possible worlds $\Omega$ that consists of all unique combinations of tuples in $\tupset$ with the constraint that for any $\omega\in\Omega$, no two tuples $\tup_j\neq\tup_{j'}$ from the same block $\block_\tup$ exist together. A $\bound$-\abbrBIDB has the further requirement that each block has a multiplicity of at most $c$. We present a reduction that is useful in producing our results: \begin{Definition}[\abbrCTIDB reduction]\label{def:ctidb-reduct} Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\Omega, \bpd'}$ be the \abbrOneBIDB obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup$ with $\bound$ disjoint copies, such that $\tup_j$ is annotated with variable $X_{\tup, j}$ for $j\in\pbox{\bound}$. The probability distribution $\bpd'$ is the one induced by $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$ and the \abbrBIDB disjoint requirement. \end{Definition} The base case of~\Cref{fig:nxDBSemantics} for $\pdb'$ is now $\poly\pbox{\rel,\tupset, \tup} = \sum_{j\in\pbox{\bound}}X_{\tup, j}$. Then given the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly\pbox{\rel,\tupset',\tup}$ is of the same form as the reformulated polynomial $\refpoly$ of step i) from~\Cref{def:reduced-poly}, which then implies that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials: $\expct_{\rvworld\sim\bpd'}\pbox{\poly\inparen{\rvworld}} = \rpoly\inparen{\vct{\prob}}$. } %In this paper, we focus on two popular forms of \abbrPDB\xplural: Block-Independent (\bi) and Tuple-Independent (\ti) \abbrPDB\xplural. %% %A \bi $\pdb$ is a \abbrPDB with the constraint that %%(i) every tuple $\tup_i$ is annotated with a unique random variable $\randWorld_i \in \{0, 1\}$ and (ii) that %the tuples in $\dbbase$ can be partitioned into a set of $\ell$ blocks such that tuples $\tup_{i, j}, \tup_{k, j'}$ from separate blocks $(i\neq k)$ are independent of each other while tuples $\tup_{i, j}, \tup_{i, k}$ from the same block are disjoint events.\footnote{ % Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block~\cite{DBLP:series/synthesis/2011Suciu}, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_{i, j} \in b_i$, the event $(\randWorld_{i,j} = 1)$ corresponds to the event $(\randWorld_i = j)$ in the customary annotation scheme. %} %Each tuple $\tup_{i, j}$ is annotated with a random variable $\randWorld_{i, j} \in \{0, 1\}$ denoting its presence in a possible world $\db$. The probability distribution $\pd$ over $\dbbase$ is the one induced from individual tuple probabilities $\prob_{i, j}\in \vct{\prob}=\inparen{\prob_{1, 1},\ldots,\prob_{\abs{\block},\ldots,\abs{\block_{\abs{\block}}}}}$ (where $\forall i$, $\sum_j p_{i,j}\le 1$) and the conditions on the blocks. A \abbrTIDB is a \abbrBIDB where each block has size exactly $1$. Instead of looking only at the possible worlds of $\pdb$, one can consider all worlds, including those that cannot exist due to disjointness. Then all worlds set can be modeled by $\vct{\randWorld}\in \{0, 1\}^\numvar$ \AH{We can use the new notation $\vct{W}\in\inset{0, 1}^\tupset$ here.} ,\footnote{Here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.} such that $\randWorld_k \in \vct{\randWorld}$ represents the presence of $\tup_{i, j}$ (where $k = \sum_{\ell = 1}^{i - 1} \abs{b_\ell} + j$). We denote a probability distribution over all $\vct{\randWorld} \in \{0, 1\}^\numvar$ as $\pdassign$. When $\pdassign$ is the one induced from each $\prob_{i, j}$ while assigning $\probOf\pbox{\vct{\randWorld}} = 0$ for any $\vct{\randWorld}$ with $\randWorld_{i, j} = \randWorld_{i, k} = 1$ for any block $i$ and $j\neq k$, we end up with a bijective mapping from $\pd$ to $\pdassign$, such that each mapping is equivalent, implying the distributions are equivalent. \Cref{subsec:supp-mat-ti-bi-def} has more details. \AH{Above, we need to use new notation of $\bpd$ instead of $\pd$, and we can use $\bpd'$ for the mapping discussion and note that $\bpd\equiv\bpd'$.}% explains \abbrTIDB\xplural and \abbrBIDB\xplural in greater detail. %%% Local Variables: %%% mode: latex %%% TeX-master: "main" %%% End: