%root: main.tex %!TEX root=./main.tex %\onecolumn \section{Background and Notation}\label{sec:background} \iffalse \subsection{Superlinearity of Bag PDBs}\label{sec:suplin-bags} Moving forward, we focus exclusively on bags. For $Q()\dlImp$$OnTime(\text{City}), Route(\text{City}_1, \text{City}_2),$ $OnTime(\text{City}')$ over the bag relations of \cref{fig:ex-shipping-simp}, consider the product query $\poly^2()\dlImp Q \times Q$. The factorized representation of $\poly^2$ is (for simplicity we ignore the random variables of $Route$ since each variable has probability of $1$): \begin{equation*} \poly^2 = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right) \end{equation*} This equivalent SOP representation is \begin{equation*} L_a^2L_b^2 + L_b^2L_d^2 + L_b^2L_c^2 + 2L_aL_b^2L_d + 2L_aL_b^2L_c + 2L_b^2L_dL_c. \end{equation*} The expectation $\expct\pbox{\poly^2}$ then is: \begin{footnotesize} \begin{equation*} \expct\pbox{L_a^2}\expct\pbox{L_b^2} + \expct\pbox{L_b^2}\expct\pbox{L_d^2} + \expct\pbox{L_b^2}\expct\pbox{L_c^2} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_d} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_c} + 2\expct\pbox{L_b^2}\expct\pbox{L_d}\expct\pbox{L_c} \end{equation*} \end{footnotesize} Note that if $Dom(W_i) = \{0, 1\}$, then for any $k > 0$, $\expct\pbox{W_i^k} = \expct\pbox{W_i}$. This property leads us to consider a structure related to $\poly$. \begin{Definition}\label{def:reduced-poly} For any polynomial $\poly(\vct{X})$, define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in $\poly(\vct{X})$ to $1$. \end{Definition} With $\poly^2$ as an example, we have: \begin{align*} \rpoly^2(L_a, L_b, L_c, L_d) =&\; L_aL_b + L_bL_d + L_bW_c + 2L_aL_bL_d + 2L_aL_bL_c + 2L_bL_cL_d \end{align*} It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2} = \rpoly(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$). The reduced form of a lineage polynomial can be obtained but requires a linear scan over the clauses of an SOP encoding of the polynomial. Note that for a compressed representation, this scheme would require an exponential number of computations in the size of the compressed representation. In \Cref{sec:hard}, we use $\rpoly$ to prove our hardness results . %In prior work on lineage-based Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,yang:2015:pvldb:lenses} where this encoding is implicitly assumed, computing the expected count is linear in the size of the encoding. %In general however, compressed encodings of the polynomial can be exponentially smaller in $k$ for $k$-products --- the query $\poly^k$ obtained by taking the product of $k$ copies of $\poly$ as a factorized encoding of size $6\cdot k$, while the SOP encoding is of size $2\cdot 3^k$. %This leads us to the \textbf{central question of this paper}: %\begin{quote} %{\em %Is it always the case that the expectation of a UCQ in a Bag-PDB can be computed in time linear in the size of the \textbf{compressed} lineage polynomial?} %\end{quote} %If so, then Bag-PDBs can indeed compete with deterministic databases. %This is unfortunately not the case, and an approximation is required. \fi \subsection{Probabilistic Databases (PDBs)} An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds. Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world: $\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}$ For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer: \[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \] Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_n)$ with natural number coefficients and exponents. We model incomplete relations using Green et. al.'s $\semNX$-databases~\cite{DBLP:conf/pods/GreenKT07}, discussed in detail in \Cref{subsec:supp-mat-krelations} and summarized here. In an $\semNX$-databases, relations are defined as functions from tuples to elements of $\semNX$, typically called annotations. We write $R(t)$ to denote the polynomial annotating tuple $t$ in relation $R$. Each possible world is defined by an assignment of $N$ binary values $\vct{W} \in \{0, 1\}^{|X|}$. The multiplicity of $t \in R$ in this possible world is obtained by evaluating the polynomial annotating it on $\vct{W}$ (i.e., $R(t)(\vct{W})$). $\semNX$-relations are closed under $\raPlus$ (\cref{fig:nxDBSemantics}). \begin{figure} \begin{align*} \evald{\project_A(\rel)}{\db}(\tup) =& \bigoplus_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') & \evald{(\rel_1 \union \rel_2)}{\db}(\tup) =& \evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\ \evald{\select_\theta(\rel)}{\db}(\tup) =& \begin{cases} \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\ \zeroK & \text{otherwise}. \end{cases} & \begin{aligned} \evald{(\rel_1 \join \rel_2)}{\db}(\tup) =\\ ~ \end{aligned}& \begin{aligned} &\evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \\ &~~~\cdot\evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \end{aligned}\\ & & \evald{R}{\db}(\tup) =& \rel(\tup) \end{align*} \caption{Evaluation semantics $\evald{\cdot}{\db}$ for $\semNX$-DBs~\cite{DBLP:conf/pods/GreenKT07}.} \label{fig:nxDBSemantics} \end{figure} % For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}. % We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ over $\semK$-database $\db$. Below, we assume that tuples are of appropriate arity, use $\sch(\rel)$ to denote the attributes of $\rel$, and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$. Furthermore, $\theta(\tup)$ denotes the (Boolean) result of evaluating condition $\theta$ over $\tup$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% We will use $\semNX$-PDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$. We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-PDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial. $\semNX$-PDBs and a function $\rmod$ (which transforms an $\semNX$-PDB to classical, or $\semN$-PDB~\cite{DBLP:conf/pods/GreenKT07,feng:2019:sigmod:uncertainty}) are both formalized in \Cref{subsec:supp-mat-background}. \begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom} Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$, we have: $ \expct_{\idb \sim \pd}[\query(\idb)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})}. $ \footnote{Although assumed by most prior work on set-probabilistic databases, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.} \end{Proposition} \noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}. This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$. We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ will denote a polynomial). \subsubsection{\tis and \bis} \label{subsec:tidbs-and-bidbs} In this paper, we focus on two popular forms of PDB: Block-Independent (\bi) and Tuple-Independent (\ti) PDBs. % A \bi $\pxdb = (\idb_{\semNX}, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ (i.e., the tuple does not exist) or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events. % A \emph{\ti} is a \bi where each block contains exactly one tuple. \Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Local Variables: %%% mode: latex %%% TeX-master: "main" %%% End: