Restructuring S.2.

master
Aaron Huber 2021-09-02 12:06:47 -04:00
parent 89aa7dfaf5
commit 49efdb6668
3 changed files with 98 additions and 91 deletions

View File

@ -6,11 +6,11 @@
We now introduce some terminology % for polynomials
and develop a reduced form (a closed form of the polynomial's expectation) for polynomials over probability distributions derived from a \bi or \ti.
%We will use $(X + Y)^2$ as a running example.
Note that a polynomial over $\vct{X}=(X_1,\dots,X_n)$ is formally defined as (with $c_\vct{i} \in \domN$):
\AH{My attempt to clear up any confusion in the ambiguity of $c_{\vct{i}}$. We may want to say that $\domain\inparen{c_\vct{i}} = \domR$ instead?}
Note that a polynomial over $\vct{X}=(X_1,\dots,X_n)$ with highest power $B <\infty$\footnote{The standard definition of polynomials requires a finite number of terms.} and $c_\vct{i} \in \domN$ is formally defined as: %(with $c_\vct{i} \in \domN$):
\AH{Do we want to say that $\domain\inparen{c_\vct{i}} = \domR$ instead? I've only seen the $\semNX$ use case. Is there a legitimate use case for real valued coefficients?}
\begin{equation}
\label{eq:sop-form}
\poly\inparen{X_1,\dots,X_n}=\sum_{\vct{d}=(d_1,\dots,d_n)\in \semN^n} c_{\vct{d}}\cdot \prod_{i=1}^n X_i^{d_i}.
\poly\inparen{X_1,\dots,X_n}=\sum_{\vct{d}\in\{0,\ldots,B\}^n} c_{\vct{d}}\cdot \prod_{i=1}^n X_i^{d_i}.
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -26,60 +26,55 @@ When it is unclear, we use $\smbOf{\poly}$ to denote the \abbrSMB form of a poly
The degree of polynomial $\poly(\vct{X})$ is the largest $\sum_{i=1}^n d_i$ such that $c_{(d_1,\dots,d_n)}\ne 0$. % maximum sum of exponents, over all monomials in $\smbOf{\poly(\vct{X})}$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\AH{The example needs to change to avoid ambiguity between different definitions of \emph{degree}. Our definition is the one we want, since we two different tuples join without any tuples joining on themselves, and the degree would be $2$, rather than $1$, for example.}
The degree of the polynomial $X^2+2XY+Y^2$ is $2$.
Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantics}), so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in any clause of the UCQ query that created it.
In this paper we consider only finite degree polynomials.
We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\bi-lineage polynomial} (resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if there exists a \AH{Which formalism? UCQ?}$\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\poly\inparen{\vct{X}} = \query(\pxdb)(\tup)$.
As an example, the degree of the polynomial $X^2+2XY^2+Y^2$ is $3$, since the largest sum of any monomial's exponents is that of the monomial $XY^2$.
Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantics}), so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in any clause of the $\raPlus$ query that created it.
We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\bi-lineage polynomial} (resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if there exists a $\raPlus$ query $\query$, \bi (\ti) $\pdb$, and tuple $\tup$ such that $\poly\inparen{\vct{X}} = \query(\pdb)(\tup)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\AH{There was reviewer disgruntlement with how we discussed modding and all in the next few definitions. Need to revisit these comments and adjust accordingly.}
\begin{Definition}[Modding with a set]\label{def:mod-set}
Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S}$ is the polynomial obtained by taking the mod of $\poly(\vct{X})$ over {\em all} polynomials in $S$ (order does not matter).
\end{Definition}
For example for a set of polynomials $S=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S$ yields $2X+3XY-2Y$.
%
\begin{Definition}[$\mathcal B$, $\mathcal T$]\label{def:mod-set-polys}
Given the set of BIDB variables $\inset{X_{i,j}}$, define
\setlength\parindent{0pt}
\vspace*{-3mm}
{\small
\begin{tabular}{@{}l l}
\begin{minipage}[b]{0.45\linewidth}
\centering
\begin{equation*}
\mathcal{B}=\comprehension{X_{i,j}\cdot X_{i,j'}}{i \in [\ell], j\neq j' \in [~\abs{\block_i}~]}
\end{equation*}
\end{minipage}%
\hspace{13mm}
&
\begin{minipage}[b]{0.45\linewidth}
\centering
\begin{equation*}
\mathcal{T}=\comprehension{X_{i,j}^2-X_{i,j}}{i \in [\ell], j \in [~\abs{\block_i}~]}
\end{equation*}
\end{minipage}
\\
\end{tabular}
}
\end{Definition}
%\begin{Definition}[Modding with a set]\label{def:mod-set}
%Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S}$ is the polynomial obtained by taking the mod of $\poly(\vct{X})$ over {\em all} polynomials in $S$ (order does not matter).
%\end{Definition}
%For example for a set of polynomials $S=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S$ yields $2X+3XY-2Y$.
%%
%\begin{Definition}[$\mathcal B$, $\mathcal T$]\label{def:mod-set-polys}
%Given the set of BIDB variables $\inset{X_{i,j}}$, define
%
%\setlength\parindent{0pt}
%\vspace*{-3mm}
%{\small
%\begin{tabular}{@{}l l}
% \begin{minipage}[b]{0.45\linewidth}
% \centering
% \begin{equation*}
% \mathcal{B}=\comprehension{X_{i,j}\cdot X_{i,j'}}{i \in [\ell], j\neq j' \in [~\abs{\block_i}~]}
% \end{equation*}
% \end{minipage}%
% \hspace{13mm}
% &
% \begin{minipage}[b]{0.45\linewidth}
% \centering
% \begin{equation*}
% \mathcal{T}=\comprehension{X_{i,j}^2-X_{i,j}}{i \in [\ell], j \in [~\abs{\block_i}~]}
% \end{equation*}
% \end{minipage}
% \\
%\end{tabular}
%}
%\end{Definition}
%%
\begin{Definition}[Reduced \bi Polynomials]\label{def:reduced-bi-poly}
Let $\poly(\vct{X})$ be a \bi-lineage polynomial.
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is: $\rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}$
% \begin{equation*}
% \rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}%X_i^2 - X_i \mod X_{\block_s, t}X_{\block_s, u}
% \end{equation*}
%for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{\block_s}]$, $t \neq u$.
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is the same as \cref{def:reduced-poly} with the added constraint that all monomials with variables $X_{\block, i}, X_{\block, j}, i\neq j$ from the same block $\block$ are omitted.
%: $\rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
All exponents $e > 1$ in $\smbOf{\poly(\vct{X})}$ are reduced to $e = 1$ via mod $\mathcal{T}$. Performing the modulus of $\rpoly(\vct{X})$ with $\mathcal{B}$ ensures the disjoint condition of \bi, removing monomials with lineage variables from the same block.
Consider a $\abbrBIDB$ polynomial $\poly\inparen{\vct{X}} = X_{1, 1}X_{1, 2} + X_{1, 2}X_{2, 1}$. Then by \cref{def:reduced-bi-poly}, we have that $\rpoly\inparen{\vct{X}} = X_{1, 2}X_{2, 1}$.
%, (recall the constraint on tuples from the same block being disjoint in a \bi).% any monomial containing more than one tuple from a block has $0$ probability and can be ignored).
%
For the special case of \tis, the second step is not necessary since every block contains a single tuple.
%For the special case of \tis, the second step is not necessary since every block contains a single tuple.
%Alternatively, one can think of $\rpoly$ as the \abbrSMB of $\poly(\vct{X})$ when the product operator is idempotent.
%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -111,14 +106,22 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
%
%\noindent The usefulness of this will reduction become clear in \Cref{lem:exp-poly-rpoly}.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Valid Worlds]
For probability distribution $\pd$, % and its corresponding probability mass function $\probOf$,
the set of valid worlds $\valworlds$ consists of all the worlds with probability value greater than $0$; i.e., for random world variable vector $\vct{W}$
\[
\valworlds = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
\]
\end{Definition}
%NEEDS to be moved to the appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Definition}[Valid Worlds]
%For probability distribution $\pd$, % and its corresponding probability mass function $\probOf$,
%the set of valid worlds $\valworlds$ consists of all the worlds with probability value greater than $0$; i.e., for random world variable vector $\vct{W}$
%\[
%\valworlds = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
%\]
%\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%END move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$ in \Cref{app:subsec-pre-poly-rpoly} and \Cref{app:subsec-prop-q-qtilde}.
\noindent Next, we show why the reduced form is useful for our purposes:
@ -135,7 +138,7 @@ the set of valid worlds $\valworlds$ consists of all the worlds with probability
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:exp-poly-rpoly}
Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\pd$ induced by the tuple probability vector $\probAllTup = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\valworlds$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and query $\query$ we have:
Let $\pdb$ be a \abbrBIDB over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\pd$ induced by the tuple probability vector $\probAllTup = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\{0, 1\}^\numvar$. For any \abbrBIDB-lineage polynomial $\poly(\vct{X})$ based on $\pdb$ and query $\query$ we have:
% The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
\begin{equation*}
\expct_{\vct{W}\sim \pd}\pbox{\poly(\vct{W})} = \rpoly(\probAllTup).
@ -159,8 +162,6 @@ to the variables $\vct{X}$. Intuitively, \Cref{lem:exp-poly-rpoly} states that w
\begin{Corollary}\label{cor:expct-sop}
If $\poly$ is a \bi-lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\size\inparen{\poly}}$, where $\size\inparen{\poly}$ (\Cref{def:size}) is proportional to the total number of multiplication/addition operators in $\poly$.
\end{Corollary}
\AH{Maybe state in terms of $\abs{\circuit}$.}
%\AH{What if $\poly$ is not in \abbrSMB form?}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -3,26 +3,25 @@
\subsection{Problem Definition}\label{sec:expression-trees}
We first formally define circuits, an encoding of polynomials that we use throughout the paper. Since we are particularly using \emph{lineage} circuits, we drop the term lineage and only refer to them as circuits.
%We first formally define circuits, an encoding of polynomials that we use throughout the paper.
%
For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY - 2Y^2$ over $\vct{X} = [X, Y]$.
%For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY - 2Y^2$ over $\vct{X} = [X, Y]$.
We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way.
We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way. Since we are particularly using \emph{lineage} circuits, we drop the term lineage and only refer to them as circuits.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X}$. The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
%
Each internal node in a circuit $\circuit$ has the following members: \type, \vpartial, \vari{input}, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the type of value stored in the gate (one of $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} is the list of the gate's inputs. The source gates have an additional member \val, which holds the value stored (constant or variable). We use $\circuit_\linput$ to denote the left input and $\circuit_\rinput$ the right input of a sink of circuit $\circuit$.
%The member \degval holds the degree of \circuit.
When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of \circuit.
Each gate has the following members: \type, \vpartial, \vari{input}, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an additional member \val storing the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input.
\end{Definition}
When the underlying DAG is a tree (with edges pointing towards the root), the structure is an expression tree \etree. In such a case, the root of \etree is analogous to the sink of \circuit. We ignore the fields \vari{partial}, \degval, \vari{Lweight}, and \vari{Rweight} until \Cref{sec:algo}.\AH{\emph{I think} \degval is used only in appendix proofs.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As stated in \Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees.
We ignore the fields \vari{partial}, \degval, \vari{Lweight}, and \vari{Rweight} until \Cref{sec:algo}.\AH{We omit degree here too, which {\emph I think} is used only in appendix proofs.}
%As stated in \Cref{def:circuit}, every internal node has at most two incoming edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
%Note that if we limit the outdegree to one, then we get expression trees.
\begin{Example}
@ -92,8 +91,7 @@ The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $
\caption{Example circuit encodings}
\end{figure}
The semantics of circuits follows the obvious interpretation. We next define its relationship with polynomials formally:
We next formally define the relationship of circuits with polynomials.
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
Denote $\polyf(\circuit)$ to be the function from circuit $\circuit$ to its corresponding polynomial (in \abbrSMB).\footnote{Recall our assumption that unless otherwise mentioned, all polynomials are considered in $\abbrSMB$.} $\polyf(\cdot)$ is recursively defined on $\circuit$ as follows, with addition and multiplication following the standard interpretation for polynomials:
\begin{equation*}
@ -105,16 +103,15 @@ Denote $\polyf(\circuit)$ to be the function from circuit $\circuit$ to its corr
\end{equation*}
\end{Definition}
Note that $\circuit$ need not encode an expression in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$, as shown in \Cref{fig:circuit}, while $\polyf(\circuit) = 2X^2+3XY-2Y^2$.
$\circuit$ need not encode $\poly\inparen{\vct{X}}$ in the same, default \abbrSMB representation. For instance, $\circuit$ could encode the factorized representation $(X + 2Y)(2X - Y)$ of $\poly\inparen{\vct{X}} = 2X^2+3XY-2Y^2$, as shown in \Cref{fig:circuit}, while $\polyf(\circuit) = \poly\inparen{\vct{X}}$, the equivalent \abbrSMB representation.
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\polyX}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \polyX$.\footnote{Again, the representation of $\polyX$ is $\abbrSMB$.}
$\circuitset{\polyX}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \polyX$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. One can think of $\circuitset{\polyX}$ as the infinite set of circuits each of which equal $\polyX$ when represented in $\abbrSMB$.
Note that \Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$.
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. One can think of $\circuitset{\polyX}$ as the infinite set of circuits where for each element \circuit, $\polyf\inparen{\circuit} = \polyX$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip
@ -122,7 +119,7 @@ Note that \Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\
\noindent We are now ready to formally state our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$ and $\pxdb$ be an arbitrary $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to \{0,1\}$. Fix a query $\query$ and an output tuple $\tup$.
Let $\vct{X} = (X_1, \ldots, X_n)$ and $\pdb$ be an arbitrary \abbrBIDB-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to \{0,1\}^\numvar$. Fix a query $\query$ and an output tuple $\tup$.
The \expectProblem is defined as follows:\\[-7mm]
\begin{center}
\textbf{Input}: A circuit $\circuit \in \circuitset{\polyX}$ for $\polyX = \query(\pxdb)(t)$

View File

@ -5,7 +5,7 @@
\subsection{Probabilistic Databases}
The setting used in this section is primarily that of a bag-\abbrPDB query with set-\abbrPDB inputs. Recall, as noted in \cref{sec:intro-rewrite-070921}, this is not limiting.
While the setting used in this section is primarily that of a bag-\abbrPDB query with set-\abbrPDB inputs, recall, as noted in \cref{sec:intro-rewrite-070921}, this is not limiting. All proofs are located in the appendix.
An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds.
Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world: $\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}$.
@ -14,43 +14,52 @@ For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is th
%
\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_\numvar)$ with natural number coefficients and exponents.
We model incomplete relations using Green et. al.'s $\semNX$-databases~\cite{DBLP:conf/pods/GreenKT07}, discussed in detail in \Cref{subsec:supp-mat-krelations}.
$\semNX$-databases are functions from tuples to elements of $\semNX$, typically called annotations.
Given an $\semNX$-database $\db$, it is common to use $\db(\tup)$ to denote the polynomial annotating tuple $\tup$ in $\db$.
%Note that based on this definition of $\rel$, $\rel(\tup)$ is the lineage polynomial for $\tup$.
Let $\numvar$ be the number of tuples in $\pdb$. Then, each possible world is defined by an assignment of $\numvar$ binary values $\vct{\wElem} \in \{0, 1\}^{\numvar}$ to $\vct{X}$.
The multiplicity of $\tup \in \db$, denoted $\db(\tup)(\vct{\wElem})$, is obtained by evaluating the polynomial annotating $\tup$ on $\vct{\wElem}$.
$\semNX$-relations are closed under $\raPlus$ (\Cref{fig:nxDBSemantics}).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%NEEDS to be moved to the appendix.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_\numvar)$ with natural number coefficients and exponents.
%We model incomplete relations using Green et. al.'s $\semNX$-databases~\cite{DBLP:conf/pods/GreenKT07}, discussed in detail in \Cref{subsec:supp-mat-krelations}.
% $\semNX$-databases are functions from tuples to elements of $\semNX$, typically called annotations.
%Given an $\semNX$-database $\db$, it is common to use $\db(\tup)$ to denote the polynomial annotating tuple $\tup$ in $\db$.
%%Note that based on this definition of $\rel$, $\rel(\tup)$ is the lineage polynomial for $\tup$.
%Let $\numvar$ be the number of tuples in $\pdb$. Then, each possible world is defined by an assignment of $\numvar$ binary values $\vct{\wElem} \in \{0, 1\}^{\numvar}$ to $\vct{X}$.
%The multiplicity of $\tup \in \db$, denoted $\db(\tup)(\vct{\wElem})$, is obtained by evaluating the polynomial annotating $\tup$ on $\vct{\wElem}$.
%$\semNX$-relations are closed under $\raPlus$ (\Cref{fig:nxDBSemantics}).
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%We will use $\semNX$-\abbrPDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$ over the assignments to $\vct{X}$.
%We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-\abbrPDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{\numvar} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
%$\semNX$-\abbrPDB\xplural and a function $\rmod$ (which transforms an $\semNX$-\abbrPDB to a classical bag-\abbrPDB, or $\semN$-\abbrPDB~\cite{DBLP:conf/pods/GreenKT07,feng:2019:sigmod:uncertainty}) are both formalized in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%END: move to appendix.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We will use $\semNX$-\abbrPDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$ over the assignments to $\vct{X}$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-\abbrPDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{\numvar} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-\abbrPDB\xplural and a function $\rmod$ (which transforms an $\semNX$-\abbrPDB to a classical bag-\abbrPDB, or $\semN$-\abbrPDB~\cite{DBLP:conf/pods/GreenKT07,feng:2019:sigmod:uncertainty}) are both formalized in \Cref{subsec:supp-mat-background}.
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-\abbrPDB $\pdb = (\idb,\pd)$ and $\semNX$-\abbrPDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$, we have:
$ \expct_{\randDB \sim \pd}[\query(\randDB)(t)] = \expct_{\randWorld\sim \pd'}\pbox{\polyForTuple(\randWorld)}. $
Given an $\semN$-\abbrPDB $\pdb = (\idb,\pd)$ and equivalent polynomial $\polyForTuple$ for aribitrary tuple $\tup \in \pdb$,%$\semNX$-\abbrPDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$,
we have:
$ \expct_{\randDB \sim \pd}[\query(\randDB)(t)] = \expct_{\randWorld\sim \pd'}\pbox{\poly_{\query, \tup}(\randWorld)}. $
\footnote{Although assumed by most prior work on set-probabilistic databases, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.}
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ will denote a polynomial).
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\poly_{\query, \tup}$ (i.e., $\poly$ will denote a polynomial).
\subsubsection{\tis and \bis}
\label{subsec:tidbs-and-bidbs}
In this paper, we focus on two popular forms of \abbrPDB\xplural: Block-Independent (\bi) and Tuple-Independent (\ti) \abbrPDB\xplural.
%
A \bi $\pxdb = (\idb_{\semNX}, \pd)$ is an $\semNX$-\abbrPDB such that (i) every tuple is annotated with either $0$ (i.e., the tuple does not exist) or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same block are disjoint events.
A \bi $\pdb$ is a \abbrPDB such that (i) every tuple is annotated with either $0$ (i.e., the tuple does not exist) or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pdb$ for which $\pdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same block are disjoint events.
In other words, each random variable corresponds to the event of a single tuple's presence.
%
A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail.
%
In a \bi (and by extension a \ti) $\pxdb$, tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \probOf[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.\footnote{
In a \bi (and by extension a \ti), tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \probOf[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.\footnote{
Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_{i, j} \in b_i$, the event $(X_{i,j} = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme.
}
Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pdb$.
We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as
$\poly(\vct{X})$ = $\poly(X_{1, 1},\ldots, X_{1, \abs{\block_1}},$ $\ldots, X_{\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$.\footnote{Later on in the paper, especially in \Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.}