Conformed S2 to notation convention for probabilities.

This commit is contained in:
Aaron Huber 2020-12-19 23:19:02 -05:00
parent 41f1d6dd38
commit 9aaf254977
3 changed files with 10 additions and 10 deletions

View file

@ -21,7 +21,7 @@
\newcommand{\pxdb}{\mathbf{D}}
\newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db
\newcommand{\tset}{\mathcal{T}}%the set of tuples in a database
\newcommand{\pd}{P}%pd for probability distribution
\newcommand{\pd}{\vct{P}}%pd for probability distribution
\newcommand{\eval}[1]{\llbracket #1 \rrbracket}%evaluation double brackets
\newcommand{\evald}[2]{\eval{{#1}}_{#2}}
\newcommand{\query}{Q}

View file

@ -52,7 +52,7 @@ We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \
there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case).
As they are a special case of \bis, the following applies to \tis as well.
Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$.
Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob(\tup_i) = \pd[X_i = 1]$.\footnote{
Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob_{\tup_i} = \pd[X_i = 1]$.\footnote{
Note the deviation from the more common approach of defining a single independent, $[\abs{\block_i}+1]$-valued variable per block; Here we define $\abs{\block_i}$ correlated variables per block.
}
Because blocks are independent and tuples from the same block are disjoint, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
@ -123,9 +123,9 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Valid Worlds]
For probability distribution $\vct{P}$ and its corresponding PMF $P$, the set of valid worlds $\eta$ is the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
For probability distribution $\probDist$ and its corresponding PMF $\probOf$, the set of valid worlds $\eta$ is the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
\[
\eta = \{\vct{w}\st P[\vct{W} = \vct{w}] > 0\}
\eta = \{\vct{w}\st \probOf[\vct{W} = \vct{w}] > 0\}
\]
\end{Definition}
@ -143,10 +143,10 @@ We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:exp-poly-rpoly}
Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\vct{p} = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\eta$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and query $\query$ we have:
Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\probDist$ produced by the tuple probability vector $\probAllTup = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\eta$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and query $\query$ we have:
% The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
\begin{equation*}
\expct_{\vct{w}\sim \vct{p}}\pbox{\poly(\vct{W})} = \rpoly(\vct{p}).
\expct_{\vct{W}\sim \probDist}\pbox{\poly(\vct{W})} = \rpoly(\probAllTup).
\end{equation*}
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View file

@ -11,7 +11,7 @@ Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\p
\[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
@ -20,10 +20,10 @@ A $\semK$-database is a set of $\semK$-relations. It will be convenient to also
We review positive relational algebra semantics for $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \pd}[\query(\db)(t)]$:
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \pd}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db)
\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
\end{align}
%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
@ -59,7 +59,7 @@ $\semNX$-PDBs, a function $\rmod$, which takes an $\semNX$-PDB input and outputs
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db,\pd')$ where $\rmod(\pxdb) = \pdb$:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{w} \sim \pd'}\pbox{\polyForTuple(\vct{w})} \]
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.