Conformed S2 to notation convention for probabilities.

2020-12-19 23:19:02 -05:00 · 2020-12-19 23:19:02 -05:00 · 9aaf254977
parent 41f1d6dd38
commit 9aaf254977
3 changed files with 10 additions and 10 deletions
--- a/macros.tex
+++ b/macros.tex
@ -21,7 +21,7 @@
 \newcommand{\pxdb}{\mathbf{D}}
 \newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db
 \newcommand{\tset}{\mathcal{T}}%the set of tuples in a database
-\newcommand{\pd}{P}%pd for probability distribution
+\newcommand{\pd}{\vct{P}}%pd for probability distribution
 \newcommand{\eval}[1]{\llbracket #1 \rrbracket}%evaluation double brackets
 \newcommand{\evald}[2]{\eval{{#1}}_{#2}}
 \newcommand{\query}{Q}
--- a/poly-form.tex
+++ b/poly-form.tex
@ -52,7 +52,7 @@ We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \
 there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are  \bis (which subsume \tis as a special case).
 As they are a special case of \bis, the following applies to \tis as well.
 Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$. 
-Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob(\tup_i) = \pd[X_i = 1]$.\footnote{
+Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob_{\tup_i} = \pd[X_i = 1]$.\footnote{
  Note the deviation from the more common approach of defining a single independent, $[\abs{\block_i}+1]$-valued variable per block; Here we define $\abs{\block_i}$ correlated variables per block.
 } 
 Because blocks are independent and tuples from the same block are disjoint, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
@ -123,9 +123,9 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
 %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[Valid Worlds]
-For probability distribution $\vct{P}$ and its corresponding PMF $P$, the set of valid worlds $\eta$ is the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
+For probability distribution $\probDist$ and its corresponding PMF $\probOf$, the set of valid worlds $\eta$ is the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
 \[
-\eta = \{\vct{w}\st P[\vct{W} = \vct{w}] > 0\}
+\eta = \{\vct{w}\st \probOf[\vct{W} = \vct{w}] > 0\}
 \]
 \end{Definition}

@ -143,10 +143,10 @@ We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Lemma}\label{lem:exp-poly-rpoly}
-Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\vct{p} = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\eta$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and query $\query$ we have:
+Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\probDist$ produced by the tuple probability vector $\probAllTup = (\prob_1, \ldots, \prob_\numvar)$ over all $\vct{w}$ in $\eta$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and query $\query$ we have:
  % The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
 \begin{equation*}
-\expct_{\vct{w}\sim \vct{p}}\pbox{\poly(\vct{W})}  = \rpoly(\vct{p}).
+\expct_{\vct{W}\sim \probDist}\pbox{\poly(\vct{W})}  = \rpoly(\probAllTup).
 \end{equation*}
 \end{Lemma}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- a/ra-to-poly.tex
+++ b/ra-to-poly.tex
@ -11,7 +11,7 @@ Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\p
 \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]

 For a probabilistic  database $\pdb = (\idb, \pd)$,  the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$  that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
-\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
+\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]

 Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$.  A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
 Let $\udom$ be a countable domain of values.
@ -20,10 +20,10 @@ A $\semK$-database is a set of $\semK$-relations. It will be convenient to also
 We review positive relational algebra semantics for $\semK$-relations below.


-Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \pd}[\query(\db)(t)]$:
+Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
 %
 \begin{align}\label{eq:bag-expectation}
-\expct_{\idb \sim \pd}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db)
+\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
 \end{align}
 %
 Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
@ -59,7 +59,7 @@ $\semNX$-PDBs, a function $\rmod$, which takes an $\semNX$-PDB input and outputs
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
  Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db,\pd')$ where $\rmod(\pxdb) = \pdb$:
-  \[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{w} \sim \pd'}\pbox{\polyForTuple(\vct{w})} \]
+  \[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
 \end{Proposition}
 \noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.  
 This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.