Started pass on Sec 2 (Aaron)

2021-04-06 17:44:14 -04:00 · 2021-04-06 17:44:14 -04:00 · 69051b4949
parent f226af1dc3
commit 69051b4949
2 changed files with 20 additions and 27 deletions
--- a/poly-form.tex
+++ b/poly-form.tex
@ -8,7 +8,7 @@ We will use $(X + Y)^2$ as a running example.

 \begin{Definition}[Standard Monomial Basis]\label{def:smb}
 A monomial is a product of variable terms, each raised to a non-negative integer power.
-  A polynomial in \termSMB (\abbrSMB) has the form: $\sum_{i=1}^n c_i \cdot m_i$, where each $c_i \neq 0$ is an integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. The \abbrSMB of a polynomial $\poly$ is $\smbOf{\poly}$.
+  A polynomial in \termSMB (\abbrSMB) has the form: $\sum_{i=1}^n c_i \cdot m_i$ for each of its $n$ terms, where each $c_i \neq 0$ is an integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. The \abbrSMB of a polynomial $\poly$ is $\smbOf{\poly}$.
 %  fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
 \end{Definition}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- a/ra-to-poly.tex
+++ b/ra-to-poly.tex
@ -3,42 +3,33 @@
 %\onecolumn
 \section{Background and Notation}\label{sec:background}

-\subsection{Prelim: Superlinearity of Bag PDBs}\label{sec:suplin-bags}
-Moving forward, we focus exclusively on bags.  The bag relations of \cref{fig:ex-shipping} are modeled by the atttribute $\Phi_{bag}$ (i.e., we can ignore the $\Phi_{set}$ attribute). 
-Consider the following product query, which can be thought of the set of all route pairs.
-\begin{equation}
-\poly^2_E():- Loc(\text{City}), Route(\text{City}_1, \text{City}_2), Loc(\text{City}'),  Loc(\text{City}''), Route(\text{City}_1', \text{City}_2'), Loc(\text{City}''')\label{eq:edge-query}
-\end{equation}
-%For an arbitrary polynomial, it is known that there may exist equivalent compressed representations.
-%One such compression is the factorized polynomial~\cite{factorized-db}, where the polynomial is broken up into separate factors.
-%For example:
-Consider the factorized representation of $\poly^2_E$:
+\subsection{Superlinearity of Bag PDBs}\label{sec:suplin-bags}
+Moving forward, we focus exclusively on bags.  For $Q():-$$OnTime(\text{City}), Route(\text{City}_1, \text{City}_2),$ $OnTime(\text{City}')$ over the bag relations of \cref{fig:ex-shipping-simp}, consider the product query $\poly^2():- Q \times Q$.
+The factorized representation of $\poly^2$ is (for simplicity we ignore the random variables of $Route$ since each variable has probability of $1$):
 \begin{equation*}
-\poly^2_E = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right)
+\poly^2 = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right)
 \end{equation*}
 This equivalent SOP representation is
 \begin{equation*}
 L_a^2L_b^2 + L_b^2L_d^2 + L_b^2L_c^2 + 2L_aL_b^2L_d + 2L_aL_b^2L_c + 2L_b^2L_dL_c.
 \end{equation*}
-The expectation $\expct\pbox{\poly^2_E()}$ then is:
+The expectation $\expct\pbox{\poly^2}$ then is:
 \begin{footnotesize}
 \begin{equation*}
 \expct\pbox{L_a^2}\expct\pbox{L_b^2} + \expct\pbox{L_b^2}\expct\pbox{L_d^2} + \expct\pbox{L_b^2}\expct\pbox{L_c^2} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_d} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_c} + 2\expct\pbox{L_b^2}\expct\pbox{L_d}\expct\pbox{L_c}
 \end{equation*}
 \end{footnotesize}
-%Recall the nice property of $\query$ that its expected count could be computed by evaluating its lineage on the probability vector (i.e., \Cref{eqn:can-inline-probabilities-into-polynomial}).
-%This property does not hold for $\poly^2$ (i.e., $\expct\pbox{\poly^2} \neq \poly^2(\probOf\pbox{W_a}, \probOf\pbox{W_b}, \probOf\pbox{W_c})$), but does suggest a related closed form formula.
 Note that if $Dom(W_i) = \{0, 1\}$, then for any $k > 0$, $\expct\pbox{W_i^k} = \expct\pbox{W_i}$.
 This property leads us to consider a structure related to $\poly$.
 \begin{Definition}\label{def:reduced-poly}
 For any polynomial $\poly(\vct{X})$, define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in $\poly(\vct{X})$ to $1$.
 \end{Definition}
-With $\poly^2_E$ as an example, we have:
+With $\poly^2$ as an example, we have:
 \begin{align*}
-\rpoly^2_E(L_a, L_b, L_c, L_d)
+\rpoly^2(L_a, L_b, L_c, L_d)
 =&\; L_aL_b + L_bL_d + L_bW_c + 2L_aL_bL_d + 2L_aL_bL_c + 2L_bL_cL_d
 \end{align*}
-It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2_E} = \rpoly_E(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$).
+It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2} = \rpoly(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$).

 The reduced form of a lineage polynomial can be obtained but requires a linear scan over the clauses of an SOP encoding of the polynomial.  Note that for a compressed representation, this scheme would require an exponential number of computations in the size of the compressed representation.  In \Cref{sec:hard}, we use $\rpoly$ to prove our hardness results .
 %In prior work on lineage-based Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,yang:2015:pvldb:lenses} where this encoding is implicitly assumed, computing the expected count is linear in the size of the encoding.
@ -59,7 +50,7 @@ Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\p
 \[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]

 For a probabilistic  database $\pdb = (\idb, \pd)$,  the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$  that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
-\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]
+\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]

 Note that in this work, for the query output, we consider bags, i.e., each possible world in the query output is a set of bag relations and queries are evaluated using bag semantics. We will use $\domK$-relations to model bags. A \emph{$\domK$-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$.  A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
 Let $\udom$ be a countable domain of values.
@ -68,12 +59,14 @@ A $\semK$-database is a set of $\semK$-relations. It will be convenient to also
 We review positive relational algebra semantics for $\semK$-relations below.


-Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$,  we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
-%
-\begin{equation}\label{eq:bag-expectation}
-\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
-\end{equation}
-%
+Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A  probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases.  Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$,  we use $\query(\db)(t)$ for $\db \in \idb$ as input to compute the expected multiplicity of \cref{eq:intro-bag-expectation}.  Note that the tables of \cref{fig:ex-shipping-simp} have an implicit $1$ $\semN$-valued annotation for each tuple in tables $OnTime$ and $Route$.
+
+%\cref{ex:intro-tbls} and \cref{ex:intro-lineage} $\semN$-valued variable and are interested in computing its expectation  $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
+%%
+%\begin{equation}\label{eq:bag-expectation}
+%\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
+%\end{equation}
+%%
 Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -101,13 +94,13 @@ Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural
 Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$, with the standard addition and multiplication of polynomials. 
 We will use $\semNX$-PDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$.  
 We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-PDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
-$\semNX$-PDBs and a function $\rmod$ from an $\semNX$-PDB to an equivalent $\semN$-PDB are both formalized in \Cref{subsec:supp-mat-background}.
+$\semNX$-PDBs and a function $\rmod$ (which transforms an $\semNX$-PDB to an equivalent $\semN$-PDB) are both formalized in \Cref{subsec:supp-mat-background}.

 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
  Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$:
-  \[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
+  \[ \expct_{\idb \sim \pd}[\query(\idb)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
 \end{Proposition}
 \noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.  
 This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.