Started pass on Sec 2 (Aaron)

This commit is contained in:
Aaron Huber 2021-04-06 17:44:14 -04:00
parent f226af1dc3
commit 69051b4949
2 changed files with 20 additions and 27 deletions

View file

@ -8,7 +8,7 @@ We will use $(X + Y)^2$ as a running example.
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
A monomial is a product of variable terms, each raised to a non-negative integer power.
A polynomial in \termSMB (\abbrSMB) has the form: $\sum_{i=1}^n c_i \cdot m_i$, where each $c_i \neq 0$ is an integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. The \abbrSMB of a polynomial $\poly$ is $\smbOf{\poly}$.
A polynomial in \termSMB (\abbrSMB) has the form: $\sum_{i=1}^n c_i \cdot m_i$ for each of its $n$ terms, where each $c_i \neq 0$ is an integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. The \abbrSMB of a polynomial $\poly$ is $\smbOf{\poly}$.
% fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View file

@ -3,42 +3,33 @@
%\onecolumn
\section{Background and Notation}\label{sec:background}
\subsection{Prelim: Superlinearity of Bag PDBs}\label{sec:suplin-bags}
Moving forward, we focus exclusively on bags. The bag relations of \cref{fig:ex-shipping} are modeled by the atttribute $\Phi_{bag}$ (i.e., we can ignore the $\Phi_{set}$ attribute).
Consider the following product query, which can be thought of the set of all route pairs.
\begin{equation}
\poly^2_E():- Loc(\text{City}), Route(\text{City}_1, \text{City}_2), Loc(\text{City}'), Loc(\text{City}''), Route(\text{City}_1', \text{City}_2'), Loc(\text{City}''')\label{eq:edge-query}
\end{equation}
%For an arbitrary polynomial, it is known that there may exist equivalent compressed representations.
%One such compression is the factorized polynomial~\cite{factorized-db}, where the polynomial is broken up into separate factors.
%For example:
Consider the factorized representation of $\poly^2_E$:
\subsection{Superlinearity of Bag PDBs}\label{sec:suplin-bags}
Moving forward, we focus exclusively on bags. For $Q():-$$OnTime(\text{City}), Route(\text{City}_1, \text{City}_2),$ $OnTime(\text{City}')$ over the bag relations of \cref{fig:ex-shipping-simp}, consider the product query $\poly^2():- Q \times Q$.
The factorized representation of $\poly^2$ is (for simplicity we ignore the random variables of $Route$ since each variable has probability of $1$):
\begin{equation*}
\poly^2_E = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right)
\poly^2 = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right)
\end{equation*}
This equivalent SOP representation is
\begin{equation*}
L_a^2L_b^2 + L_b^2L_d^2 + L_b^2L_c^2 + 2L_aL_b^2L_d + 2L_aL_b^2L_c + 2L_b^2L_dL_c.
\end{equation*}
The expectation $\expct\pbox{\poly^2_E()}$ then is:
The expectation $\expct\pbox{\poly^2}$ then is:
\begin{footnotesize}
\begin{equation*}
\expct\pbox{L_a^2}\expct\pbox{L_b^2} + \expct\pbox{L_b^2}\expct\pbox{L_d^2} + \expct\pbox{L_b^2}\expct\pbox{L_c^2} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_d} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_c} + 2\expct\pbox{L_b^2}\expct\pbox{L_d}\expct\pbox{L_c}
\end{equation*}
\end{footnotesize}
%Recall the nice property of $\query$ that its expected count could be computed by evaluating its lineage on the probability vector (i.e., \Cref{eqn:can-inline-probabilities-into-polynomial}).
%This property does not hold for $\poly^2$ (i.e., $\expct\pbox{\poly^2} \neq \poly^2(\probOf\pbox{W_a}, \probOf\pbox{W_b}, \probOf\pbox{W_c})$), but does suggest a related closed form formula.
Note that if $Dom(W_i) = \{0, 1\}$, then for any $k > 0$, $\expct\pbox{W_i^k} = \expct\pbox{W_i}$.
This property leads us to consider a structure related to $\poly$.
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly(\vct{X})$, define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in $\poly(\vct{X})$ to $1$.
\end{Definition}
With $\poly^2_E$ as an example, we have:
With $\poly^2$ as an example, we have:
\begin{align*}
\rpoly^2_E(L_a, L_b, L_c, L_d)
\rpoly^2(L_a, L_b, L_c, L_d)
=&\; L_aL_b + L_bL_d + L_bW_c + 2L_aL_bL_d + 2L_aL_bL_c + 2L_bL_cL_d
\end{align*}
It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2_E} = \rpoly_E(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$).
It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2} = \rpoly(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$).
The reduced form of a lineage polynomial can be obtained but requires a linear scan over the clauses of an SOP encoding of the polynomial. Note that for a compressed representation, this scheme would require an exponential number of computations in the size of the compressed representation. In \Cref{sec:hard}, we use $\rpoly$ to prove our hardness results .
%In prior work on lineage-based Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,yang:2015:pvldb:lenses} where this encoding is implicitly assumed, computing the expected count is linear in the size of the encoding.
@ -59,7 +50,7 @@ Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\p
\[\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}\]
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \probOf'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \probOf(\db') \]
\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
Note that in this work, for the query output, we consider bags, i.e., each possible world in the query output is a set of bag relations and queries are evaluated using bag semantics. We will use $\domK$-relations to model bags. A \emph{$\domK$-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
@ -68,12 +59,14 @@ A $\semK$-database is a set of $\semK$-relations. It will be convenient to also
We review positive relational algebra semantics for $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%
\begin{equation}\label{eq:bag-expectation}
\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
\end{equation}
%
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$, we use $\query(\db)(t)$ for $\db \in \idb$ as input to compute the expected multiplicity of \cref{eq:intro-bag-expectation}. Note that the tables of \cref{fig:ex-shipping-simp} have an implicit $1$ $\semN$-valued annotation for each tuple in tables $OnTime$ and $Route$.
%\cref{ex:intro-tbls} and \cref{ex:intro-lineage} $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \probDist}[\query(\db)(t)]$:
%%
%\begin{equation}\label{eq:bag-expectation}
%\expct_{\idb \sim \probDist}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf(\db)
%\end{equation}
%%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -101,13 +94,13 @@ Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$, with the standard addition and multiplication of polynomials.
We will use $\semNX$-PDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-PDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-PDBs and a function $\rmod$ from an $\semNX$-PDB to an equivalent $\semN$-PDB are both formalized in \Cref{subsec:supp-mat-background}.
$\semNX$-PDBs and a function $\rmod$ (which transforms an $\semNX$-PDB to an equivalent $\semN$-PDB) are both formalized in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\[ \expct_{\idb \sim \pd}[\query(\idb)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\polyForTuple(\vct{W})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.