This commit is contained in:
Aaron Huber 2021-09-11 10:02:26 -04:00
commit 59c44f53d7

View file

@ -1,8 +1,7 @@
%!TEX root=./main.tex
%root: main.tex
\section{Introduction}\label{sec:intro}
\input{two-step-model}
A probabilistic database (PDB) $\pdb$ is a tuple $\inparen{\idb, \pd}$, where $\idb$ is a set of deterministic database instances called possible worlds and $\pd$ is a probability distribution over $\idb$.
A probabilistic database (PDB) $\pdb$ is a pair $\inparen{\idb, \pd}$, where $\idb$ is a set of deterministic database instances called possible worlds and $\pd$ is a probability distribution over $\idb$.
A commonly studied problem in probabilistic databases is, given a query $\query$, PDB $\pdb$, and possible query result tuple $\tup$, to compute the tuple's \textit{marginal probability} of being in the query's result, i.e., computing the expectation of a Boolean random variable over $\pd$ that is $1$ for every $\db \in \idb$ for which $\tup \in \query(\db)$ and $0$ otherwise. In this work, we are interested in bag semantics where each tuple $\tup$ is associated with a multiplicity $\db(\tup)$ from $\semN$ in each possible world\footnote{We find it convenient to use the notation from~\cite{DBLP:conf/pods/GreenKT07} which models bag relations as functions that map tuples to their multiplicity.}.
We refer to such a probabilistic database as a bag-probabilistic database or \abbrBPDB for short.
The natural generalization of the problem of computing marginal probabilities of query result tuples to bag semantics is to compute the expectation of a random variable over $\pd$ that assigns value $\query(\db)(\tup)$ in world $\db$:
@ -13,33 +12,83 @@ The natural generalization of the problem of computing marginal probabilities of
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity]\label{prob:bag-pdb-query-eval}
Given a positive relational algebra query ($\raPlus$)\footnote{The class of $\raPlus$ queries consists of all queries that can be composed of the positive (monotonic) relational algebra operators: selection, projection, join, and union (SPJU).} $\query$, \abbrBPDB $\pdb$, and output tuple $\tup$, compute the expected
multiplicity ($\expct_\pd\pbox{\query\inparen{\pdb}\inparen{\tup}}$)
Given a positive relational algebra query\footnote{The class of $\raPlus$ queries consists of all queries that can be composed of the positive (monotonic) relational algebra operators: selection, projection, join, and union (SPJU).} ($\raPlus$) $\query$, \abbrBPDB $\pdb$, and output tuple $\tup$, compute the expected
multiplicity ($\expct_{\db\sim\pd}\pbox{\query\inparen{\db}\inparen{\tup}}$)
of tuple $\tup$.
\end{Problem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We are interested in the data complexity of this problem (i.e. we think of $Q$ as being of constant size). Unless stated otherwise, we implicitly assume the probability distribution $\pd$, and for notational convenience use $\expct\pbox{\cdot}$ instead of $\expct_\pd\pbox{\cdot}$.
We are interested in the parameterized complexity of this problem (i.e. we think of $Q$ as being parameterized by some parameter $k$ and size of the database going to infinity relative to $k$). Unless stated otherwise, we implicitly assume the probability distribution $\pd$, and for notational convenience use $\expct\pbox{\cdot}$ instead of $\expct_\pd\pbox{\cdot}$. Further, define $D_\Omega=\cup_{D\in\Omega} D$.
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in.
Each valuation of the random variables appearing in this formula corresponds to one possible world.
Given a joint probability distribution over such assignments, the marginal probability of a query result tuple $\tup$ is the probability that the lineage formula of $\tup$ evaluates to true. Given a \abbrBPDB $\pdb$, we refer to the above encoding of $\pdb$ as \dbbaseName and denote it as $\dbbase$.
\AR{Removed couple of sentence on lineage formula since we explicitly define $\Phi$ now.}
%
%Each valuation of the random variables appearing in this formula corresponds to one possible world.
%Given a joint probability distribution over such assignments, the marginal probability of a query result tuple $\tup$ is the probability that the lineage formula of $\tup$ evaluates to true. Given a \abbrBPDB $\pdb$, we refer to the above encoding of $\pdb$ as \dbbaseName and denote it as $\dbbase$.
%
The bag semantics analog of a lineage formula is a provenance/lineage polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}-- see~\Cref{fig:nxDBSemantics} for a definition-- a polynomial with integer coefficients and exponents over integer variables $\vct{X}$ encoding the multiplicity of input tuples.
\begin{figure}
\begin{align*}
\polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
\polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
\polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
\polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
0 & \text{otherwise}.
\end{cases} &
\begin{aligned}
\polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
\end{aligned}&
\begin{aligned}
&\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}} \\
&~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
& & \polyqdt{\rel}{\dbbase}{\tup} =&\begin{cases}
X_\tup & \text{if }\dbbase.\rel\inparen{\tup} = 1 \\
0 &\text{otherwise.}\end{cases}
%\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \evald{\project_A(\rel)}{\db}(\tup) =& \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') &
% \evald{(\rel_1 \union \rel_2)}{\db}(\tup) =& \evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\
% \evald{\select_\theta(\rel)}{\db}(\tup) =& \begin{cases}
% \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
% 0 & \text{otherwise}.
% \end{cases} &
% \begin{aligned}
% \evald{(\rel_1 \join \rel_2)}{\db}(\tup) =\\ ~
% \end{aligned}&
% \begin{aligned}
% &\evald{\rel_1}{\db}(\project_{\attr{\rel_1}}(\tup)) \\
% &~~~\cdot\evald{\rel_2}{\db}(\project_{\attr{\rel_2}}(\tup))
% \end{aligned}\\
% & & \evald{R}{\db}(\tup) =& \rel(\tup)
\end{align*}\\[-10mm]
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$.} % Evaluation semantics $\evald{\cdot}{\db}$ for $\semNX$-DBs~\cite{DBLP:conf/pods/GreenKT07}.}
\label{fig:nxDBSemantics}
\end{figure}
The bag semantics analog of a lineage formula is a provenance polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with integer coefficients and exponents over integer random variables $\vct{\randWorld}$ encoding the multiplicity of input tuples.
Analog to set-semantics, computing the expected multiplicity of a tuple reduces to computing the expectation of this polynomial. We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion.
%Analog to set-semantics, computing the expected multiplicity of a tuple reduces to computing the expectation of this polynomial.
We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now re-state~\Cref
{prob:bag-pdb-query-eval} in the language of lineage polynomials:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$, \abbrBPDB $\pdb$, and output tuple $\tup$, compute the expected
multiplicity of $\apolyqdt$ ($\expct_\pd\pbox{\apolyqdt}$).
multiplicity of $\apolyqdt$ ($\expct_{\vct{W}\sim \pdassign}\pd\pbox{\apolyqdt(\vct{W})}$),
where $\pdassign$ is the distribution induced by $\pd$ on the relevant assignements to variables of $\apolyqdt$.
\end{Problem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\AH{I think that \Cref{prob:bag-pdb-poly-expected} needs to define the all worlds distribution $\pdassign$ over the set $\vct{W}\in\{0, 1\}^\numvar$, as well as the assumption or justification that $\pd \equiv \pdassign$. The prose ``propositional fomulas that dscribe the set of possible worlds...'' perhaps `justifies' using $\pd$.}
%\AH{I think that \Cref{prob:bag-pdb-poly-expected} needs to define the all worlds distribution $\pdassign$ over the set $\vct{W}\in\{0, 1\}^\numvar$, as well as the assumption or justification that $\pd \equiv \pdassign$. The prose ``propositional fomulas that dscribe the set of possible worlds...'' perhaps `justifies' using $\pd$.}
%\AR{Handled the above I think.}
Note that, if $\apolyqdt$ is given, then \Cref{prob:bag-pdb-query-eval} reduces to \Cref{prob:bag-pdb-poly-expected} (see \Cref{subsec:expectation-of-polynom-proof} for the proof). Evaluating queries over probabilistic databases in this fashion (first computing a tuple's lineage and then calculating the expectation of the lineage) has been referred to as \textit{intensional query evaluation}~\cite{DBLP:series/synthesis/2011Suciu}. In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials, considering the size of the encoding as the input size. % specifically, the bag semantics version of tuple-independent probabilistic bag-databases (\abbrTIDB) and block-independent probabilistic databases (\abbrBIDB).
%Note that, if $\apolyqdt$ is given, then
We note that \Cref{prob:bag-pdb-query-eval} is equivalent to \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
%(see \Cref{subsec:expectation-of-polynom-proof} for the proof). Evaluating queries over probabilistic databases in this fashion (first computing a tuple's lineage and then calculating the expectation of the lineage) has been referred to as \textit{intensional query evaluation}~\cite{DBLP:series/synthesis/2011Suciu}.
In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials, considering the size of the encoding as the input size. % specifically, the bag semantics version of tuple-independent probabilistic bag-databases (\abbrTIDB) and block-independent probabilistic databases (\abbrBIDB).
% Our main technical focus is on studying the complexity of this problem for various encoding of such polynomials.
However, as we will show, these results have implications for the complexity of \Cref{prob:bag-pdb-query-eval} through intensional query evaluation, i.e., when also considering the cost of generating lineage polynomials.
%However, as we will show, these results have implications for the complexity of \Cref{prob:bag-pdb-query-eval} through intensional query evaluation, i.e., when also considering the cost of generating lineage polynomials.
\AR{Have done my pass till here}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{\abbrTIDB\xplural}
%Solving~\Cref{prob:bag-pdb-query-eval} for arbitrary $\pd$ is hopeless since we need exponential space to repreent an arbitrary $\pd$.
@ -174,46 +223,7 @@ as the representation system of $\poly(\vct{X})$.
% In this case, we have for any output tuple $\tup$, $\expct\pbox{\poly(\vct{W})}=\Phi(1,\dots,1)$.
% Thus, we have another case where $\timeOf{\abbrStepTwo}(Q,\pdb)$ is $\bigO{\timeOf{\abbrStepOne}(Q,\pdb)}$ and we again achieve deterministic query runtime for $\query\inparen{\pdb}$ (up to a constant factor). These observations introduce our first formalization of~\Cref{prob:informal}:
\begin{figure}
\begin{align*}
\polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
\polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
\polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
\polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
0 & \text{otherwise}.
\end{cases} &
\begin{aligned}
\polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
\end{aligned}&
\begin{aligned}
&\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}} \\
&~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
& & \polyqdt{\rel}{\dbbase}{\tup} =&\begin{cases}
X_\tup & \text{if }\dbbase.\rel\inparen{\tup} = 1 \\
0 &\text{otherwise.}\end{cases}
%\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \evald{\project_A(\rel)}{\db}(\tup) =& \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') &
% \evald{(\rel_1 \union \rel_2)}{\db}(\tup) =& \evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\
% \evald{\select_\theta(\rel)}{\db}(\tup) =& \begin{cases}
% \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
% 0 & \text{otherwise}.
% \end{cases} &
% \begin{aligned}
% \evald{(\rel_1 \join \rel_2)}{\db}(\tup) =\\ ~
% \end{aligned}&
% \begin{aligned}
% &\evald{\rel_1}{\db}(\project_{\attr{\rel_1}}(\tup)) \\
% &~~~\cdot\evald{\rel_2}{\db}(\project_{\attr{\rel_2}}(\tup))
% \end{aligned}\\
% & & \evald{R}{\db}(\tup) =& \rel(\tup)
\end{align*}\\[-10mm]
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$.} % Evaluation semantics $\evald{\cdot}{\db}$ for $\semNX$-DBs~\cite{DBLP:conf/pods/GreenKT07}.}
\label{fig:nxDBSemantics}
\end{figure}
\input{two-step-model}
Given $\timeOf{\abbrStepOne}(Q,\pdb) = O(\qruntime{Q, \dbbase})$, we can now focus on the complexity of \abbrStepTwo.
We can represent the factorized lineage polynomial by the size of its correspoding arithmetic circuit $\circuit$ (which we denote by $|\circuit|$).