Starting second section of Intro (poly equivalence).

This commit is contained in:
Aaron Huber 2022-01-18 11:49:12 -05:00
parent a2fcf0b468
commit 627e8745e7

View file

@ -136,6 +136,42 @@ In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DB
}
\secrev{
\subsection{Polynomial Equivalence}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07} (see~\Cref{fig:nxDBSemantics} for a definition), a polynomial with non-zero integer coefficients and exponents, over integer variables $\vct{X}$ encoding input tuple multiplicities.
\begin{figure}
\begin{align*}
\polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
\polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
\polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
\polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
0 & \text{otherwise}.
\end{cases} &
\begin{aligned}
\polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
\end{aligned}&
\begin{aligned}
&\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}} \\
&~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
& & & \polyqdt{\rel}{\dbbase}{\tup} = \sum_{j \in [c]}j\cdot\pVar_{\tup, j}
\end{align*}\\[-10mm]
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$. Here $\dbbase.\rel$ denotes the instance of relation $\rel$ in $\dbbase$.}
\label{fig:nxDBSemantics}
\end{figure}
We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$,
\AHchange{
\abbrCTIDB $\pdb$
}
and result tuple $\tup$, compute the expected
multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$).,
where $\pdassign$ is the distribution induced by $\pd$ on the relevant assignments $\vct{W}$ to variables of $\apolyqdt$.
\end{Problem}
We note that computing \Cref{prob:expect-mult}
is equivalent to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials.
}
A probabilistic database (PDB) $\pdb$ is a pair $\inparen{\idb, \pd}$, where $\idb$ is a set of deterministic database instances called possible worlds and $\pd$ is a probability distribution over $\idb$.
@ -173,89 +209,6 @@ iff $W_i=1$. Furthermore, $\pd$ is compactly described by a tuple $\vct{p}=\inpa
We then define a \abbrCTIDB be a bag \abbrTIDB with the further restriction that each tuple $\tup$ has a multiplicity of at most some constant $c$, formally: $\forall \db \in \pdb, ~\forall \tup \in \db, ~\db\inparen{\tup}\leq c$. That is, any tuple in a \abbrCTIDB has a multiplicity of at most $c$.
}
\noindent\AHchange{
For notational convenience we make use of the following definition.
\begin{Definition}[$\pdassign$]
Given a \abbrCTIDB $\pdb = \inparen{\idb, \pd}$ and the set of all $c^\numvar$ worlds $W$, denote the probability distribution induced from $\pd$ over each world $\wElem \in W$ as $\pdassign$.
\end{Definition}
}
\sout{Further, define $\dbbase=\bigcup_{\db\in\idb} \db$.}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in.
%\AR{Removed couple of sentence on lineage formula since we explicitly define $\poly$ now.}
%
%Each valuation of the random variables appearing in this formula corresponds to one possible world.
%Given a joint probability distribution over such assignments, the marginal probability of a query result tuple $\tup$ is the probability that the lineage formula of $\tup$ evaluates to true. Given a \abbrBPDB $\pdb$, we refer to the above encoding of $\pdb$ as \dbbaseName and denote it as $\dbbase$.
%
The bag semantics analog is a provenance/lineage polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07} (see~\Cref{fig:nxDBSemantics} for a definition), a polynomial with non-zero integer coefficients and exponents, over integer variables $\vct{X}$ encoding input tuple multiplicities.
\begin{figure}
\begin{align*}
\polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
\polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
\polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
\polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
0 & \text{otherwise}.
\end{cases} &
\begin{aligned}
\polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
\end{aligned}&
\begin{aligned}
&\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}} \\
&~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
& & & \polyqdt{\rel}{\dbbase}{\tup} = \sum_{j \in [c]}j\cdot\pVar_{\tup, j}%&\begin{cases}
% X_\tup & \text{if }\dbbase.\rel\inparen{\tup} = 1 \\
% 0 &\text{otherwise.}\end{cases}
%\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \evald{\project_A(\rel)}{\db}(\tup) =& \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') &
% \evald{(\rel_1 \union \rel_2)}{\db}(\tup) =& \evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\
% \evald{\select_\theta(\rel)}{\db}(\tup) =& \begin{cases}
% \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
% 0 & \text{otherwise}.
% \end{cases} &
% \begin{aligned}
% \evald{(\rel_1 \join \rel_2)}{\db}(\tup) =\\ ~
% \end{aligned}&
% \begin{aligned}
% &\evald{\rel_1}{\db}(\project_{\attr{\rel_1}}(\tup)) \\
% &~~~\cdot\evald{\rel_2}{\db}(\project_{\attr{\rel_2}}(\tup))
% \end{aligned}\\
% & & \evald{R}{\db}(\tup) =& \rel(\tup)
\end{align*}\\[-10mm]
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$. Here $\dbbase.\rel$ denotes the instance of relation $\rel$ in $\dbbase$.} % Evaluation semantics $\evald{\cdot}{\db}$ for $\semNX$-DBs~\cite{DBLP:conf/pods/GreenKT07}.}
\label{fig:nxDBSemantics}
\end{figure}
%Analog to set-semantics, computing the expected multiplicity of a tuple reduces to computing the expectation of this polynomial.
We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now
\sout{
re-state
} %~\Cref{prob:bag-pdb-query-eval}
\AHchange{
specify the problem of computing the expectation of tuple multiplicity
}
in the language of lineage polynomials:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$,
\AHchange{
\abbrCTIDB $\pdb$
}
and result tuple $\tup$, compute the expected
multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$).
\sout{,
where $\pdassign$ is the distribution induced by $\pd$ on the relevant assignments $\vct{W}$ to variables of $\apolyqdt$.
}
\end{Problem}
We note that %\Cref{prob:bag-pdb-query-eval}
\AHchange{
computing $\expct_{\randDB\sim\pd}\pbox{\query\inparen{\randDB}\inparen{\tup}}$
}
is equivalent to \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials.
%\mypar{\abbrTIDB\xplural}
%We initially focus on tuple-independent probabilistic bag-databases\footnote{See \cite{DBLP:series/synthesis/2011Suciu} for a survey of set-\abbrTIDBs; the bag encoding is analogous~\cite{DBLP:conf/pods/GreenKT07}.} (\abbrTIDB\xplural), a compressed encoding of probabilistic databases where the presence of each individual tuple (out of a total of $\numvar$ input tuples) in a possible world is modeled as an independent probabilistic event.\footnote{