Starting second section of Intro (poly equivalence).

2022-01-18 11:49:12 -05:00 · 2022-01-18 11:49:12 -05:00 · 627e8745e7
parent a2fcf0b468
commit 627e8745e7
1 changed files with 36 additions and 83 deletions
--- a/intro-rewrite-070921.tex
+++ b/intro-rewrite-070921.tex
@ -136,6 +136,42 @@ In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DB
 }
 \secrev{
 \subsection{Polynomial Equivalence}
+A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in.  The bag semantics analog is a provenance/lineage polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07} (see~\Cref{fig:nxDBSemantics} for a definition), a polynomial with non-zero integer coefficients and exponents, over integer variables $\vct{X}$ encoding input tuple multiplicities.
+\begin{figure}
+  \begin{align*}
+	  \polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
+	  \polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
+	  \polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
+	    \polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
+	    0                       & \text{otherwise}.
+	    \end{cases} &
+	       \begin{aligned}
+	          \polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
+	        \end{aligned}&
+	          \begin{aligned}
+	            &\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}}  \\
+	            &~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
+	          \end{aligned}\\
+	                                           & & & \polyqdt{\rel}{\dbbase}{\tup} = \sum_{j \in [c]}j\cdot\pVar_{\tup, j}
+	\end{align*}\\[-10mm]
+	\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$. Here $\dbbase.\rel$ denotes the instance of relation $\rel$ in $\dbbase$.} 
+	\label{fig:nxDBSemantics}
+\end{figure}
+
+We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
+Given an $\raPlus$ query $\query$, 
+\AHchange{
+\abbrCTIDB $\pdb$
+}
+and result tuple $\tup$, compute the expected
+multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$).,
+where $\pdassign$ is the distribution induced by $\pd$ on the relevant assignments $\vct{W}$ to variables of $\apolyqdt$.
+\end{Problem}
+We note that computing \Cref{prob:expect-mult} 
+is equivalent to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
+In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials.
 }

 A probabilistic database (PDB) $\pdb$ is a pair $\inparen{\idb, \pd}$, where $\idb$ is a set of deterministic database instances called possible worlds and $\pd$ is a probability distribution over $\idb$.
@ -173,89 +209,6 @@ iff $W_i=1$. Furthermore, $\pd$ is compactly described by a tuple $\vct{p}=\inpa
 We then define a \abbrCTIDB be a bag \abbrTIDB with the further restriction that each tuple $\tup$ has a multiplicity of at most some constant $c$, formally: $\forall \db \in \pdb, ~\forall \tup \in \db, ~\db\inparen{\tup}\leq c$.  That is, any tuple in a \abbrCTIDB has a multiplicity of at most $c$.
 }

-\noindent\AHchange{
-For notational convenience we make use of the following definition.
-\begin{Definition}[$\pdassign$]
-Given a \abbrCTIDB $\pdb = \inparen{\idb, \pd}$ and the set of all $c^\numvar$ worlds $W$, denote the probability distribution induced from $\pd$ over each world $\wElem \in W$ as $\pdassign$.
-\end{Definition}
-}
-\sout{Further, define $\dbbase=\bigcup_{\db\in\idb} \db$.}
-
-A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in.
-%\AR{Removed couple of sentence on lineage formula since we explicitly define $\poly$ now.}
-%
-%Each valuation of the random variables appearing in this formula corresponds to one possible world.
-%Given a joint probability distribution over such assignments, the marginal probability of a query result tuple $\tup$ is the probability that the lineage formula of $\tup$ evaluates to true.  Given a \abbrBPDB $\pdb$, we refer to the above encoding of $\pdb$ as \dbbaseName and denote it as $\dbbase$.
-%
-The bag semantics analog is a provenance/lineage polynomial $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07} (see~\Cref{fig:nxDBSemantics} for a definition), a polynomial with non-zero integer coefficients and exponents, over integer variables $\vct{X}$ encoding input tuple multiplicities.
-\begin{figure}
-  \begin{align*}
-  \polyqdt{\project_A(\query)}{\dbbase}{\tup} =& \sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\dbbase}{\tup'} &
-  \polyqdt{\query_1 \union \query_2}{\dbbase}{\tup} =& \polyqdt{\query_1}{\dbbase}{\tup} + \polyqdt{\query_2}{\dbbase}{\tup}\\
-  \polyqdt{\select_\theta(\query)}{\dbbase}{\tup} =& \begin{cases}
-    \polyqdt{\query}{\dbbase}{\tup} & \text{if }\theta(\tup) \\
-    0                       & \text{otherwise}.
-    \end{cases} &
-       \begin{aligned}
-          \polyqdt{\query_1 \join \query_2}{\dbbase}{\tup} =\\ ~
-        \end{aligned}&
-          \begin{aligned}
-            &\polyqdt{\query_1}{\dbbase}{\project_{\attr{\query_1}}{\tup}}  \\
-            &~~~\cdot\polyqdt{\query_2}{\dbbase}{\project_{\attr{\query_2}}{\tup}}
-          \end{aligned}\\
-                                           & & & \polyqdt{\rel}{\dbbase}{\tup} = \sum_{j \in [c]}j\cdot\pVar_{\tup, j}%&\begin{cases}
-%                                           		X_\tup & \text{if }\dbbase.\rel\inparen{\tup} = 1 \\
-%                                           		0		 &\text{otherwise.}\end{cases}
-    %\\
-  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-  % \evald{\project_A(\rel)}{\db}(\tup) =& \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') &
-  % \evald{(\rel_1 \union \rel_2)}{\db}(\tup) =& \evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\
-  % \evald{\select_\theta(\rel)}{\db}(\tup) =& \begin{cases}
-  %   \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
-  %   0                       & \text{otherwise}.
-  %   \end{cases} &
-  %      \begin{aligned}
-  %         \evald{(\rel_1 \join \rel_2)}{\db}(\tup) =\\ ~
-  %       \end{aligned}&
-  %         \begin{aligned}
-  %           &\evald{\rel_1}{\db}(\project_{\attr{\rel_1}}(\tup))  \\
-  %           &~~~\cdot\evald{\rel_2}{\db}(\project_{\attr{\rel_2}}(\tup))
-  %         \end{aligned}\\
-  %      & & \evald{R}{\db}(\tup) =& \rel(\tup)
-\end{align*}\\[-10mm]
-\caption{Construction of the lineage (polynomial) for an $\raPlus$ query over a \abbrBPDB, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\dbbase$ and $\tup$ in $\rel$. Here $\dbbase.\rel$ denotes the instance of relation $\rel$ in $\dbbase$.} % Evaluation semantics $\evald{\cdot}{\db}$ for $\semNX$-DBs~\cite{DBLP:conf/pods/GreenKT07}.}
-\label{fig:nxDBSemantics}
-\end{figure}
-
-
-%Analog to set-semantics, computing the expected multiplicity of a tuple reduces to computing the expectation of this polynomial.
-We drop $\query$, $\dbbase$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now 
-\sout{
-re-state
-} %~\Cref{prob:bag-pdb-query-eval} 
-\AHchange{
-specify the problem of computing the expectation of tuple multiplicity
-}
-in the language of lineage polynomials:
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
-Given an $\raPlus$ query $\query$, 
-\AHchange{
-\abbrCTIDB $\pdb$
-}
-and result tuple $\tup$, compute the expected
-multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$).
-\sout{,
-where $\pdassign$ is the distribution induced by $\pd$ on the relevant assignments $\vct{W}$ to variables of $\apolyqdt$.
-}
-\end{Problem}
-We note that %\Cref{prob:bag-pdb-query-eval} 
-\AHchange{
-computing $\expct_{\randDB\sim\pd}\pbox{\query\inparen{\randDB}\inparen{\tup}}$
-}
-is equivalent to \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
-In this work, we study the complexity of \Cref{prob:bag-pdb-poly-expected} for several models of probabilistic databases and various encodings of such polynomials.

 %\mypar{\abbrTIDB\xplural}
 %We initially focus on tuple-independent probabilistic bag-databases\footnote{See \cite{DBLP:series/synthesis/2011Suciu} for a survey of set-\abbrTIDBs; the bag encoding is analogous~\cite{DBLP:conf/pods/GreenKT07}.} (\abbrTIDB\xplural), a compressed encoding of probabilistic databases where the presence of each individual tuple (out of a total of $\numvar$ input tuples) in a possible world is modeled as an independent probabilistic event.\footnote{