master
Boris Glavic 2022-05-21 14:50:36 +02:00
parent a9d3984dbf
commit 347cca2f7d
2 changed files with 52 additions and 51 deletions

View File

@ -6,7 +6,7 @@ This work explores the problem of computing the expectation of the multiplicity
Formally, a \abbrCTIDB,
$\pdb = \inparen{\worlds, \bpd}$ is a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
Any such world can be encoded as a vector (of length $\numvar=\abs{\tupset}$) from $\worlds$, such that the multiplicity of each $\tup \in \tupset$ is stored at a distinct index.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$.
We note that encoding a possible world as a vector, while non-standard, is equivalent to encoding it as a set of tuples (\Cref{prop:expection-of-polynom} in \Cref{subsec:expectation-of-polynom-proof}).
Given that tuple multiplicities are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
@ -17,7 +17,7 @@ We can formally state our problem of computing the expected multiplicity of a re
\begin{Problem}\label{prob:expect-mult}
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
}
$\query$, and result tuple $\tup$, compute the expectation $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
\end{Problem}
@ -27,7 +27,7 @@ An $\raPlus$ query is a query expressed in positive relational algebra, i.e., us
&\begin{aligned}[t]
&\polyqdt{\project_A(\query)}{\gentupset}{\tup} =\\
&~~\sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\gentupset}{\tup'}
\end{aligned}
\end{aligned}
&
&\begin{aligned}[t]
&\polyqdt{\query_1 \union \query_2}{\gentupset}{\tup} =\\
@ -42,7 +42,7 @@ An $\raPlus$ query is a query expressed in positive relational algebra, i.e., us
\end{aligned}
&
&\begin{aligned}
&\polyqdt{\query_1 \join \query_2}{\gentupset}{\tup} =\\
&\polyqdt{\query_1 \join \query_2}{\gentupset}{\tup} =\\
&\qquad\polyqdt{\query_1}{\gentupset}{\project_{\attr{\query_1}}{\tup}}\\
&\qquad\cdot\polyqdt{\query_2}{\gentupset}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}
@ -64,7 +64,7 @@ Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query, which we make more precise shortly.
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. To analyze this question we denote by $\timeOf{}^*(Q,\pdb)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$.
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries, for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} \approx \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{The upper bounds on runtime that we derive apply pointwise to any $\query \in\raPlus$, allowing us to abstract away the specific heuristics for choosing an optimized query (i.e., Any deterministic query optimization heuristic is equally useful for \abbrCTIDB queries).}
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries, for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} \approx \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{The upper bounds on runtime that we derive apply pointwise to any $\query \in\raPlus$, allowing us to abstract away the specific heuristics for choosing an optimized query (i.e., Any deterministic query optimization heuristic is equally useful for \abbrCTIDB queries).}\BG{Rewrite: since an optimized Q is also a Q this also applies in the case where there is a query optimizer the rewrites Q}
\begin{table*}[t!]
\centering
@ -75,7 +75,7 @@ Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further detail
\hline
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{1+\eps_0}}$ for {\em some} $\eps_0>0$ & Single & Triangle Detection hypothesis\\
$\omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{C_0}}$ for {\em all} $C_0>0$ & Multiple &$\sharpwzero\ne\sharpwone$\\
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{c_0\cdot k}}$ for {\em some} $c_0>0$ & Multiple & \Cref{conj:known-algo-kmatch}\\
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{c_0\cdot k}}$ for {\em some} $c_0>0$ & Multiple & \Cref{conj:known-algo-kmatch}\\
\hline
\end{tabular}
\caption{Our lower bounds for a specific hard query $\qhard$ parameterized by $k$. For $\pdb = \inset{\worlds, \bpd}$ those with `Multiple' in the second column need the algorithm to be able to handle multiple $\bpd$, i.e. probability distributions (for a given $\tupset$). The last column states the hardness assumptions that imply the lower bounds in the first column ($\eps_o,C_0,c_0$ are constants that are independent of $k$).}
@ -94,10 +94,10 @@ What our lower bound in the third row says, is that one cannot get more than a p
} already imply the claimed lower bounds if we replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar = |\tupset|$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
\mypar{Our upper bound results} We introduce a $(1\pm \epsilon)$-approximation algorithm that computes ~\Cref{prob:expect-mult} in time $O_\epsilon\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$. This means, when we are okay with approximation, that we solve~\Cref{prob:expect-mult} in time linear in the size of the deterministic query and bag \abbrPDB\xplural are deployable in practice.
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
(see \Cref{sec:karp-luby}).
Further, our approximation algorithm works for a more general notion of bag \abbrPDB\xplural beyond \abbrCTIDB\xplural
(see \Cref{subsec:tidbs-and-bidbs}).
(see \Cref{subsec:tidbs-and-bidbs}).
\subsection{Polynomial Equivalence}\label{sec:intro-poly-equiv}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) annotates tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. The lineage polynomial for result tuple $t_{out}$ evaluates to $t_{out}$'s multiplicity in a given possible world when each $X_{t_{in}}$ is replaced by the multiplicity of $t_{in}$ in the possible world.
@ -188,7 +188,7 @@ We adopt a two-step intensional model of query evaluation used in set-\abbrPDB\x
(i) \termStepOne (\abbrStepOne): Given input $\tupset$ and $\query$, output every tuple $\tup$ that possibly satisfies $\query$, annotated with its lineage polynomial ($\poly(\vct{X})=\apolyqdt\inparen{\vct{X}}$);
(ii) \termStepTwo (\abbrStepTwo): Given $\poly(\vct{X})$ for each tuple, compute $\expct_{\randWorld\sim\bpd}\pbox{\poly(\vct{\randWorld})}$.
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the output of \abbrStepOne) the runtime of \abbrStepTwo, which we can leverage~\Cref{def:reduced-poly} and~\Cref{lem:tidb-reduce-poly} to address the next formal objective:
Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the output of \abbrStepOne) the runtime of \abbrStepTwo, which we can leverage~\Cref{def:reduced-poly} and~\Cref{lem:tidb-reduce-poly} to address the next formal objective:
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
Given \abbrCTIDB $\pdb$, $\raPlus$ query $\query$,
@ -207,23 +207,23 @@ as the representation system of $\poly(\vct{X})$, and we show in \Cref{sec:circu
Given that a representation $\circuit^*$ exists where $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can focus on the complexity of \abbrStepTwo.
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$), where $|\circuit|$ is the size of circuit $\circuit$.
Thus, the question of approximation
Thus, the question of approximation
can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
\begin{Problem}\label{prob:intro-stmt}
Given one circuit $\circuit$ that encodes $\apolyqdt$ for all result tuples $\tup$ (one sink per $\tup$) for \abbrCTIDB $\pdb$ and $\raPlus$ query $\query$, does there exist an algorithm that computes a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ (for all result tuples $\tup$) in $\bigO{|\circuit|}$ time?
\end{Problem}
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation.
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation.
This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we again limit our focus to $\refpoly{1, }^{\inparen{ABX}^2}$, assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
In computing $\rpoly$, we have some cancellations to deal with:
\begin{footnotesize}
\begin{equation*}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{X}} = A^2\inparen{X_1^2 + 4X_1X_2 + 4X_2^2}B^2 =A^2X_1^2B^2 + 4A^2X_1X_2B^2+4A^2X_2^2B^2
\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{X}} = A^2\inparen{X_1^2 + 4X_1X_2 + 4X_2^2}B^2 =A^2X_1^2B^2 + 4A^2X_1X_2B^2+4A^2X_2^2B^2
%&\qquad+ 2AX_2B^2YE + 2AX_1B^2ZC + 2AX_2B^2ZC + 2B^2YEZC\\
\end{equation*}
\end{footnotesize}
This then implies
This then implies
%\begin{footnotesize}
%\begin{equation*}
$\rpoly_1^{\inparen{ABX}^2}\inparen{\vct{X}} = AX_1B+4AX_2B$.
@ -241,9 +241,9 @@ Substituting $\vct{\prob}$ for $\vct{X}$,
\end{align*}
\end{footnotesize}
If we assume that all probability values are at least $p_0>0$, then given access to $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$
we get that $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$ is in the range $\pbox{p_0^3\cdot\rpoly^{\inparen{ABX}^2}_1\inparen{\vct{\prob}}, \rpoly_1^{\inparen{ABX}^2}\inparen{\vct{\prob}}}$.
%We can simulate sampling from $\refpoly{1, }^2\inparen{\vct{X}}$ by sampling monomials from $\refpoly{1, }^2$ while ignoring any samples $A^2X_1X_2B^2$.
Note however, that this is \emph{not a tight approximation}.
we get that $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$ is in the range $\pbox{p_0^3\cdot\rpoly^{\inparen{ABX}^2}_1\inparen{\vct{\prob}}, \rpoly_1^{\inparen{ABX}^2}\inparen{\vct{\prob}}}$.
%We can simulate sampling from $\refpoly{1, }^2\inparen{\vct{X}}$ by sampling monomials from $\refpoly{1, }^2$ while ignoring any samples $A^2X_1X_2B^2$.
Note however, that this is \emph{not a tight approximation}.
In~\cref{sec:algo} we demonstrate that a $(1\pm\epsilon)$ (multiplicative) approximation with competitive performance is achievable.
To get an $(1\pm \epsilon)$-multiplicative approximation and solve~\Cref{prob:intro-stmt}, using \circuit we uniformly sample monomials from the equivalent \abbrSMB representation of $\poly$ (without materializing the \abbrSMB representation) and `adjust' their contribution to $\widetilde{\poly}\left(\cdot\right)$.
@ -263,7 +263,7 @@ In work independent of ours, Grohe, et. al.~\cite{https://doi.org/10.48550/arxiv
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Paper Organization} We present background and notation in \Cref{sec:background}. We prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
\mypar{Paper Organization} We present background and notation in \Cref{sec:background}. We prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}. All proofs are in the appendix.

View File

@ -1,9 +1,9 @@
\documentclass[sigconf, prologue, table]{acmart}
\documentclass[sigconf, prologue, table, anonymous]{acmart}
\AtBeginDocument{%
\providecommand\BibTeX{{%
\normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}
\setcopyright{acmcopyright}
\copyrightyear{2022}
\acmYear{2022}
@ -46,7 +46,7 @@
\usepackage[normalem]{ulem}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage%[disable]
\usepackage[disable]
{todonotes}
\usepackage{graphicx}
@ -82,7 +82,7 @@ sensitive=true
\graphicspath{ {figures/} }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{outlines}%For outline capabilities
\usepackage{outlines}%For outline capabilities
\usepackage{enumitem}%used in tandem with outlines package
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -131,39 +131,40 @@ sensitive=true
\input{abstract}
\end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% REMOVED UNTIL CR
% \begin{CCSXML}
% <ccs2012>
% <concept>
% <concept_id>10002951.10003227</concept_id>
% <concept_desc>Information systems~Information systems applications</concept_desc>
% <concept_significance>500</concept_significance>
% </concept>
% <concept>
% <concept_id>10002951.10002952</concept_id>
% <concept_desc>Information systems~Data management systems</concept_desc>
% <concept_significance>500</concept_significance>
% </concept>
% <concept>
% <concept_id>10003752.10003753.10003757</concept_id>
% <concept_desc>Theory of computation~Probabilistic computation</concept_desc>
% <concept_significance>500</concept_significance>
% </concept>
% <concept>
% <concept_id>10003752.10003777.10003778</concept_id>
% <concept_desc>Theory of computation~Complexity classes</concept_desc>
% <concept_significance>500</concept_significance>
% </concept>
% </ccs2012>
% \end{CCSXML}
\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10002951.10003227</concept_id>
<concept_desc>Information systems~Information systems applications</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10002951.10002952</concept_id>
<concept_desc>Information systems~Data management systems</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10003752.10003753.10003757</concept_id>
<concept_desc>Theory of computation~Probabilistic computation</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10003752.10003777.10003778</concept_id>
<concept_desc>Theory of computation~Complexity classes</concept_desc>
<concept_significance>500</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}
% \ccsdesc[500]{Information systems~Information systems applications}
% \ccsdesc[500]{Information systems~Data management systems}
% \ccsdesc[500]{Theory of computation~Probabilistic computation}
% \ccsdesc[500]{Theory of computation~Complexity classes}
\ccsdesc[500]{Information systems~Information systems applications}
\ccsdesc[500]{Information systems~Data management systems}
\ccsdesc[500]{Theory of computation~Probabilistic computation}
\ccsdesc[500]{Theory of computation~Complexity classes}
\keywords{probabilstic data model, parameterized complexity, fine-grained complexity, lineage polynomial}
%\keywords{probabilstic data model, parameterized complexity, fine-grained complexity, lineage polynomial}
\maketitle