Trimming. Down by ~1 column so far.
parent
b839349c51
commit
212b915809
|
@ -90,8 +90,17 @@ Recall that tuple blocks in a TIDB always have size 1, so the outer summation of
|
|||
\end{proof}
|
||||
|
||||
|
||||
\subsection{Proof of~\Cref{prop:expection-of-polynom}}
|
||||
\subsection{Equating Expectations of Tuple Multiplicities and Polynomials}
|
||||
\label{subsec:expectation-of-polynom-proof}
|
||||
|
||||
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
|
||||
Given a \abbrBPDB $\pdb = (\Omega,\bpd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$,
|
||||
we have (denoting $\randDB$ as the random variable over $\Omega$):
|
||||
$ \expct_{\randDB \sim \bpd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
|
||||
\end{Proposition}
|
||||
|
||||
Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any prior formal proof for bag-probabilistic databases.
|
||||
|
||||
\begin{proof}
|
||||
We need to prove for $\semN$-PDB $\pdb = (\idb,\pd)$ and \abbrNXPDB $\pxdb = (\db_{\semNX},\pd')$ where $\rmod(\pxdb) = \pdb$ that $\expct_{\randDB\sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\nxpolyqdt(\vct{W})}$
|
||||
By expanding $\nxpolyqdt$ and the expectation we have:
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
We showed in~\Cref{sec:hard} that a runtime of $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ cannot be acheived for~\Cref{prob:bag-pdb-poly-expected}. In light of this, we desire to produce an approximation algorithm that runs in time $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$. We do this by showing the result via circuits,
|
||||
such that our approximation algorithm for this problem runs in $\bigO{\abs{\circuit}}$ for a very broad class of circuits, (thus affirming~\Cref{prob:intro-stmt}); see the discussion after \Cref{lem:val-ub} for more.
|
||||
The following approximation algorithm applies to bag query semantics over both
|
||||
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$). Our experimental results (see~\Cref{app:subsec:experiment}) which use queries from the PDBench benchmark~\cite{pdbench} show a low $\gamma$ (see~\Cref{def:param-gamma}) supporting the notion that our bounds hold for general \abbrBIDB in practice.
|
||||
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$).
|
||||
Our experimental results (see~\Cref{app:subsec:experiment}), which use queries from the PDBench benchmark~\cite{pdbench} support the notion that our bounds hold for general \abbrBIDB in practice.
|
||||
|
||||
Corresponding proofs and pseudocode for all formal statements and algorithms
|
||||
can be found in \Cref{sec:proofs-approx-alg}.
|
||||
|
@ -41,7 +42,7 @@ The functions \size and \depth output the number of gates and levels respectivel
|
|||
\end{Definition}
|
||||
|
||||
\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
|
||||
$\degree(\circuit)$ is defined recursively as follows:
|
||||
$\degree(\circuit)$ is defined recursively:
|
||||
\[\degree(\circuit)=
|
||||
\begin{cases}
|
||||
\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
|
||||
|
@ -52,7 +53,8 @@ $\degree(\circuit)$ is defined recursively as follows:
|
|||
\]
|
||||
\end{Definition}
|
||||
|
||||
Next, we use the following notation for the complexity of multiplying integers:
|
||||
\noindent
|
||||
We use the following notation for integer multiplication complexity:
|
||||
\begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$. More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
|
||||
In a RAM model of word size of $W$-bits, $\multc{M}{W}$ denotes the complexity of multiplying two integers represented with $M$-bits. (We will assume that for input of size $N$, $W=O(\log{N})$.)
|
||||
\end{Definition}
|
||||
|
@ -61,7 +63,7 @@ Finally, to get linear runtime results, we will need to define another parameter
|
|||
that need to be `canceled' when monomials with dependent variables are removed (\Cref{subsec:one-bidb}).
|
||||
Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise; further, let $\indicator{\theta}$ also be a boolean function returning true if $\theta$ evaluates to true.
|
||||
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
|
||||
Given a \abbrOneBIDB circuit $\circuit$ define
|
||||
Given \abbrOneBIDB circuit $\circuit$, let
|
||||
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\neg\isInd{\encMon}} }
|
||||
{\abs{\circuit}(1,\ldots, 1)}.\]
|
||||
\end{Definition}
|
||||
|
|
|
@ -40,7 +40,7 @@ We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynom
|
|||
\begin{Definition}[\abbrOneBIDB]\label{def:one-bidb}
|
||||
Define a \emph{\abbrOneBIDB} to be the pair $\pdb' = \inparen{\bigtimes_{\tup\in\tupset'}\inset{0, \bound_\tup}, \bpd'},$ where $\tupset'$ is the set of possible tuples such that each $\tup \in \tupset'$ has a multiplicity domain of $\inset{0, \bound_\tup}$, with $\bound_\tup\in\mathbb{N}$. $\tupset'$ is partitioned into $\numblock$ independent blocks $\block_i,$ for $i\in\pbox{\numblock}$, of disjoint tuples. $\bpd'$ is characterized by the vector $\inparen{\prob_\tup}_{\tup\in\tupset'}$ where for every block $\block_i$, $\sum_{\tup \in \block_i}\prob_\tup \leq 1$. Given $W\in\onebidbworlds{\tupset'}$ and for $i\in\pbox{\numblock}$, let $\prob_i(W) = \begin{cases}
|
||||
1 - \sum_{\tup\in\block_i}\prob_\tup & \text{if }W_\tup = 0\text{ for all }\tup\in\block_i\\
|
||||
0 & \text{if there exists } \tup,~\tup'\in\block_i, W_\tup, W_{\tup'}\neq 0\\
|
||||
0 & \text{if there exists } \tup \neq \tup'\in\block_i; W_\tup, W_{\tup'}\neq 0\\
|
||||
\prob_\tup & W_\tup \ne 0 \text{ for the unique } t\in B_i.\\
|
||||
\end{cases}$
|
||||
|
||||
|
@ -54,9 +54,8 @@ We slightly abuse notation here, denoting a world vector as $W$ rather than $\wo
|
|||
\begin{Proposition}[\abbrCTIDB reduction]\label{prop:ctidb-reduct}
|
||||
Given \abbrCTIDB $\pdb =$\newline $\inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$ be the \emph{\abbrOneBIDB} obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intuple{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$ where $\bound_{\tup_j} = j$ for each $\tup_j$ in $\tupset'$.
|
||||
The probability distribution $\bpd'$ is the characterized by the vector $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$.
|
||||
Then, the distributions $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
|
||||
Then, $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
|
||||
\end{Proposition}
|
||||
\AH{Not entirely sure if I the notation above $\bound_{\tup_j} = j$ for each $\tup_j$ in $\tupset'$.}
|
||||
|
||||
We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
|
||||
\begin{figure}[t!]
|
||||
|
@ -92,7 +91,7 @@ We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
|
|||
\end{figure}
|
||||
|
||||
\begin{Definition}[$\rpoly'$]\label{def:reduced-poly-one-bidb}
|
||||
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB and let $\rpoly'\inparen{\vct{X}}$ denote the reduced form of $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating all monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
|
||||
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB, let $\rpoly'\inparen{\vct{X}}$ denote the reduced $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
|
||||
\end{Definition}
|
||||
Then given $\worldvec\in\inset{0,1}^{\tupset'}$ over the reduced \abbrOneBIDB of~\Cref{prop:ctidb-reduct}, the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\inparen{\worldvec}$ is of the same structure as the reformulated polynomial $\refpoly{}\inparen{\worldvec}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of both~\Cref{def:reduced-poly} and~\Cref{def:reduced-poly-one-bidb}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials.
|
||||
\begin{Lemma}\label{lem:bin-bidb-phi-eq-redphi}
|
||||
|
|
|
@ -36,11 +36,10 @@ We require only that the algorithm must enumerate its output, i.e., that $\joint
|
|||
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
|
||||
For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bound}~\cite{AGM}.
|
||||
% = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.
|
||||
Our cost model for general query evaluation follows from the join cost:
|
||||
|
||||
Our cost model for general query evaluation follows:
|
||||
%\noindent\resizebox{1\linewidth}{!}{
|
||||
%\begin{minipage}{1.0\linewidth}
|
||||
\vspace{-0.57cm}
|
||||
{\small
|
||||
\begin{flalign*}
|
||||
&\begin{aligned}
|
||||
&\qruntimenoopt{R,\gentupset,\bound} = |\gentupset.R|
|
||||
|
@ -54,6 +53,7 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
|
|||
&\qquad\qruntimenoopt{\query_1, \gentupset,\bound} + \ldots + \qruntimenoopt{\query_m,\gentupset,\bound} +
|
||||
\jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}&\\
|
||||
\end{flalign*}
|
||||
}
|
||||
\vspace{-0.93cm}
|
||||
%\begin{align*}
|
||||
% &\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound} = \\
|
||||
|
@ -63,13 +63,13 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
|
|||
%\end{minipage}
|
||||
%}\\
|
||||
|
||||
|
||||
\noindent
|
||||
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O(\qruntimenoopt{Q,\gentupset, \bound})$.
|
||||
We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
|
||||
%Observe that
|
||||
% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .
|
||||
|
||||
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\optquery{\query}$.
|
||||
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$, as we assumed when moving from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\qruntimenoopt{\optquery{\query}, \gentupset, \bound}$.
|
||||
%
|
||||
%We now make a simple observation on the above cost model:
|
||||
%\begin{proposition}
|
||||
|
|
|
@ -4,20 +4,22 @@
|
|||
|
||||
This work explores the problem of computing the expectation of the multiplicity of a tuple in the result of a query over a \abbrCTIDB, a type of probabilistic database with bag semantics where the multiplicity of a tuple is a random variable with range $[0,\bound]$ for some fixed constant $\bound$ and multiplicities assigned to any two tuples are independent of each other.
|
||||
Formally, a \abbrCTIDB,
|
||||
$\pdb = \inparen{\worlds, \bpd}$ consists of a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
|
||||
Any such world can be encoded as a vector from $\worlds$, the set of all vectors of length $\numvar=\abs{\tupset}$ such that each index corresponds to a distinct $\tup \in \tupset$ storing its multiplicity.
|
||||
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$. Given that the multiplicities of tuples are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a particular world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
|
||||
$\pdb = \inparen{\worlds, \bpd}$ is a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
|
||||
Any such world can be encoded as a vector (of length $\numvar=\abs{\tupset}$) from $\worlds$, such that the multiplicity of each $\tup \in \tupset$ is stored at a distinct index.
|
||||
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$.
|
||||
We note that encoding a possible world as a vector, while non-standard, is equivalent to encoding it as a set of tuples (\Cref{prop:expection-of-polynom} in \Cref{subsec:expectation-of-polynom-proof}).
|
||||
Given that tuple multiplicities are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
|
||||
|
||||
Allowing for $\leq \bound$ multiplicities across all tuples gives rise to having $\leq \inparen{\bound+1}^\numvar$ possible worlds instead of the usual $2^\numvar$ possible worlds of a $1$-\abbrTIDB, which (assuming set query semantics), is the same as the traditional set \abbrTIDB.
|
||||
In this work, since we are generally considering bag query input, we will only be considering bag query semantics. We denote by $\query\inparen{\worldvec}\inparen{\tup}$ the multiplicity of $\tup$ in query $\query$ over possible world $\worldvec\in\worlds$.
|
||||
|
||||
%
|
||||
We can formally state our problem of computing the expected multiplicity of a result tuple as:
|
||||
|
||||
\begin{Problem}\label{prob:expect-mult}
|
||||
Given a \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
|
||||
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the relational algebra operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
|
||||
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
|
||||
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
|
||||
}
|
||||
$\query$, and result tuple $\tup$, compute the expected multiplicity of $\tup$: $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
|
||||
$\query$, and result tuple $\tup$, compute the expectation $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
|
||||
\end{Problem}
|
||||
|
||||
\begin{figure}[t!]
|
||||
|
@ -43,27 +45,26 @@ An $\raPlus$ query is a query expressed in positive relational algebra, i.e., us
|
|||
&\polyqdt{\query_1 \join \query_2}{\gentupset}{\tup} =\\
|
||||
&\qquad\polyqdt{\query_1}{\gentupset}{\project_{\attr{\query_1}}{\tup}}\\
|
||||
&\qquad\cdot\polyqdt{\query_2}{\gentupset}{\project_{\attr{\query_2}}{\tup}}
|
||||
\end{aligned}\\
|
||||
&&&\polyqdt{\rel}{\gentupset}{\tup} = X_\tup
|
||||
\end{aligned}
|
||||
\end{align*}%\\[-10mm]
|
||||
\setlength{\abovecaptionskip}{-0.25cm}
|
||||
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over an arbitrary deterministic database $\gentupset$, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\gentupset$ and $\tup$ in $\rel$. Here $\gentupset.\rel$ denotes the instance of relation $\rel$ in $\gentupset$. Please note, after we introduce the reduction to $1$-\abbrBIDB, the base case will be expressed alternatively.}
|
||||
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over an arbitrary deterministic database $\gentupset$, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\gentupset$ and $\tup$ in $\rel$. Here $\gentupset.\rel$ denotes the instance of relation $\rel$ in $\gentupset$. Please note, after we introduce the reduction to $1$-\abbrBIDB, the base case will be expressed alternatively. The base case is $\polyqdt{\rel}{\gentupset}{\tup} = X_\tup$}
|
||||
\label{fig:nxDBSemantics}
|
||||
\vspace{-0.53cm}
|
||||
\end{figure}
|
||||
|
||||
It is natural to explore computing the expected multiplicity of a result tuple as this is the analog for computing the marginal probability of a tuple in a set \abbrPDB.
|
||||
In this work we will assume that $c =\bigO{1}$ since this is what is typically seen in practice.
|
||||
Computing the expected multiplicity of a result tuple in a \abbrCTIDB is the analog of the marginal probability in a set \abbrPDB.
|
||||
In this work we will assume that $c =\bigO{1}$, since this is what is typically seen in practice.
|
||||
Allowing for unbounded $c$ is an interesting open problem.
|
||||
|
||||
\mypar{Hardness of Set Query Semantics and Bag Query Semantics}
|
||||
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and the data complexity of the problem in general has been shown by Dalvi and Suicu to be \sharpphard\cite{10.1145/1265530.1265571}. For our setting, there exists a trivial polytime algorithm to compute~\Cref{prob:expect-mult} for any $\raPlus$ query over a \abbrCTIDB due to linearity of expection (see~\Cref{sec:intro-poly-equiv}).
|
||||
Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting question that we explore deals with analyzing the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.
|
||||
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and their data complexity has, in general been shown by Dalvi and Suicu to be \sharpphard\cite{10.1145/1265530.1265571}. For our setting, there exists a trivial polytime algorithm to compute~\Cref{prob:expect-mult} for any $\raPlus$ query over a \abbrCTIDB due to linearity of expection (see~\Cref{sec:intro-poly-equiv}).
|
||||
Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting question that we explore is the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.
|
||||
|
||||
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query which we make more precise shortly.
|
||||
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query, which we make more precise shortly.
|
||||
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. To analyze this question we denote by $\timeOf{}^*(Q,\pdb)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$.
|
||||
|
||||
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} = \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{Note that our work applies to any $\query \in\raPlus$, which implies that specific heuristics for choosing an optimized query can be abstracted away, i.e., our work does not consider heuristic techniques.}
|
||||
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries, for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} \approx \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{The upper bounds on runtime that we derive apply pointwise to any $\query \in\raPlus$, allowing us to abstract away the specific heuristics for choosing an optimized query (i.e., Any deterministic query optimization heuristic is equally useful for \abbrCTIDB queries).}
|
||||
|
||||
\begin{table*}[t!]
|
||||
\centering
|
||||
|
@ -88,9 +89,9 @@ Our question is whether or not it is always true that $\timeOf{}^*\inparen{\quer
|
|||
|
||||
Specifically, depending on what hardness result/conjecture we assume, we get various weaker or stronger versions of {\em no} as an answer to our question. To make some sense of the other lower bounds in \Cref{tab:lbs}, we note that it is not too hard to show that $\timeOf{}^*(Q,\pdb) \le \bigO{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$, where $k$ is the join width (our notion of join width follows from~\Cref{def:degree-of-poly} and~\Cref{fig:nxDBSemantics}.) of the query $\query$ over all result tuples $\tup$ (and the parameter that defines our family of hard queries).
|
||||
|
||||
What our lower bound in the third row says is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
|
||||
What our lower bound in the third row says, is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
|
||||
However, this result assumes a hardness conjecture that is not as well studied as those in the first two rows of the table (see \Cref{sec:hard} for more discussion on the hardness assumptions). Further, we note that existing results\footnote{This claim follows from known results for the problem of counting $k$-cliques, where for a query $\query$ over database $\tupset$ that counts the number of $k$-cliques. Specifically, a lower bound of the form $\Omega\inparen{n^{1+\eps_0}}$ for {\em some} $\eps_0>0$ follows from the triangle detection hypothesis (this like our result is for $k=3$). Second, a lower bound of $\omega\inparen{n^{C_0}}$ for {\em all} $C_0>0$ under the assumption $\sharpwzero\ne\sharpwone$ for counting $k$-clique~\cite{10.5555/645413.652181}. Finally, a lower bound of $\Omega\inparen{n^{c_0\cdot k}}$ for {\em some} $c_0>0$ was shown by~\cite{CHEN20061346} (under the strong exponential time hypothesis).
|
||||
} already imply the claimed lower bounds if we were to replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to then identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
|
||||
} already imply the claimed lower bounds if we replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar = |\tupset|$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
|
||||
|
||||
\mypar{Our upper bound results} We introduce a $(1\pm \epsilon)$-approximation algorithm that computes ~\Cref{prob:expect-mult} in time $O_\epsilon\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$. This means, when we are okay with approximation, that we solve~\Cref{prob:expect-mult} in time linear in the size of the deterministic query and bag \abbrPDB\xplural are deployable in practice.
|
||||
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
|
||||
|
@ -99,7 +100,7 @@ Further, our approximation algorithm works for a more general notion of bag \abb
|
|||
(see \Cref{subsec:tidbs-and-bidbs}).
|
||||
|
||||
\subsection{Polynomial Equivalence}\label{sec:intro-poly-equiv}
|
||||
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages or propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. Evaluating a lineage polynomial for a query result tuple $t_{out}$ by, for each tuple $\tup_{in}$, assigning the variable $X_{t_{in}}$ encoding the tuple's multiplicity to the tuple's multiplicity in the possible world yields the multiplicity of the $\tup_{out}$ in the query result for this world.
|
||||
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) annotates tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. The lineage polynomial for result tuple $t_{out}$ evaluates to $t_{out}$'s multiplicity in a given possible world when each $X_{t_{in}}$ is replaced by the multiplicity of $t_{in}$ in the possible world.
|
||||
|
||||
We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
@ -130,12 +131,14 @@ To compute $\expct\pbox{\poly_1^2}$ we can use linearity of expectation and push
|
|||
Denote the variables of $\poly$ to be $\vars{\poly}.$ In the \abbrCTIDB setting, $\poly\inparen{\vct{X}}$ has an equivalent reformulation $\inparen{\refpoly{}\inparen{\vct{X_R}}}$ that is of use to us, where $\abs{\vct{X_R}} = \bound\cdot\abs{\vct{X}}$ . Given $X_\tup \in\vars{\poly}$ and integer valuation $X_\tup \in\inset{0,\ldots, c}$. We can replace $X_\tup$ by $\sum_{j\in\pbox{\bound}}jX_{\tup, j}$ where the variables $\inparen{X_{\tup, j}}_{j\in\pbox{\bound}}$ are disjoint with integer assignments $X_\tup\in\inset{0, 1}$. Then for any $\worldvec\in\worlds$ and corresponding reformulated world $\worldvec_{\vct{R}}\in\inset{0, 1}^{\tupset\bound}$, we set $\worldvec_{\vct{R}_{\tup, j}} = 1$ for $\worldvec_\tup = j$, while $\worldvec_{\vct{R}_{\tup, j'}} = 0$ for all $j'\neq j\in\pbox{\bound}$. By construction then $\poly\inparen{\vct{X}}\equiv\refpoly{}\inparen{\vct{X_R}}$ $\inparen{\vct{X_R} = \vars{\refpoly{}}}$ since for any integer valuation $X_\tup\in\pbox{\bound}$, $X_{\tup, j}\in\inset{0, 1}$ we have the equality $X_\tup = j = \sum_{j\in\pbox{\bound}}jX_{t, j}$.
|
||||
|
||||
Considering again our example,
|
||||
{\small
|
||||
\begin{multline*}
|
||||
\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
|
||||
= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
|
||||
\end{multline*}
|
||||
}
|
||||
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$, computing expectation yields $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2$ \allowbreak $\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$.
|
||||
This leads us to consider a structure related to the lineage polynomial.
|
||||
This leads us to consider a structure related to lineage polynomials:
|
||||
|
||||
\begin{Definition}\label{def:reduced-poly}
|
||||
For any polynomial $\poly\inparen{\inparen{X_\tup}_{\tup\in\tupset}}$ define the reformulated polynomial $\refpoly{}\inparen{\inparen{X_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}
|
||||
|
@ -145,7 +148,7 @@ $ to be the polynomial resulting from converting $\refpoly{}$ into the standard
|
|||
\footnote{
|
||||
This is the representation, typically used in set-\abbrPDB\xplural, where the polynomial is reresented as sum of `pure' products. See \Cref{def:smb} for a formal definition.
|
||||
}
|
||||
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting all \emph{variable} exponents $e > 1$ to $1$.
|
||||
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting each \emph{variable}'s exponents $e > 1$ to $1$.
|
||||
\end{Definition}
|
||||
Continuing with the example
|
||||
\footnote{
|
||||
|
@ -158,20 +161,22 @@ To save clutter we do not show the full expansion for variables with greatest mu
|
|||
&= ABX_1 + AB\inparen{2}^2X_2+ BYE + BZC + 2AX_1BYE+ 2A\inparen{2}^2X_2BYE\\
|
||||
&\qquad + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
|
||||
\end{align*}
|
||||
Note that we have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
|
||||
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural (proof in \Cref{subsec:proof-exp-poly-rpoly}).
|
||||
We have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
|
||||
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural .
|
||||
\begin{Lemma}\label{lem:tidb-reduce-poly}
|
||||
For any \abbrCTIDB $\pdb$, $\raPlus$ query $\query$, and lineage polynomial
|
||||
$\poly\inparen{\vct{X}}=\poly\pbox{\query,\tupset,\tup}\inparen{\vct{X}}$, it holds that $
|
||||
\expct_{\vct{W} \sim \pdassign}\pbox{\poly\inparen{\vct{W}}} = \rpoly\inparen{\probAllTup}
|
||||
$, where $\probAllTup = \inparen{\prob_{\tup}}_{\tup\in\tupset}.$
|
||||
\end{Lemma}
|
||||
The proof follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
|
||||
|
||||
\noindent
|
||||
The proof in \Cref{subsec:proof-exp-poly-rpoly} follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
|
||||
|
||||
\subsection{Our Techniques}
|
||||
\mypar{Lower Bound Proof Techniques}
|
||||
Our main hardness result shows that computing~\Cref{prob:expect-mult} is $\sharpwonehard$ for $1$-\abbrTIDB. To prove this result we show that for the same $\query_1$ from the example above, for an arbitrary `product width' $k$, the query $\qhard^k$ is able to encode various hard graph-counting problems (assuming $\bigO{\numvar}$ tuples rather than the $\bigO{1}$ tuples in \Cref{fig:two-step}).
|
||||
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various graphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to the power of $2k$ in $\poly$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
|
||||
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various subgraphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to the power of $2k$ in $\poly$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
|
||||
a known hard problem in parameterized/fine-grained complexity literature.
|
||||
|
||||
|
||||
|
@ -182,7 +187,7 @@ Our negative results (\Cref{tab:lbs}) indicate that \abbrCTIDB{}s (even for $\bo
|
|||
We adopt a two-step intensional model of query evaluation used in set-\abbrPDB\xplural, as illustrated in \Cref{fig:two-step}:
|
||||
(i) \termStepOne (\abbrStepOne): Given input $\tupset$ and $\query$, output every tuple $\tup$ that possibly satisfies $\query$, annotated with its lineage polynomial ($\poly(\vct{X})=\apolyqdt\inparen{\vct{X}}$);
|
||||
(ii) \termStepTwo (\abbrStepTwo): Given $\poly(\vct{X})$ for each tuple, compute $\expct_{\randWorld\sim\bpd}\pbox{\poly(\vct{\randWorld})}$.
|
||||
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (which is a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
|
||||
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
|
||||
Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the output of \abbrStepOne) the runtime of \abbrStepTwo, which we can leverage~\Cref{def:reduced-poly} and~\Cref{lem:tidb-reduce-poly} to address the next formal objective:
|
||||
|
||||
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
|
||||
|
@ -191,30 +196,25 @@ is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}$\allowbr
|
|||
$\exists \circuit : \timeOf{\abbrStepOne}(Q,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le$\allowbreak$ O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
|
||||
\end{Problem}
|
||||
|
||||
We show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
|
||||
A key insight of this paper is that the representation of $\circuit$ matters.
|
||||
For example, if we insist that $\circuit$ represent the lineage polynomial in \abbrSMB, the answer to the above question in general is no, since then we will need $\abs{\circuit}\ge \Omega\inparen{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$,
|
||||
and hence, just $\timeOf{\abbrStepOne}(\query,\tupset,\circuit)$ will be too large.
|
||||
|
||||
and hence, just $\timeOf{\abbrStepOne}(\query,\tupset,\circuit)$ is too large.
|
||||
However, systems can directly emit compact, factorized representations of $\poly(\vct{X})$ (e.g., as a consequence of the standard projection push-down optimization~\cite{DBLP:books/daglib/0020812}).
|
||||
For example, in~\Cref{fig:two-step}, $B(Y+Z)$ is a factorized representation of the SMB-form $BY+BZ$.
|
||||
Accordingly, this work uses (arithmetic) circuits\footnote{
|
||||
An arithmetic circuit is a DAG with variable and/or numeric source nodes and internal, each nodes representing either an addition or multiplication operator.
|
||||
}
|
||||
as the representation system of $\poly(\vct{X})$.
|
||||
as the representation system of $\poly(\vct{X})$, and we show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
|
||||
|
||||
Given that there exists a representation $\circuit^*$ such that $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can now focus on the complexity of the \abbrStepTwo step.
|
||||
We can represent the factorized lineage polynomial by its correspoding arithmetic circuit $\circuit$ (whose size we denote by $|\circuit|$).
|
||||
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$).
|
||||
Given that a representation $\circuit^*$ exists where $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can focus on the complexity of \abbrStepTwo.
|
||||
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$), where $|\circuit|$ is the size of circuit $\circuit$.
|
||||
Thus, the question of approximation
|
||||
can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
|
||||
\begin{Problem}\label{prob:intro-stmt}
|
||||
Given one circuit $\circuit$ that encodes $\apolyqdt$ for all result tuples $\tup$ (one sink per $\tup$) for \abbrCTIDB $\pdb$ and $\raPlus$ query $\query$, does there exist an algorithm that computes a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ (for all result tuples $\tup$) in $\bigO{|\circuit|}$ time?
|
||||
\end{Problem}
|
||||
|
||||
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial
|
||||
over the probability values) is a constant factor approximation
|
||||
. This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
|
||||
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation.
|
||||
This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
|
||||
In computing $\rpoly$, we have some cancellations to deal with:
|
||||
|
||||
\begin{footnotesize}
|
||||
|
@ -264,7 +264,7 @@ In work independent of ours, Grohe, et. al.~\cite{https://doi.org/10.48550/arxiv
|
|||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\mypar{Paper Organization} We present relevant background and notation in \Cref{sec:background}. We then prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
|
||||
\mypar{Paper Organization} We present background and notation in \Cref{sec:background}. We prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
|
||||
Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}. All proofs are in the appendix.
|
||||
|
||||
|
||||
|
|
|
@ -3,11 +3,11 @@
|
|||
\section{Hardness of Exact Computation}
|
||||
\label{sec:hard}
|
||||
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\qhard,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a $1$-\abbrTIDB.
|
||||
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$, showing \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other more general \abbrPDB\xplural.
|
||||
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$; \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other \abbrPDB\xplural.
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Preliminaries}\label{sec:hard:sub:pre}
|
||||
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing.
|
||||
In particular, we will consider the problems of computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
|
||||
In particular, we will consider computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{Theorem}[\cite{k-match}]
|
||||
|
@ -20,7 +20,7 @@ Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no sel
|
|||
%\end{hypo}
|
||||
|
||||
\begin{hypo}[~\cite{DBLP:journals/corr/CurticapeanM14}]\label{conj:known-algo-kmatch}
|
||||
For every $G=\inparen{\vset, \edgeSet}$, we have $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
|
||||
For every $G=\inparen{\vset, \edgeSet}$, $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
|
||||
\end{hypo}
|
||||
|
||||
We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$
|
||||
|
@ -56,11 +56,11 @@ For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
|
|||
SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
|
||||
WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = r.Point$_2$
|
||||
\end{lstlisting}
|
||||
as $R$. The query $\qhard^k$ then becomes
|
||||
as $Q^1$. The query $\qhard^k$ then becomes
|
||||
\mdfdefinestyle{underbrace}{topline=false, rightline=false, bottomline=false, leftline=false, backgroundcolor=black!15!white, innerbottommargin=0pt}
|
||||
\begin{mdframed}[style=underbrace]
|
||||
\begin{lstlisting}
|
||||
SELECT COUNT(*) FROM $\underbrace{R\text{ JOIN }R\text{ JOIN}\cdots\text{JOIN }R}_{k\rm\ times}$
|
||||
SELECT COUNT(*) FROM $\underbrace{Q^1\text{ JOIN }Q^1\text{ JOIN}\cdots\text{JOIN }Q^1}_{k\rm\ times}$
|
||||
\end{lstlisting}
|
||||
\end{mdframed}
|
||||
\noindent Consider again the \abbrCTIDB instance $\pdb$ of~\Cref{fig:two-step} and, for our hard instance, let $\bound = 1$. $\pdb$ generalizes to one compatible to~\Cref{def:qk} as follows. Relation $T$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob$ and $R$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $R$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $R$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
|
||||
|
@ -69,7 +69,7 @@ Note that this implies that $\poly_{G}^\kElem$ is indeed a $1$-\abbrTIDB lineage
|
|||
|
||||
Next, we note that the runtime for answering $\qhard^k$ on deterministic database $\tupset$, as defined above, is $O_k\inparen{\numedge}$ (i.e. deterministic query processing is `easy' for this query):
|
||||
\begin{Lemma}\label{lem:tdet-om}
|
||||
Let $\qhard^k$ and $\tupset$ be as defined above. Then
|
||||
Define $\qhard^k$ and $\tupset$ as above. Then
|
||||
$\qruntimenoopt{\qhard^k, \tupset, \bound}$ is $O_k\inparen{\numedge}$.
|
||||
\end{Lemma}
|
||||
|
||||
|
@ -81,7 +81,7 @@ We are now ready to present our main hardness result.
|
|||
|
||||
\begin{Theorem}\label{thm:mult-p-hard-result}
|
||||
Let $\prob_0,\ldots,\prob_{2k}$ be $2k + 1$ distinct values in $(0, 1]$. Then computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ (over all $i\in [2k+1]$) for arbitrary $G=(\vset,\edgeSet)$
|
||||
needs time $\bigOmega{\kmatchtime}$, assuming $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
|
||||
needs time $\bigOmega{\kmatchtime}$, if $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
|
||||
\end{Theorem}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%
|
||||
|
|
|
@ -9,9 +9,9 @@ We focus on the problem of computing $\expct_{\worldvec\sim\pdassign}\pbox{\apol
|
|||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{Definition}[Circuit]\label{def:circuit}
|
||||
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X} = \inparen{X_1,\ldots,X_\numvar}$. For each result tuple there exists one sink gate. The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
|
||||
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) with source gates (in degree of $0$) drawn from either $\domN$ or $\vct{X} = \inparen{X_1,\ldots,X_\numvar}$ and one sink gate for each result tuple. Internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
|
||||
%
|
||||
Each gate has the following members: \type, \vari{input}, \val, \vpartial, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val storing the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
|
||||
Each gate has the following members: \type, \vari{input}, \val, \vpartial, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val for the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
|
||||
\end{Definition}
|
||||
When the underlying DAG is a tree (with edges pointing towards the root), the structure is an expression tree \etree. In such a case, the root of \etree is analogous to the sink of \circuit. The fields \vari{partial}, \degval, \vari{Lweight}, and \vari{Rweight} are used in the proofs of \Cref{sec:proofs-approx-alg}.
|
||||
|
||||
|
|
29
pwsem.tex
29
pwsem.tex
|
@ -20,29 +20,24 @@ Let $\abs{\poly}$ be the number of operators in $\poly$.
|
|||
If $\poly$ is a $1$-\abbrBIDB lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\abs{\poly}}$ time.
|
||||
\end{Corollary}
|
||||
|
||||
\subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}
|
||||
In this section, we show how the traditional possible worlds semantics corresponds to our setup. Readers can safely skip this part without missing anything vital to the results of this paper.
|
||||
% \subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}
|
||||
% In this section, we show how the traditional possible worlds semantics corresponds to our setup. Readers can safely skip this part without missing anything vital to the results of this paper.
|
||||
|
||||
Queries over probabilistic databases are traditionally viewed as being evaluated using the so-called possible world semantics. A general bag-\abbrPDB can be defined as the pair $\pdb = \inparen{\Omega, \bpd}$ where $\Omega$ is the set of possible worlds represented by $\pdb$ and $\bpd$ the probability distribution over $\Omega$. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\Omega$ is the set of query answers produced by evaluating $\query$ over each possible world $\omega\in\Omega$: $\inset{\query\inparen{\omega}: \omega\in\Omega}$.
|
||||
The result of a query is the pair $\inparen{\query\inparen{\Omega}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\omega\in\Omega} = \sum_{\omega'\in\Omega,\\\query\inparen{\omega'}=\query\inparen{\omega}}\probOf\pbox{\omega'}$.
|
||||
% Queries over probabilistic databases are traditionally viewed as being evaluated using the so-called possible world semantics. A general bag-\abbrPDB can be defined as the pair $\pdb = \inparen{\Omega, \bpd}$ where $\Omega$ is the set of possible worlds represented by $\pdb$ and $\bpd$ the probability distribution over $\Omega$. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\Omega$ is the set of query answers produced by evaluating $\query$ over each possible world $\omega\in\Omega$: $\inset{\query\inparen{\omega}: \omega\in\Omega}$.
|
||||
% The result of a query is the pair $\inparen{\query\inparen{\Omega}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\omega\in\Omega} = \sum_{\omega'\in\Omega,\\\query\inparen{\omega'}=\query\inparen{\omega}}\probOf\pbox{\omega'}$.
|
||||
|
||||
|
||||
Suppose that $\pdb'$ is a reduced \abbrOneBIDB from \abbrCTIDB $\pdb$ as defined by~\Cref{prop:ctidb-reduct}. Instead of looking only at the possible worlds of $\pdb'$, one can consider the set of all worlds, including those that cannot exist due to, e.g., disjointness. Since $\abs{\tupset} = \numvar$ the all worlds set can be modeled by $\worldvec\in \{0, 1\}^{\numvar\bound}$, such that $\worldvec_{\tup, j} \in \worldvec$ represents whether or not the multiplicity of $\tup$ is $j$ (\emph{here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_{\numvar'}$, where $\numvar'=\sum_{\tup\in\tupset}\abs{\block_\tup}$}).
|
||||
\footnote{
|
||||
In this example, $\abs{\block_\tup} = \bound$ for all $\tup$.
|
||||
}
|
||||
We can denote a probability distribution over all $\worldvec \in \{0, 1\}^{\numvar\bound}$ as $\bpd'$. When $\bpd'$ is the one induced from each $\prob_{\tup, j}$ while assigning $\probOf\pbox{\worldvec} = 0$ for any $\worldvec$ with $\worldvec_{\tup, j}, \worldvec_{\tup, j'} \neq 0$ for $j\neq j'$, we end up with a bijective mapping from $\bpd$ to $\bpd'$, such that each mapping is equivalent, implying the distributions are equivalent, and thus query results.
|
||||
\Cref{subsec:supp-mat-ti-bi-def} has more details. \medskip
|
||||
% Suppose that $\pdb'$ is a reduced \abbrOneBIDB from \abbrCTIDB $\pdb$ as defined by~\Cref{prop:ctidb-reduct}. Instead of looking only at the possible worlds of $\pdb'$, one can consider the set of all worlds, including those that cannot exist due to, e.g., disjointness. Since $\abs{\tupset} = \numvar$ the all worlds set can be modeled by $\worldvec\in \{0, 1\}^{\numvar\bound}$, such that $\worldvec_{\tup, j} \in \worldvec$ represents whether or not the multiplicity of $\tup$ is $j$ (\emph{here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_{\numvar'}$, where $\numvar'=\sum_{\tup\in\tupset}\abs{\block_\tup}$}).
|
||||
% \footnote{
|
||||
% In this example, $\abs{\block_\tup} = \bound$ for all $\tup$.
|
||||
% }
|
||||
% We can denote a probability distribution over all $\worldvec \in \{0, 1\}^{\numvar\bound}$ as $\bpd'$. When $\bpd'$ is the one induced from each $\prob_{\tup, j}$ while assigning $\probOf\pbox{\worldvec} = 0$ for any $\worldvec$ with $\worldvec_{\tup, j}, \worldvec_{\tup, j'} \neq 0$ for $j\neq j'$, we end up with a bijective mapping from $\bpd$ to $\bpd'$, such that each mapping is equivalent, implying the distributions are equivalent, and thus query results.
|
||||
% \Cref{subsec:supp-mat-ti-bi-def} has more details. \medskip
|
||||
|
||||
|
||||
We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
|
||||
% We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
|
||||
|
||||
|
||||
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
|
||||
Given a \abbrBPDB $\pdb = (\Omega,\bpd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$,
|
||||
we have (denoting $\randDB$ as the random variable over $\Omega$):
|
||||
$ \expct_{\randDB \sim \bpd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
|
||||
\end{Proposition}
|
||||
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.\footnote{Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ Fix $\prob\in (0,1)$. Then assuming \Cref{conj:graph} is true, any algorithm tha
|
|||
\end{Theorem}
|
||||
|
||||
Note that \Cref{prop:expection-of-polynom} and \Cref{th:single-p-hard} above imply the hardness result in the first row of \Cref{tab:lbs}.
|
||||
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows of Table~\ref{tab:lbs}) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
|
||||
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
|
|
Loading…
Reference in New Issue