Trimming. Down by ~1 column so far.

master
Oliver Kennedy 2022-05-18 17:51:12 -04:00
parent b839349c51
commit 212b915809
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
10 changed files with 84 additions and 79 deletions

View File

@ -90,8 +90,17 @@ Recall that tuple blocks in a TIDB always have size 1, so the outer summation of
\end{proof}
\subsection{Proof of~\Cref{prop:expection-of-polynom}}
\subsection{Equating Expectations of Tuple Multiplicities and Polynomials}
\label{subsec:expectation-of-polynom-proof}
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given a \abbrBPDB $\pdb = (\Omega,\bpd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$,
we have (denoting $\randDB$ as the random variable over $\Omega$):
$ \expct_{\randDB \sim \bpd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
\end{Proposition}
Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any prior formal proof for bag-probabilistic databases.
\begin{proof}
We need to prove for $\semN$-PDB $\pdb = (\idb,\pd)$ and \abbrNXPDB $\pxdb = (\db_{\semNX},\pd')$ where $\rmod(\pxdb) = \pdb$ that $\expct_{\randDB\sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\nxpolyqdt(\vct{W})}$
By expanding $\nxpolyqdt$ and the expectation we have:

View File

@ -5,7 +5,8 @@
We showed in~\Cref{sec:hard} that a runtime of $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ cannot be acheived for~\Cref{prob:bag-pdb-poly-expected}. In light of this, we desire to produce an approximation algorithm that runs in time $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$. We do this by showing the result via circuits,
such that our approximation algorithm for this problem runs in $\bigO{\abs{\circuit}}$ for a very broad class of circuits, (thus affirming~\Cref{prob:intro-stmt}); see the discussion after \Cref{lem:val-ub} for more.
The following approximation algorithm applies to bag query semantics over both
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$). Our experimental results (see~\Cref{app:subsec:experiment}) which use queries from the PDBench benchmark~\cite{pdbench} show a low $\gamma$ (see~\Cref{def:param-gamma}) supporting the notion that our bounds hold for general \abbrBIDB in practice.
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$).
Our experimental results (see~\Cref{app:subsec:experiment}), which use queries from the PDBench benchmark~\cite{pdbench} support the notion that our bounds hold for general \abbrBIDB in practice.
Corresponding proofs and pseudocode for all formal statements and algorithms
can be found in \Cref{sec:proofs-approx-alg}.
@ -41,7 +42,7 @@ The functions \size and \depth output the number of gates and levels respectivel
\end{Definition}
\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
$\degree(\circuit)$ is defined recursively as follows:
$\degree(\circuit)$ is defined recursively:
\[\degree(\circuit)=
\begin{cases}
\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
@ -52,7 +53,8 @@ $\degree(\circuit)$ is defined recursively as follows:
\]
\end{Definition}
Next, we use the following notation for the complexity of multiplying integers:
\noindent
We use the following notation for integer multiplication complexity:
\begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$. More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
In a RAM model of word size of $W$-bits, $\multc{M}{W}$ denotes the complexity of multiplying two integers represented with $M$-bits. (We will assume that for input of size $N$, $W=O(\log{N})$.)
\end{Definition}
@ -61,7 +63,7 @@ Finally, to get linear runtime results, we will need to define another parameter
that need to be `canceled' when monomials with dependent variables are removed (\Cref{subsec:one-bidb}).
Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise; further, let $\indicator{\theta}$ also be a boolean function returning true if $\theta$ evaluates to true.
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given a \abbrOneBIDB circuit $\circuit$ define
Given \abbrOneBIDB circuit $\circuit$, let
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\neg\isInd{\encMon}} }
{\abs{\circuit}(1,\ldots, 1)}.\]
\end{Definition}

View File

@ -40,7 +40,7 @@ We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynom
\begin{Definition}[\abbrOneBIDB]\label{def:one-bidb}
Define a \emph{\abbrOneBIDB} to be the pair $\pdb' = \inparen{\bigtimes_{\tup\in\tupset'}\inset{0, \bound_\tup}, \bpd'},$ where $\tupset'$ is the set of possible tuples such that each $\tup \in \tupset'$ has a multiplicity domain of $\inset{0, \bound_\tup}$, with $\bound_\tup\in\mathbb{N}$. $\tupset'$ is partitioned into $\numblock$ independent blocks $\block_i,$ for $i\in\pbox{\numblock}$, of disjoint tuples. $\bpd'$ is characterized by the vector $\inparen{\prob_\tup}_{\tup\in\tupset'}$ where for every block $\block_i$, $\sum_{\tup \in \block_i}\prob_\tup \leq 1$. Given $W\in\onebidbworlds{\tupset'}$ and for $i\in\pbox{\numblock}$, let $\prob_i(W) = \begin{cases}
1 - \sum_{\tup\in\block_i}\prob_\tup & \text{if }W_\tup = 0\text{ for all }\tup\in\block_i\\
0 & \text{if there exists } \tup,~\tup'\in\block_i, W_\tup, W_{\tup'}\neq 0\\
0 & \text{if there exists } \tup \neq \tup'\in\block_i; W_\tup, W_{\tup'}\neq 0\\
\prob_\tup & W_\tup \ne 0 \text{ for the unique } t\in B_i.\\
\end{cases}$
@ -54,9 +54,8 @@ We slightly abuse notation here, denoting a world vector as $W$ rather than $\wo
\begin{Proposition}[\abbrCTIDB reduction]\label{prop:ctidb-reduct}
Given \abbrCTIDB $\pdb =$\newline $\inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$ be the \emph{\abbrOneBIDB} obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intuple{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$ where $\bound_{\tup_j} = j$ for each $\tup_j$ in $\tupset'$.
The probability distribution $\bpd'$ is the characterized by the vector $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$.
Then, the distributions $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
Then, $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
\end{Proposition}
\AH{Not entirely sure if I the notation above $\bound_{\tup_j} = j$ for each $\tup_j$ in $\tupset'$.}
We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
\begin{figure}[t!]
@ -92,7 +91,7 @@ We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
\end{figure}
\begin{Definition}[$\rpoly'$]\label{def:reduced-poly-one-bidb}
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB and let $\rpoly'\inparen{\vct{X}}$ denote the reduced form of $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating all monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB, let $\rpoly'\inparen{\vct{X}}$ denote the reduced $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
\end{Definition}
Then given $\worldvec\in\inset{0,1}^{\tupset'}$ over the reduced \abbrOneBIDB of~\Cref{prop:ctidb-reduct}, the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\inparen{\worldvec}$ is of the same structure as the reformulated polynomial $\refpoly{}\inparen{\worldvec}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of both~\Cref{def:reduced-poly} and~\Cref{def:reduced-poly-one-bidb}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials.
\begin{Lemma}\label{lem:bin-bidb-phi-eq-redphi}

View File

@ -36,11 +36,10 @@ We require only that the algorithm must enumerate its output, i.e., that $\joint
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bound}~\cite{AGM}.
% = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.
Our cost model for general query evaluation follows from the join cost:
Our cost model for general query evaluation follows:
%\noindent\resizebox{1\linewidth}{!}{
%\begin{minipage}{1.0\linewidth}
\vspace{-0.57cm}
{\small
\begin{flalign*}
&\begin{aligned}
&\qruntimenoopt{R,\gentupset,\bound} = |\gentupset.R|
@ -54,6 +53,7 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
&\qquad\qruntimenoopt{\query_1, \gentupset,\bound} + \ldots + \qruntimenoopt{\query_m,\gentupset,\bound} +
\jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}&\\
\end{flalign*}
}
\vspace{-0.93cm}
%\begin{align*}
% &\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound} = \\
@ -63,13 +63,13 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
%\end{minipage}
%}\\
\noindent
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O(\qruntimenoopt{Q,\gentupset, \bound})$.
We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
%Observe that
% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\optquery{\query}$.
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$, as we assumed when moving from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\qruntimenoopt{\optquery{\query}, \gentupset, \bound}$.
%
%We now make a simple observation on the above cost model:
%\begin{proposition}

View File

@ -4,20 +4,22 @@
This work explores the problem of computing the expectation of the multiplicity of a tuple in the result of a query over a \abbrCTIDB, a type of probabilistic database with bag semantics where the multiplicity of a tuple is a random variable with range $[0,\bound]$ for some fixed constant $\bound$ and multiplicities assigned to any two tuples are independent of each other.
Formally, a \abbrCTIDB,
$\pdb = \inparen{\worlds, \bpd}$ consists of a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
Any such world can be encoded as a vector from $\worlds$, the set of all vectors of length $\numvar=\abs{\tupset}$ such that each index corresponds to a distinct $\tup \in \tupset$ storing its multiplicity.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$. Given that the multiplicities of tuples are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a particular world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
$\pdb = \inparen{\worlds, \bpd}$ is a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
Any such world can be encoded as a vector (of length $\numvar=\abs{\tupset}$) from $\worlds$, such that the multiplicity of each $\tup \in \tupset$ is stored at a distinct index.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$.
We note that encoding a possible world as a vector, while non-standard, is equivalent to encoding it as a set of tuples (\Cref{prop:expection-of-polynom} in \Cref{subsec:expectation-of-polynom-proof}).
Given that tuple multiplicities are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
Allowing for $\leq \bound$ multiplicities across all tuples gives rise to having $\leq \inparen{\bound+1}^\numvar$ possible worlds instead of the usual $2^\numvar$ possible worlds of a $1$-\abbrTIDB, which (assuming set query semantics), is the same as the traditional set \abbrTIDB.
In this work, since we are generally considering bag query input, we will only be considering bag query semantics. We denote by $\query\inparen{\worldvec}\inparen{\tup}$ the multiplicity of $\tup$ in query $\query$ over possible world $\worldvec\in\worlds$.
%
We can formally state our problem of computing the expected multiplicity of a result tuple as:
\begin{Problem}\label{prob:expect-mult}
Given a \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the relational algebra operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
}
$\query$, and result tuple $\tup$, compute the expected multiplicity of $\tup$: $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
$\query$, and result tuple $\tup$, compute the expectation $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
\end{Problem}
\begin{figure}[t!]
@ -43,27 +45,26 @@ An $\raPlus$ query is a query expressed in positive relational algebra, i.e., us
&\polyqdt{\query_1 \join \query_2}{\gentupset}{\tup} =\\
&\qquad\polyqdt{\query_1}{\gentupset}{\project_{\attr{\query_1}}{\tup}}\\
&\qquad\cdot\polyqdt{\query_2}{\gentupset}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
&&&\polyqdt{\rel}{\gentupset}{\tup} = X_\tup
\end{aligned}
\end{align*}%\\[-10mm]
\setlength{\abovecaptionskip}{-0.25cm}
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over an arbitrary deterministic database $\gentupset$, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\gentupset$ and $\tup$ in $\rel$. Here $\gentupset.\rel$ denotes the instance of relation $\rel$ in $\gentupset$. Please note, after we introduce the reduction to $1$-\abbrBIDB, the base case will be expressed alternatively.}
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over an arbitrary deterministic database $\gentupset$, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\gentupset$ and $\tup$ in $\rel$. Here $\gentupset.\rel$ denotes the instance of relation $\rel$ in $\gentupset$. Please note, after we introduce the reduction to $1$-\abbrBIDB, the base case will be expressed alternatively. The base case is $\polyqdt{\rel}{\gentupset}{\tup} = X_\tup$}
\label{fig:nxDBSemantics}
\vspace{-0.53cm}
\end{figure}
It is natural to explore computing the expected multiplicity of a result tuple as this is the analog for computing the marginal probability of a tuple in a set \abbrPDB.
In this work we will assume that $c =\bigO{1}$ since this is what is typically seen in practice.
Computing the expected multiplicity of a result tuple in a \abbrCTIDB is the analog of the marginal probability in a set \abbrPDB.
In this work we will assume that $c =\bigO{1}$, since this is what is typically seen in practice.
Allowing for unbounded $c$ is an interesting open problem.
\mypar{Hardness of Set Query Semantics and Bag Query Semantics}
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and the data complexity of the problem in general has been shown by Dalvi and Suicu to be \sharpphard\cite{10.1145/1265530.1265571}. For our setting, there exists a trivial polytime algorithm to compute~\Cref{prob:expect-mult} for any $\raPlus$ query over a \abbrCTIDB due to linearity of expection (see~\Cref{sec:intro-poly-equiv}).
Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting question that we explore deals with analyzing the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and their data complexity has, in general been shown by Dalvi and Suicu to be \sharpphard\cite{10.1145/1265530.1265571}. For our setting, there exists a trivial polytime algorithm to compute~\Cref{prob:expect-mult} for any $\raPlus$ query over a \abbrCTIDB due to linearity of expection (see~\Cref{sec:intro-poly-equiv}).
Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting question that we explore is the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query which we make more precise shortly.
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query, which we make more precise shortly.
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. To analyze this question we denote by $\timeOf{}^*(Q,\pdb)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$.
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} = \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{Note that our work applies to any $\query \in\raPlus$, which implies that specific heuristics for choosing an optimized query can be abstracted away, i.e., our work does not consider heuristic techniques.}
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries, for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} \approx \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{The upper bounds on runtime that we derive apply pointwise to any $\query \in\raPlus$, allowing us to abstract away the specific heuristics for choosing an optimized query (i.e., Any deterministic query optimization heuristic is equally useful for \abbrCTIDB queries).}
\begin{table*}[t!]
\centering
@ -88,9 +89,9 @@ Our question is whether or not it is always true that $\timeOf{}^*\inparen{\quer
Specifically, depending on what hardness result/conjecture we assume, we get various weaker or stronger versions of {\em no} as an answer to our question. To make some sense of the other lower bounds in \Cref{tab:lbs}, we note that it is not too hard to show that $\timeOf{}^*(Q,\pdb) \le \bigO{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$, where $k$ is the join width (our notion of join width follows from~\Cref{def:degree-of-poly} and~\Cref{fig:nxDBSemantics}.) of the query $\query$ over all result tuples $\tup$ (and the parameter that defines our family of hard queries).
What our lower bound in the third row says is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
What our lower bound in the third row says, is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
However, this result assumes a hardness conjecture that is not as well studied as those in the first two rows of the table (see \Cref{sec:hard} for more discussion on the hardness assumptions). Further, we note that existing results\footnote{This claim follows from known results for the problem of counting $k$-cliques, where for a query $\query$ over database $\tupset$ that counts the number of $k$-cliques. Specifically, a lower bound of the form $\Omega\inparen{n^{1+\eps_0}}$ for {\em some} $\eps_0>0$ follows from the triangle detection hypothesis (this like our result is for $k=3$). Second, a lower bound of $\omega\inparen{n^{C_0}}$ for {\em all} $C_0>0$ under the assumption $\sharpwzero\ne\sharpwone$ for counting $k$-clique~\cite{10.5555/645413.652181}. Finally, a lower bound of $\Omega\inparen{n^{c_0\cdot k}}$ for {\em some} $c_0>0$ was shown by~\cite{CHEN20061346} (under the strong exponential time hypothesis).
} already imply the claimed lower bounds if we were to replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to then identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
} already imply the claimed lower bounds if we replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar = |\tupset|$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
\mypar{Our upper bound results} We introduce a $(1\pm \epsilon)$-approximation algorithm that computes ~\Cref{prob:expect-mult} in time $O_\epsilon\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$. This means, when we are okay with approximation, that we solve~\Cref{prob:expect-mult} in time linear in the size of the deterministic query and bag \abbrPDB\xplural are deployable in practice.
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
@ -99,7 +100,7 @@ Further, our approximation algorithm works for a more general notion of bag \abb
(see \Cref{subsec:tidbs-and-bidbs}).
\subsection{Polynomial Equivalence}\label{sec:intro-poly-equiv}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages or propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. Evaluating a lineage polynomial for a query result tuple $t_{out}$ by, for each tuple $\tup_{in}$, assigning the variable $X_{t_{in}}$ encoding the tuple's multiplicity to the tuple's multiplicity in the possible world yields the multiplicity of the $\tup_{out}$ in the query result for this world.
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) annotates tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. The lineage polynomial for result tuple $t_{out}$ evaluates to $t_{out}$'s multiplicity in a given possible world when each $X_{t_{in}}$ is replaced by the multiplicity of $t_{in}$ in the possible world.
We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -130,12 +131,14 @@ To compute $\expct\pbox{\poly_1^2}$ we can use linearity of expectation and push
Denote the variables of $\poly$ to be $\vars{\poly}.$ In the \abbrCTIDB setting, $\poly\inparen{\vct{X}}$ has an equivalent reformulation $\inparen{\refpoly{}\inparen{\vct{X_R}}}$ that is of use to us, where $\abs{\vct{X_R}} = \bound\cdot\abs{\vct{X}}$ . Given $X_\tup \in\vars{\poly}$ and integer valuation $X_\tup \in\inset{0,\ldots, c}$. We can replace $X_\tup$ by $\sum_{j\in\pbox{\bound}}jX_{\tup, j}$ where the variables $\inparen{X_{\tup, j}}_{j\in\pbox{\bound}}$ are disjoint with integer assignments $X_\tup\in\inset{0, 1}$. Then for any $\worldvec\in\worlds$ and corresponding reformulated world $\worldvec_{\vct{R}}\in\inset{0, 1}^{\tupset\bound}$, we set $\worldvec_{\vct{R}_{\tup, j}} = 1$ for $\worldvec_\tup = j$, while $\worldvec_{\vct{R}_{\tup, j'}} = 0$ for all $j'\neq j\in\pbox{\bound}$. By construction then $\poly\inparen{\vct{X}}\equiv\refpoly{}\inparen{\vct{X_R}}$ $\inparen{\vct{X_R} = \vars{\refpoly{}}}$ since for any integer valuation $X_\tup\in\pbox{\bound}$, $X_{\tup, j}\in\inset{0, 1}$ we have the equality $X_\tup = j = \sum_{j\in\pbox{\bound}}jX_{t, j}$.
Considering again our example,
{\small
\begin{multline*}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
\end{multline*}
}
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$, computing expectation yields $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2$ \allowbreak $\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$.
This leads us to consider a structure related to the lineage polynomial.
This leads us to consider a structure related to lineage polynomials:
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly\inparen{\inparen{X_\tup}_{\tup\in\tupset}}$ define the reformulated polynomial $\refpoly{}\inparen{\inparen{X_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}
@ -145,7 +148,7 @@ $ to be the polynomial resulting from converting $\refpoly{}$ into the standard
\footnote{
This is the representation, typically used in set-\abbrPDB\xplural, where the polynomial is reresented as sum of `pure' products. See \Cref{def:smb} for a formal definition.
}
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting all \emph{variable} exponents $e > 1$ to $1$.
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting each \emph{variable}'s exponents $e > 1$ to $1$.
\end{Definition}
Continuing with the example
\footnote{
@ -158,20 +161,22 @@ To save clutter we do not show the full expansion for variables with greatest mu
&= ABX_1 + AB\inparen{2}^2X_2+ BYE + BZC + 2AX_1BYE+ 2A\inparen{2}^2X_2BYE\\
&\qquad + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
\end{align*}
Note that we have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural (proof in \Cref{subsec:proof-exp-poly-rpoly}).
We have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural .
\begin{Lemma}\label{lem:tidb-reduce-poly}
For any \abbrCTIDB $\pdb$, $\raPlus$ query $\query$, and lineage polynomial
$\poly\inparen{\vct{X}}=\poly\pbox{\query,\tupset,\tup}\inparen{\vct{X}}$, it holds that $
\expct_{\vct{W} \sim \pdassign}\pbox{\poly\inparen{\vct{W}}} = \rpoly\inparen{\probAllTup}
$, where $\probAllTup = \inparen{\prob_{\tup}}_{\tup\in\tupset}.$
\end{Lemma}
The proof follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
\noindent
The proof in \Cref{subsec:proof-exp-poly-rpoly} follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
\subsection{Our Techniques}
\mypar{Lower Bound Proof Techniques}
Our main hardness result shows that computing~\Cref{prob:expect-mult} is $\sharpwonehard$ for $1$-\abbrTIDB. To prove this result we show that for the same $\query_1$ from the example above, for an arbitrary `product width' $k$, the query $\qhard^k$ is able to encode various hard graph-counting problems (assuming $\bigO{\numvar}$ tuples rather than the $\bigO{1}$ tuples in \Cref{fig:two-step}).
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various graphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to the power of $2k$ in $\poly$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various subgraphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to the power of $2k$ in $\poly$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
a known hard problem in parameterized/fine-grained complexity literature.
@ -182,7 +187,7 @@ Our negative results (\Cref{tab:lbs}) indicate that \abbrCTIDB{}s (even for $\bo
We adopt a two-step intensional model of query evaluation used in set-\abbrPDB\xplural, as illustrated in \Cref{fig:two-step}:
(i) \termStepOne (\abbrStepOne): Given input $\tupset$ and $\query$, output every tuple $\tup$ that possibly satisfies $\query$, annotated with its lineage polynomial ($\poly(\vct{X})=\apolyqdt\inparen{\vct{X}}$);
(ii) \termStepTwo (\abbrStepTwo): Given $\poly(\vct{X})$ for each tuple, compute $\expct_{\randWorld\sim\bpd}\pbox{\poly(\vct{\randWorld})}$.
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (which is a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the output of \abbrStepOne) the runtime of \abbrStepTwo, which we can leverage~\Cref{def:reduced-poly} and~\Cref{lem:tidb-reduce-poly} to address the next formal objective:
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
@ -191,30 +196,25 @@ is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}$\allowbr
$\exists \circuit : \timeOf{\abbrStepOne}(Q,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le$\allowbreak$ O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
\end{Problem}
We show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
A key insight of this paper is that the representation of $\circuit$ matters.
For example, if we insist that $\circuit$ represent the lineage polynomial in \abbrSMB, the answer to the above question in general is no, since then we will need $\abs{\circuit}\ge \Omega\inparen{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$,
and hence, just $\timeOf{\abbrStepOne}(\query,\tupset,\circuit)$ will be too large.
and hence, just $\timeOf{\abbrStepOne}(\query,\tupset,\circuit)$ is too large.
However, systems can directly emit compact, factorized representations of $\poly(\vct{X})$ (e.g., as a consequence of the standard projection push-down optimization~\cite{DBLP:books/daglib/0020812}).
For example, in~\Cref{fig:two-step}, $B(Y+Z)$ is a factorized representation of the SMB-form $BY+BZ$.
Accordingly, this work uses (arithmetic) circuits\footnote{
An arithmetic circuit is a DAG with variable and/or numeric source nodes and internal, each nodes representing either an addition or multiplication operator.
}
as the representation system of $\poly(\vct{X})$.
as the representation system of $\poly(\vct{X})$, and we show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
Given that there exists a representation $\circuit^*$ such that $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can now focus on the complexity of the \abbrStepTwo step.
We can represent the factorized lineage polynomial by its correspoding arithmetic circuit $\circuit$ (whose size we denote by $|\circuit|$).
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$).
Given that a representation $\circuit^*$ exists where $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can focus on the complexity of \abbrStepTwo.
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$), where $|\circuit|$ is the size of circuit $\circuit$.
Thus, the question of approximation
can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
\begin{Problem}\label{prob:intro-stmt}
Given one circuit $\circuit$ that encodes $\apolyqdt$ for all result tuples $\tup$ (one sink per $\tup$) for \abbrCTIDB $\pdb$ and $\raPlus$ query $\query$, does there exist an algorithm that computes a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ (for all result tuples $\tup$) in $\bigO{|\circuit|}$ time?
\end{Problem}
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial
over the probability values) is a constant factor approximation
. This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation.
This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
In computing $\rpoly$, we have some cancellations to deal with:
\begin{footnotesize}
@ -264,7 +264,7 @@ In work independent of ours, Grohe, et. al.~\cite{https://doi.org/10.48550/arxiv
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Paper Organization} We present relevant background and notation in \Cref{sec:background}. We then prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
\mypar{Paper Organization} We present background and notation in \Cref{sec:background}. We prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}. All proofs are in the appendix.

View File

View File

@ -3,11 +3,11 @@
\section{Hardness of Exact Computation}
\label{sec:hard}
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\qhard,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a $1$-\abbrTIDB.
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$, showing \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other more general \abbrPDB\xplural.
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$; \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other \abbrPDB\xplural.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}\label{sec:hard:sub:pre}
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing.
In particular, we will consider the problems of computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
In particular, we will consider computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
@ -20,7 +20,7 @@ Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no sel
%\end{hypo}
\begin{hypo}[~\cite{DBLP:journals/corr/CurticapeanM14}]\label{conj:known-algo-kmatch}
For every $G=\inparen{\vset, \edgeSet}$, we have $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
For every $G=\inparen{\vset, \edgeSet}$, $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
\end{hypo}
We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$
@ -56,11 +56,11 @@ For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = r.Point$_2$
\end{lstlisting}
as $R$. The query $\qhard^k$ then becomes
as $Q^1$. The query $\qhard^k$ then becomes
\mdfdefinestyle{underbrace}{topline=false, rightline=false, bottomline=false, leftline=false, backgroundcolor=black!15!white, innerbottommargin=0pt}
\begin{mdframed}[style=underbrace]
\begin{lstlisting}
SELECT COUNT(*) FROM $\underbrace{R\text{ JOIN }R\text{ JOIN}\cdots\text{JOIN }R}_{k\rm\ times}$
SELECT COUNT(*) FROM $\underbrace{Q^1\text{ JOIN }Q^1\text{ JOIN}\cdots\text{JOIN }Q^1}_{k\rm\ times}$
\end{lstlisting}
\end{mdframed}
\noindent Consider again the \abbrCTIDB instance $\pdb$ of~\Cref{fig:two-step} and, for our hard instance, let $\bound = 1$. $\pdb$ generalizes to one compatible to~\Cref{def:qk} as follows. Relation $T$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob$ and $R$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $R$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $R$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
@ -69,7 +69,7 @@ Note that this implies that $\poly_{G}^\kElem$ is indeed a $1$-\abbrTIDB lineage
Next, we note that the runtime for answering $\qhard^k$ on deterministic database $\tupset$, as defined above, is $O_k\inparen{\numedge}$ (i.e. deterministic query processing is `easy' for this query):
\begin{Lemma}\label{lem:tdet-om}
Let $\qhard^k$ and $\tupset$ be as defined above. Then
Define $\qhard^k$ and $\tupset$ as above. Then
$\qruntimenoopt{\qhard^k, \tupset, \bound}$ is $O_k\inparen{\numedge}$.
\end{Lemma}
@ -81,7 +81,7 @@ We are now ready to present our main hardness result.
\begin{Theorem}\label{thm:mult-p-hard-result}
Let $\prob_0,\ldots,\prob_{2k}$ be $2k + 1$ distinct values in $(0, 1]$. Then computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ (over all $i\in [2k+1]$) for arbitrary $G=(\vset,\edgeSet)$
needs time $\bigOmega{\kmatchtime}$, assuming $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
needs time $\bigOmega{\kmatchtime}$, if $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%

View File

@ -9,9 +9,9 @@ We focus on the problem of computing $\expct_{\worldvec\sim\pdassign}\pbox{\apol
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X} = \inparen{X_1,\ldots,X_\numvar}$. For each result tuple there exists one sink gate. The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) with source gates (in degree of $0$) drawn from either $\domN$ or $\vct{X} = \inparen{X_1,\ldots,X_\numvar}$ and one sink gate for each result tuple. Internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
%
Each gate has the following members: \type, \vari{input}, \val, \vpartial, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val storing the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
Each gate has the following members: \type, \vari{input}, \val, \vpartial, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val for the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
\end{Definition}
When the underlying DAG is a tree (with edges pointing towards the root), the structure is an expression tree \etree. In such a case, the root of \etree is analogous to the sink of \circuit. The fields \vari{partial}, \degval, \vari{Lweight}, and \vari{Rweight} are used in the proofs of \Cref{sec:proofs-approx-alg}.

View File

@ -20,29 +20,24 @@ Let $\abs{\poly}$ be the number of operators in $\poly$.
If $\poly$ is a $1$-\abbrBIDB lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\abs{\poly}}$ time.
\end{Corollary}
\subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}
In this section, we show how the traditional possible worlds semantics corresponds to our setup. Readers can safely skip this part without missing anything vital to the results of this paper.
% \subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}
% In this section, we show how the traditional possible worlds semantics corresponds to our setup. Readers can safely skip this part without missing anything vital to the results of this paper.
Queries over probabilistic databases are traditionally viewed as being evaluated using the so-called possible world semantics. A general bag-\abbrPDB can be defined as the pair $\pdb = \inparen{\Omega, \bpd}$ where $\Omega$ is the set of possible worlds represented by $\pdb$ and $\bpd$ the probability distribution over $\Omega$. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\Omega$ is the set of query answers produced by evaluating $\query$ over each possible world $\omega\in\Omega$: $\inset{\query\inparen{\omega}: \omega\in\Omega}$.
The result of a query is the pair $\inparen{\query\inparen{\Omega}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\omega\in\Omega} = \sum_{\omega'\in\Omega,\\\query\inparen{\omega'}=\query\inparen{\omega}}\probOf\pbox{\omega'}$.
% Queries over probabilistic databases are traditionally viewed as being evaluated using the so-called possible world semantics. A general bag-\abbrPDB can be defined as the pair $\pdb = \inparen{\Omega, \bpd}$ where $\Omega$ is the set of possible worlds represented by $\pdb$ and $\bpd$ the probability distribution over $\Omega$. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\Omega$ is the set of query answers produced by evaluating $\query$ over each possible world $\omega\in\Omega$: $\inset{\query\inparen{\omega}: \omega\in\Omega}$.
% The result of a query is the pair $\inparen{\query\inparen{\Omega}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\omega\in\Omega} = \sum_{\omega'\in\Omega,\\\query\inparen{\omega'}=\query\inparen{\omega}}\probOf\pbox{\omega'}$.
Suppose that $\pdb'$ is a reduced \abbrOneBIDB from \abbrCTIDB $\pdb$ as defined by~\Cref{prop:ctidb-reduct}. Instead of looking only at the possible worlds of $\pdb'$, one can consider the set of all worlds, including those that cannot exist due to, e.g., disjointness. Since $\abs{\tupset} = \numvar$ the all worlds set can be modeled by $\worldvec\in \{0, 1\}^{\numvar\bound}$, such that $\worldvec_{\tup, j} \in \worldvec$ represents whether or not the multiplicity of $\tup$ is $j$ (\emph{here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_{\numvar'}$, where $\numvar'=\sum_{\tup\in\tupset}\abs{\block_\tup}$}).
\footnote{
In this example, $\abs{\block_\tup} = \bound$ for all $\tup$.
}
We can denote a probability distribution over all $\worldvec \in \{0, 1\}^{\numvar\bound}$ as $\bpd'$. When $\bpd'$ is the one induced from each $\prob_{\tup, j}$ while assigning $\probOf\pbox{\worldvec} = 0$ for any $\worldvec$ with $\worldvec_{\tup, j}, \worldvec_{\tup, j'} \neq 0$ for $j\neq j'$, we end up with a bijective mapping from $\bpd$ to $\bpd'$, such that each mapping is equivalent, implying the distributions are equivalent, and thus query results.
\Cref{subsec:supp-mat-ti-bi-def} has more details. \medskip
% Suppose that $\pdb'$ is a reduced \abbrOneBIDB from \abbrCTIDB $\pdb$ as defined by~\Cref{prop:ctidb-reduct}. Instead of looking only at the possible worlds of $\pdb'$, one can consider the set of all worlds, including those that cannot exist due to, e.g., disjointness. Since $\abs{\tupset} = \numvar$ the all worlds set can be modeled by $\worldvec\in \{0, 1\}^{\numvar\bound}$, such that $\worldvec_{\tup, j} \in \worldvec$ represents whether or not the multiplicity of $\tup$ is $j$ (\emph{here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_{\numvar'}$, where $\numvar'=\sum_{\tup\in\tupset}\abs{\block_\tup}$}).
% \footnote{
% In this example, $\abs{\block_\tup} = \bound$ for all $\tup$.
% }
% We can denote a probability distribution over all $\worldvec \in \{0, 1\}^{\numvar\bound}$ as $\bpd'$. When $\bpd'$ is the one induced from each $\prob_{\tup, j}$ while assigning $\probOf\pbox{\worldvec} = 0$ for any $\worldvec$ with $\worldvec_{\tup, j}, \worldvec_{\tup, j'} \neq 0$ for $j\neq j'$, we end up with a bijective mapping from $\bpd$ to $\bpd'$, such that each mapping is equivalent, implying the distributions are equivalent, and thus query results.
% \Cref{subsec:supp-mat-ti-bi-def} has more details. \medskip
We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
% We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given a \abbrBPDB $\pdb = (\Omega,\bpd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$,
we have (denoting $\randDB$ as the random variable over $\Omega$):
$ \expct_{\randDB \sim \bpd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.\footnote{Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.}

View File

@ -11,7 +11,7 @@ Fix $\prob\in (0,1)$. Then assuming \Cref{conj:graph} is true, any algorithm tha
\end{Theorem}
Note that \Cref{prop:expection-of-polynom} and \Cref{th:single-p-hard} above imply the hardness result in the first row of \Cref{tab:lbs}.
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows of Table~\ref{tab:lbs}) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
%%% Local Variables: