Fixing merge conflict.

master
Aaron Huber 2022-06-02 09:43:04 -04:00
commit 763edccf11
9 changed files with 182 additions and 124 deletions

View File

@ -4,11 +4,11 @@
\section{Background and Notation}\label{sec:background}
\subsection{Polynomial Definition and Terminology}
Given an index set $S$ over variables $X_\tup$ for $\tup\in S$, a (general) polynomial $\genpoly$ over $\inparen{X_\tup}_{\tup \in S}$ with individual degree $\hideg <\infty$
Given an index set $S$ and variables $X_\tup$ for $\tup\in S$, a (general) polynomial $\genpoly$ over $\inparen{X_\tup}_{\tup \in S}$ with individual degree $\hideg <\infty$
is formally defined as:
\begin{align}
\label{eq:sop-form}
\genpoly\inparen{\inparen{X_\tup}_{\tup\in S}}=\sum_{\vct{d}\in\{0,\ldots,\hideg\}^{S}} c_{\vct{d}}\cdot \prod_{\tup\in S}X_\tup^{d_\tup}&&\text{ where } c_{\vct{d}}\in \semN.
\genpoly\inparen{\inparen{X_\tup}_{\tup\in S}}=\sum_{\vct{d}=\inparen{d_\tup}_{\tup\in S}\in\{0,\ldots,\hideg\}^{S}} c_{\vct{d}}\cdot \prod_{\tup\in S}X_\tup^{d_\tup}&&\text{ where } c_{\vct{d}}\in \semN.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -17,7 +17,8 @@ The term $\prod_{\tup\in S} X_\tup^{d_\tup}$ in \Cref{eq:sop-form} is a {\em mon
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Unless othewise noted, we consider all polynomials to be in \abbrSMB representation.
When it is unclear, we use $\smbOf{\genpoly}~\inparen{\smbOf{\poly}}$ to denote the \abbrSMB form of a polynomial (lineage polynomial) $\genpoly~\inparen{\poly}$.
When it is unclear, we use $\smbOf{\genpoly}$
to denote the \abbrSMB form of a polynomial $\genpoly~\inparen{\poly}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -26,7 +27,7 @@ We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynom
\subsection{\abbrOneBIDB}\label{subsec:one-bidb}
\label{subsec:tidbs-and-bidbs}
\noindent A block independent database \abbrBIDB $\pdb'$ models a set of worlds each of which consists of a subset of the possible tuples $\tupset'$, where $\tupset'$ is partitioned into $\numblock$ blocks $\block_i$ and all $\block_i$ are independent random events. $\pdb'$ further constrains that all $\tup\in\block_i$ for all $i\in\pbox{\numblock}$ of $\tupset'$ be disjoint events. We refer to any monomial that includes $X_\tup X_{\tup'}$ for $\tup\neq\tup'\in\block_i$ as a \emph{cancellation}. We define next a specific construction of \abbrBIDB that is useful for our work.
\noindent A block independent database \abbrBIDB $\pdb'$ models a set of worlds each of which consists of a subset of the possible tuples $\tupset'$, where $\tupset'$ is partitioned into $\numblock$ blocks $\block_i$ and all $\tup\in\block_i$ are independent random events for distinct blocks. $\pdb'$ further constrains that all $\tup\in\block_i$ for all $i\in\pbox{\numblock}$ of $\tupset'$ be disjoint events. We refer to any monomial that includes $X_\tup X_{\tup'}$ for $\tup\neq\tup'\in\block_i$ as a \emph{cancellation}. We define next a specific construction of \abbrBIDB that is useful for our work.
\begin{Definition}[\abbrOneBIDB]\label{def:one-bidb}
Define a \emph{\abbrOneBIDB} to be the pair $\pdb' = \inparen{\bigtimes_{\tup\in\tupset'}\inset{0, \bound_\tup}, \bpd'},$ where $\tupset'$ is the set of possible tuples such that each $\tup \in \tupset'$ has a multiplicity domain of $\inset{0, \bound_\tup}$, with $\bound_\tup\in\mathbb{N}$. $\tupset'$ is partitioned into $\numblock$ independent blocks $\block_i,$ for $i\in\pbox{\numblock}$, of disjoint tuples. $\bpd'$ is characterized by the vector $\inparen{\prob_\tup}_{\tup\in\tupset'}$ where for every block $\block_i$, $\sum_{\tup \in \block_i}\prob_\tup \leq 1$. Given $W\in\onebidbworlds{\tupset'}$ and for $i\in\pbox{\numblock}$, let $\prob_\tup(W) = \begin{cases}
@ -35,17 +36,18 @@ Define a \emph{\abbrOneBIDB} to be the pair $\pdb' = \inparen{\bigtimes_{\tup\in
\prob_\tup & W_\tup \ne 0 \text{ for the unique } t\in B_i.\\
\end{cases}$
\noindent$\bpd'$ is the probability distribution across all worlds such that, given $W\in\bigtimes_{\tup \in \tupset'}\inset{0,\bound_\tup}$, $\probOf\pbox{\worldvec = W} = \prod_{i\in\pbox{\numblock}}\prob_{i}(W)$.
\noindent$\bpd'$ is the probability distribution across all worlds such that, given $W\in\bigtimes_{\tup \in \tupset'}\inset{0,\bound_\tup}$, $\probOf\pbox{\worldvec = W} = \prod_{\tup\in\tupset'}\prob_{\tup}(W)$.
\footnote{
We slightly abuse notation here, denoting a world vector as $W$ rather than $\worldvec$ to distinguish between the random variable and the world instance. When there is no ambiguity, we will denote a world vector as $\worldvec$.}
\end{Definition}
Lineage polynomials for arbitrary deterministic $\gentupset'$ are constructed in a manner analogous to $1$-\abbrTIDB\xplural (see \Cref{fig:nxDBSemantics}), differing only in the base case.
In a $1$-\abbrTIDB, each tuple contributes a multiplicity of 0 or 1, and $\polyqdt{\rel}{\gentupset}{\tup} = X_\tup$.
In a $c$-\abbrTIDB, each expanded tuple $t_j$ contributes its corresponding multiplicity: $\polyqdt{\rel}{\gentupset}{\tup_j} = j\cdot X_\tup$. These semantics are fully detailed in \Cref{fig:lin-poly-bidb}.
In a $c$-\abbrTIDB, each tuple $\tup\in\tupset'$ contributes its corresponding multiplicity: $\polyqdt{\rel}{\gentupset}{\tup} = c_\tup\cdot X_\tup$. These semantics are fully detailed in \Cref{fig:lin-poly-bidb}.
\abbrOneBIDB are powerful enough to encode \abbrCTIDB:
\begin{Proposition}[\abbrCTIDB reduction]\label{prop:ctidb-reduct}
Given \abbrCTIDB $\pdb =$\newline $\inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$ be the \emph{\abbrOneBIDB} obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intuple{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$ where $\bound_{\tup_j} = j$ for each $\tup_j$ in $\tupset'$.
Given \abbrCTIDB $\pdb =$\newline $\inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$ be the \emph{\abbrOneBIDB} obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intuple{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$ where $\bound_{\intuple{\tup, j}} = j$ for each $\intuple{\tup, j}$ in $\tupset'$.
The probability distribution $\bpd'$ is the characterized by the vector $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$.
Then, $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
\end{Proposition}
@ -55,7 +57,9 @@ We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
\begin{Definition}[$\rpoly'$]\label{def:reduced-poly-one-bidb}
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB, let $\rpoly'\inparen{\vct{X}}$ denote the reduced $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
\end{Definition}
Then given $\worldvec\in\inset{0,1}^{\tupset'}$ over the reduced \abbrOneBIDB of~\Cref{prop:ctidb-reduct}, the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\inparen{\worldvec}$ is of the same structure as the reformulated polynomial $\refpoly{}\inparen{\worldvec}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of both~\Cref{def:reduced-poly} and~\Cref{def:reduced-poly-one-bidb}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials.
%Then given $\worldvec\in\inset{0,1}^{\tupset'}$ over the reduced \abbrOneBIDB of~\Cref{prop:ctidb-reduct}, the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\inparen{\worldvec}$ is of the same structure as the reformulated polynomial $\refpoly{}\inparen{\worldvec}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of both~\Cref{def:reduced-poly} and~\Cref{def:reduced-poly-one-bidb}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials.
We have the following generalization of \Cref{lem:tidb-reduce-poly}:
\begin{Lemma}\label{lem:bin-bidb-phi-eq-redphi}
Given any \emph{\abbrOneBIDB} $\pdb'$, $\raPlus$ query $\query$, and lineage polynomial
$\poly'\inparen{\vct{X}}=\poly'\pbox{\query,\tupset',\tup}\inparen{\vct{X}}$, it holds that \newline$

View File

@ -30,10 +30,10 @@ To decouple our results from specific join algorithms, we first lower bound the
\begin{Definition}[Join Cost]
\label{def:join-cost}
Denote by $\jointime{R_1, \ldots, R_m}$ the runtime of an algorithm for computing the $m$-ary join $R_1 \bowtie \ldots \bowtie R_m$.
We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_m} \geq |R_1 \bowtie \ldots \bowtie R_m|$. With this definition of $\jointime{\cdot}$, worst-case optimal join algorithms are handled.
We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_m} \geq |R_1 \bowtie \ldots \bowtie R_m|$. %With this definition of $\jointime{\cdot}$, worst-case optimal join algorithms are handled.
\end{Definition}
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16} and AJAR~\cite{ajar}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bound}~\cite{AGM}.
% = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.
Our cost model for general query evaluation follows:
@ -64,12 +64,12 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
%}\\
\noindent
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O(\qruntimenoopt{Q,\gentupset, \bound})$.
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O\inparen{\qruntimenoopt{Q,\gentupset, \bound}}$.
We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
%Observe that
% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$, as we assumed when moving from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\qruntimenoopt{\optquery{\query}, \gentupset, \bound}$.
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$, as we assumed when moving from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization. %, i.e. $\qruntimenoopt{\optquery{\query}, \gentupset, \bound}$.
%
%We now make a simple observation on the above cost model:
%\begin{proposition}

View File

@ -2,9 +2,9 @@
%root: main.tex
\section{Introduction}\label{sec:intro}
This work explores the problem of computing the expectation of the multiplicity of a tuple in the result of a query over a \abbrCTIDB (tuple independent database), a type of probabilistic database with bag semantics where the multiplicity of a tuple is a random variable with range $[0,\bound]$ for some fixed constant $\bound$ and multiplicities assigned to any two tuples are independent of each other.
This work explores the problem of computing the expectation of the multiplicity of a tuple in the result of a query over a \abbrCTIDB (tuple independent database), a type of probabilistic database with bag semantics where the multiplicity of a tuple is a random variable with range $[0,\bound]\stackrel{\text{def}}{=}\{0,1,\dots,\bound\}$ for some fixed constant $\bound$ and multiplicities assigned to any two tuples are independent of each other.
Formally, a \abbrCTIDB,
$\pdb = \inparen{\worlds, \bpd}$ is a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
$\pdb = \inparen{\worlds, \bpd}$ is defined over a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
Any such world can be encoded as a vector (of length $\numvar=\abs{\tupset}$) from $\worlds$, such that the multiplicity of each $\tup \in \tupset$ is stored at a distinct index.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$.
We note that encoding a possible world as a vector, while non-standard, is equivalent to encoding it as a set of tuples (\Cref{prop:expection-of-polynom} in \Cref{subsec:expectation-of-polynom-proof}).
@ -13,9 +13,9 @@ Given that tuple multiplicities are independent events, the probability distrib
% Allowing for $\leq \bound$ multiplicities across all tuples gives rise to having $\leq \inparen{\bound+1}^\numvar$ possible worlds instead of the usual $2^\numvar$ possible worlds of a $1$-\abbrTIDB, which (assuming set query semantics), is the same as the traditional set \abbrTIDB.
% In this work, since we are generally considering bag query input, we will only be considering bag query semantics.
Note that in this work, we consider queries with bag semantics over such bag probabilistic databases.
We denote by $\query\inparen{\worldvec}\inparen{\tup}$ the multiplicity of $\tup$ in query $\query$ over possible world $\worldvec\in\worlds$.
We denote by $\query\inparen{\worldvec}\inparen{\tup}$ the multiplicity of a result tuple $\tup$ in query $\query$ over possible world $\worldvec\in\worlds$.
%
We can formally state our problem of computing the expected multiplicity of a result tuple as:
We can formally state our problem of computing the expected multiplicity: % of a result tuple as:
\begin{Problem}\label{prob:expect-mult}
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
@ -65,20 +65,23 @@ As also observed in \cite{https://doi.org/10.48550/arxiv.2201.11524}, computing
\mypar{Hardness of Set Query Semantics and Bag Query Semantics}
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and its data complexity has, in general been shown % by Dalvi and Suicu
to be \sharpphard\cite{10.1145/1265530.1265571}.
Grohe et. al.~\cite{https://doi.org/10.48550/arxiv.2201.11524} studied bag-\abbrTIDB\xplural allowing for unbounded multiplicities which requires them to explicitly address the issue of a succinct representation of probability distributions over infinitely many multiplicities.
This work demonstrated the existence of a dichotomy for
In an independent work Grohe et. al.~\cite{https://doi.org/10.48550/arxiv.2201.11524} studied bag-\abbrTIDB\xplural allowing for unbounded multiplicities, which requires them to explicitly address the issue of a succinct representation of probability distributions over infinitely many multiplicities.
Their work demonstrated the existence of a dichotomy for
the problem of computing the probability that an output tuple has a multiplicity of at most $s$.
% investigates the query evaluation problem over bag-\abbrTIDB\xplural when computing the probability of an output tuple having at most a multiplicity of $k$, showing that a dichotomy exists for this problem.
% While the authors observe that computing the expectation of an output tuple multiplicity is in polynomial time, no further (fine-grained) analysis of the expected value is considered.
% Our work in contrast assumes a finite bound on the multiplicities where we simply list the finitely many probability values (and hence do not need consider a more succinct representation). Further, our work primarily looks into the fine-grained analysis of computing the expected multiplicity of an output tuple.
In contrast to this work, we consider \abbrCTIDB\xplural, i.e., the multiplicity of input tuples is bound by a constant $\bound$.
In contrast to~\cite{https://doi.org/10.48550/arxiv.2201.11524}, we consider \abbrCTIDB\xplural, i.e., the multiplicity of input tuples is bound by a constant $\bound$.
For this setting, % (\abbrCTIDB\xplural, i.e., the multiplicity of input tuples is bound by a constant $\bound$), however,
there exists a trivial \ptime algorithm for computing the expectation of a result tuple's multiplicity~(\Cref{prob:expect-mult}) for any $\raPlus$ query due to linearity of expectation (see~\Cref{sec:intro-poly-equiv}).
Since we can solve~\Cref{prob:expect-mult} in \ptime, the interesting question that we explore is the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.\footnote{While the authors of \cite{https://doi.org/10.48550/arxiv.2201.11524} also observe that computing the expectation of an output tuple multiplicity is in \ptime, they do not investigate the fine-grained complexity of this problem.}
Since we can solve~\Cref{prob:expect-mult} in \ptime, the %interesting
question that we explore is %the hardness of
computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.\footnote{While %the authors of
\cite{https://doi.org/10.48550/arxiv.2201.11524} also observe that computing the expectation of an output tuple multiplicity is in \ptime, they do not investigate the fine-grained complexity of this problem.}
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query, which we make more precise shortly.
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. To analyze this question we denote by $\timeOf{}^*(\query,\pdb, \bound)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$.
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query, which we make more precise shortly.
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. We expand on the potential practical implications of this problem later in the section but for now we stress that in practice, $\bound$ is indeed constant and most often $\bound=1$. To analyze this question we denote by $\timeOf{}^*(\query,\pdb, \bound)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$ and query $\query$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -91,11 +94,11 @@ Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in ti
\hline
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{1+\eps_0}}$ for {\em some} $\eps_0>0$ & Single & Triangle Detection hypothesis\\
$\omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{C_0}}$ for {\em all} $C_0>0$ & Multiple &$\sharpwzero\ne\sharpwone$\\
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{c_0\cdot k}}$ for {\em some} $c_0>0$ & Multiple & \Cref{conj:known-algo-kmatch}\\
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{c_0\cdot k}}$ for {\em some} $c_0>0$ & Multiple & Exponential Time Hypothesis\\%\Cref{conj:known-algo-kmatch}\\
\hline
\end{tabular}
\savecaptionspace{
\caption{Our lower bounds for $\qhard$ parameterized by $k$ $\inparen{\Cref{sec:hard:sub:pre}}$ over \abbrCTIDB $\pdb$. % = \inset{\worlds, \bpd}$.
\caption{Our lower bounds for $\qhard$ parameterized by $k$ $\inparen{\Cref{sec:hard:sub:pre}}$ over $1$-TIDB $\pdb$. % = \inset{\worlds, \bpd}$.
Those with `Multiple' in the second column need the algorithm to be able to handle multiple $\bpd$. See~\Cref{sec:hard} for further details.}%, i.e. probability distributions (for a given $\tupset$). The last column states the hardness assumptions that imply the lower bounds in the first column ($\eps_o,C_0,c_0$ are constants that are independent of $k$).}
\label{tab:lbs}
}{0cm}{-0.73cm}
@ -106,14 +109,22 @@ Those with `Multiple' in the second column need the algorithm to be able to hand
\mypar{Our lower bound results}
%
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$ over a deterministic database $\gentupset$ where the maximum multiplicity of any tuple is less than or equal to $\bound$. % This paper considers $\raPlus$ queries, for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} \approx \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{The upper bounds on runtime that we derive apply pointwise to any $\query \in\raPlus$, allowing us to abstract away the specific heuristics for choosing an optimized query (i.e., Any deterministic query optimization heuristic is equally useful for \abbrCTIDB queries).}\BG{Rewrite: since an optimized Q is also a Q this also applies in the case where there is a query optimizer the rewrites Q}
Our question is whether or not it is always true that $\timeOf{}^*\inparen{\query, \pdb, \bound}\leq \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$. Unfortunately this is not the case.
~\Cref{tab:lbs} shows our results.
Our question is whether or not it is always true that for every $\query$ and $\timeOf{}^*\inparen{\query, \pdb, \bound}\leq \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$. We remark that the issue of query optimization is orthogonal to this question (recall that an $\raPlus$ query also encodes the `query plan') since we want to answer the above question for all $\query$. \emph{Specifically, if there is an equivalent query $\query'$ that is more efficient to evaluate, we allow both the deterministic and probabilistic query processing access to $\query'$}.
Specifically, depending on what hardness result/conjecture we assume, we get various weaker or stronger versions of {\em no} as an answer to our question. To make some sense of the other lower bounds in \Cref{tab:lbs}, we note that it is not too hard to show that $\timeOf{}^*(\query,\pdb, \bound) \le \bigO{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$, where $k$ is the join width of $\query$ (our notion of join width follows from~\Cref{def:degree-of-poly} and~\Cref{fig:nxDBSemantics}.) of the query $\query$ over all result tuples $\tup$ (and the parameter that defines our family of hard queries).
What our lower bound in the third row says, is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
However, this result assumes a hardness conjecture that is not as well studied as those in the first two rows of the table (see \Cref{sec:hard} for more discussion on the hardness assumptions). Further, we note that existing results\footnote{This claim follows from known results for the problem of evaluating a query $\query$ that counts the number of $k$-cliques over database $\tupset$. Specifically, a lower bound of the form $\Omega\inparen{n^{1+\eps_0}}$ for {\em some} $\eps_0>0$ follows from the triangle detection hypothesis (this like our result is for $k=3$). Second, a lower bound of $\omega\inparen{n^{C_0}}$ for {\em all} $C_0>0$ under the assumption $\sharpwzero\ne\sharpwone$~\cite{10.5555/645413.652181}. Finally, a lower bound of $\Omega\inparen{n^{c_0\cdot k}}$ for {\em some} $c_0>0$ was shown by~\cite{CHEN20061346} (under the strong exponential time hypothesis).
} already imply the claimed lower bounds if we replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar = |\tupset|$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
Unfortunately the the answer to the above question is no--
\Cref{tab:lbs} shows our results.
Specifically, depending on what hardness result/conjecture we assume, we get various weaker or stronger versions of {\em no} as an answer to our question. To make some sense of the other lower bounds in \Cref{tab:lbs}, we note that it is not too hard to show that $\timeOf{}^*(\query,\pdb, \bound) \le \bigO{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$, where $k$ is the join width of $\query$ (our notion of join width
%follows from~\Cref{def:degree-of-poly}
is essentially the degree of the corresponding polynomial defined in~\Cref{fig:nxDBSemantics}.) of the query $\query$ over all result tuples $\tup$ (and the parameter that defines our family of hard queries).
%
What our lower bound in the third row says, is that one cannot get more than a polynomial improvement (for fixed $k$) over essentially the trivial algorithm for~\Cref{prob:expect-mult}, assuming the Exponential Time Hypothesis (ETH)~\cite{eth}.
%However, this result assumes a hardness conjecture that is not as well studied as those in the first two rows of the table (see \Cref{sec:hard} for more discussion on the hardness assumptions).
Further, we note that existing results\footnote{This claim follows when we set $\query$ to the query that counts the number of $k$-cliques over database $\tupset$. Precisely the same bounds as in the three rows of~ \Cref{tab:lbs} (with $n$ replacing $\qruntime{\optquery{\query}, \tupset, \bound}$) follow from the same complexity assumptions we make: triangle detection hypothesis (by definition), $\sharpwzero\ne\sharpwone$~\cite{10.5555/645413.652181} and Strong ETH~\cite{CHEN20061346}. For the last result we can replace $k/\log{k}$ by just $k$.
%This claim follows from known results for the problem of evaluating a query $\query$ that counts the number of $k$-cliques over database $\tupset$. Specifically, a lower bound of the form $\Omega\inparen{n^{1+\eps_0}}$ for {\em some} $\eps_0>0$ follows from the triangle detection hypothesis (this like our result is for $k=3$). Second, a lower bound of $\omega\inparen{n^{C_0}}$ for {\em all} $C_0>0$ under the assumption $\sharpwzero\ne\sharpwone$~\cite{10.5555/645413.652181}. Finally, a lower bound of $\Omega\inparen{n^{c_0\cdot k}}$ for {\em some} $c_0>0$ was shown by~\cite{CHEN20061346} (under the strong exponential time hypothesis).
}
already imply the claimed lower bounds if we replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar = |\tupset|$.
%(indeed these results follow from known lower bounds for deterministic query processing).
Our contribution is to identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
\mypar{Our upper bound results} We introduce a $(1\pm \epsilon)$-approximation algorithm that computes ~\Cref{prob:expect-mult} in time $O_\epsilon\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$. This means, when we are okay with approximation, that we solve~\Cref{prob:expect-mult} in time linear in the size of the deterministic query\BG{What is the size of the deterministic query?}. % and bag \abbrPDB\xplural are deployable in practice.
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
@ -124,18 +135,19 @@ Further, our approximation algorithm works for a more general notion of bag \abb
\subsection{Polynomial Equivalence}\label{sec:intro-poly-equiv}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,4497507,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) annotates tuples with lineages, propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. The lineage polynomial for result tuple $t_{out}$ evaluates to $t_{out}$'s multiplicity in a given possible world when each $X_{t_{in}}$ is replaced by the multiplicity of $t_{in}$ in the possible world.
We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials (which is equivalent to \Cref{prob:bag-pdb-poly-expected}-- see \Cref{prop:expection-of-polynom}):
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$, \abbrCTIDB $\pdb$ and result tuple $\tup$, compute the expected
multiplicity of the polynomial $\apolyqdt$ (i.e., for $\worldvec\in\worlds$, compute $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt\inparen{\worldvec}}$).
multiplicity of the polynomial $\apolyqdt$ (i.e.,
%for $\worldvec\in\worlds$,
compute $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt\inparen{\worldvec}}$).
\end{Problem}
We note that computing \Cref{prob:expect-mult}
is equivalent (yields the same result as) to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
%We note that computing \Cref{prob:expect-mult} is equivalent (yields the same result as) to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
All of our results rely on working with a {\em reduced} form $\inparen{\rpoly}$ of the lineage polynomial $\poly$. In fact, it turns out that for the $1$-\abbrTIDB case, computing the expected multiplicity (over bag query semantics) is {\em exactly} the same as evaluating this reduced polynomial over the probabilities that define the $1$-\abbrTIDB. This is also true when the query input(s) is a block independent disjoint probabilistic database~\cite{DBLP:conf/icde/OlteanuHK10} (bag query semantics with tuple multiplicity at most $1$), for which the proof of~\Cref{lem:tidb-reduce-poly} (introduced shortly) holds .
All of our results rely on working with a {\em reduced} form $\inparen{\rpoly}$ of the lineage polynomial $\poly$. In fact, it turns out that for the $1$-\abbrTIDB case, computing the expected multiplicity (over bag query semantics) is {\em exactly} the same as evaluating this reduced polynomial over the probabilities that define the $1$-\abbrTIDB. This is also true when the query input(s) is a block independent disjoint probabilistic database~\cite{DBLP:conf/icde/OlteanuHK10} (bag query semantics with tuple multiplicity at most $1$). %, for which the proof of~\Cref{lem:tidb-reduce-poly} (introduced shortly) holds .
Next, we motivate this reduced polynomial.
Next, we motivate this reduced polynomial $\rpoly$.
Consider the query $\query_1$ defined as follows over the bag relations of \Cref{fig:two-step}:
\begin{lstlisting}
@ -148,19 +160,25 @@ The lineage polynomial for $\query_1^2$ is given by $\poly_1^2\inparen{A, B, C,
$$
=A^2X^2B^2 + B^2Y^2E^2 + B^2Z^2C^2 + 2AXB^2YE + 2AXB^2ZC + 2B^2YEZC.
$$
To compute $\expct\pbox{\poly_1^2}$ we can use linearity of expectation and push the expectation through each summand. To keep things simple, let us focus on the monomial $\poly_1^{\inparen{ABX}^2} = A^2X^2B^2$ as the procedure is the same for all other monomials of $\poly_1^2$. Let $\randWorld_X$ be the random variable corresponding to a lineage variable $X$. Because the distinct variables in the product are independent, we can push expectation through them yielding $\expct\pbox{\randWorld_A^2\randWorld_X^2\randWorld_B^2}=\expct\pbox{\randWorld_A^2}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B^2}$. Since $\randWorld_A, \randWorld_B\in \inset{0, 1}$ we can further derive $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B}$ by the fact that for any $W\in \inset{0, 1}$, $W^2 = W$. Observe that if $X\in\inset{0, 1}$, then we further would have $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X}\expct\pbox{\randWorld_B} = \prob_A\cdot\prob_X\cdot\prob_B$ (denoting $\probOf\pbox{\randWorld_A = 1} = \prob_A$) $= \rpoly_1^{\inparen{ABX}^2}\inparen{\prob_A, \prob_X, \prob_B}$ (see $ii)$ of~\Cref{def:reduced-poly}). However, in this example, we get stuck with $\expct\pbox{\randWorld_X^2}$, since $\randWorld_X\in\inset{0, 1, 2}$ and for $\randWorld_X \gets 2$, $\randWorld_X^2 \neq \randWorld_X$.
To compute $\expct\pbox{\poly_1^2}$ we can use linearity of expectation and push the expectation through each summand. To keep things simple, let us focus on the monomial $\poly_1^{\inparen{ABX}^2} = A^2X^2B^2$ as the procedure is the same for all other monomials of $\poly_1^2$. Let $\randWorld_X$ be the random variable corresponding to a lineage variable $X$. Because the distinct variables in the product are independent, we can push expectation through them yielding $\expct\pbox{\randWorld_A^2\randWorld_X^2\randWorld_B^2}=\expct\pbox{\randWorld_A^2}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B^2}$. Since $\randWorld_A, \randWorld_B\in \inset{0, 1}$ we can further simplify to $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B}$ by the fact that for any $W\in \inset{0, 1}$, $W^2 = W$. Observe that if $W_X\in\inset{0, 1}$, then we further would have $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X}\expct\pbox{\randWorld_B} = \prob_A\cdot\prob_X\cdot\prob_B$ (denoting $\probOf\pbox{\randWorld_A = 1} = \prob_A$) $= \rpoly_1^{\inparen{ABX}^2}\inparen{\prob_A, \prob_X, \prob_B}$ (see $ii)$ of~\Cref{def:reduced-poly}). However, in this example, we get stuck with $\expct\pbox{\randWorld_X^2}$, since $\randWorld_X\in\inset{0, 1, 2}$ and for $\randWorld_X \gets 2$, $\randWorld_X^2 \neq \randWorld_X$.
Denote the variables of $\poly$ to be $\vars{\poly}.$ In the \abbrCTIDB setting, $\poly\inparen{\vct{X}}$ has an equivalent reformulation $\inparen{\refpoly{}\inparen{\vct{X_R}}}$ that is of use to us, where $\abs{\vct{X_R}} = \bound\cdot\abs{\vct{X}}$ . Given $X_\tup \in\vars{\poly}$ and integer valuation $X_\tup \in\inset{0,\ldots, c}$. We can replace $X_\tup$ by $\sum_{j\in\pbox{\bound}}jX_{\tup, j}$ where the variables $\inparen{X_{\tup, j}}_{j\in\pbox{\bound}}$ are disjoint with integer assignments $X_{\tup, j}\in\inset{0, 1}$. Then for any $\worldvec\in\worlds$ and corresponding reformulated world $\worldvec_{\vct{R}}\in\inset{0, 1}^{\tupset\bound}$, we set $\worldvec_{\vct{R}_{\tup, j}} = 1$ for $\worldvec_\tup = j$, while $\worldvec_{\vct{R}_{\tup, j'}} = 0$ for all $j'\neq j\in\pbox{\bound}$. By construction then $\poly\inparen{\vct{X}}\equiv\refpoly{}\inparen{\vct{X_R}}$ $\inparen{\vct{X_R} = \vars{\refpoly{}}}$ since for any integer valuation $X_\tup\in\pbox{\bound}$, $X_{\tup, j}\in\inset{0, 1}$ we have the equality $X_\tup = j = \sum_{j\in\pbox{\bound}}jX_{t, j}$.
The simple insight to get around this issue to note that the random variables $\randWorld_X$ and $\randWorld_{X_1}+2\randWorld_{X_2}$ have exactly the same distribution, where $\randWorld_{X_1},\randWorld_{X_2}\in\inset{0,1}$ and $\probOf\pbox{\randWorld_{X_j} = 1} = \probOf\pbox{\randWorld_{X} = j}$. Thus, the idea is to replace the variable $X$ by $X_1+2X_2$ (where $X_j$ corresponds to the event that $X$ has multiplicity $j$) to obtain the following polynomial:
%
%Denote the variables of $\poly$ to be $\vars{\poly}.$ In the \abbrCTIDB setting, $\poly\inparen{\vct{X}}$ has an equivalent reformulation $\inparen{\refpoly{}\inparen{\vct{X_R}}}$ that is of use to us, where $\abs{\vct{X_R}} = \bound\cdot\abs{\vct{X}}$ . Given $X_\tup \in\vars{\poly}$ and integer valuation $X_\tup \in\inset{0,\ldots, c}$. We can replace $X_\tup$ by $\sum_{j\in\pbox{\bound}}jX_{\tup, j}$ where the variables $\inparen{X_{\tup, j}}_{j\in\pbox{\bound}}$ are disjoint with integer assignments $X_{\tup, j}\in\inset{0, 1}$. Then for any $\worldvec\in\worlds$ and corresponding reformulated world $\worldvec_{\vct{R}}\in\inset{0, 1}^{\tupset\bound}$, we set $\worldvec_{\vct{R}_{\tup, j}} = 1$ for $\worldvec_\tup = j$, while $\worldvec_{\vct{R}_{\tup, j'}} = 0$ for all $j'\neq j\in\pbox{\bound}$. By construction then $\poly\inparen{\vct{X}}\equiv\refpoly{}\inparen{\vct{X_R}}$ $\inparen{\vct{X_R} = \vars{\refpoly{}}}$ since for any integer valuation $X_\tup\in\pbox{\bound}$, $X_{\tup, j}\in\inset{0, 1}$ we have the equality $X_\tup = j = \sum_{j\in\pbox{\bound}}jX_{t, j}$.
%
%Considering again our example,
%{\small
%\begin{multline*}
%\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
%= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
\[\overline{\refpoly{1, }^{\inparen{ABX}^2}}\inparen{A, X_1, X_2 B} = \poly_1^{\inparen{AXB}^2}\inparen{A,(X_1+2X_2),B}.\]
%\end{multline*}
%}
%Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$, computing expectation yields $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2$ \allowbreak $\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$.
Considering again our example,
{\small
\begin{multline*}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
\end{multline*}
}
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$, computing expectation yields $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2$ \allowbreak $\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$.
This leads us to consider a structure related to lineage polynomials:
Given that $X$ can only have multiplicity of $1$ or $2$ but not both, we drop the monomials with the term $X_1X_2$ to get
$\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X_1, X_2 B} = A^2X_1^2B^2+2^2\cdot A^2 X_2^2B^2.$
Now that all the world vectors $(\randWorld_A,\randWorld_{X_1},\randWorld_{X_2},\randWorld_A)\in\inset{0,1}^4$, we have $\expct\pbox{\refpoly{1, }^2}=\expct\pbox{\randWorld_{A}}\expct\pbox{\randWorld_{X_1}}\expct\pbox{\randWorld_{B}}+$ \\ $4\expct\pbox{\randWorld_{A}}\expct\pbox{\randWorld_{X_2}}\expct\pbox{\randWorld_{B}}\stackrel{\text{def}}{=}\rpoly_1^2\inparen{p_A,\probOf\inparen{X=1},\probOf\inparen{X=2},p_B}$. We only did the argument for a single monomial but by linearity of expectation we can apply the same argument to all monomials in $\poly_1^2$. Generalizing this argument to general $\poly$ leads to consider its follownig `reduced' version:
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly\inparen{\inparen{X_\tup}_{\tup\in\tupset}}$ define the reformulated polynomial $\refpoly{}\inparen{\inparen{X_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}
@ -171,32 +189,31 @@ $ to be the polynomial resulting from converting $\refpoly{}$ into the standard
} (\abbrSMB),
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting each \emph{variable}'s exponents $e > 1$ to $1$.
\end{Definition}
Continuing with the example\footnote{
To save clutter we do not show the full expansion for variables with greatest multiplicity $= 1$ since e.g. for variable $A$, the sum of products itself evaluates to $1^2\cdot A^2 = A$.
}
$\poly_1^2\inparen{A, B, C, E, X_1, X_2, Y, Z}$ we have $\rpoly_1^2(A, B, C, E, X_1, X_2, Y, Z)=$
\begin{align*}
&A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}B + BYE + BZC + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BYE \\
&\qquad+ 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BZC + 2BYEZC \\
&= ABX_1 + AB\inparen{2}^2X_2+ BYE + BZC + 2AX_1BYE+ 2A\inparen{2}^2X_2BYE\\
&\qquad + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
\end{align*}
We have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural .
%Continuing with the example\footnote{To save clutter we do not show the full expansion for variables with greatest multiplicity $= 1$ since e.g. for variable $A$, the sum of products itself evaluates to $1^2\cdot A^2 = A$.}
% $\poly_1^2\inparen{A, B, C, E, X_1, X_2, Y, Z}$ we have $\rpoly_1^2(A, B, C, E, X_1, X_2, Y, Z)=$
%\begin{align*}
%&A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}B + BYE + BZC + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BYE \\
%&\qquad+ 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BZC + 2BYEZC \\
%&= ABX_1 + AB\inparen{2}^2X_2+ BYE + BZC + 2AX_1BYE+ 2A\inparen{2}^2X_2BYE\\
%&\qquad + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
%\end{align*}
As we have essentially argued earlier, for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural (proof in \Cref{subsec:proof-exp-poly-rpoly}):
\begin{Lemma}\label{lem:tidb-reduce-poly}
For any \abbrCTIDB $\pdb$, $\raPlus$ query $\query$, and lineage polynomial
$\poly\inparen{\vct{X}}=\poly\pbox{\query,\tupset,\tup}\inparen{\vct{X}}$, it holds that $
\expct_{\vct{W} \sim \pdassign}\pbox{\poly\inparen{\vct{W}}} = \rpoly\inparen{\probAllTup}
$, where $\probAllTup = \inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}.$
$, where $\probAllTup = \inparen{\prob_{\tup,j}}_{\tup\in\tupset,j\in[\bound]}.$
\end{Lemma}
\noindent
The proof in \Cref{subsec:proof-exp-poly-rpoly} follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
%\noindent
%The proof in \Cref{subsec:proof-exp-poly-rpoly} follows by~\Cref{prop:ctidb-reduct} and~\Cref{lem:bin-bidb-phi-eq-redphi}.
\subsection{Our Techniques}
\mypar{Lower Bound Proof Techniques}
Our main hardness result shows that computing~\Cref{prob:expect-mult} is $\sharpwonehard$ for $1$-\abbrTIDB. To prove this result we show that for the same $\query_1$ from the example above, for an arbitrary `product width' $k$, the query $\qhard^k$ is able to encode various hard graph-counting problems (assuming $\bigO{\numvar}$ tuples rather than the $\bigO{1}$ tuples in \Cref{fig:two-step}).
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various subgraphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to $\prob^{2k}$ in $\rpoly\inparen{\prob,\ldots,\prob}$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
%Our main hardness result shows that computing~\Cref{prob:expect-mult} is $\sharpwonehard$ for $1$-\abbrTIDB.
To prove the lower bounds in \cref{tab:lbs} we show that for the same $\query_1$ from the example above, for an arbitrary `product width' $k$, the query $\qhard^k$ is able to encode various hard graph-counting problems (assuming $\bigO{\numvar}$ tuples rather than the $\bigO{1}$ tuples in \Cref{fig:two-step}).
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various subgraphs with $k$ edges. E.g., for the last two rows in \cref{tab:lbs}, we exploit the fact that the coefficient corresponding to $\prob^{2k}$ in $\rpoly\inparen{\prob,\ldots,\prob}$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
a known hard problem in parameterized/fine-grained complexity literature.
@ -212,8 +229,8 @@ Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
Given \abbrCTIDB $\pdb$, $\raPlus$ query $\query$,
is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}$\allowbreak$\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ for all result tuples $\tup$ where
$\exists \circuit : \timeOf{\abbrStepOne}(\query,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le$\allowbreak$ O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}$\allowbreak$\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ for all result tuples $\tup$ where there exists
$\circuit : \timeOf{\abbrStepOne}(\query,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le$\allowbreak$ O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
\end{Problem}
A key insight of this paper is that the representation of $\circuit$ matters.
@ -226,41 +243,44 @@ Accordingly, this work uses (arithmetic) circuits\footnote{
as the representation system of $\poly(\vct{X})$, and we show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
Given that a representation $\circuit^*$ exists where $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can focus on the complexity of \abbrStepTwo.
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$), where $|\circuit|$ is the size of circuit $\circuit$.
Thus, the question of approximation
can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
As we also show in \Cref{sec:circuit-runtime}, the size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$), where $|\circuit|$ is the size of circuit $\circuit$.
%Thus, the question of approximation can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
Given such a $\circuit^*$, to solve \Cref{prob:big-o-joint-steps}, it is \emph{sufficient} to solve: % the following problem:
\begin{Problem}\label{prob:intro-stmt}
Given one circuit $\circuit$ that encodes $\apolyqdt$ for all result tuples $\tup$ (one sink per $\tup$) for \abbrCTIDB $\pdb$ and $\raPlus$ query $\query$, does there exist an algorithm that computes a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ (for all result tuples $\tup$) in $\bigO{|\circuit|}$ time?
\end{Problem}
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation.
We will formalize the notions of circuits and hence, \Cref{prob:intro-stmt} in \cref{sec:expression-trees}. For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\rpoly\left(\prob_1,\dots, \prob_n\right)$ (recall \cref{def:reduced-poly}) is a constant factor approximation.
This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we again limit our focus to $\refpoly{1, }^{\inparen{ABX}^2}$, assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
In computing $\rpoly$, we have some cancellations to deal with:
%In computing $\rpoly$, we have some cancellations to deal with:
Then we have:
\begin{footnotesize}
\begin{equation*}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{X}} = A^2\inparen{X_1^2 + 4X_1X_2 + 4X_2^2}B^2 =A^2X_1^2B^2 + 4A^2X_1X_2B^2+4A^2X_2^2B^2
%&\qquad+ 2AX_2B^2YE + 2AX_1B^2ZC + 2AX_2B^2ZC + 2B^2YEZC\\
\end{equation*}
\end{footnotesize}
This then implies
Recall that
%\begin{footnotesize}
%\begin{equation*}
$\rpoly_1^{\inparen{ABX}^2}\inparen{\vct{X}} = AX_1B+4AX_2B$.
$\rpoly_1^{\inparen{ABX}^2}\inparen{\vct{X}} = AX_1B+4AX_2B$,
%\end{equation*}
%\end{footnotesize}
Substituting $\vct{\prob}$ for $\vct{X}$,
\begin{footnotesize}
\begin{align*}
\hspace*{-3mm}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{\probAllTup} &=\prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2\\% + \prob_B^2\prob_Y^2\prob_E^2 + \prob_B^2\prob_Z^2\prob_C^2 + 2\prob_A\prob_{X_1}\prob_B^2\prob_Y\prob_E\\
which implies:
\[ \refpoly{1, }^{\inparen{ABX}^2}\inparen{\probAllTup} -4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2=\prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2.\]
%Substituting $\vct{\prob}$ for $\vct{X}$,
%\begin{footnotesize}
%\begin{align*}
%\hspace*{-3mm}
% \refpoly{1, }^{\inparen{ABX}^2}\inparen{\probAllTup} &=\prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2\\% + \prob_B^2\prob_Y^2\prob_E^2 + \prob_B^2\prob_Z^2\prob_C^2 + 2\prob_A\prob_{X_1}\prob_B^2\prob_Y\prob_E\\
%&\qquad+ 2\prob_A\prob_{X_2}\prob_B^2\prob_Y\prob_E + 2\prob_A\prob_{X_1}\prob_B^2\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B^2\prob_Z\prob_C+ 2\prob_B^2\prob_Y\prob_E\prob_Z\prob_C\\
&\leq\prob_A\prob_{X_1}\prob_B + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A\prob_{X_2}\prob_b\\% + \prob_B\prob_Y\prob_E + \prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_1}\prob_B\prob_Y\prob_E \\
% &\leq\prob_A\prob_{X_1}\prob_B + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A\prob_{X_2}\prob_b\\% + \prob_B\prob_Y\prob_E + \prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_1}\prob_B\prob_Y\prob_E \\
%&\qquad + 2\prob_A\prob_{X_2}\prob_B\prob_Y\prob_E+ 2\prob_A\prob_{X_1}\prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B\prob_Z\prob_C + 2\prob_B\prob_Y\prob_E\prob_Z\prob_C\\
&= \rpoly_1^{\inparen{ABX}^2}\inparen{\vct{p}} + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2.
\end{align*}
\end{footnotesize}
If we assume that all probability values are at least $p_0>0$, then given access to $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$
% &= \rpoly_1^{\inparen{ABX}^2}\inparen{\vct{p}} + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2.
%\end{align*}
%\end{footnotesize}
If we assume that all probability values are in $[p_0,1]$ for some $p_0>0$,
%then given access to $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$
we get that $\refpoly{1, }^{\inparen{ABX}^2}\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$ is in the range $\pbox{p_0^3\cdot\rpoly^{\inparen{ABX}^2}_1\inparen{\vct{\prob}}, \rpoly_1^{\inparen{ABX}^2}\inparen{\vct{\prob}}}$.
%We can simulate sampling from $\refpoly{1, }^2\inparen{\vct{X}}$ by sampling monomials from $\refpoly{1, }^2$ while ignoring any samples $A^2X_1X_2B^2$.
Note however, that this is \emph{not a tight approximation}.

View File

@ -153,7 +153,7 @@
\newcommand{\tupsetsize}{n}
\newcommand{\tupset}{D}
\newcommand{\gentupset}{\overline{D}}
\newcommand{\world}{\inset{0,\ldots, c}}
\newcommand{\world}{[0,\bound]}
\newcommand{\worldvec}{\vct{W}}
\newcommand{\worlds}{\world^\tupset}
\newcommand{\bpd}{\mathcal{P}}%bpd for bag probability distribution
@ -357,7 +357,8 @@
\newcommand{\ptime}{{\sf PTIME}\xspace}
\newcommand{\timeOf}[1]{T_{#1}}
\newcommand{\qruntime}[1]{T_{det}\inparen{#1}}
\newcommand{\optquery}[1]{\func{OPT}\inparen{#1}}
%\newcommand{\optquery}[1]{\func{OPT}\inparen{#1}}
\newcommand{\optquery}[1]{{#1}} % Changing this since we decided not to use OPT specifically.
\newcommand{\qruntimenoopt}[1]{T_{det}\inparen{#1}}%need to get rid of this--needs to be propagated
\newcommand{\jointime}[1]{T_{join}(#1)}
\newcommand{\kmatchtime}{T_{match}\inparen{k, G}}

View File

@ -815,3 +815,17 @@ Maximilian Schleich},
year = {2021}
}
@article{eth,
title = {Which Problems Have Strongly Exponential Complexity?},
journal = {Journal of Computer and System Sciences},
volume = {63},
number = {4},
pages = {512-530},
year = {2001},
issn = {0022-0000},
doi = {https://doi.org/10.1006/jcss.2001.1774},
url = {https://www.sciencedirect.com/science/article/pii/S002200000191774X},
author = {Russell Impagliazzo and Ramamohan Paturi and Francis Zane},
abstract = {For several NP-complete problems, there have been a progression of better but still exponential algorithms. In this paper, we address the relative likelihood of sub-exponential algorithms for these problems. We introduce a generalized reduction that we call Sub-exponential Reduction Family (SERF) that preserves sub-exponential complexity. We show that Circuit-SAT is SERF-complete for all NP-search problems, and that for any fixed k⩾3, k-SAT, k-Colorability, k-Set Cover, Independent Set, Clique, and Vertex Cover, are SERF-complete for the class SNP of search problems expressible by second-order existential formulas whose first-order part is universal. In particular, sub-exponential complexity for any one of the above problems implies the same for all others. We also look at the issue of proving strongly exponential lower bounds for AC0, that is, bounds of the form 2Ω(n). This problem is even open for depth-3 circuits. In fact, such a bound for depth-3 circuits with even limited (at most nε) fan-in for bottom-level gates would imply a nonlinear size lower bound for logarithmic depth circuits. We show that with high probability even random degree 2 GF(2) polynomials require strongly exponential size for Σk3 circuits for k=o(loglogn). We thus exhibit a much smaller space of 2O(n2) functions such that almost every function in this class requires strongly exponential size Σk3 circuits. As a corollary, we derive a pseudorandom generator (requiring O(n2) bits of advice) that maps n bits into a larger number of bits so that computing parity on the range is hard for Σk3 circuits. Our main technical lemma is an algorithm that, for any fixed ε>0, represents an arbitrary k-CNF formula as a disjunction of 2εnk-CNF formulas that are sparse, that is, each disjunct has O(n) clauses.}
}

View File

@ -3,11 +3,13 @@
\section{Hardness of Exact Computation}
\label{sec:hard}
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\qhard,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a $1$-\abbrTIDB.
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$; \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other \abbrPDB\xplural.
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$
%; \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to
as well as \abbrOneBIDB. % and other \abbrPDB\xplural.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}\label{sec:hard:sub:pre}
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing.
In particular, we will consider computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
In particular, we will consider computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following known (conditional) hardness results:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
@ -19,13 +21,19 @@ Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no sel
%There exists an absolute constant $c_0>0$ such that for every $G=(\vset,\edgeSet)$, we have $\kmatchtime \ge \Omega\inparen{|E|^{c_0\cdot k}}$ for large enough $k$.
%\end{hypo}
\begin{hypo}[~\cite{10.1109/FOCS.2014.22}]\label{conj:known-algo-kmatch}
For every $G=\inparen{\vset, \edgeSet}$, $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
\end{hypo}
%<<<<<<< HEAD
%\begin{hypo}[~\cite{10.1109/FOCS.2014.22}]\label{conj:known-algo-kmatch}
%For every $G=\inparen{\vset, \edgeSet}$, $\kmatchtime\ge n^{\Omega\inparen{k/\log{k}}}$.
%\end{hypo}
%=======
\begin{Theorem}[~\cite{DBLP:journals/corr/CurticapeanM14}]\label{conj:known-algo-kmatch}
Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$, $\kmatchtime\ge |\vset|^{\Omega\inparen{k/\log{k}}}$, assuming ETH.
\end{Theorem}
We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$
%We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$
%(i.e. if this is the best algorithm then $c_0=\frac 14$)
~\cite{k-match}. What the above conjecture is saying is that one can only hope for a polynomial improvement over the state of the art algorithm to compute $\numocc{G}{\kmatch}$.
%~\cite{k-match}.
The above result is saying is that (assuming ETH) one can only hope for a slightly super-polynomial improvement over the state of the art algorithm to compute $\numocc{G}{\kmatch}$.
%
Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
@ -51,25 +59,28 @@ For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\noindent Returning to \Cref{fig:two-step}, it can be seen that $\poly_{G}^\kElem(\vct{X})$ is the lineage polynomial from query $\qhard^k$, which we define next ($\query_1$ from~\Cref{sec:intro} is the same query with $k=1$). Let us alias
\begin{lstlisting}
SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = r.Point$_2$
\end{lstlisting}
as $Q^1$. The query $\qhard^k$ then becomes
\noindent Returning to \Cref{fig:two-step}, it can be seen that $\poly_{G}^\kElem(\vct{X})$ is the lineage polynomial from query $\qhard^k$, which we define next. Recall $\query_1$ from~\Cref{sec:intro}, which is $\qhard^1$.
%Let us alias
%\begin{lstlisting}
%SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
%WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = %r.Point$_2$
%\end{lstlisting}
%as $Q^1$.
The query $\qhard^k$ then becomes
\mdfdefinestyle{underbrace}{topline=false, rightline=false, bottomline=false, leftline=false, backgroundcolor=black!15!white, innerbottommargin=0pt}
\begin{mdframed}[style=underbrace]
\begin{lstlisting}
SELECT COUNT(*) FROM $\underbrace{Q^1\text{ JOIN }Q^1\text{ JOIN}\cdots\text{JOIN }Q^1}_{k\rm\ times}$
\end{lstlisting}
\end{mdframed}
\noindent Consider again the \abbrCTIDB instance $\pdb$ of~\Cref{fig:two-step} and, for our hard instance, let $\bound = 1$. $\pdb$ generalizes to one compatible to~\Cref{def:qk} as follows. Relation $T$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob$ and $R$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $R$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $R$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
\noindent %Consider again the \abbrCTIDB instance $\pdb$ of~\Cref{fig:two-step} and, for our hard instance, let $\bound = 1$. $\pdb$ generalizes to one compatible
We next define the instances for $T$ and $R$ that lead to the lineage polynomial in~\Cref{def:qk} as follows. Relation $T$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob$ and $R$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $R$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $R$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
In other words, this instance $\tupset$ contains the set of $\numvar$ unary tuples in $T$ (which corresponds to $\vset$) and $\numedge$ binary tuples in $R$ (which corresponds to $\edgeSet$).
Note that this implies that $\poly_{G}^\kElem$ is indeed a $1$-\abbrTIDB lineage polynomial.
Next, we note that the runtime for answering $\qhard^k$ on deterministic database $\tupset$, as defined above, is $O_k\inparen{\numedge}$ (i.e. deterministic query processing is `easy' for this query):
\begin{Lemma}\label{lem:tdet-om}
Define $\qhard^k$ and $\tupset$ as above. Then
For $\qhard^k,\tupset$ be as above,
$\qruntimenoopt{\qhard^k, \tupset, \bound}$ is $O_k\inparen{\numedge}$.
\end{Lemma}
@ -85,8 +96,12 @@ needs time $\bigOmega{\kmatchtime}$, if $\kmatchtime\ge \omega\inparen{\abs{\edg
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Note that the second row of \Cref{tab:lbs} follows from \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{thm:k-match-hard} while the third row is proved by \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{conj:known-algo-kmatch}. Since \Cref{conj:known-algo-kmatch} is non-standard, the latter hardness result should be interpreted as follows. Any substantial polynomial improvement for \Cref{prob:bag-pdb-poly-expected} (over the trivial algorithm that converts $\poly$ into SMB and then uses \Cref{cor:expct-sop} for \abbrStepTwo) would lead to an improvement over the state of the art {\em upper} bounds on $\kmatchtime$. Finally, note that \Cref{thm:mult-p-hard-result} needs one to be able to compute the expected multiplicities over $(2k+1)$ distinct values of $p_i$, each of which corresponds to distinct $\bpd$ (for the same $\tupset$), which explain the `Multiple' entries in the second column of the second and third rows in \Cref{tab:lbs}. Next, we argue how to get rid of this latter requirement.
Note that the second row of \Cref{tab:lbs} follows from %\Cref{prop:expection-of-polynom},
\Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{thm:k-match-hard} while the third row is proved by %\Cref{prop:expection-of-polynom},
\Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{conj:known-algo-kmatch}.
%Since \Cref{conj:known-algo-kmatch} is non-standard, the latter hardness result should be interpreted as follows. Any substantial polynomial improvement for \Cref{prob:bag-pdb-poly-expected} (over the trivial algorithm that converts $\poly$ into SMB and then uses \Cref{cor:expct-sop} for \abbrStepTwo) would lead to an improvement over the state of the art {\em upper} bounds on $\kmatchtime$. Finally,
Note that \Cref{thm:mult-p-hard-result} needs one to be able to compute the expected multiplicities over $(2k+1)$ distinct values of $p_i$, each of which corresponds to distinct $\bpd$ (for the same $\tupset$), which explain the `Multiple' entries in the second column of the second and third rows in \Cref{tab:lbs}. Next, we argue how to get rid of this latter requirement.
\textcolor{red}{Need to put in a proof overview here-- Atri}
%%% Local Variables:
%%% mode: latex

View File

@ -19,8 +19,23 @@ We refer to the structure when the underlying DAG is a tree (with edges pointing
The circuits $\inparen{1}$ and $\inparen{2}$ in column $\poly$ of \Cref{fig:two-step} are both expression trees.%encode their respective polynomials in column $\poly$.
%Note that the ciricuit \circuit representing $AX$ and the circuit \circuit' representing $B\inparen{Y+Z}$ each encode a tree, with edges pointing towards the root.
The function $\polyf\inparen{\cdot}$ (\Cref{def:poly-func}) maps a circuit to its corresponding polynomial. We next define its inverse:
%of the function $\polyf(\cdot)$.% (\Cref{def:poly-func}).%, which maps a circuit to the polynomial it encodes.
\begin{figure}[t!]
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\polyX}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \polyX$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. %One can think of $\circuitset{\polyX}$ as the infinite set of circuits where for each element \circuit, $\polyf\inparen{\circuit} = \polyX$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\medskip
\noindent We are now ready to formally state the final version of \Cref{prob:intro-stmt}.%our \textbf{main problem}.
\begin{wrapfigure}{r}{0.3\textwidth}
%\begin{figure}[t!]
\centering
%see https://tex.stackexchange.com/questions/26846/how-to-scale-a-tikzpicture-including-texts#26852
%for a nice explanation of scaling a tikzpicture
@ -59,22 +74,10 @@ The circuits $\inparen{1}$ and $\inparen{2}$ in column $\poly$ of \Cref{fig:two-
\label{fig:circuit}
}{-0.025cm}{-0.58cm}
%\vspace{-0.58cm}
\end{figure}
\end{wrapfigure}
The function $\polyf\inparen{\cdot}$ (\Cref{def:poly-func}) maps a circuit to its corresponding polynomial. We next define the inverse of the function $\polyf(\cdot)$.% (\Cref{def:poly-func}).%, which maps a circuit to the polynomial it encodes.
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\polyX}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \polyX$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. %One can think of $\circuitset{\polyX}$ as the infinite set of circuits where for each element \circuit, $\polyf\inparen{\circuit} = \polyX$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip
\noindent We are now ready to formally state the final version of \Cref{prob:intro-stmt}.%our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}

View File

@ -13,11 +13,11 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $\abs{\poly}$ be the number of operators in $\poly$.
Let $\abs{\poly}$ be the number of operators in $\poly$. Then:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Corollary}\label{cor:expct-sop}
If $\poly$ is a $1$-\abbrBIDB lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\abs{\poly}}$ time.
If $\poly$ is a \abbrOneBIDB lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\abs{\poly}}$ time.
\end{Corollary}
% \subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}

View File

@ -4,14 +4,15 @@
\subsection{Single $\prob$ value}
\label{sec:single-p}
While \Cref{thm:mult-p-hard-result} shows that computing $\rpoly(\prob,\dots,\prob)$ for multiple values of $\prob$ in general is hard it does not rule out the possibility that one can compute this value exactly for a {\em fixed} value of $\prob$. Indeed, it is easy to check that one can compute $\rpoly(\prob,\dots,\prob)$ exactly in linear time for $\prob\in \inset{0,1}$. Next we show that these two are the only possibilities:
While \Cref{thm:mult-p-hard-result} shows that computing $\rpoly(\prob,\dots,\prob)$ for multiple values of $\prob$ in general is hard it does not rule out the possibility that one can compute this value exactly for a {\em fixed} value of $\prob$. %Indeed, it is easy to check that
One can compute $\rpoly(\prob,\dots,\prob)$ exactly in linear time for $\prob\in \inset{0,1}$. Next, we show that these are the only two possibilities:
\begin{Theorem}\label{th:single-p-hard}
Fix $\prob\in (0,1)$. Then assuming \Cref{conj:graph} is true, any algorithm that computes $\rpoly_{G}^3(\prob,\dots,\prob)$ for arbitrary $G = (\vset, \edgeSet)$ exactly has to run in time $\Omega\inparen{\abs{\edgeSet}^{1+\eps_0}}$, where $\eps_0$ is as defined in \Cref{conj:graph}.
Fix $\prob\in (0,1)$. Then assuming \Cref{conj:graph}, any algorithm that computes $\rpoly_{G}^3(\prob,\dots,\prob)$ for arbitrary $G = (\vset, \edgeSet)$ exactly has to run in time $\Omega\inparen{\abs{\edgeSet}^{1+\eps_0}}$, where $\eps_0$ is as in \Cref{conj:graph}.
\end{Theorem}
Note that \Cref{prop:expection-of-polynom} and \Cref{th:single-p-hard} above imply the hardness result in the first row of \Cref{tab:lbs}.
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
Note that \Cref{lem:tdet-om} and \Cref{th:single-p-hard} above imply the hardness result in the first row of \Cref{tab:lbs}.
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third rows) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query). \textcolor{red}{Need to put in a proof overview here-- Atri}
%%% Local Variables: