paper-BagRelationalPDBsAreHard/mult_distinct_p.tex

%root:main.tex
%!TEX root=./main.tex
\section{Hardness of Exact Computation}
\label{sec:hard}
\AH{If anything need be changed in~\Cref{sec:hard}, it would only be in the following (opening) paragraph.}
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\query,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a \abbrTIDB.
 Note that this implies hardness for \bis and general \abbrBPDB, showing \Cref{prob:bag-pdb-poly-expected} cannot be done in $O\inparen{\qruntime{\query,\dbbase}}$ runtime.
%(and hence the equivalent \Cref{prob:bag-pdb-query-eval})
 %in the negative.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}\label{sec:hard:sub:pre}
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity.  We can think of $H$ as being of constant size and $G$ as growing.
In particular, we will consider the problems of computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings).  We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly.  Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
\label{thm:k-match-hard}
Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no self-loops or parallel edges,  $\kmatchtime\ge \littleomega{f(k)\cdot |\edgeSet|^c}$ for any function $f$ and fixed constant $c$ independent of $\numedge$ and $k$ (assuming $\sharpwzero\ne\sharpwone$).
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{hypo}\label{conj:known-algo-kmatch}
There exists an absolute constant $c_0>0$ such that for every $G=(\vset,\edgeSet)$, we have $\kmatchtime \ge \Omega\inparen{|E|^{c_0\cdot k}}$ for large enough $k$.
\end{hypo}
We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$ (i.e. if this is the best algorithm then $c_0=\frac 14$)~\cite{k-match}. What the above conjecture is saying is that one can only hope for a polynomial improvement over the state of the art algorithm to compute $\numocc{G}{\kmatch}$.
%

Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{hypo}
\label{conj:graph}
There exists a constant $\eps_0>0$ such that given an undirected graph $G=(\vset,\edgeSet)$, computing $\numocc{G}{\tri}$ exactly cannot be done in time $o\inparen{|\edgeSet|^{1+\eps_0}}$.
\end{hypo}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
The so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detecting the presence of triangles in $G$ takes time $\Omega\inparen{|\edgeSet|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.

All of our hardness results rely on a simple lineage polynomial encoding of the edges of a graph.
To prove our hardness result, consider a graph $G=(\vset, \edgeSet)$, where $|\edgeSet| = m$, $\vset = [\numvar]$. Our lineage polynomial has a variable $X_i$ for every $i$ in $[\numvar]$.
Consider the polynomial
$\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in \edgeSet} X_i \cdot X_j.$
The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}\label{def:qk}
For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in \edgeSet} X_i \cdot X_j\right)^\kElem.\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\noindent Returning to \Cref{fig:two-step}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the lineage polynomial corresponding to the query that generalizes our example query from \Cref{sec:intro}. Let us alias
\begin{lstlisting}
SELECT 1 FROM OnTime a, Route r, OnTime b
WHERE a.city = r.city1 AND b.city = r.city2
\end{lstlisting}
as $R_i$ for each $i \in [k]$.  The query $\query^k$ then becomes
\begin{lstlisting}
SELECT COUNT(*) FROM $R_1$ JOIN $R_2$ JOIN$\cdots$JOIN $R_k$
\end{lstlisting}
\noindent Further, the PDB instance generalizes the one in \Cref{fig:two-step} as follows. Relation $OnTime$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob_i$ and $Route$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
In other words, for this instance $\dbbase$ contains the set of $n$ unary tuples in $OnTime$ (which corresponds to $\vset$) and $m$ binary tuples in $Route$ (which corresponds to $\edgeSet$).
Note that this implies that $\poly_{G}^\kElem$ is indeed a \abbrTIDB-lineage polynomial. % for a \abbrTIDB \abbrPDB.

Next, we note that the runtime for answering $\query^k$ on deterministic database $\dbbase$, as defined above, is $O(m)$ (i.e. deterministic query processing is `easy' for this query):
\begin{Lemma}\label{lem:tdet-om}
Let $\query^k$ and $\dbbase$ be as defined above. Then
% of \Cref{def:qk}, the runtime
$\qruntime{\query^k, \dbbase}$ is $O(\kElem\numedge)$.
\end{Lemma}

\subsection{Multiple Distinct $\prob$ Values}
\label{sec:multiple-p}
%Unless otherwise noted, all proofs for this section are in \Cref{app:single-mult-p}.
We are now ready to present our main hardness result.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}\label{thm:mult-p-hard-result}
Let $\prob_0,\ldots,\prob_{2k}$ be $2k + 1$ distinct values in $(0, 1]$.  Then computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ (over all $i\in [2k+1]$ for arbitrary $G=(\vset,\edgeSet)$
%and any $(2k+1)$ distinct values $\prob_i$ ($0\le i \le 2k$)
needs time $\bigOmega{\kmatchtime}$, assuming $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Note that the second row of \Cref{tab:lbs} follows from \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{thm:k-match-hard} while the third row is proved by \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{conj:known-algo-kmatch}. Since \Cref{conj:known-algo-kmatch} is non-standard, the latter hardness result should be interpreted as follows. Any substantial polynomial improvement for \Cref{prob:bag-pdb-poly-expected} (over the trivial algorithm that converts $\poly$ into SMB and then uses \Cref{cor:expct-sop} for \abbrStepTwo) would lead to an improvement over the state of the art {\em upper} bounds on  $\kmatchtime$. Finally, note that \Cref{thm:mult-p-hard-result} needs one to be able to compute the expected multiplicities over $(2k+1)$ distinct values of $p_i$, each of which corresponds to distinct $\pd$ (for the same $\dbbase$), which explain the `Multiple' entry in the second column in the second and third row in \Cref{tab:lbs}. Next, we argue how to get rid of this latter requirement.


%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: