Moved commented out material into the appendix.

master
Aaron Huber 2021-09-17 18:10:41 -04:00
parent e3faa018bc
commit ea8cb76bcd
7 changed files with 40 additions and 289 deletions

View File

@ -33,14 +33,17 @@ For the sake of contradiction, assume we can solve our problem in $\littleo{\kma
%= &\bigO{k}\cdot \littleo{\kmatchtime} + O(\kElem^3)\label{eq:proof-omega-kmatch3}\\
&\le \littleo{\kmatchtime}\label{eq:proof-omega-kmatch4}.
\end{align}
%Atri: The details below are fine to make sure our proofs are correct but the arguments below are bit too "basic" to include in an ICDT paper, I _think_
%We obtain \Cref{eq:proof-omega-kmatch2} by the assumption that $\kmatchtime \in \littleomega{\numedge}$, an assumption which is upheld by the assumption in \Cref{thm:k-match-hard} that $\#W[0]\neq\sharpwone$ and the further assumption that the best runtime to compute $k$-matchings is $\bigOmega{\numedge^{k/2}}$ in \cite{k-match}. \Cref{eq:proof-omega-kmatch3} then follows by the fact that $\bigO{\numedge}\in \littleo{\numedge}\in \littleo{\kmatchtime}$.
We obtain \Cref{eq:proof-omega-kmatch4} from the facts that $k$ is fixed (related to $m$) and the assumption that $\kmatchtime\ge\omega(m)$.
%by the observation that $\bigO{k}\in\bigO{k^3}$ and $\bigO{k^3} \in \littleo{\numedge}$ since by definition of parameterized complexity we have $\frac{\numedge}{k}\rightarrow \infty$.
Thus we obtain the contradiction that we can achieve a runtime $\littleo{\kmatchtime}$ that is better than the optimal time $\kmatchtime$ required to compute $k$-matchings.
\qed
\end{proof}
\subsection{\Cref{lem:qEk-multi-p}}
\noindent The following lemma reduces the problem of counting $\kElem$-matchings in a graph to our problem (and proves \Cref{thm:mult-p-hard-result}):
\begin{Lemma}\label{lem:qEk-multi-p}
Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $\bigO{\kElem^3}$ time.
\end{Lemma}
\subsection{Proof of Lemma~\ref{lem:qEk-multi-p}}
\input{lem_mult-p}

View File

@ -100,6 +100,22 @@ Denote the vector $\vct{p}$ to be a vector whose elements are the individual pro
\end{align}
%
Recall that tuple blocks in a TIDB always have size 1, so the outer summation of \cref{eq:tidb-expectation} is over the full set of vectors.
\AH{Have cut and pasted the subsequent text. Need to verify this is the appropriate place for it.}
Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_\numvar)$ with natural number coefficients and exponents.
We model incomplete relations using Green et. al.'s $\semNX$-databases~\cite{DBLP:conf/pods/GreenKT07}, discussed in detail in \Cref{subsec:supp-mat-krelations}.
$\semNX$-databases are functions from tuples to elements of $\semNX$, typically called annotations.
Given an $\semNX$-database $\db$, it is common to use $\db(\tup)$ to denote the polynomial annotating tuple $\tup$ in $\db$.
%Note that based on this definition of $\rel$, $\rel(\tup)$ is the lineage polynomial for $\tup$.
Let $\numvar$ be the number of tuples in $\pdb$. Then, each possible world is defined by an assignment of $\numvar$ binary values $\vct{\wElem} \in \{0, 1\}^{\numvar}$ to $\vct{X}$.
The multiplicity of $\tup \in \db$, denoted $\db(\tup)(\vct{\wElem})$, is obtained by evaluating the polynomial annotating $\tup$ on $\vct{\wElem}$.
$\semNX$-relations are closed under $\raPlus$ (\Cref{fig:nxDBSemantics}).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We will use $\semNX$-\abbrPDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$ over the assignments to $\vct{X}$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-\abbrPDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{\numvar} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-\abbrPDB\xplural and a function $\rmod$ (which transforms an $\semNX$-\abbrPDB to a classical bag-\abbrPDB, or $\semN$-\abbrPDB~\cite{DBLP:conf/pods/GreenKT07,feng:2019:sigmod:uncertainty}) are both formalized in \Cref{subsec:supp-mat-background}.
\BG{Oliver's conjecture: Bag-\tis + Q can express any finite bag-PDB:
A well-known result for set semantics PDBs is that while not all finite PDBs can be encoded as \tis, any finite PDB can be encoded using a \ti and a query. An analog result holds in our case: any finite $\semN$-PDB can be encoded as a bag \ti and a query (WHAT CLASS? ADD PROOF)
}
@ -139,6 +155,14 @@ Follows by the construction of $\rpoly$ in \cref{def:reduced-bi-poly}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Valid Worlds]
For probability distribution $\pd$, % and its corresponding probability mass function $\probOf$,
the set of valid worlds $\valworlds$ consists of all the worlds with probability value greater than $0$; i.e., for random world variable vector $\vct{W}$
\[
\valworlds = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
\]
\end{Definition}
\subsection{Proposition~\ref{proposition:q-qtilde}}\label{app:subsec-prop-q-qtilde}
\noindent Note the following fact:

View File

@ -12,15 +12,6 @@ The folowing approximation algorithm applies to \abbrBIDB lineage polynomials (o
We now introduce useful definitions and notation related to circuits and polynomials.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Definition}[Variables in a monomial]\label{def:vars}
% Given a monomial $v$, we use $\var(v)$ to denote the set of variables in $v$.
%\end{Definition}
%\noindent For example the monomial $XY$ has $\var(XY)=\inset{X,Y}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\expansion{\circuit}$]\label{def:expand-circuit}
For a circuit $\circuit$, we define $\expansion{\circuit}$ as a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef \in \domN$.
$\expansion{\circuit}$ has the following recursive definition ($\circ$ is list concatenation).
@ -46,15 +37,6 @@ Conveniently, $\abs{\circuit}\inparen{1,\ldots,1}$ gives us $\sum\limits_{\inpar
The functions \size and \depth output the number of gates and levels respectively for input \circuit.
\end{Definition}
%\begin{Definition}[\depth($\cdot$)]
%The function \depth has circuit $\circuit$ as input and outputs the number of levels in \circuit.
%\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%
%NEEDS to be moved to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
$\degree(\circuit)$ is defined recursively as follows:
\[\degree(\circuit)=
@ -66,9 +48,6 @@ $\degree(\circuit)$ is defined recursively as follows:
\end{cases}
\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%
%END move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%
Finally, we use the following notation for the complexity of multiplying integers:
\begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$. More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
@ -145,64 +124,9 @@ Given a lineage polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circui
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%NEED to move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%
%\input{app_approx-alg-pseudo-code}
%%%%%%%%%%%%%%%%%%%%%%%%%
%END move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample (via \sampmon) $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional %\footnote{We could have also uniformly sampled from $\expansion{\circuit}$ but this gives better parameters.}
to $\abs{\coef}$ and compute $\vari{Y}=\indicator{\isInd{\encMon}}%\monom\mod{\mathcal{B}}\not\equiv 0}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample (via \sampmon) $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional
to $\abs{\coef}$ and compute $\vari{Y}=\indicator{\isInd{\encMon}}
\cdot \prod_{X_i\in \monom} p_i$. Taking $\ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$ samples and computing the average of $\vari{Y}$ gives us our final estimate. \onepass is used to compute the sampling probabilities needed in \sampmon (details are in \Cref{sec:proofs-approx-alg}).
%\approxq (\Cref{alg:mon-sam}) modifies \circuit with a call to \onepass. It then samples from $\circuit_{\vari{mod}}\numsamp$ times and uses that information to approximate $\rpoly$.
%\subsubsection{Correctness}
%In order to prove \Cref{lem:approx-alg}, we will need to argue the correctness of \approxq, which relies on the correctness of auxiliary algorithms \onepass and \sampmon.
%\begin{Lemma}\label{lem:one-pass}
%The $\onepass$ function completes in time:
%$$O\left(\size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit}}\right)$$
% $\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
%\end{Lemma}
%To prove correctness of \Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
%\begin{Lemma}\label{lem:sample}
%The function $\sampmon$ completes in time
%$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
% where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
%\end{Lemma}
%With the above two lemmas, we are ready to argue the following result (proof in \Cref{sec:proofs-approx-alg}):
%\begin{Theorem}\label{lem:mon-samp}
%For any $\circuit$ with $\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that
%\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
% in $O\left(\left(\size(\circuit)+\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}}\right)$ time.
%\end{Theorem}
%\subsection{\onepass\ Algorithm}
%\label{sec:onepass}
%\noindent \onepass\ (Algorithm ~\ref{alg:one-pass-iter} in \Cref{sec:proofs-approx-alg}) iteratively visits each gate one time according to the topological ordering of \circuit annotating the \lwght, \rwght, and \prt variables of each node according to the definitions above. Lemma~\ref{lem:one-pass} is proved in \Cref{sec:proofs-approx-alg}.
%\subsection{\sampmon\ Algorithm}
%\label{sec:samplemonomial}
%A naive (slow) implementation of \sampmon\ would first compute $\expansion{\circuit}$ and then sample from it.
%Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top-down traversal of the input \circuit. More details on the traversal can be found in \Cref{subsec:sampmon-remarks}.
%
%$\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}.
%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:

View File

@ -4,22 +4,18 @@
\label{sec:hard}
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\query,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a \abbrTIDB.
% that computing $\expct\pbox{\poly(\vct{W})}$ exactly for a \ti-lineage polynomial $\poly(\vct{X})$ generated from a project-join query (even an expression tree representation) is \sharpwonehard.
Note that this implies hardness for \bis and general \abbrBPDB, answering \Cref{prob:bag-pdb-poly-expected} (and hence the equivalent \Cref{prob:bag-pdb-query-eval}) in the negative.
%Furthermore, we demonstrate in \Cref{sec:single-p} that the problem remains hard, even if $\probOf[X_i=1] = \prob$ for all $X_i$ and any fixed valued $\prob \in (0, 1)$ as long as certain popular hardness conjectures in fine-grained complexity hold.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}\label{sec:hard:sub:pre}
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing. %In query processing, $H$ can be viewed as the query while $G$ as the database instance.
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing.
In particular, we will consider the problems of computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
\label{thm:k-match-hard}
Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no self-loops or parallel edges, the time $\kmatchtime$ to compute $\numocc{G}{\kmatch}$ exactly is $\littleomega{f(k)\cdot |\edgeSet|^c}$ for any function $f$ and fixed constant $c$ independent of $\numedge$ and $k$ (assuming $\sharpwzero\ne\sharpwone$. %counting the number of $k$-matchings in $G$ is\sharpwonehard (parameterization is in $k$).
Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no self-loops or parallel edges, the time $\kmatchtime$ to compute $\numocc{G}{\kmatch}$ exactly is $\littleomega{f(k)\cdot |\edgeSet|^c}$ for any function $f$ and fixed constant $c$ independent of $\numedge$ and $k$ (assuming $\sharpwzero\ne\sharpwone$.
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%The above result means that we cannot hope to count the number of $k$-matchings in $G=(\vset,\edgeSet)$ in time $f(k)\cdot |\vset|^{c}$ for any function $f$ and constant $c$ independent of $k$.
\begin{hypo}\label{conj:known-algo-kmatch}
There exists an absolute constant $c_0>0$ such that for every $G=(\vset,\edgeSet)$, we have $\kmatchtime \ge \Omega\inparen{|E|^{c_0\cdot k}}$.
\end{hypo}
@ -35,8 +31,6 @@ There exists a constant $\eps_0>0$ such that given an undirected graph $G=(\vset
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Based on the so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detection of whether $G$ has a triangle or not takes time $\Omega\inparen{|\edgeSet|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.
%The current best known algorithm to count the number of $3$-matchings, to
%\AR{Need to add something about 3-paths and 3-matchings as well.}
All of our hardness results rely on a simple lineage polynomial encoding of the edges of a graph.
To prove our hardness result, consider a graph $G=(\vset, \edgeSet)$, where $|\edgeSet| = m$, $\vset = [\numvar]$. Our lineage polynomial has a variable $X_i$ for every $i$ in $[\numvar]$.
@ -49,8 +43,6 @@ For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in \edgeSet} X_i \cdot X_j\right)^\kElem.\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Our hardness results only need a \ti instance; We also consider the special case when all the tuple probabilities (probabilities assigned to $X_i$ by $\probAllTup$) are the same value. Note that our hardness results % do not require the general circuit representation and
%even hold for the expression trees. %this polynomial can be encoded in an expression tree of size $\Theta(km)$.
\noindent Returning to \Cref{fig:two-step}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the lineage polynomial corresponding to the query that generalizes our example query from \Cref{sec:intro}. Let us alias
\begin{lstlisting}
@ -61,22 +53,9 @@ as $R_i$ for each $i \in [k]$. The query $\query^k$ then becomes
\begin{lstlisting}
SELECT COUNT(*) FROM $R_1$ JOIN $R_2$ JOIN$\cdots$JOIN $R_k$
\end{lstlisting}
%RA format for the same query
%\begin{align*}
%\query^k_G \coloneqq &\inparen{\project_\emptyset\inparen{OnTime \join_{City = City_1} Route \join_{{City}_2 = City'}\rename_{City' \leftarrow City}(OnTime)}}\times_2\cdots\\
%&\cdots \times_k \inparen{\project_\emptyset\inparen{OnTime \join_{City = City_1} Route \join_{{City}_2 = City'}\rename_{City' \leftarrow City}(OnTime)}}
%\end{align*}
%\resizebox{1\linewidth}{!}{
%\begin{minipage}{1.05\linewidth}
%\[\poly^k_G\dlImp OnTime(C_1),Route(C_1, C_1'),OnTime(C_1'),\dots,OnTime(C_\kElem),Route(C_\kElem,C_\kElem'),OnTime(C_\kElem')\]
%\end{minipage}
%}
\noindent Further, the PDB instance generalizes the one in \Cref{fig:two-step} as follows. Relation $OnTime$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob_i$ and $Route$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
In other words, for this instance $\dbbase$ contains the set of $n$ unary tuples in $OnTime$ (which corresponds to $\vset$) and $m$ binary tuples in $Route$ (which corresponds to $\edgeSet$).
Note that this implies that $\poly_{G}^\kElem$
%our hard lineage polynomial can be represented as an expression tree produced by a project-join query with same probability value for each input tuple $\prob_i$, and hence
is indeed a lineage polynomial for a \abbrTIDB \abbrPDB.
Note that this implies that $\poly_{G}^\kElem$ is indeed a lineage polynomial for a \abbrTIDB \abbrPDB.
Next, we note that the runtime for \abbrStepOne with $\query^k$ and $\dbbase$ as defined above is $O(m)$ (i.e. \abbrStepOne is `easy' for this query):
\begin{Lemma}\label{lem:tdet-om}
@ -85,16 +64,6 @@ Let $\query^k$ and $\dbbase$ be as defined above. Then
$\qruntime{\query^k, \dbbase}$ is $O(\kElem\numedge)$.
\end{Lemma}
%\begin{Corollary}\label{cor:at-least-kmatch}
%\end{Corollary}
%\begin{proof}[Proof of \Cref{cor:at-least-kmatch}
%\end{proof}
%
%\begin{Corollary}\label{cor:best-curr-algo}
%\end{Corollary}
%\begin{proof}[Proof of \Cref{cor:best-curr-algo}
%\end{proof}
\subsection{Multiple Distinct $\prob$ Values}
\label{sec:multiple-p}
%Unless otherwise noted, all proofs for this section are in \Cref{app:single-mult-p}.
@ -109,16 +78,7 @@ needs time $\bigOmega{\kmatchtime}$, assuming $\kmatchtime\ge \omega\inparen{\ab
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Note that the second row of \Cref{tab:lbs} follows from \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{thm:k-match-hard} while the third row is proved by \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{conj:known-algo-kmatch}. Since \Cref{conj:known-algo-kmatch} is non-standard, the latter hardness result should be interpreted as follows. Any substantial polynomial improvement for \Cref{prob:bag-pdb-poly-expected} (over the trivial algorithm that converts $\poly$ into SMB and then runs the obvious algorithm for \abbrStepTwo) would lead to an improvement over the state of the art {\em upper} bounds on $\kmatchtime$. Finally, note that \Cref{thm:mult-p-hard-result} needs one to be able to compute the expected multiplicities over $(2k+1)$ distinct values of $p_i$, each of which corresponds to distinct $\pd$ (for the same $\dbbase$), which explain the `Multiple' entry in the second column in the second and third row in \Cref{tab:lbs}. Next, we argue how to get rid of this latter requirement.
%%%%%%%%%%%%%%%%%%%%%%%%%%%
%NEEDS to be moved to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\noindent The following lemma reduces the problem of counting $\kElem$-matchings in a graph to our problem (and proves \Cref{thm:mult-p-hard-result}):
%\begin{Lemma}\label{lem:qEk-multi-p}
%Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $\bigO{\kElem^3}$ time.
%\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
%END move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex

View File

@ -3,12 +3,10 @@
%\onecolumn
\subsection{Reduced Polynomials and Equivalences}
We now introduce some terminology % for polynomials
We now introduce some terminology
and develop a reduced form of lineage polynomials for a \abbrBIDB or \abbrTIDB.
%We will use $(X + Y)^2$ as a running example.
Note that a polynomial over $\vct{X}=(X_1,\dots,X_n)$ with individual degree $B <\infty$
%\footnote{The standard definition of polynomials requires a finite number of terms.} and $c_\vct{i} \in \domN$
is formally defined as: %(with $c_\vct{i} \in \domN$):
is formally defined as:
\begin{equation}
\label{eq:sop-form}
\poly\inparen{X_1,\dots,X_n}=\sum_{\vct{d}\in\{0,\ldots,B\}^n} c_{\vct{d}}\cdot \prod_{i=1}^n X_i^{d_i},
@ -33,62 +31,15 @@ Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantic
%in any clause of the $\raPlus$ query that created it.
We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\bi-lineage polynomial} (resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if there exists a $\raPlus$ query $\query$, \bi (\ti) $\pdb$, and tuple $\tup$ such that $\poly\inparen{\vct{X}} = \apolyqdt\inparen{\vct{X}}.$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Definition}[Modding with a set]\label{def:mod-set}
%Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S}$ is the polynomial obtained by taking the mod of $\poly(\vct{X})$ over {\em all} polynomials in $S$ (order does not matter).
%\end{Definition}
%For example for a set of polynomials $S=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S$ yields $2X+3XY-2Y$.
%%
%\begin{Definition}[$\mathcal B$, $\mathcal T$]\label{def:mod-set-polys}
%Given the set of BIDB variables $\inset{X_{i,j}}$, define
%
%\setlength\parindent{0pt}
%\vspace*{-3mm}
%{\small
%\begin{tabular}{@{}l l}
% \begin{minipage}[b]{0.45\linewidth}
% \centering
% \begin{equation*}
% \mathcal{B}=\comprehension{X_{i,j}\cdot X_{i,j'}}{i \in [\ell], j\neq j' \in [~\abs{\block_i}~]}
% \end{equation*}
% \end{minipage}%
% \hspace{13mm}
% &
% \begin{minipage}[b]{0.45\linewidth}
% \centering
% \begin{equation*}
% \mathcal{T}=\comprehension{X_{i,j}^2-X_{i,j}}{i \in [\ell], j \in [~\abs{\block_i}~]}
% \end{equation*}
% \end{minipage}
% \\
%\end{tabular}
%}
%\end{Definition}
%%
\begin{Definition}[Reduced \bi Polynomials]\label{def:reduced-bi-poly}
Let $\poly(\vct{X})$ be a \bi-lineage polynomial.
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is the same as \Cref{def:reduced-poly} with the added constraint that all monomials with variables $X_{\block, i}, X_{\block, j}, i\neq j$ from the same block $\block$ are omitted.
%: $\rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Consider a $\abbrBIDB$ polynomial $\poly\inparen{\vct{X}} = X_{1, 1}X_{1, 2} + X_{1, 2}X_{2, 1}^2$. Then by \Cref{def:reduced-bi-poly}, we have that $\rpoly\inparen{\vct{X}} = X_{1, 2}X_{2, 1}$. Next, we show why the reduced form is useful for our purposes.
%, (recall the constraint on tuples from the same block being disjoint in a \bi).% any monomial containing more than one tuple from a block has $0$ probability and can be ignored).
%
%For the special case of \tis, the second step is not necessary since every block contains a single tuple.
%Alternatively, one can think of $\rpoly$ as the \abbrSMB of $\poly(\vct{X})$ when the product operator is idempotent.
%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{Definition}[$\rpoly(\vct{X})$] \label{def:qtilde}
% Define $\rpoly(X_1,\ldots, X_\numvar)$ as the reduced version of $\poly(X_1,\ldots, X_\numvar)$, of the form
% $\rpoly(X_1,\ldots, X_\numvar) = $
% \[\poly(X_1,\ldots, X_\numvar) \mod X_1^2-X_1\cdots\mod X_\numvar^2 - X_\numvar.\]
% \end{Definition}
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%Removing this example to save space
\iffalse
\begin{Example}\label{example:qtilde}
@ -101,43 +52,8 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
\end{Example}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Intuitively, $\rpoly(\textbf{X})$ is the \abbrSMB form of $\poly(\textbf{X})$ such that if any $X_j$ term has an exponent $e > 1$, it is reduced to $1$, i.e. $X_j^e\mapsto X_j$ for any $e > 1$.
%
%When considering $\bi$ input, it becomes necessary to redefine $\rpoly(\vct{X})$.
%
%\noindent The usefulness of this will reduction become clear in \Cref{lem:exp-poly-rpoly}.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%NEEDS to be moved to the appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Definition}[Valid Worlds]
%For probability distribution $\pd$, % and its corresponding probability mass function $\probOf$,
%the set of valid worlds $\valworlds$ consists of all the worlds with probability value greater than $0$; i.e., for random world variable vector $\vct{W}$
%\[
%\valworlds = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
%\]
%\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%END move to appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$ in \Cref{app:subsec-pre-poly-rpoly} and \Cref{app:subsec-prop-q-qtilde}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Define all variables $X_i$ in $\poly$ to be independent.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:exp-poly-rpoly}
Let $\pdb$ be a \abbrBIDB over $\numvar$ input tuples such that the probability distribution $\pdassign$ over $\vct{\randWorld}^\numvar$ (the all worlds set) is induced by the probability vector $\probAllTup = (\prob_1, \ldots, \prob_\numvar)$. As in \Cref{lem:tidb-reduce-poly} for \abbrTIDB, any \abbrBIDB-lineage polynomial $\poly(\vct{X})$ based on $\pdb$ and query $\query$ we have:
@ -148,17 +64,6 @@ Let $\pdb$ be a \abbrBIDB over $\numvar$ input tuples such that the probability
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
By \Cref{lem:exp-poly-rpoly} and linearity of expectation, the following corollary results.
%Note that in the preceding lemma, we have assigned $\vct{p}$
%%(introduced in \Cref{subsec:def-data})
%to the variables $\vct{X}$. Intuitively, \Cref{lem:exp-poly-rpoly} states that when we replace each variable $X_i$ with its probability $\prob_i$ in the reduced form of a \bi-lineage polynomial and evaluate the resulting expression in $\mathbb{R}$, then the result is the expectation of the polynomial.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Corollary}\label{cor:expct-sop}

View File

@ -19,41 +19,10 @@ When the underlying DAG is a tree (with edges pointing towards the root), the st
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%As stated in \Cref{def:circuit}, every internal node has at most two incoming edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
%Note that if we limit the outdegree to one, then we get expression trees.
The circuits in \Cref{fig:two-step} encode their respective polynomials in column $\poly$.
%\circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$.
Note that each circuit \circuit encodes a tree, with edges pointing towards the root.
%\begin{figure}[t]
% \begin{subfigure}[b]{0.45\linewidth}
% \centering
% \begin{tikzpicture}[thick]
% \node[tree_node] (a1) at (0, 0){$\boldsymbol{X}$};
% \node[tree_node] (b1) at (1, 0){$\boldsymbol{Y}$};
% \node[tree_node] (c1) at (2, 0){$\boldsymbol{W}$};
% \node[tree_node] (d1) at (3, 0){$\boldsymbol{Z}$};
%
% \node[tree_node] (a2) at (0.5, 1){$\boldsymbol{\circmult}$};
% \node[tree_node] (b2) at (2.5, 1){$\boldsymbol{\circmult}$};
%
% \node[tree_node] (a3) at (1.5, 2){$\boldsymbol{\circplus}$};
%
% \draw[->] (a1) -- (a2);
% \draw[->] (b1) -- (a2);
% \draw[->] (c1) -- (b2);
% \draw[->] (d1) -- (b2);
% \draw[->] (a2) -- (a3);
% \draw[->] (b2) -- (a3);
% \draw[->] (a3) -- (1.5, 2.5);
% \end{tikzpicture}
% \caption{Circuit encoding $XY + WZ$, a special case of an expression tree}
% \label{fig:circuit-express-tree}
% \end{subfigure}
% \hspace{5mm}
\begin{wrapfigure}{l}{0.45\linewidth}
\centering
\begin{tikzpicture}[thick]

View File

@ -11,39 +11,15 @@ An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$
A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world: $\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}$.
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
%
%\[\forall \db' \in \query(\idb): \pd'(\db') = \sum_{\db \in \idb: \query(\db) = \db'} \pd(\db). \]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%NEEDS to be moved to the appendix.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_\numvar)$ with natural number coefficients and exponents.
%We model incomplete relations using Green et. al.'s $\semNX$-databases~\cite{DBLP:conf/pods/GreenKT07}, discussed in detail in \Cref{subsec:supp-mat-krelations}.
% $\semNX$-databases are functions from tuples to elements of $\semNX$, typically called annotations.
%Given an $\semNX$-database $\db$, it is common to use $\db(\tup)$ to denote the polynomial annotating tuple $\tup$ in $\db$.
%%Note that based on this definition of $\rel$, $\rel(\tup)$ is the lineage polynomial for $\tup$.
%Let $\numvar$ be the number of tuples in $\pdb$. Then, each possible world is defined by an assignment of $\numvar$ binary values $\vct{\wElem} \in \{0, 1\}^{\numvar}$ to $\vct{X}$.
%The multiplicity of $\tup \in \db$, denoted $\db(\tup)(\vct{\wElem})$, is obtained by evaluating the polynomial annotating $\tup$ on $\vct{\wElem}$.
%$\semNX$-relations are closed under $\raPlus$ (\Cref{fig:nxDBSemantics}).
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%We will use $\semNX$-\abbrPDB $\pxdb$, defined as the tuple $(\idb_{\semNX}, \pd)$, where $\semNX$-database $\idb_{\semNX}$ is paired with probability distribution $\pd$ over the assignments to $\vct{X}$.
%We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ on an implicit $\semNX$-\abbrPDB (i.e., $\polyForTuple = \query(\pxdb)(t)$ for some $\pxdb$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{\numvar} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
%$\semNX$-\abbrPDB\xplural and a function $\rmod$ (which transforms an $\semNX$-\abbrPDB to a classical bag-\abbrPDB, or $\semN$-\abbrPDB~\cite{DBLP:conf/pods/GreenKT07,feng:2019:sigmod:uncertainty}) are both formalized in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%END: move to appendix.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Recall \Cref{fig:nxDBSemantics} which depicts the semantics for constructing a lineage polynomial $\apolyqdt$ for any $\raPlus$ query. We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given a \abbrBPDB $\pdb = (\idb,\pd)$ and lineage polynomial $\apolyqdt$ for aribitrary output tuple $\tup$, %$\semNX$-\abbrPDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$,
Given a \abbrBPDB $\pdb = (\idb,\pd)$ and lineage polynomial $\apolyqdt$ for aribitrary output tuple $\tup$,
we have (denoting $\randDB$ as the random variable over $\idb$):
$ \expct_{\randDB \sim \pd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.\footnote{Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.}
%This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on the problem of computing $\expct_\pdassign\pbox{\apolyqdt\inparen{\vct{\randWorld}}}$ from now on, assume implicit $\query, \dbbase, \tup$, and so drop the subscript from $\apolyqdt$ (i.e., $\poly\inparen{\vct{X}}$ will denote a polynomial).
\subsubsection{\tis and \bis}
@ -51,23 +27,13 @@ We focus on the problem of computing $\expct_\pdassign\pbox{\apolyqdt\inparen{\v
In this paper, we focus on two popular forms of \abbrPDB\xplural: Block-Independent (\bi) and Tuple-Independent (\ti) \abbrPDB\xplural.
%
A \bi $\pdb$ is a \abbrPDB with the constraint that
%(i) every tuple $\tup_i$ is annotated with a unique random variable $\randWorld_i \in \{0, 1\}$ and (ii) that
the tuples in $\dbbase$ can be partitioned into a set of $\ell$ blocks such that tuples $\tup_{i, j}, \tup_{k, j'}$ from separate blocks $(i\neq k, j \in [\abs{i}], j' \in [\abs{k}])$ are independent of each other while tuples $\tup_{i, j}, \tup_{i, k}$ from the same block are disjoint events.\footnote{
Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block~\cite{DBLP:series/synthesis/2011Suciu}, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_{i, j} \in b_i$, the event $(\randWorld_{i,j} = 1)$ corresponds to the event $(\randWorld_i = j)$ in the customary annotation scheme.
}
Each tuple $\tup_{i, j}$ is annotated with a random variable $\randWorld_{i, j} \in \{0, 1\}$ denoting its presence in a possible world $\db$. The probability distribution $\pd$ over $\dbbase$ is the one induced from individual tuple probabilities $\prob_{i, j}\in \vct{\prob}=\inparen{\prob_{1, 1},\ldots,\prob_{\abs{\block},\ldots,\abs{\block_{\abs{\block}}}}}$ and the conditions on the blocks. A \abbrTIDB is a \abbrBIDB where each block has size exactly $1$.
Instead of looking only at the possible worlds of $\pdb$, one can consider all worlds, including those that cannot exist due to disjointness. The all worlds set can be modeled by $\vct{\randWorld}\in \{0, 1\}^\numvar$,\footnote{Here and later on in the paper, especially in \Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.} such that $\randWorld_k \in \vct{\randWorld}$ represents the presence of $\tup_{i, j}$ (where $k = \sum_{\ell = 1}^{i - 1} \abs{b_\ell} + j$). We denote a probability distribution over all $\vct{\randWorld} \in \{0, 1\}^\numvar$ as $\pdassign$. When $\pdassign$ is the one induced from each $\prob_{i, j}$ while assigning $\probOf\pbox{\vct{\randWorld}} = 0$ for any $\vct{\randWorld}$ with $\randWorld_{i, j} = \randWorld_{i, k} = 1$ for any block $i$ and $j\neq k$, we end up with a bijective mapping from $\pd$ to $\pdassign$, such that each mapping is equivalent, implying the distributions are equivalent.
%that $\forall i \in \abs{\block}, \forall j\neq k \in [\block_i] \suchthat \db\inparen{\tup_{i, j}} = 0 \vee \db\inparen{\tup_{i, k} = 0}$.In other words, each random variable corresponds to the event of a single tuple's presence.
%A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \abbrTIDB\xplural and \abbrBIDB\xplural in greater detail.
%%
%In a \bi (and by extension a \ti), tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \probOf[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.
%Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pdb$.
%We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as
%$\poly(\vct{X})$ = $\poly(X_{1, 1},\ldots, X_{1, \abs{\block_1}},$ $\ldots, X_{\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables: