Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

master
Boris Glavic 2020-12-19 00:21:03 -06:00
commit 61b46da038
11 changed files with 629 additions and 550 deletions

View File

@ -97,7 +97,7 @@ series = {PODS '07}
@inproceedings{BD05,
@inproceedings{DBLP:conf/sigmod/BoulosDMMRS05,
author = {Jihad Boulos and
Nilesh N. Dalvi and
Bhushan Mandhani and
@ -300,3 +300,33 @@ numpages = {12}
pages = {5--16},
year = {2017}
}
@inproceedings{GL16,
author = {Paolo Guagliardo and
Leonid Libkin},
booktitle = {PODS},
title = {Making SQL Queries Correct on Incomplete Databases: A Feasibility
Study},
year = {2016}
}
@inproceedings{jampani2008mcdb,
author = {Jampani, Ravi and Xu, Fei and Wu, Mingxi and Perez, Luis Leopoldo and Jermaine, Christopher and Haas, Peter J},
booktitle = {SIGMOD},
title = {MCDB: a monte carlo approach to managing uncertain data},
year = {2008}
}
@article{yang:2015:pvldb:lenses,
author = {Yang, Ying and Meneghetti, Niccolò and Fehling, Ronny and Liu, Zhen Hua and Gawlick, Dieter and Kennedy, Oliver},
title = {Lenses: An On-Demand Approach to ETL},
journal = {pVLDB},
volume = {8},
number = {12},
year = {2015},
pages = {1578--1589}
}

View File

@ -1,12 +1,17 @@
%root: main.tex
%!TEX root=./main.tex
\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}
In~\cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for TIDB (even just based on project-join queries) is unlikely to be possible in linear time (\cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\cref{cor:single-p-hard}). Given this, in this section we will design an approximation algorithm for our problem that runs in {\em linear time}. Unlike the results in~\cref{sec:hard} our approximation algorithm works for BIDB though our bounds are more meaningful for a non-trivial subclass of BIDB that includes TIDB as well as PDB benchmarks (\cref{sec:experiments}).
In~\Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for \ti (even just based on project-join queries) is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability of being present (\Cref{th:single-p-hard}).
Given this, in this section we design an approximation algorithm for our problem that runs in {\em linear time}.
Unlike the results in~\Cref{sec:hard} our approximation algorithm works for \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark.
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
\subsection{Preliminaries and some more notation}
First, let us introduce some useful definitions and notation related to polynomials and their representations. For illustrative purposes in the definitions below, we will use the following {\em bivariate} polynomial:
First, let us introduce some useful definitions and notation related to polynomials and their representations. For illustrative purposes in the definitions below, we use the following %{\em bivariate}
polynomial:
\begin{equation}
\label{eq:poly-eg}
\poly(X, Y) = 2X^2 + 3XY - 2Y^2.
@ -24,18 +29,17 @@ For example the monomial $XY$ has $\var(XY)=\inset{X,Y}$.
%tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child. Remaining fields hold values whose semantics we will fix later. When $\etree$ is used as input of ~\Cref{alg:mon-sam} and ~\Cref{alg:one-pass}, the values of \vari{partial} and \vari{weight} will not be set. %SEMANTICS FOR \etree: \vari{partial} is the sum of $\etree$'s coefficients , n, and \vari{weight} is the probability of $\etree$ being sampled.
%\end{Definition}
%Note that $\etree$ need not encode an expression in the standard monomial basis. For instance, $\etree$ could represent a compressed form of the polynomial in~\cref{eq:poly-eg}, such as $(x + 2y)(2x - y)$.
%Note that $\etree$ need not encode an expression in the standard monomial basis. For instance, $\etree$ could represent a compressed form of the polynomial in~\Cref{eq:poly-eg}, such as $(x + 2y)(2x - y)$.
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
Denote $\polyf(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial. $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively.
%
% \begin{align*}
% &\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\
% &\etree.\type = \times\mapsto&& \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)\\
% &\etree.\type = \var \text{ OR } \tnum\mapsto&& \etree.\val
% \end{align*}
%
\begin{equation*}
\polyf(\etree) = \begin{cases}
\polyf(\etree_\lchild) + \polyf(\etree_\rchild) &\text{ if \etree.\type } = +\\
@ -44,20 +48,19 @@ Denote $\polyf(\etree)$ to be the function that takes as input expression tree $
\end{cases}
\end{equation*}
\end{Definition}
%
Note that addition and multiplication above follow the standard interpretation over polynomials.
%Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial. Multiplication here is denoted by concatenation of the monomial and coefficient. When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added. Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents.
%\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.
%\end{Definition}
%
%For the polynomial in~\cref{eq:poly-eg}, $\etreeset{\smb}$ would include the following (represented as their corresponding expression trees): $2x^2 + 3xy - 2y^2, (x + 2y)(2x - y), x(2x - y) + 2y(2x - y), 2x(x + 2y) - y(x + 2y)$. Note that \cref{def:express-tree-set} implies that for any expression tree $\etree$, we have $\etree \in \etreeset{poly(\etree)}$.
%For the polynomial in~\Cref{eq:poly-eg}, $\etreeset{\smb}$ would include the following (represented as their corresponding expression trees): $2x^2 + 3xy - 2y^2, (x + 2y)(2x - y), x(2x - y) + 2y(2x - y), 2x(x + 2y) - y(x + 2y)$. Note that \Cref{def:express-tree-set} implies that for any expression tree $\etree$, we have $\etree \in \etreeset{poly(\etree)}$.
\begin{Definition}[Expanded T]\label{def:expand-tree}
$\expandtree{\etree}$ is the (pure) sum of products expansion of $\etree$, which we formally define next. The logical view of \expandtree{\etree} ~is a list of tuples $(\monom, \coef)$, where $\monom$ is a monomial and $\coef$ is in $\mathbb{R}$. \expandtree{\etree} has the following recursive definition (where $\circ$ is list concatenation).
\end{Definition}
%
% recursively defined as
% \begin{align*}
% &\etree.\type = + \mapsto&& \elist{\expandtree{\etree_\lchild}, \expandtree{\etree_\rchild}}\\
@ -74,12 +77,13 @@ $\expandtree{\etree}$ is the (pure) sum of products expansion of $\etree$, which
\elist{(\{\etree.\val\}, 1)} &\textbf{ if }\etree.\type = \var.\\
\end{cases}
\end{align*}
\end{Definition}
%where that the multiplication of two tuples %is the standard multiplication over monomials and the standard multiplication over coefficients to produce the product tuple, as in
%is their direct product $(\monom_1, \coef_1) \cdot (\monom_2, \coef_2) = (\monom_1 \cdot \monom_2, \coef_1 \times \coef_2)$ such that monomials $\monom_1$ and $\monom_2$ are concatenated in a product operation, while the standard product operation over reals applies to $\coef_1 \times \coef_2$. The product of $\expandtree{\etree_\lchild} \cdot \expandtree{\etree'_\rchild}$ is then the cross product of the multiplication of all such tuples returned to both $\expandtree{\etree_\lchild}$ and $\expandtree{\etree_\rchild}$. %The operator $\otimes$ is defined as the cross-product tuple multiplication of all such tuples returned by both $\expandtree{\etree_\lchild}$ and $\expandtree{\etree_\rchild}$.
\begin{Example}\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\cref{eq:poly-eg}. Its expression tree $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}. The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expandtree{\etree}$ is $[(2, X^2), (-1, XY), (4, XY), (-2, Y^2)]$.
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}. Its expression tree $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}. The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expandtree{\etree}$ is $[(2, X^2), (-1, XY), (4, XY), (-2, Y^2)]$.
\end{Example}
@ -126,7 +130,7 @@ For any expression tree $\etree$, the corresponding
{\em positive tree}, denoted $\abs{\etree}$ obtained from $\etree$ as follows. For each leaf node $\ell$ of $\etree$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$. %value $\coef$ of each coefficient leaf node in $\etree$ is set to %$\coef_i$ in $\etree$ is exchanged with its absolute value$|\coef|$.
\end{Definition}
Using the same factorization from ~\cref{example:expr-tree-T}, $poly(\abs{\etree}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\cref{eq:poly-eg}.
Using the same factorization from ~\Cref{example:expr-tree-T}, $poly(\abs{\etree}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.
\begin{Definition}[Evaluation]\label{def:exp-poly-eval}
Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(\vct{v}) = poly(\etree)(\vct{v})$.
@ -138,11 +142,11 @@ Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(
In the subsequent subsections we will prove the following theorem.
\begin{Theorem}\label{lem:approx-alg}
Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\degree(\poly)$
%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a BIDB.
Let $\etree$ be an expression tree for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\degree(\poly)$
%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a \bi.
An estimate $\mathcal{E}$ %=\approxq(\etree, (p_1,\dots,p_\numvar), \conf, \error')$
of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\]
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\]
such that
\begin{equation}
\label{eq:approx-algo-bound}
@ -153,10 +157,10 @@ P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \
The proof of~\Cref{lem:approx-alg} can be found in~\Cref{sec:proofs-approx-alg}.
It turns out that to get linear runtime results from~\cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
It turns out that to get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter (which roughly counts the (weighted) number of monomials in $\expandtree{\etree}$ that get `canceled' when modded with $\mathcal{B}$):
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\etree$, define
\[\gamma(\etree)=\frac{\sum_{(\monom, \coef)\in \expandtree{\etree}} \abs{\coef}\cdot \onesymbol\inparen{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\etree}(1,\ldots, 1)}\]
\[\gamma(\etree)=\frac{\sum_{(\monom, \coef)\in \expandtree{\etree}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\etree}(1,\ldots, 1)}\]
\end{Definition}
%\AH{This....combined with \Cref{def:mod-set-polys} is \emph{really} nice notation!}
\AR{Need to make sure use of indicator variable $\onesymbol$ above is consistent with the rest of the paper.}
@ -164,40 +168,41 @@ Given an expression tree $\etree$, define
We next present couple of corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\etree)$. Further let it be the case that $p_i\ge p_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\cref{eq:approx-algo-bound} can be computed in time
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\etree)$. Further let it be the case that $p_i\ge p_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time
\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
In particular, if $p_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1{\eps^2}\cdot\treesize(\etree)\cdot \log{\frac{1}{\conf}}\right)$.
\end{Corollary}
The proof for~\Cref{cor:approx-algo-const-p} can be seen in~\Cref{sec:proofs-approx-alg}.
We note that the restriction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
We note that the restriction on $\gamma$ is satisfied by \ti (where $\gamma=0$) as well as for the three queries of the popular PDBench \bi benchmark (see \Cref{app:subsec:experiment}).
\AH{I am thinking that perhaps the terminology and presentation of~\Cref{sec:experiments} may need word-smithing to clearly illustrate the $\bi$ benchmarks satisfied--although the substance is already written there.}
\AR{Yes! E.g. $\gamma$ is not used at all in~\Cref{sec:experiments}}
\AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of tuples?}
\OK{@Atri: This seems like a reasonable claim. It's too late for me to come up with a reasonable motivation (maybe something will come to me in the morning), but the intuition for me is that each tuple/block is independent... it would be hard for that to be the case if the probability were a function of the number of tuples.}
\subsection{Approximating $\rpoly$}
The algorithm to prove~\Cref{lem:approx-alg} follows from the following observation. Given a query polynomial $\poly(\vct{X})=poly(\etree)$ for expression tree $\etree$ over $\bi$, we note that we can exactly represent $\rpoly(\vct{X}$ as follows:
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{X_1,\dots,X_\numvar}=\sum_{(v,c)\in \expandtree{\etree}} \onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} X_i.
\rpoly\inparen{X_1,\dots,X_\numvar}=\sum_{(v,c)\in \expandtree{\etree}} \indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} X_i.
\end{equation}
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample $(v,c)\in \expandtree{\etree}$ with probability proportional\footnote{We could have also uniformly sampled from $\expandtree{\etree}$ but this gives better parameters.}
%\AH{Regarding the footnote, is there really a difference? I \emph{suppose} technically, but in this case they are \emph{effectively} the same. Just wondering.}
%\AR{Yes, there is! If we used uniform distribution then in our bounds we will have a parameter that depends on the largest $\abs{coef}$, which e.g. could be dependent on $n$. But with the weighted probability distribution, we avoid paying this price. Though I guess perhaps we can say for the kinds of queries we consider thhese coefficients are all constants?}
to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i$. Taking enough samples and computing the average of $Y$ gives us our final estimate. Algorithm~\ref{alg:mon-sam} has the details.
to $\abs{c}$ and compute $Y=\indicator{\monom\mod{\mathcal{B}}\not\equiv 0}\cdot \prod_{X_i\in \var\inparen{v}} p_i$. Taking enough samples and computing the average of $Y$ gives us our final estimate. Algorithm~\ref{alg:mon-sam} has the details.
\OK{Even if the proof is offloaded to the appendix, it would be useful to state the formula for $N$ (line 4 of \Cref{alg:mon-sam}), along with a pointer to the appendix.}
%We state the approximation algorithm in terms of a $\bi$.
%\subsubsection{Description}
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}.
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\Cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\Cref{lem:mon-samp}.
%\AR{Seems like the notation below belongs to the notation section (if we decide to state this explicitly at all)?}
%\AH{Yes, I only included this per your request a few months ago. Based on @lordpretzel removing my definition of monomial, perhaps we can assume that the reader understands the notation below. I \emph{think} this should be a reasonable assumption.}
%Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$.
%\subsubsection{Psuedo Code}
%Original TIDB Algorithm
%Original \ti Algorithm
%\begin{algorithm}[H]
% \caption{$\approxq$($\etree$, $\vct{p}$, $\conf$, $\error$)}
% \label{alg:mon-sam}
@ -209,7 +214,7 @@ to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv
% \Ensure \vari{acc} $\in \mathbb{R}$
% \State $\accum \gets 0$\label{alg:mon-sam-global1}
% \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
% \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}}\newline
% \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass} \;and \sampmon \; is ~\Cref{alg:sample}}\newline
% \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples}
% \State $(\vari{M}_\vari{i}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}
% \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}
@ -225,7 +230,7 @@ to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv
% \end{algorithmic}
%\end{algorithm}
%BIDB Version of Approximation Algorithm
%\bi Version of Approximation Algorithm
\begin{algorithm}[H]
@ -242,12 +247,12 @@ to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv
%\State $\vari{sample}_\vari{next} \gets 0$
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass}}
\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass}}
%\newline
%\State $\vari{i} \gets 1$
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon \; is ~\cref{alg:sample}}
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon \; is ~\Cref{alg:sample}}
%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
% \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
% \newline
@ -318,12 +323,12 @@ to $\abs{c}$ and compute $Y=\onesymbol\inparen{\monom\mod{\mathcal{B}}\not\equiv
\subsubsection{Correctness}
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies. Their proofs are given in~\Cref{sec:onepass} and~\Cref{sec:samplemonomial} respectively.
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, for each subtree $\vari{S}$ of $\etree$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\val = +$, each $\vari{child}$ of $\vari{S}$, $\vari{child}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, for each subtree $\vari{S}$ of $\etree$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = +$, each $\vari{child}$ of $\vari{S}$, $\vari{child}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
\end{Lemma}
In proving correctness of~\Cref{alg:mon-sam}, we will only use the following fact (which follows from the above lemma), $\etree_{\vari{mod}}.\vari{partial}=\abs{\etree}(1,\dots,1)$.
%\AH{I'm wondering if there is a better notation to use here. I myself got confused by my own notation of $\etree_{\vari{mod}}$. \emph{But}, we need to to be referencing the modified $\etree$ returned by $\onepass$ in the algorithm, so maybe this is the best we can do?}
@ -387,133 +392,12 @@ Algorithm ~\ref{alg:one-pass} essentially implements the above definitions.
%\subsubsection{Psuedo Code}
%See algorithm ~\ref{alg:one-pass} for details.
\begin{algorithm}[h!]
\caption{\onepass$(\etree)$}
\label{alg:one-pass}
\begin{algorithmic}[1]
\Require \etree: Binary Expression Tree
\Ensure \etree: Binary Expression Tree
\Ensure \vari{sum} $\in \mathbb{R}$
\If{$\etree.\type = +$}\label{alg:one-pass-equality1}
\State $\accum \gets 0$\label{alg:one-pass-plus-assign1}
\For{$child$ in $\etree.\vari{children}$}\Comment{Sum up all children coefficients}
\State $(child, \vari{s}) \gets \onepass(child)$
\State $\accum \gets \accum + \vari{s}$\label{alg:one-pass-plus-add}
\EndFor
\State $\etree.\vari{partial} \gets \accum$\label{alg:one-pass-plus-assign2}
\For{$child$ in $\etree.\vari{children}$}\Comment{Record distributions for each child}
\State $child.\vari{weight} \gets \frac{child.\vari{partial}}{\etree.\vari{partial}}$\label{alg:one-pass-plus-prob}
\EndFor
%\State $\vari{sum} \gets \etree.\vari{partial}$\label{alg:one-pass-plus-assign3}
\State \Return (\etree, \etree.\vari{partial})
\ElsIf{$\etree.\type = \times$}\label{alg:one-pass-equality2}
\State $\accum \gets 1$\label{alg:one-pass-times-assign1}
\For{$child \text{ in } \etree.\vari{children}$}\Comment{Compute the product of all children coefficients}
\State $(child, \vari{s}) \gets \onepass(child)$
\State $\accum \gets \accum \times \vari{s}$\label{alg:one-pass-times-product}
\EndFor
\State $\etree.\vari{partial}\gets \accum$\label{alg:one-pass-times-assign2}
%\State $\vari{sum} \gets \etree.\vari{partial}$\label{alg:one-pass-times-assign3}
\State \Return (\etree, \etree.\vari{partial})
\ElsIf{$\etree.\type = numeric$}\Comment{Base case}\label{alg:one-pass-equality3}
\State $\vari{sum} \gets |\etree.\val|$\label{alg:one-pass-leaf-assign1}\Comment{This step effectively converts $\etree$ into $\abs{\etree}$}
\State \Return (\etree, \vari{sum})
\Else\Comment{$\etree.\type = \var$}\label{alg:one-pass-equality4}
%\State $\vari{sum} \gets 1$\label{alg:one-pass-global-assign}
\State \Return (\etree,$1$) % \vari{sum})
\EndIf
\end{algorithmic}
\end{algorithm}
\begin{Example}\label{example:one-pass}
Let $\etree$ encode the expression $(x_1 + x_2)(x_1 - x_2) + x_2^2$. After one pass, \cref{alg:one-pass} would have computed the following weight distribution. For the two children of the root $+$ node $\etree$, $\etree_\lchild.\wght = \frac{4}{5}$ and $\etree_\rchild.\wght = \frac{1}{5}$. Similarly, let $\stree$ denote the left-subtree of $\etree_{\lchild}$, $\stree_\lchild.\wght = \stree_\rchild.\wght = \frac{1}{2}$. This is depicted in~\Cref{fig:expr-tree-T-wght}. %Note that in this example, the sampling probabilities for the children of each inner $+$ node of $\stree$ are equal to one another because both parents have the same number of children, and, in each case, the children of each parent $+$ node share the same $|\coef_i|$.
\end{Example}
\begin{figure}[h!]
\begin{tikzpicture}[thick, every tree node/.style={default_node, thick, draw=black, black, circle, text width=0.3cm, font=\bfseries, minimum size=0.65cm}, every child/.style={black}, edge from parent/.style={draw, thick},
level 1/.style={sibling distance=0.95cm},
level 2/.style={sibling distance=0.7cm},
%level 2+/.style={sibling distance=0.625cm}
%level distance = 1.25cm,
%sibling distance = 1cm,
%every node/.append style = {anchor=center}
]
\Tree [.\node(root){$\boldsymbol{+}$};
\edge [wght_color] node[midway, auto= right, font=\bfseries, gray] {$\bsym{\frac{4}{5}}$}; [.\node[highlight_color](tl){$\boldsymbol{\times}$};
[.\node(s){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sr){$\bsym{x_2}$}; ]
]
[.\node(sp){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spr){$\bsym{\times}$};
[.$\bsym{-1}$ ] [.$\bsym{x_2}$ ]
]
]
]
\edge [wght_color] node[midway, auto=left, font=\bfseries, gray] {$\bsym{\frac{1}{5}}$}; [.\node[highlight_color](tr){$\boldsymbol{\times}$};
[.$\bsym{x_2}$
\edge [draw=none]; [.\node[draw=none]{}; ]
\edge [draw=none]; [.\node[draw=none]{}; ]
]
[.$\bsym{x_2}$ ] ]
]
% labels for plus node children, with arrows
\node[left=2pt of sl, highlight_color, inner sep=0pt] (sl-label) {$\stree_\lchild$};
\draw[highlight_color] (sl) -- (sl-label);
\node[right=2pt of sr, highlight_color, inner sep=0pt] (sr-label) {$\stree_\rchild$};
\draw[highlight_color] (sr) -- (sr-label);
\node[below left=2pt of spl, inner sep=0pt, highlight_color](spl-label) {$\stree_\lchild'$};
\draw[highlight_color] (spl) -- (spl-label);
\node[right=2pt of spr, highlight_color, inner sep=0] (spr-label) {$\stree_\rchild'$};
\draw[highlight_color] (spr) -- (spr-label);
\node[above left=2pt of tl, inner sep=0pt, highlight_color] (tl-label) {$\etree_\lchild$};
\draw[highlight_color] (tl) -- (tl-label);
\node[above right=2pt of tr, highlight_color, inner sep=0pt] (tr-label) {$\etree_\rchild$};
\node[above = 2pt of root, highlight_color, inner sep=0pt, font=\bfseries] (root-label) {$\etree$};
\node[above = 2pt of s, highlight_color, inner sep=0pt, font=\bfseries] (s-label) {$\stree$};
\node[above = 2pt of sp, highlight_color, inner sep=0pt, font=\bfseries] (sp-label) {$\stree'$};
\draw[highlight_color] (tr) -- (tr-label);
% \draw[<-|, highlight_color] (s) -- (s-label);
% \draw[<-|, highlight_color] (sp) -- (sp-label);
% \draw[<-|, highlight_color] (root) -- (root-label);
%\node[above right=0.7cm of TR, highlight_color, inner sep=0pt, font=\bfseries] (tr-comment) {$\etree_\rchild$};
% \draw[<-|, highlight_color] (TR) -- (tr-comment);
\end{tikzpicture}
% \begin{tikzpicture}[thick, level distance=1.2cm, level 1/.style={sibling distance= 5cm}, level 2/.style={sibling distance=3cm}, level 3/.style={sibling distance=1.5cm}, level 4/.style={sibling distance= 1cm}, every child/.style={black}]
% \node[tree_node](root) {$\boldsymbol{+}$}
% child[red]{node[tree_node](tl) {$\boldsymbol{\times}$}
% child{node[tree_node] {$\boldsymbol{+}$}
% child{node[tree_node]{$\boldsymbol{x_1}$} }
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% }
% child{node[tree_node] {$\boldsymbol{+}$}
% child{node[tree_node] {$\boldsymbol{x_1}$}}
% %child[missing]{node[tree_node] {$\boldsymbol{1}$}}
% child[red]{node[tree_node] {$\boldsymbol{\times}$}
% child{node[tree_node] {$\boldsymbol{-1}$}}
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% }
% }
% }
% child{node[tree_node] {$\boldsymbol{\times}$} edge from parent [red]
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% child{node[tree_node] {$\boldsymbol{x_2}$}}
% };
% \node[font=\bfseries, red] at (-2.8, -0.2) {$\etree_\lchild.\wght \boldsymbol{= \frac{4}{5} } $};
% \end{tikzpicture}
\caption{Weights computed by $\onepass$ in ~\cref{example:one-pass}.
%\AH{I fixed the labels; @atri, let me know if you would rather have the labels positioned in alternative locations.}
%\AR{Looks good-- thanks!}
}
\label{fig:expr-tree-T-wght}
\end{figure}
We prove the correctness of Algorithm ~\ref{alg:one-pass} by proving~\Cref{lem:one-pass} in~\Cref{sec:proofs-approx-alg}.
For an example of how $\onepass$ works, the pseudocode, and the proof of correctness (~\Cref{lem:one-pass}) of Algorithm ~\ref{alg:one-pass}see~\Cref{sec:proofs-approx-alg}.
\subsection{\sampmon\ Algorithm}
\label{sec:samplemonomial}
@ -568,11 +452,9 @@ See algorithm ~\ref{alg:sample} for the details of $\sampmon$ algorithm.
\end{algorithm}
We argue the correctness of Algorithm ~\ref{alg:sample} by proving~\Cref{lem:sample} in~\Cref{sec:proofs-approx-alg}.
% \subsection{Experimental results}
% \label{sec:experiments}
% We conducted an experiment running modified TPCH queries over uncertain data generated by pdbench~\cite{pdbench}, both of which (data and queries) represent what is typically encountered in practice. Queries were run two times, once filtering $\bi$ cancellations, and then second not filtering the cancellations. The purpose of this was to determine an indication for how many $\bi$ cancellations occur in practice. Details and results can be found in~.
\subsection{Experimental results}
\label{sec:experiments}
\input{experiments}
%\AR{Experimental stuff about BIDB should go in here}
%\AR{Experimental stuff about \bi should go in here}
%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -20,13 +20,7 @@ We first note that since expression trees are a special case of them, all of our
For the approximation algorithm in~\Cref{sec:algo} we note that \textsc{Approx}\textsc{imate}$\rpoly$ (\Cref{alg:mon-sam}) works for lineage circuits as long as the same guarantees on $\onepass$ and $\sampmon$ (\Cref{lem:one-pass} and \Cref{lem:sample} respectively) hold for lineage circuits as well. It turns out that both $\onepass$ and $\sampmon$ work for lineage circuits as well, simply because the only property these use for expression trees is that each node has two children. This is still valid of lineage circuits where for each non-source node the children correspond to the two nodes that have incoming edges to the given node. Put another way, our argument never used the fact that in an expression tree, each node has at most one parent.
More specifically consider $\onepass$. The algorithm (as well as its analysis) basically uses the fact that one can compute the corresponding polynomial at all $1$s input with a simple recursive formula (\cref{eq:T-all-ones}), and that we can compute a probability distribution based on these weights (as in~\cref{eq:T-weights}). It can be verified that all the arguments go through if we replace $\etree_\lchild$ and $\etree_\rchild$ for expression tree $\etree$ with the two incoming nodes of the sink for the given lineage circuit. Another way to look at this is we could `unroll' the recursion in $\onepass$ and think of the algorithm as doing the evaluation at each node bottom up from leaves to the root in the expression tree. For lineage circuits, we start from the source nodes and do the computation in the topological order till we reach the sink(s).
The argument for $\sampmon$ is similar. Since we argued that $\onepass$ works as intended for lineage circuits since~\Cref{alg:one-pass} only recurses on children of the current node in the expression tree and we can generalize it to lineage circuits by recursing to the two children of the current node in the lineage circuit. Alternatively, as we have already used in the proof of~\Cref{lem:sample}, we can think of the sampling algorithm sampling a sub-graph of the expression tree. For lineage circuits, we can think of $\sampmon$ as sampling the same sub-graph. Alternatively, one can implicitly expand the circuit lineage into a (larger but) equivalent expression tree. Since $\sampmon$ only explores one sub-graph during its run we can think of its run on a lineage circuit as being done on the implicit equivalent expression tree\footnote{
Recall that $\sampmon$ scales only in the depth of the expression and its polynomial degree ($k$). There exist polynomials that can be encoded in size $\Omega(\log k)$, but we follow convention in assuming that the circuit size is asymptotically larger than $k$ and thus treat the degree (i.e., join width) as a constant.
}. Hence, all of the results on $\sampmon$ on expression trees carry over to lineage circuits.
Thus, we have argued that~\Cref{lem:approx-alg} also holds if we use a lineage circuit instead of an expression tree as the input to our approximation algorithm.
For further discussion on why~\Cref{lem:approx-alg} holds for a lineage circuit, see~\Cref{app:lineage-circuit-ext}.
\subsubsection{The cost model}
\label{sec:cost-model}
@ -44,9 +38,8 @@ We adopt a minimalistic compute-bound model of query evaluation drawn from worst
Under this model the query plan $Q(D)$ has runtime $O(\qruntime{Q(D)})$.
Base relations assume that a full table scan is required; We model index scans by treating an index scan query $\sigma_\theta(R)$ as a single base relation.
It can be verified that the worst-case join algorithms~\cite{skew,ngo-survey}, as well as query evaluation via factorized databases~\cite{factorized-db} (and work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as select-union-project-join queries (though these queries can be data dependent).\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} Further, it can be verified that the above cost model on the corresponding SUPJ join queries correctly captures their runtime.
\AH{I am used to folks using the order SPJU, is this ordering of operations a `standard' that we should follow?}
\AR{Am not sure if we need to motivate the cost model more.}
It can be verified that the worst-case join algorithms~\cite{skew,ngo-survey}, as well as query evaluation via factorized databases~\cite{factorized-db} (and work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as select-union-project-join queries (though these queries can be data dependent).\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} Further, it can be verified that the above cost model on the corresponding SPJU join queries correctly captures their runtime.
%We now make a simple observation on the above cost model:
%\begin{proposition}
%\label{prop:queries-need-to-output-tuples}
@ -56,7 +49,7 @@ It can be verified that the worst-case join algorithms~\cite{skew,ngo-survey}, a
\subsubsection{Lineage circuit for query plans}
\label{sec:circuits-formal}
We now define a lineage circuit more formally and also show how to construct a lineage circuit given a SUPJ query $Q$.
We now define a lineage circuit more formally and also show how to construct a lineage circuit given a SPJU query $Q$.
As mentioned earlier, we represent lineage polynomials with arithmetic circuits over $\mathbb N$ with $+$, $\times$.
A circuit for query $Q$ is a directed acyclic graph $\tuple{V_Q, E_Q, \phi_Q, \ell_Q}$ with vertices $V_Q$ and directed edges $E_Q \subset V_Q^2$.
@ -66,140 +59,23 @@ We require that $\phi_Q$'s range be limited to sink vertices (i.e., vertices wit
A function $\ell_Q : V_Q \rightarrow \{\;+,\times\;\}\cup \mathbb N \cup \vct X$ assigns a label to each node: Source nodes (i.e., vertices with in-degree 0) are labeled with constants or variables (i.e., $\mathbb N \cup \vct X$), while the remaining nodes are labeled with the symbol $+$ or $\times$.
We require that vertices have an in-degree of at most two.
\newcommand{\getpoly}[1]{\textbf{poly}\inparen{#1}}
Each vertex $v \in V_Q$ in the arithmetic circuit for $\tuple{V_Q, E_Q, \phi_Q, \ell_Q}$ encodes a polynomial, realized as
\AH{We already have a function named poly (not in bold however). Is \textbf{poly} enough to convey to the reader that this is a \emph{different} function, or is another name a better idea ?}
$$\getpoly{v} = \begin{cases}
\sum_{v' : (v',v) \in E_Q} \getpoly{v'} & \textbf{if } \ell(v) = +\\
\prod_{v' : (v',v) \in E_Q} \getpoly{v'} & \textbf{if } \ell(v) = \times\\
\ell(v) & \textbf{otherwise}
\end{cases}$$
For the specifics on how lineage circuits are translated to represent polynomials see~\Cref{app:subsec-rep-poly-lin-circ}.
\newcommand{\caseheading}[1]{\smallskip \noindent \textbf{#1}.~}
We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i}, E_{Q_i}, \phi_{Q_i}, \ell_{Q_i}}$ denote the circuit for subquery $Q_i$.
\caseheading{Base Relation}
Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_Q = \comprehension{v_t}{t\in R}$, let $\phi_Q(t) = v_t$, let $\ell_Q(v_t) = R(t)$, and let $E_Q = \emptyset$.
This circuit has $|R|$ vertices.
\caseheading{Selection}
Let $Q = \sigma_\theta \inparen{Q_1}$.
We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate.
Formally, let $V_Q = V_{Q_1}$, let $\ell_Q(v_0) = 0$, and let $\ell_Q(v) = \ell_{Q_1}(v)$ for any $v \in V_{Q_1}$. Let $E_Q = E_{Q_1}$, and define
$$\phi_Q(t) =
\phi_{Q_1}(t) \text{ for } t \text{ s.t.}\; \theta(t).$$
Dead sinks are iteratively removed, and so
%\AH{While not explicit, I assume a reviewer would know that the notation above discards tuples/vertices not satisfying the selection predicate.}
%v_0 & \textbf{otherwise}
%\end{cases}$$
this circuit has at most $|V_{Q_1}|$ vertices.
\caseheading{Projection}
Let $Q = \pi_{\vct A} {Q_1}$.
We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$.
Naively, let $V_Q = V_{Q_1} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_Q(t) = v_t$, and let $\ell_Q(v_t) = +$. Finally let
$$E_Q = E_{Q_1} \cup \comprehension{(\phi_{Q_1}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$
This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ new vertices.
% \AH{Is the rightmost operator \emph{supposed} to be a $-$? In the beginning we add $|\pi_{\vct A}{Q_1}|$ vertices.}
The corrected circuit thus has at most $|V_{Q_1}|+|{Q_1}|$ vertices.
\caseheading{Union}
Let $Q = {Q_1} \cup {Q_2}$.
We merge graphs and produce a sum vertex for all tuples in both sides of the union.
Formally, let $V_Q = V_{Q_1} \cup V_{Q_2} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_Q(v_t) = +$, and let
$$E_Q = E_{Q_1} \cup E_{Q_2} \cup \comprehension{(\phi_{Q_1}(t), v_t), (\phi_{Q_2}(t), v_t)}{t \in {Q_1} \cap {Q_2}}$$
$$\phi_Q(t) = \begin{cases}
v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\
\phi_{Q_1}(t) & \textbf{if } t \not \in {Q_2}\\
\phi_{Q_2}(t) & \textbf{if } t \not \in {Q_1}\\
\end{cases}$$
This circuit has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices.
\caseheading{$k$-ary Join}
Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$.
We merge graphs and produce a multiplication vertex for all tuples resulting from the join
Naively, let $V_Q = V_{Q_1} \cup \ldots \cup V_{Q_k} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let
{\small
\begin{multline*}
E_Q = E_{Q_1} \cup \ldots \cup E_{Q_k} \cup
\left\{\;
(\phi_{Q_1}(\pi_{\sch({Q_1})}t), v_t), \right.\\
\ldots, (\phi_{Q_k}(\pi_{\sch({Q_k})}t), v_t)
\;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\}
\end{multline*}
}
Let $\ell_Q(v_t) = \times$, and let $\phi_Q(t) = v_t$
As in projection, newly created vertices will have an in-degree of $k$, and a fan-in tree is required.
There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
\subsubsection{Circuit size vs. runtime}
\label{sec:circuit-runtime}
We now connect the size of a lineage circuit (where the size of a lineage circuit is the number of vertices in the corresponding DAG\footnote{since each node has indegree at most two, this also is the same up to constants to counting the number of edges in the DAG.})\AH{Wouldn't it be the same for an arbitrary indegree? On another note, for a base relation with no edges, is this still considered the same \emph{up to a constant}? What if the base relation contains $10^{10}$ tuples/vertices?} for a given SUPJ query $Q$ to its $\qruntime{Q}$. We do this formally by showing that the size of the lineage circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms.
We now connect the size of a lineage circuit (where the size of a lineage circuit is the number of vertices in the corresponding DAG\footnote{since each node has indegree at most two, this also is the same up to constants to counting the number of edges in the DAG.}) for a given SPJU query $Q$ to its $\qruntime{Q}$. We do this formally by showing that the size of the lineage circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms.
\begin{lemma}
\label{lem:circuits-model-runtime}
The runtime of any query plan $Q$ has the same or better complexity as the lineage of the corresponding query result for any specific database instance. That is, for any query plan $Q$ we have $|V_Q| \leq (k-1)\qruntime{Q}$, where $k$ is the degree of query polynomial corresponding to $Q$.
\end{lemma}
\begin{proof}
Proof by induction. The base case is a base relation: $Q = R$ and is trivially true since $|V_R| = |R|$.
For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i}| \leq (k_i-1)\qruntime{Q_i}$ where $k_i$ is the degree of $Q_i$.
Proof is in~\Cref{app:subsec-lem-lin-vs-qplan}.
\caseheading{Selection}
Assume that $Q = \sigma_\theta(Q_1)$.
In the circuit for $Q$, $|V_Q| = |V_{Q_1}|$ vertices, so from the inductive assumption and $\qruntime{Q} = \qruntime{Q_1}$ by definition, we have $|V_Q| \leq (k-1) \qruntime{Q} $.
% \AH{Technically, $\kElem$ is the degree of $\poly_1$, but I guess this is a moot point since one can argue that $\kElem$ is also the degree of $\poly$.}
% OK: Correct
\caseheading{Projection}
Assume that $Q = \pi_{\vct A}(Q_1)$.
The circuit for $Q$ has at most $|V_{Q_1}|+|{Q_1}|$ vertices.
% \AH{The combination of terms above doesn't follow the details for projection above.}
\begin{align*}
|V_{Q}| & \leq |V_{Q_1}| + |Q_1|\\
%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$}
%& \leq |V_{Q_1}| + 2 \qruntime{Q_1}\\
\intertext{(From the inductive assumption)}
& \leq (k-1)\qruntime{Q_1} + \abs{Q_1}\\
\intertext{(By definition of $\qruntime{Q}$)}
& \le (k-1)\qruntime{Q}.
\end{align*}
\AH{In the inductive step above, where does $\abs{\poly_1}$ come from? I understand that $b_i$ is part of the inductive hypothesis, but, is it \emph{legal/justifiable} to just throw in \emph{any} constant we so desire?}
\caseheading{Union}
Assume that $Q = Q_1 \cup Q_2$.
The circuit for $Q$ has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices.
\begin{align*}
|V_{Q}| & \leq |V_{Q_1}|+|V_{Q_2}|+|{Q_1}|+|{Q_2}|\\
%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$}
%& \leq |V_{Q_1}|+|V_{Q_2}|+\qruntime{Q_1}+\qruntime{Q_2}|\\
\intertext{(From the inductive assumption)}
& \leq (k-1)(\qruntime{Q_1} + \qruntime{Q_2}) + (b_1 + b_2)
\intertext{(By definition of $\qruntime{Q}$)}
& \leq (k-1)(\qruntime{Q}).
\end{align*}
\caseheading{$k$-ary Join}
Assume that $Q = Q_1 \bowtie \ldots \bowtie Q_k$.
The circuit for $Q$ has $|V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
\begin{align*}
|V_{Q}| & = |V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
\intertext{From the inductive assumption and noting $\forall i: k_i \leq k-1$}
& \leq (k-1)\qruntime{Q_1}+\ldots+(k-1)\qruntime{Q_k}+\\
&\;\;\; (k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
& \leq (k-1)(\qruntime{Q_1}+\ldots+\qruntime{Q_k}+\\
&\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_k}|)\\
\intertext{(By definition of $\qruntime{Q}$)}
& = (k-1)\qruntime{Q}.
\end{align*}
The property holds for all recursive queries, and the proof holds.
\end{proof}
\qed
We now have all the pieces to argue the following, which formally states that our approximation algorithm implies that approximating the expected multiplicities of SUPJ query can be done in essentially the same runtime as deterministic query processing of the same query:
We now have all the pieces to argue the following, which formally states that our approximation algorithm implies that approximating the expected multiplicities of SPJU query can be done in essentially the same runtime as deterministic query processing of the same query:
\begin{Corollary}
Given an SUPJ query $Q$ for a TIDB, we can present $(1\pm\eps)$ approximation to the expectation of each output tuple with probability at least $1-\delta$ in time $O_k\left(\frac 1{\eps^2}\cdot\qruntime{Q}\cdot \log{\frac{1}{\conf}}\cdot \log(n)\right)$.
Given an SPJU query $Q$ for a TIDB, we can present $(1\pm\eps)$ approximation to the expectation of each output tuple with probability at least $1-\delta$ in time $O_k\left(\frac 1{\eps^2}\cdot\qruntime{Q}\cdot \log{\frac{1}{\conf}}\cdot \log(n)\right)$.
\end{Corollary}
\begin{proof}
This follows from~\Cref{lem:circuits-model-runtime} and (the lineage circuit counterpart-- see~\Cref{sec:results-circuits} of)~\Cref{cor:approx-algo-const-p} (where the latter is used with $\delta$ being substituted\footnote{Recall that~\Cref{cor:approx-algo-const-p} is stated for a single output tuple so to get the required guarantee for all (at most $n^k$) output tuples of $Q$ we get at most $\frac \delta{n^k}$ probability of failure for each output tuple and then just a union bound over all output tuples. } with $\frac \delta{n^k}$).

View File

@ -1,5 +1,50 @@
\section{Missing details from Section~\ref{sec:background}}\label{sec:proofs-background}
\subsection{Supplementary Material for~\Cref{prop:expection-of-polynom}}\label{subsec:supp-mat-background}
To justify the use of $\semNX$-databases, we need to show that we can encode any $\semN$-PDB in this way and that the query semantics over this representation coincides with query semantics over $\semN$-PDB. For that it will be opportune to define representation systems for $\semN$-PDBs.\BG{cite}
Before we proceed, unless otherwise mentioned, all subsequent proofs for~\Cref{sec:background} can be found in~\Cref{sec:proofs-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Representation System]\label{def:representation-syste}
A representation system for $\semN$-PDBs is a tuple $(\reprs, \rmod)$ where $\reprs$ is a set of representations and $\rmod$ associates with each $\repr \in \reprs$ an $\semN$-PDB $\pdb$. We say that a representation system is \emph{closed} under a class of queries $\qClass$ if for any query $\query \in \qClass$ we have:
%
\[ \rmod(\query(\repr)) = \query(\rmod(\repr)) \]
A representation system is \emph{complete} if for every $\semN$-PDB $\pdb$ there exists $\repr \in \reprs$ such that:
%
\[ \rmod(\repr) = \pdb \]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As mentioned above we will use $\semNX$-databases paired with a probability distribution as a representation system.
We refer to such databases as $\semNX$-PDBs and use bold symbols to distinguish them from possible worlds (which are $\semN$-databases).
Formally, an $\semNX$-PDB is an $\semNX$-database $\db$ and a probability distribution $\pd$ over assignments $\assign$ of the variables $\vct{X} = \{X_1, \ldots, X_n\}$ occurring in annotations of $\db$ to $\{0,1\}$. Note that an assignment $\assign: \vct{X} \to \{0,1\}$ can be represented as a vector $\vct{w} \in \{0,1\}^n$ where $\vct{w}[i]$ records the value assigned to variable $X_i$. Thus, from now on we will solely use such vectors which we refer to as \emph{world vectors} and implicitly understand them to represent assignments. Given an assignment $\assign$ we use $\assign(\pxdb)$ to denote the semiring homomorphism $\semNX \to \semN$ that applies the assignment $\assign$ to all variables of a polynomial and evaluates the resulting expression in $\semN$.\BG{explain connection to homomorphism lifting in K-relations}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\semNX$-PDBs]\label{def:semnx-pdbs}
An $\semNX$-PDB $\pxdb$ over variables $\vct{X} = \{X_1, \ldots, X_n\}$ is a tuple $(\db,\pd)$ where $\db$ is an $\semNX$-database and $\pd$ is a probability distribution over $\vct{w} \in \{0,1\}^n$. We use $\assign_{\vct{w}}$ to denote the assignment corresponding to $\vct{w} \in \{0,1\}^n$. The $\semN$-PDB $\rmod(\pxdb) = (\idb, \pd')$ encoded by $\pxdb$ is defined as:
\begin{align*}
\idb & = \{ \assign_{\vct{w}}(\pxdb) \mid \vct{w} \in \{0,1\}^n \} \\
\pd'(\db) & = \sum_{\vct{w} \in \{0,1\}^n: \assign_{\vct{w}}(\pxdb) = \db} \pd(\vct{w})
\end{align*}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For instance, consider a $\pxdb$ consisting of a single tuple $\tup_1 = (1)$ annotated with $X_1 + X_2$ with probability distribution $\pd([0,0]) = 0$, $\pd([0,1]) = 0$, $\pd([1,0]) = 0.3$ and $\pd([1,1]) = 0.7$. This $\semNX$-PDB encodes two possible worlds (with non-zero) probability that we denote using their world vectors.
%
\[
D_{[0,1]}(\tup_1) = 1 \hspace{0.3cm} \mathbf{and} \hspace{0.3cm} D_{[1,1]}(\tup_1) = 2
\]
%
Importantly, as the following proposition shows, any finite $\semN$-PDB can be encoded as an $\semNX$-PDB and $\semNX$-PDBs are closed under positive relational algebra queries, the class of queries we are interested in in this work.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{prop:semnx-pdbs-are-a-}
$\semNX$-PDBs are a complete representation system for $\semN$-PDBs that is closed under $\raPlus$ queries.
\end{Proposition}
\subsection{Proof of~\Cref{prop:semnx-pdbs-are-a-}}
\AH{I made small changes to the proof, noteably the summation, the variable definition and the world subscript, the latter of which I am not sure if it is the best notation or not.}
@ -14,9 +59,42 @@ The probability distribution $\pd'$ assigns all world vectors zero probability e
The closure under $\raPlus$ queries follows from the fact that an assignment $\vct{X} \to \{0,1\}$ is a semiring homomorphism and that semiring homomorphisms commute with queries over $\semK$-relations.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Now let us consider computing the expected multiplicity of a tuple $\tup$ in the result of a query $\query$ over an $\semN$-PDB $\pdb$ using the annotation of $\tup$ in the result of evaluating $\query$ over an $\semNX$-PDB $\pxdb$ for which $\rmod(\pxdb) = \pdb$. The expectation of the polynomial $\poly = \query(\pxdb)(\tup)$ based on the probability distribution of $\pxdb$ over the variables in $\pxdb$ is:
\begin{equation}
\expct_{\vct{W} \sim \pd}\pbox{\poly(\vct{W})} = \sum_{\vct{w} \in \{0,1\}^n} \query(\assign_{\vct{w}}(\pxdb))(\tup) \cdot \pd(\vct{w})\label{eq:expect-q-nx}
\end{equation}
Since $\semNX$-PDBs $\pxdb$ are a complete representation system for $\semN$-PDBs which are closed under $\raPlus$, computing the expectation of the multiplicity of a tuple $t$ in the result of an $\raPlus$ query over the $\semN$-PDB $\rmod(\pxdb)$, is the same as computing the expectation of the polynomial $\query(\pxdb)(t)$.
\subsection{Proof of~\Cref{prop:expection-of-polynom}}
\label{subsec:expectation-of-polynom-proof}
\BG{TODO}
\subsection{Supplementary Material for~\Cref{def:tidbs-and-bidbs}}\label{subsec:supp-mat-ti-bi-def}
Two important subclasses of $\semNX$-PDBs that are of interest to us are the bag versions of tuple-independent databases (\tis) and block-independent databases (\bis). Under set semantics, a \ti is a deterministic database $\db$ where each tuple $\tup$ is assigned a probability $\prob(\tup)$. The set of possible worlds represented by a \ti $\db$ is all subsets of $\db$. The probability of each world is the product of the probabilities of all tuples that exist with one minus the probability of all tuples of $\db$ that are not part of this world, i.e., tuples are treated as independent random events. In a \bi, we also assign each tuple a probability, but additionally partition $\db$ into blocks. The possible worlds of a \bi $\db$ are all subsets of $\db$ that contain at most one tuple from each block. Note then that the tuples sharing the same block are disjoint, and the sum of the probabilitites of all the tuples in the same block $\block$ is $1$. The probability of such a world is the product of the probabilities of all tuples present in the world. %and one minus the sum of the probabilities of all tuples from blocks for which no tuple is present in the world.
For bag \tis and \bis, we define the probability of a tuple to be the probability that the tuple exists with multiplicity at least $1$.
\AH{This part \emph{below} needs more work if we include it.}
Note that the main difference to the standard definitions of \tis and \bis is that we define them as subclasses of $\semNX$-PDBs and that we use bag semantics. Even though tuples cannot occur more than once in the input \ti or \bi, they can occur with a multiplicity larger than one in the result of a query. Since in \tis and \bis, there is a one-to-one correspondence between tuples in the database and variables, we can interpret a vector $\vct{w} \in \{0,1\}^n$ as denoting which tuples exist in the possible world $\assign_{\vct{w}}(\pxdb)$ (the ones where $\vct{w}[i] = 1$). Denote the vector $\vct{p}$ to be a vector whose elements are the individual probabilities $\prob_i$ of each tuple $\tup_i$. Let $\pd^{(\vct{p})}$ denote the distribution induced by $\vct{p}$.
%
\begin{align}\label{eq:tidb-expectation}
\expct_{\vct{X} \sim \pd^{(\vct{p})}}\pbox{\poly(\vct{X})} = \sum\limits_{\vct{w} \in \{0, 1\}^\numvar} \poly(\vct{w})\prod_{\substack{i \in [\numvar]\\ s.t. \wElem_i = 1}}\prob_i \prod_{\substack{i \in [\numvar]\\s.t. w_i = 0}}\left(1 - \prob_i\right).
\end{align}
%
\BG{Do we need the BIDB formula?}
\BG{Oliver's conjecture: Bag-\tis + Q can express any finite bag-PDB:
A well-known result for set semantics PDBs is that while not all finite PDBs can be encoded as \tis, any finite PDB can be encoded using a \ti and a query. An analog result holds in our case: any finite $\semN$-PDB can be encoded as a bag \ti and a query (WHAT CLASS? ADD PROOF)
}
\subsection{Proof for Proposition ~\ref{proposition:q-qtilde}}
Note that any $\poly$ in factorized form is equivalent to its \abbrSMB expansion. For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$. \qed
@ -48,7 +126,19 @@ Note that \cref{lem:exp-poly-rpoly} shows that $\expct\pbox{\poly} =$ $\rpoly(\p
\section{Missing details from Section~\ref{sec:hard}}
\label{app:hard}
\label{app:single-mult-p}
We use~\Cref{lem:qEk-multi-p} to prove~\Cref{thm:mult-p-hard-result}:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Proof of Theorem~\ref{thm:mult-p-hard-result}}
For the sake of contradiction, let us assume we can solve our problem in $f(\kElem)\cdot m^c$ time for some absolute constant $c$. Then given a graph $G$ we can compute the query polynomial $\rpoly_G^\kElem$ (in the obvious way) in $O(km)$ time. Then after we run our algorithm on $\rpoly_G^\kElem$, we get $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$ in additional $f(\kElem)\cdot m^c$ time. \Cref{lem:qEk-multi-p} then computes the number of $k$-matchings in $G$ in $O(\kElem^3)$ time. Thus, overall we have an algorithm for computing the number of $k$-matchings in time
\begin{align*}
O(km) + f(\kElem)\cdot m^c + O(\kElem^3)
&\le \inparen{O(\kElem^3) + f(\kElem)}\cdot m^{c+1} \\
&\le \inparen{O(\kElem^3) + f(\kElem)}\cdot n^{2c+2},
\end{align*}
which contradicts \Cref{thm:k-match-hard}.
\subsection{Proofs of~\cref{eq:1e}-\cref{eq:2pd-3d}}
\label{app:easy-counts}
@ -312,7 +402,111 @@ Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the firs
\subsection{$\onepass$ Pseudocode}
\begin{algorithm}[h!]
\caption{\onepass$(\etree)$}
\label{alg:one-pass}
\begin{algorithmic}[1]
\Require \etree: Binary Expression Tree
\Ensure \etree: Binary Expression Tree
\Ensure \vari{sum} $\in \mathbb{R}$
\If{$\etree.\type = +$}\label{alg:one-pass-equality1}
\State $\accum \gets 0$\label{alg:one-pass-plus-assign1}
\For{$child$ in $\etree.\vari{children}$}\Comment{Sum up all children coefficients}
\State $(child, \vari{s}) \gets \onepass(child)$
\State $\accum \gets \accum + \vari{s}$\label{alg:one-pass-plus-add}
\EndFor
\State $\etree.\vari{partial} \gets \accum$\label{alg:one-pass-plus-assign2}
\For{$child$ in $\etree.\vari{children}$}\Comment{Record distributions for each child}
\State $child.\vari{weight} \gets \frac{child.\vari{partial}}{\etree.\vari{partial}}$\label{alg:one-pass-plus-prob}
\EndFor
%\State $\vari{sum} \gets \etree.\vari{partial}$\label{alg:one-pass-plus-assign3}
\State \Return (\etree, \etree.\vari{partial})
\ElsIf{$\etree.\type = \times$}\label{alg:one-pass-equality2}
\State $\accum \gets 1$\label{alg:one-pass-times-assign1}
\For{$child \text{ in } \etree.\vari{children}$}\Comment{Compute the product of all children coefficients}
\State $(child, \vari{s}) \gets \onepass(child)$
\State $\accum \gets \accum \times \vari{s}$\label{alg:one-pass-times-product}
\EndFor
\State $\etree.\vari{partial}\gets \accum$\label{alg:one-pass-times-assign2}
%\State $\vari{sum} \gets \etree.\vari{partial}$\label{alg:one-pass-times-assign3}
\State \Return (\etree, \etree.\vari{partial})
\ElsIf{$\etree.\type = numeric$}\Comment{Base case}\label{alg:one-pass-equality3}
\State $\vari{sum} \gets |\etree.\val|$\label{alg:one-pass-leaf-assign1}\Comment{This step effectively converts $\etree$ into $\abs{\etree}$}
\State \Return (\etree, \vari{sum})
\Else\Comment{$\etree.\type = \var$}\label{alg:one-pass-equality4}
%\State $\vari{sum} \gets 1$\label{alg:one-pass-global-assign}
\State \Return (\etree,$1$) % \vari{sum})
\EndIf
\end{algorithmic}
\end{algorithm}
\subsection{$\onepass$ Example}
\begin{Example}\label{example:one-pass}
Let $\etree$ encode the expression $(X_1 + X_2)(X_1 - X_2) + X_2^2$. After one pass, \cref{alg:one-pass} would have computed the following weight distribution. For the two children of the root $+$ node $\etree$, $\etree_\lchild.\wght = \frac{4}{5}$ and $\etree_\rchild.\wght = \frac{1}{5}$. Similarly, let $\stree$ denote the left-subtree of $\etree_{\lchild}$, $\stree_\lchild.\wght = \stree_\rchild.\wght = \frac{1}{2}$. This is depicted in~\Cref{fig:expr-tree-T-wght}. %Note that in this example, the sampling probabilities for the children of each inner $+$ node of $\stree$ are equal to one another because both parents have the same number of children, and, in each case, the children of each parent $+$ node share the same $|\coef_i|$.
\end{Example}
\begin{figure}[h!]
\begin{tikzpicture}[thick, every tree node/.style={default_node, thick, draw=black, black, circle, text width=0.3cm, font=\bfseries, minimum size=0.65cm}, every child/.style={black}, edge from parent/.style={draw, thick},
level 1/.style={sibling distance=0.95cm},
level 2/.style={sibling distance=0.7cm},
%level 2+/.style={sibling distance=0.625cm}
%level distance = 1.25cm,
%sibling distance = 1cm,
%every node/.append style = {anchor=center}
]
\Tree [.\node(root){$\boldsymbol{+}$};
\edge [wght_color] node[midway, auto= right, font=\bfseries, gray] {$\bsym{\frac{4}{5}}$}; [.\node[highlight_color](tl){$\boldsymbol{\times}$};
[.\node(s){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](sr){$\bsym{x_2}$}; ]
]
[.\node(sp){$\bsym{+}$};
\edge[wght_color] node[pos=0.35, left, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spl){$\bsym{x_1}$}; ]
\edge[wght_color] node[pos=0.35, right, font=\bfseries, gray]{$\bsym{\frac{1}{2}}$}; [.\node[highlight_color](spr){$\bsym{\times}$};
[.$\bsym{-1}$ ] [.$\bsym{x_2}$ ]
]
]
]
\edge [wght_color] node[midway, auto=left, font=\bfseries, gray] {$\bsym{\frac{1}{5}}$}; [.\node[highlight_color](tr){$\boldsymbol{\times}$};
[.$\bsym{x_2}$
\edge [draw=none]; [.\node[draw=none]{}; ]
\edge [draw=none]; [.\node[draw=none]{}; ]
]
[.$\bsym{x_2}$ ] ]
]
% labels for plus node children, with arrows
\node[left=2pt of sl, highlight_color, inner sep=0pt] (sl-label) {$\stree_\lchild$};
\draw[highlight_color] (sl) -- (sl-label);
\node[right=2pt of sr, highlight_color, inner sep=0pt] (sr-label) {$\stree_\rchild$};
\draw[highlight_color] (sr) -- (sr-label);
\node[below left=2pt of spl, inner sep=0pt, highlight_color](spl-label) {$\stree_\lchild'$};
\draw[highlight_color] (spl) -- (spl-label);
\node[right=2pt of spr, highlight_color, inner sep=0] (spr-label) {$\stree_\rchild'$};
\draw[highlight_color] (spr) -- (spr-label);
\node[above left=2pt of tl, inner sep=0pt, highlight_color] (tl-label) {$\etree_\lchild$};
\draw[highlight_color] (tl) -- (tl-label);
\node[above right=2pt of tr, highlight_color, inner sep=0pt] (tr-label) {$\etree_\rchild$};
\node[above = 2pt of root, highlight_color, inner sep=0pt, font=\bfseries] (root-label) {$\etree$};
\node[above = 2pt of s, highlight_color, inner sep=0pt, font=\bfseries] (s-label) {$\stree$};
\node[above = 2pt of sp, highlight_color, inner sep=0pt, font=\bfseries] (sp-label) {$\stree'$};
\draw[highlight_color] (tr) -- (tr-label);
% \draw[<-|, highlight_color] (s) -- (s-label);
% \draw[<-|, highlight_color] (sp) -- (sp-label);
% \draw[<-|, highlight_color] (root) -- (root-label);
%\node[above right=0.7cm of TR, highlight_color, inner sep=0pt, font=\bfseries] (tr-comment) {$\etree_\rchild$};
% \draw[<-|, highlight_color] (TR) -- (tr-comment);
\end{tikzpicture}
\caption{Weights computed by $\onepass$ in ~\cref{example:one-pass}.}
\label{fig:expr-tree-T-wght}
\end{figure}
\subsection{Proof of~\Cref{lem:one-pass}}
We prove the first part of lemma ~\ref{lem:one-pass}, i.e., correctness, by structural induction over the depth $d$ of the binary tree $\etree$.
For the base case, $d = 0$, it is the case that the node is a leaf and therefore by definition ~\ref{def:express-tree} must be a variable or coefficient. When it is a variable, \textsc{OnePass} returns $1$, and we have in this case that $\polyf(\etree) = X_i = \polyf(\abs{\etree})$ for some $i$ in $[\numvar]$, and this evaluated at all $1$'s indeed gives $1$, verifying the correctness of the returned value of $\abs{\etree}(1,\ldots, 1) = 1$. When the root is a coefficient, the absolute value of the coefficient is returned, which is indeed $\abs{\etree}(1,\ldots, 1)$. This proves the base case.
@ -368,4 +562,140 @@ We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth
It is easy to check that except for~\Cref{alg:sample-times-union}, all other lines take $O(1)$ time. Thus, overall all lines except for~\Cref{alg:sample-times-union} take $O(k\cdot depth(\etree))$ time. Now consider all executions of~\Cref{alg:sample-times-union} together. We note that at each level we will be adding a given set of variables to some set at most once: since the sum of the sizes of the sets at a given level is at most $k$, each level involves $O(k\log{k})$ time. Thus, overall all executions of~\Cref{alg:sample-times-union} takes $O(k\log{k}\cdot depth(T))$ time, as desired.
\subsection{Experimental Results}\label{app:subsec:experiment}
\input{experiments}
\section{Circuits}\label{app:sec-cicuits}
\subsection{Extending to Lineage Circuits}\label{app:lineage-circuit-ext}
More specifically consider $\onepass$. The algorithm (as well as its analysis) basically uses the fact that one can compute the corresponding polynomial at all $1$s input with a simple recursive formula (\cref{eq:T-all-ones}), and that we can compute a probability distribution based on these weights (as in~\cref{eq:T-weights}). It can be verified that all the arguments go through if we replace $\etree_\lchild$ and $\etree_\rchild$ for expression tree $\etree$ with the two incoming nodes of the sink for the given lineage circuit. Another way to look at this is we could `unroll' the recursion in $\onepass$ and think of the algorithm as doing the evaluation at each node bottom up from leaves to the root in the expression tree. For lineage circuits, we start from the source nodes and do the computation in the topological order till we reach the sink(s).
The argument for $\sampmon$ is similar. Since we argued that $\onepass$ works as intended for lineage circuits since~\Cref{alg:one-pass} only recurses on children of the current node in the expression tree and we can generalize it to lineage circuits by recursing to the two children of the current node in the lineage circuit. Alternatively, as we have already used in the proof of~\Cref{lem:sample}, we can think of the sampling algorithm sampling a sub-graph of the expression tree. For lineage circuits, we can think of $\sampmon$ as sampling the same sub-graph. Alternatively, one can implicitly expand the circuit lineage into a (larger but) equivalent expression tree. Since $\sampmon$ only explores one sub-graph during its run we can think of its run on a lineage circuit as being done on the implicit equivalent expression tree\footnote{
Recall that $\sampmon$ scales only in the depth of the expression and its polynomial degree ($k$). There exist polynomials that can be encoded in size $\Omega(\log k)$, but we follow convention in assuming that the circuit size is asymptotically larger than $k$ and thus treat the degree (i.e., join width) as a constant.
}. Hence, all of the results on $\sampmon$ on expression trees carry over to lineage circuits.
Thus, we have argued that~\Cref{lem:approx-alg} also holds if we use a lineage circuit instead of an expression tree as the input to our approximation algorithm.
\subsection{Representing Polynomials with Lineage Circuits}\label{app:subsec-rep-poly-lin-circ}
\newcommand{\getpoly}[1]{\textbf{lin}\inparen{#1}}
Each vertex $v \in V_Q$ in the arithmetic circuit for $\tuple{V_Q, E_Q, \phi_Q, \ell_Q}$ encodes a polynomial, realized as
$$\getpoly{v} = \begin{cases}
\sum_{v' : (v',v) \in E_Q} \getpoly{v'} & \textbf{if } \ell(v) = +\\
\prod_{v' : (v',v) \in E_Q} \getpoly{v'} & \textbf{if } \ell(v) = \times\\
\ell(v) & \textbf{otherwise}
\end{cases}$$
We define the circuit for a select-union-project-join $Q$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i}, E_{Q_i}, \phi_{Q_i}, \ell_{Q_i}}$ denote the circuit for subquery $Q_i$.
\caseheading{Base Relation}
Let $Q$ be a base relation $R$. We define one node for each tuple. Formally, let $V_Q = \comprehension{v_t}{t\in R}$, let $\phi_Q(t) = v_t$, let $\ell_Q(v_t) = R(t)$, and let $E_Q = \emptyset$.
This circuit has $|R|$ vertices.
\caseheading{Selection}
Let $Q = \sigma_\theta \inparen{Q_1}$.
We re-use the circuit for $Q_1$. %, but define a new distinguished node $v_0$ with label $0$ and make it the sink node for all tuples that fail the selection predicate.
Formally, let $V_Q = V_{Q_1}$, let $\ell_Q(v_0) = 0$, and let $\ell_Q(v) = \ell_{Q_1}(v)$ for any $v \in V_{Q_1}$. Let $E_Q = E_{Q_1}$, and define
$$\phi_Q(t) =
\phi_{Q_1}(t) \text{ for } t \text{ s.t.}\; \theta(t).$$
Dead sinks are iteratively removed, and so
%\AH{While not explicit, I assume a reviewer would know that the notation above discards tuples/vertices not satisfying the selection predicate.}
%v_0 & \textbf{otherwise}
%\end{cases}$$
this circuit has at most $|V_{Q_1}|$ vertices.
\caseheading{Projection}
Let $Q = \pi_{\vct A} {Q_1}$.
We extend the circuit for ${Q_1}$ with a new set of sum vertices (i.e., vertices with label $+$) for each tuple in $Q$, and connect them to the corresponding sink nodes of the circuit for ${Q_1}$.
Naively, let $V_Q = V_{Q_1} \cup \comprehension{v_t}{t \in \pi_{\vct A} {Q_1}}$, let $\phi_Q(t) = v_t$, and let $\ell_Q(v_t) = +$. Finally let
$$E_Q = E_{Q_1} \cup \comprehension{(\phi_{Q_1}(t'), v_t)}{t = \pi_{\vct A} t', t' \in {Q_1}, t \in \pi_{\vct A} {Q_1}}$$
This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in tree. The resulting structure has at most $|{Q_1}|-1$ new vertices.
% \AH{Is the rightmost operator \emph{supposed} to be a $-$? In the beginning we add $|\pi_{\vct A}{Q_1}|$ vertices.}
The corrected circuit thus has at most $|V_{Q_1}|+|{Q_1}|$ vertices.
\caseheading{Union}
Let $Q = {Q_1} \cup {Q_2}$.
We merge graphs and produce a sum vertex for all tuples in both sides of the union.
Formally, let $V_Q = V_{Q_1} \cup V_{Q_2} \cup \comprehension{v_t}{t \in {Q_1} \cap {Q_2}}$, let $\ell_Q(v_t) = +$, and let
$$E_Q = E_{Q_1} \cup E_{Q_2} \cup \comprehension{(\phi_{Q_1}(t), v_t), (\phi_{Q_2}(t), v_t)}{t \in {Q_1} \cap {Q_2}}$$
$$\phi_Q(t) = \begin{cases}
v_t & \textbf{if } t \in {Q_1} \cap {Q_1}\\
\phi_{Q_1}(t) & \textbf{if } t \not \in {Q_2}\\
\phi_{Q_2}(t) & \textbf{if } t \not \in {Q_1}\\
\end{cases}$$
This circuit has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices.
\caseheading{$k$-ary Join}
Let $Q = {Q_1} \bowtie \ldots \bowtie {Q_k}$.
We merge graphs and produce a multiplication vertex for all tuples resulting from the join
Naively, let $V_Q = V_{Q_1} \cup \ldots \cup V_{Q_k} \cup \comprehension{v_t}{t \in {Q_1} \bowtie \ldots \bowtie {Q_k}}$, let
{\small
\begin{multline*}
E_Q = E_{Q_1} \cup \ldots \cup E_{Q_k} \cup
\left\{\;
(\phi_{Q_1}(\pi_{\sch({Q_1})}t), v_t), \right.\\
\ldots, (\phi_{Q_k}(\pi_{\sch({Q_k})}t), v_t)
\;\left|\;t \in {Q_1} \bowtie \ldots \bowtie {Q_k}\;\right\}
\end{multline*}
}
Let $\ell_Q(v_t) = \times$, and let $\phi_Q(t) = v_t$
As in projection, newly created vertices will have an in-degree of $k$, and a fan-in tree is required.
There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
\subsection{Proof for~\Cref{lem:circuits-model-runtime}}\label{app:subsec-lem-lin-vs-qplan}
Proof by induction. The base case is a base relation: $Q = R$ and is trivially true since $|V_R| = |R|$.
For the inductive step, we assume that we have circuits for subplans $Q_1, \ldots, Q_n$ such that $|V_{Q_i}| \leq (k_i-1)\qruntime{Q_i}$ where $k_i$ is the degree of $Q_i$.
\caseheading{Selection}
Assume that $Q = \sigma_\theta(Q_1)$.
In the circuit for $Q$, $|V_Q| = |V_{Q_1}|$ vertices, so from the inductive assumption and $\qruntime{Q} = \qruntime{Q_1}$ by definition, we have $|V_Q| \leq (k-1) \qruntime{Q} $.
% \AH{Technically, $\kElem$ is the degree of $\poly_1$, but I guess this is a moot point since one can argue that $\kElem$ is also the degree of $\poly$.}
% OK: Correct
\caseheading{Projection}
Assume that $Q = \pi_{\vct A}(Q_1)$.
The circuit for $Q$ has at most $|V_{Q_1}|+|{Q_1}|$ vertices.
% \AH{The combination of terms above doesn't follow the details for projection above.}
\begin{align*}
|V_{Q}| & \leq |V_{Q_1}| + |Q_1|\\
%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$}
%& \leq |V_{Q_1}| + 2 \qruntime{Q_1}\\
\intertext{(From the inductive assumption)}
& \leq (k-1)\qruntime{Q_1} + \abs{Q_1}\\
\intertext{(By definition of $\qruntime{Q}$)}
& \le (k-1)\qruntime{Q}.
\end{align*}
\AH{In the inductive step above, where does $\abs{\poly_1}$ come from? I understand that $b_i$ is part of the inductive hypothesis, but, is it \emph{legal/justifiable} to just throw in \emph{any} constant we so desire?}
\caseheading{Union}
Assume that $Q = Q_1 \cup Q_2$.
The circuit for $Q$ has $|V_{Q_1}|+|V_{Q_2}|+|{Q_1} \cap {Q_2}|$ vertices.
\begin{align*}
|V_{Q}| & \leq |V_{Q_1}|+|V_{Q_2}|+|{Q_1}|+|{Q_2}|\\
%\intertext{By \Cref{prop:queries-need-to-output-tuples} $\qruntime{Q_1} \geq |Q_1|$}
%& \leq |V_{Q_1}|+|V_{Q_2}|+\qruntime{Q_1}+\qruntime{Q_2}|\\
\intertext{(From the inductive assumption)}
& \leq (k-1)(\qruntime{Q_1} + \qruntime{Q_2}) + (b_1 + b_2)
\intertext{(By definition of $\qruntime{Q}$)}
& \leq (k-1)(\qruntime{Q}).
\end{align*}
\caseheading{$k$-ary Join}
Assume that $Q = Q_1 \bowtie \ldots \bowtie Q_k$.
The circuit for $Q$ has $|V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
\begin{align*}
|V_{Q}| & = |V_{Q_1}|+\ldots+|V_{Q_k}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
\intertext{From the inductive assumption and noting $\forall i: k_i \leq k-1$}
& \leq (k-1)\qruntime{Q_1}+\ldots+(k-1)\qruntime{Q_k}+\\
&\;\;\; (k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
& \leq (k-1)(\qruntime{Q_1}+\ldots+\qruntime{Q_k}+\\
&\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_k}|)\\
\intertext{(By definition of $\qruntime{Q}$)}
& = (k-1)\qruntime{Q}.
\end{align*}
The property holds for all recursive queries, and the proof holds.

128
intro.tex
View File

@ -4,25 +4,34 @@
\section{Introduction}
\label{sec:intro}
\AR{\textbf{Oliver/Boris:} What is missing from the intro is why would someone care about bag-PDBs in {\em practice}? This is kinda obliquely referred to in the first para but it would be good to motivate this more. The intro (rightly) focuses on the theoretical reasons to study bag PDBs but what (if any) are the practical significance of getting bag PDBs done in linear-time? Would this lead to much faster real-life PDB systems?}
% \AR{\textbf{Oliver/Boris:} What is missing from the intro is why would someone care about bag-PDBs in {\em practice}? This is kinda obliquely referred to in the first para but it would be good to motivate this more. The intro (rightly) focuses on the theoretical reasons to study bag PDBs but what (if any) are the practical significance of getting bag PDBs done in linear-time? Would this lead to much faster real-life PDB systems?}
Modern production databases like Postgres and Oracle use bag semantics, while research on probabilistic databases (PDBs)~\cite{DBLP:series/synthesis/2011Suciu,BD05,DBLP:conf/icde/AntovaKO07a,DBLP:conf/sigmod/SinghMMPHS08} focuseses predominantly on query evaluation under set semantics.
This is not surprising, as the conventional strategy for encoding the lineage of a query result --- a key component of query evaluation in PDBs --- makes computing typical statistics like marginal probabilities or moments easy (at worst linear in the size of the lineage) for bags and hence, perhaps not worthy of research attention, but hard (at worst exponential in the size of the lineage) for sets and hence, interesting from a research perspective.
However, conventional encodings of a result's lineage are typically large, and even for Bag-PDBs, computing such statistics from lineage formulas still has a higher complexity than answering queries in a deterministic (i.e., non-probabilistic) database.
In this paper, we formally prove this limitation of PDBs, and address it by proposing an approximation algorithm that, to the best of our knowledge, is the first $(1-\epsilon)$-approximation for expectations of counts to have a runtime within a constant factor of deterministic query processing.
As explainability and fairness become more relevant to the data science community, it is now more critical than ever to understand how reliable a dataset is.
Probabilistic databases (PDBs)~\cite{DBLP:series/synthesis/2011Suciu} are a compelling solution, but a major roadblock to their adoption remains:
PDBs are orders of magnitude slower than classical (i.e., deterministic) database systems~\cite{feng:2019:sigmod:uncertainty}.
A naive strategy might be to move from the theoretically simpler set-relational model~\cite{DBLP:series/synthesis/2011Suciu,DBLP:conf/sigmod/BoulosDMMRS05,DBLP:conf/icde/AntovaKO07a,DBLP:conf/sigmod/SinghMMPHS08} to the computationally simpler bag-relational model, mirroring a similar transition in deterministic datbases decades ago.
However, after discarding a long-held approach to representing lineage, we prove that query processing in Bag-PDBs is \sharpwonehard.
This finding shows that even Bag-PDB query processing has a higher complexity than deterministic query processing, and opens a rich landscape of opportunities for research on approximate algorithms.
The fundamental challenge is lineage formulas, a key component of query processing in PDBs.
Under standard assumptions about how these are encoded, computing typical statistics like marginal probabilities or moments is easy (at worst linear in the size of the lineage) for bags and hence, perhaps not worthy of research attention, but hard (at worst exponential in the size of the lineage) for sets and hence, interesting from a research perspective.
However, conventional encodings of a result's lineage are typically large, and so even for Bag-PDBs, computing such statistics from lineage formulas still has a higher complexity than answering queries in a deterministic (i.e., non-probabilistic) database.
In this paper, we formally prove this limitation of PDBs, and address it by proposing an approximation algorithm that, to the best of our knowledge, is the first $(1-\epsilon)$-approximation for expectations of counts to have a runtime within a constant factor of deterministic query processing\footnote{
MCDB~\cite{jampani2008mcdb} is notable in that it is also a constant factor slower, but only guarantees additive rather than multiplicative bounds.
}.
Consider the dominant problem in Set-PDBs: Computing marginal probabilities, and the corresponding problem in Bag-PDBs: computing expectations of counts.
In work that addresses the former problem~\cite{DBLP:series/synthesis/2011Suciu}, the lineage of a query result tuple is a Boolean formula over random variables that captures the conditions under which the tuple appears in the result.
Computing the probability of the tuple appearing in the result is thus analogous to weighted model counting (a known \sharpphard problem).
In the corresponding problem for Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,feng:2019:sigmod:uncertainty}, lineage is a polynomial over random variables that captures the multiplicity of the output tuple.
In the corresponding problem for Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,feng:2019:sigmod:uncertainty,GL16}, lineage is a polynomial over random variables that captures the multiplicity of the output tuple.
Thus, the expectation of the multiplicity is the expectation of this polynomial.
Lineage in Set-PDBs is typically encoded in disjunctive normal form.
This representation is significantly larger than the query result sans lineage.
However, even with alternative encodings~\cite{FH13}, the limiting factor in computing marginal probabilities remains the probability computation itself, and not the lineage formula.
However, even with alternative encodings~\cite{DBLP:journals/vldb/FinkHO13}, the limiting factor in computing marginal probabilities remains the probability computation itself, and not the lineage formula.
The corresponding lineage encoding for Bag-PDBs is a polynomial in sum of products (SOP) form --- a sum of `clauses', each of which is the product of a set of integer or variable atoms.
Thanks to linearity of expectation, computing the expectation of a count query is linear in the number of clauses in the SOP polynomial.
Unlike Set-PDBs, however, when we consider compressed representations of this polynomial, the complexity landscape becomes much more nuanced and is \textit{not} linear in general.
Unlike Set-PDBs, however, when we consider compressed representations of this polynomial, the complexity landscape becomes much more nuanced and is \textit{not} linear in general.
Such compressed representations like Factorized Databases~\cite{10.1145/3003665.3003667,DBLP:conf/tapp/Zavodny11} or Arithmetic/Polynomial Circuits~\cite{arith-complexity}, are analogous to deterministic query optimizations (e.g. pushing down projections)~\cite{DBLP:conf/pods/KhamisNR16,10.1145/3003665.3003667}.
Thus, measuring the performance of a PDB algorithm in terms of the size of the \emph{compressed} lineage formula allows us to more closely relate the algorithm's performance to the complexity of query evaluation in a deterministic database.
@ -30,7 +39,7 @@ The initial picture is not good.
In this paper, we prove that computing expected counts is \emph{not} linear in the size of a compressed --- specifically a factorized~\cite{10.1145/3003665.3003667} --- lineage polynomial by reduction from counting $k$-matchings.
Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases.
This motivates our second goal, a linear time approximation algorithm for computing expected counts in a bag database, with complexity linear in the size of a factorized lineage formula.
As we will show, the size of the factorized
As we will show, the size of the factorized
lineage formula for a query --- and by extension, our approximation algorithm --- is proportional to the complexity of evaluating the same query on a comparable deterministic database instance~\cite{DBLP:conf/pods/KhamisNR16,10.1145/3003665.3003667}.
In other words, our approximation algorithm can estimate expected multiplicities for tuples in the result of an SPJU query with a complexity comparable to deterministic query-processing.
@ -41,26 +50,26 @@ In other words, our approximation algorithm can estimate expected multiplicities
%Figures, etc
%Relations for example 1
\begin{figure}[ht]
\begin{subfigure}{0.15\textwidth}
\begin{subfigure}{0.2\textwidth}
\centering
\begin{tabular}{ c | c c}
$\rel$ & A & $\Phi$\\
\begin{tabular}{ c | c c c}
$\rel$ & A & $\Phi_{set}$ & $\Phi_{bag}$\\
\hline
& a & $W_a$\\
& b & $W_b$\\
& c & $W_c$\\
& a & $W_a$ & $W_a$\\
& b & $W_b$ & $W_b$\\
& c & $W_c$ & $W_c$\\
\end{tabular}
%\caption{Atom 1 of query $\poly$ in ~\Cref{intro:ex}}
\label{subfig:ex-atom1}
\end{subfigure}
\begin{subfigure}{0.15\textwidth}
\begin{subfigure}{0.24\textwidth}
\centering
\begin{tabular}{ c | c c c}
$E$ & A & B & $\Phi$\\
\begin{tabular}{ c | c c c c}
$E$ & A & B & $\Phi_{set}$ & $\Phi_{bag}$ \\
\hline
& a & b & $\top$\\
& b & c & $\top$\\
& c & a & $\top$\\
& a & b & $\top$ & $1$\\
& b & c & $\top$ & $1$\\
& c & a & $\top$ & $1$\\
\end{tabular}
%\caption{Atom 3 of query $\poly$ in ~\Cref{intro:ex}}
\label{subfig:ex-atom3}
@ -98,24 +107,24 @@ In other words, our approximation algorithm can estimate expected multiplicities
%\end{figure}
\begin{Example}\label{ex:intro}
Consider the Tuple Independent ($\ti$) Set-PDB\footnote{Our work also handles Block Independent Disjoint Databases ($\bi$)~\cite{BD05,DBLP:series/synthesis/2011Suciu}, we return to this model later.} given in \Cref{fig:intro-ex} with two input relations $R$ and $E$.
Each input tuple is assigned an annotation (attribute $\Phi$): an independent random Boolean variable ($W_i$) or the constant $\top$.
Each assignment of values to variables ($\{\;W_a,W_b,W_c\;\}\mapsto \{\;\top,\bot\;\}$) \SF{Do we need to state the meaning of $\top$ and $\bot$? Also do we want to add bag annotation to Figure 1 too since we are discussing both sets and bags later?} identifies one \emph{possible world}, a deterministic database instance that contains exactly the tuples annotated by the constant $\top$ or by a variable assigned to $\top$.
The probability of this world is the joint probability of the corresponding assignments.
For example, let $P[W_a] = P[W_b] = P[W_c] = p$ and consider the possible world where $R = \{\;\tuple{a}, \tuple{b}\;\}$.
Consider the Tuple Independent ($\ti$) Set-PDB\footnote{Our work also handles Block Independent Disjoint Databases ($\bi$)~\cite{DBLP:conf/sigmod/BoulosDMMRS05,DBLP:series/synthesis/2011Suciu} and we return to this model later.} given in \Cref{fig:intro-ex} with two input relations $R$ and $E$.
Each input tuple is assigned an annotation (attribute $\Phi_{set}$): an independent random Boolean variable ($W_i$) or the constant $\top$.
% Each assignment of values to variables ($\{\;W_a,W_b,W_c\;\}\mapsto \{\;\top,\bot\;\}$) \SF{Do we need to state the meaning of $\top$ and $\bot$? Also do we want to add bag annotation to Figure 1 too since we are discussing both sets and bags later?} identifies one \emph{possible world}, a deterministic database instance that contains exactly the tuples annotated by the constant $\top$ or by a variable assigned to $\top$.
The probability of this world is the joint probability of the corresponding assignments.
For example, let $P[W_a] = P[W_b] = P[W_c] = p$ and consider the possible world where $R = \{\;\tuple{a}, \tuple{b}\;\}$.
The corresponding variable assignment is $\{\;W_a \mapsto \top, W_b \mapsto \top, W_c \mapsto \bot\;\}$, and the probability of this world is $P[W_a]\cdot P[W_b] \cdot P[\neg W_c] = p\cdot p\cdot (1-p)=p^2-p^3$.
\end{Example}
Prior efforts to generalize incomplete databases to bags~\cite{feng:2019:sigmod:uncertainty,DBLP:conf/pods/GreenKT07,DBLP:journals/sigmod/GuagliardoL17} replace the Boolean annotations with natural numbers.
Analogously, we generalize the above model of Set-PDBs to bags by using natural-number-valued random variables (i.e., $Dom(W_i) \subseteq \mathbb N$) and positive natural number constants.
Analogously, we generalize the above model of Set-PDBs to bags by using natural-number-valued random variables (i.e., $Dom(W_i) \subseteq \mathbb N$) and positive natural number constants ($\Phi_{bag}$ in the example).
Without loss of generality, we assume that input relations are sets (i.e. $Dom(W_i) = \{0, 1\}$), while query evaluation follows bag semantics.
We contrast bag and set query evaluation with the following example:
\begin{Example}\label{ex:bag-vs-set}
Continuing the prior example, we are given the following Boolean (resp,. count) query
Continuing the prior example, we are given the following Boolean (resp,. count) query
$$\poly() :- R(A), E(A, B), R(B)$$
The lineage of the result in a Set-PDB (resp., Bag-PDB) is a Boolean (resp., polynomial) formula over random variables annotating the input relations (i.e., $W_a$, $W_b$, $W_c$).
Because the Boolean query has only a nullary relation, we write $Q(\cdot)$ to denote the function mapping variable assignments to a concrete value for the lineage in the corresponding possible world:
Because the Boolean query has only a nullary relation, we write $Q(\cdot)$ to denote the function that evaluates the lineage over one specific assignment of values to the variables (i.e., the value of the lineage in the corresponding possible world):
\begin{align*}
\poly_{set}(W_a, W_b, W_c) &= W_aW_b \vee W_bW_c \vee W_cW_a\\
\poly_{bag}(W_a, W_b, W_c) &= W_aW_b + W_bW_c + W_cW_a
@ -128,17 +137,19 @@ The polynomials evaluate as:
&\poly_{bag}(1, 1, 0) = 1 \cdot 1 + 1\cdot 0 + 0 \cdot 1 = 1
\end{align*}
The Set-PDB query is satisfied in this possible world, while the Bag-PDB query produces a nullary tuple with a multiplicity of 1.
The marginal probability (resp., expected count) of this query is computed over all possible worlds:\AR{What is $\mu$ below?}
The marginal probability (resp., expected count) of this query is computed over all possible worlds:
% \AR{What is $\mu$ below?}
{\small
\begin{align*}
P[\poly_{set}] &= \sum_{w_i \in \{\top,\bot\}} \mu(\poly_{set}(w_a, w_b, w_c))P[W_a = w_a,W_b = w_b,W_c = w_c]\\
P[\poly_{set}] &= \hspace*{-1mm}
\sum_{w_i \in \{\top,\bot\}} \indicator{\poly_{set}(w_a, w_b, w_c)}P[W_a = w_a,W_b = w_b,W_c = w_c]\\
\expct[\poly_{bag}] &= \sum_{w_i \in \{0,1\}} \poly_{bag}(w_a, w_b, w_c)\cdot P[W_a = w_a,W_b = w_b,W_c = w_c]
\end{align*}
}
\end{Example}
Note that the query of \Cref{ex:bag-vs-set} in set semantics is indeed \sharpphard, since it non-hierarchical~\cite{10.1145/1265530.1265571}.
To see why computing this probability is hard, observe that the clauses of the disjunctive normal form Boolean lineage are neither independent nor disjoint, leading to e.g.~\cite{FH13} the use of Shannon decomposition, which is at worst exponential in the size of the input.
Note that the query of \Cref{ex:bag-vs-set} in set semantics is indeed non-hierarchical~\cite{10.1145/1265530.1265571}, and thus \sharpphard.
To see why computing this probability is hard, observe that the clauses of the disjunctive normal form Boolean lineage are neither independent nor disjoint, leading to e.g.~\cite{DBLP:journals/vldb/FinkHO13} the use of Shannon decomposition, which is at worst exponential in the size of the input.
% \begin{equation*}
% \expct\pbox{\poly(W_a, W_b, W_c)} = W_aW_b + W_a\overline{W_b}W_c + \overline{W_a}W_bW_c = 3\prob^2 - 2\prob^3
% \end{equation*}
@ -148,28 +159,29 @@ To see why computing this probability is hard, observe that the clauses of the d
%&W_aW_b \vee W_bW_c \vee W_cW_a
%= &W_a
%\end{align*}
Conversely, in Bag-PDBs, correlations between clauses of the SOP polynomial are not problematic thanks to linearity of expectation.
Conversely, in Bag-PDBs, correlations between clauses of the SOP polynomial are not problematic thanks to linearity of expectation.
The expectation computation over the output lineage is simply the sum of expectations of each clause.
For \Cref{ex:intro}, the expectation is simply
{\small
\begin{align*}
\expct\pbox{\poly(W_a, W_b, W_c)} &= \expct\pbox{W_aW_b} + \expct\pbox{W_bW_c} + \expct\pbox{W_cW_a}\\
\intertext{\normalsize
\intertext{\normalsize
In this particular lineage polynomial, all variables in each product clause are independent, so we can push expectations through.
}
&= \expct\pbox{W_a}\expct\pbox{W_b} + \expct\pbox{W_b}\expct\pbox{W_c} + \expct\pbox{W_c}\expct\pbox{W_a}
\end{align*}
}
Computing such expectations is indeed linear in the size of the SOP as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial.
As a further interesting feature of this example, note that $\expct\pbox{W_i} = P[W_i = 1]$, and so taking the same polynomial over the reals:
Computing such expectations is indeed linear in the size of the SOP as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial.
As a further interesting feature of this example, note that $\expct\pbox{W_i} = P[W_i = 1]$, and so taking the same polynomial over the reals:
\begin{multline}
\label{eqn:can-inline-probabilities-into-polynomial}
\expct\pbox{\poly_{bag}} = P[W_a = 1]P[W_b = 1] + P[W_b = 1]P[W_c = 1]\\
+ P[W_c = 1]P[W_a = 1]\\
= \poly_{bag}(P[W_a=1], P[W_b=1], P[W_c=1])
= \poly_{bag}(P[W_a=1], P[W_b=1], P[W_c=1])
\end{multline}
\begin{figure}[h!]
\resizebox{\columnwidth}{!}{
\begin{tikzpicture}[thick, level distance=0.9cm,level 1/.style={sibling distance=4.55cm}, level 2/.style={sibling distance=1.5cm}, level 3/.style={sibling distance=0.7cm}]% level/.style={sibling distance=6cm/(#1 * 1.5)}]
\node[tree_node](root){$\boldsymbol{\times}$}
child{node[tree_node]{$\boldsymbol{+}$}
@ -201,26 +213,26 @@ As a further interesting feature of this example, note that $\expct\pbox{W_i} =
}
};
\end{tikzpicture}
}
\caption{Expression tree for query $\poly^2$.}
\label{fig:intro-q2-etree}
\end{figure}
\subsection{Superlinearity of Bag PDBs}
Moving forward, we focus exclusively on bags and drop the subscript from $\poly_{bag}$.
Moving forward, we focus exclusively on bags and drop the subscript from $\poly_{bag}$.
Consider the Cartesian product of $\poly$ with itself:
\begin{equation*}
\poly^2() := \rel(A), E(A, B), \rel(B),\; \rel(C), E(C, D), \rel(D)
\end{equation*}
For an arbitrary polynomial, it is known that there may exist equivalent compressed representations.
For an arbitrary polynomial, it is known that there may exist equivalent compressed representations.
One such compression is the factorized polynomial~\cite{10.1145/3003665.3003667}, where the polynomial is broken up into separate factors.
For example:
{\small
\begin{equation*}
\poly^2(W_a, W_b, W_c) = \left(W_aW_b + W_bW_c + W_cW_a\right) \cdot \left(W_aW_b + W_bW_c + W_cW_a\right).
\poly^2(W_a, W_b, W_c) = \left(W_aW_b + W_bW_c + W_cW_a\right) \cdot \left(W_aW_b + W_bW_c + W_cW_a\right)
\end{equation*}
}
This factorized expression can be easily modeled as an expression tree, as in \Cref{fig:intro-q2-etree}
This factorized expression can be easily modeled as an expression tree, as in \Cref{fig:intro-q2-etree},
while the equivalent SOP representation is
\begin{equation*}
W_a^2W_b^2 + W_b^2W_c^2 + W_c^2W_a^2 + 2W_a^2W_bW_c + 2W_aW_b^2W_c + 2W_aW_bW_c^2.
@ -232,31 +244,31 @@ The expectation then is
&\qquad \expct\pbox{2W_a^2}\expct\pbox{W_b}\expct\pbox{W_c} + \expct\pbox{2W_a}\expct\pbox{W_b^2}\expct\pbox{W_c} +\\
&\qquad \expct\pbox{2W_a}\expct\pbox{W_b}\expct\pbox{W_c^2}\\
\end{align*}
In our original example, the lineage polynomial for $\poly$ had the nice property that the expected count could be computed by simply replacing each variable with its probability (\Cref{eqn:can-inline-probabilities-into-polynomial}).
In our original example, the lineage polynomial for $\poly$ had the nice property that the expected count could be computed by replacing each variable with its probability (i.e., \Cref{eqn:can-inline-probabilities-into-polynomial}).
This property does not hold for $\poly^2$ (i.e., $\expct\pbox{\poly^2} \neq \poly^2(P\pbox{W_a}, P\pbox{W_b}, P\pbox{W_c})$).
Nevertheless, it suggests that a similar closed form formula for the expected count might be possible.
Observe that under the assumption that $Dom(W_i) = \{0, 1\}$, it is generally true that for any $k$, $\expct\pbox{W_i^k} = \expct\pbox{W_i}$.
However, a similar closed form formula for the expected count might be possible.
Under the assumption that $Dom(W_i) = \{0, 1\}$, it is generally true that for any $k$, $\expct\pbox{W_i^k} = \expct\pbox{W_i}$.
This property leads us to consider another structure related to $\poly$.
% \AH{I don't know if we want to include the following statement: \par \emph{ bags are only hard with self-joins }
% \par Atri suggests a proof in the appendix regarding this claim.}
For any polynomial $\poly(\vct{X})$, we define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in $\poly(\vct{X})$ to $1$.
For any polynomial $\poly(\vct{X})$, we define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in $\poly(\vct{X})$ to $1$.
With $\poly^2$ as an example, we have:
\begin{align*}
\rpoly^2(W_a, W_b, W_c) =&\; W_aW_b + W_bW_c + W_cW_a + 2W_aW_bW_c + 2W_aW_bW_c\\
&+ 2W_aW_bW_c\\
=&\; W_aW_b + W_bW_c + W_cW_a + 6W_aW_bW_c
\end{align*}
\SF{Should this be like $\tilde{\poly^2}$ to avoid ambiguous?}
Observe that the reduced polynomial is a closed form formula for the expected count (i.e., $\expct\pbox{\poly^2} = \rpoly(P\pbox{W_a=1}, P\pbox{W_b=1}, P\pbox{W_c=1})$).
%\SF{Should this be like $\tilde{\poly^2}$ to avoid ambiguous?}
Note that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\poly^2} = \rpoly(P\pbox{W_a=1}, P\pbox{W_b=1}, P\pbox{W_c=1})$).
Also note that the $\poly$ in~\Cref{ex:bag-vs-set} is already in reduced form.
The reduced form of a polynomial can be obtained in a linear scan over the clauses of a SOP encoding of the polynomial.
In prior work on PDBs, where this encoding is implicitly assumed, computing the expected count is linear in the size of the encoding.
In prior work on lineage-based Bag-PDBs~\cite{kennedy:2010:icde:pip,DBLP:conf/vldb/AgrawalBSHNSW06,yang:2015:pvldb:lenses} where this encoding is implicitly assumed, computing the expected count is linear in the size of the encoding.
In general however, compressed encodings of the polynomial can be exponentially smaller in $k$ for $k$-products --- the query $\poly^k$ obtained by taking the Cartesian product of $k$ copies of $\poly$ has a factorized encoding of size $6\cdot k$, while the SOP encoding is of size $2\cdot 3^k$.
This leads us to the \textbf{central question of this paper}:
\begin{quote}
{\em
Is it always the case that the expectation of an UCQ in a Bag-PDB can be computed in time linear in the size of the \emph{compressed} lineage polynomial?}
{\em
Is it always the case that the expectation of a UCQ in a Bag-PDB can be computed in time linear in the size of the \textbf{compressed} lineage polynomial?}
\end{quote}
If the answer is yes, then it is possible for Bag-PDBs to achieve performance competitive with deterministic databases.
The answer, unfortunately, is no, and an approximation algorithm is required.
@ -268,18 +280,18 @@ The answer, unfortunately, is no, and an approximation algorithm is required.
% The factorized output polynomial consists of a product of three identical three-way summations, while the SOP encoding is exponential --- $3^3$ clauses to be precise.
\subsection{Overview of our results and techniques}
Concretely, in this paper:
(i) We show that conjunctive queries over a bag-$\ti$ are hard (i.e., superlinear in the size of a compressed lineage encoding) by reduction from counting the number of $k$-matchings over an arbitrary graph;
Concretely, in this paper:
(i) We show that conjunctive queries over a bag-$\ti$ are hard (i.e., superlinear in the size of a compressed lineage encoding) by reduction from counting the number of $k$-matchings over an arbitrary graph;
(ii) We present an $(1-\epsilon)$-approximation algorithm for bag-$\ti$s and show that its complexity is linear in the size of the compressed lineage encoding;
(iii) We generalize the approximation algorithm to bag-$\bi$s, a more general model of probabilistic data;
(iv) We further generalize our results to higher moments, polynomial circuits, and prove RA+ queries, the processing time in approximation is within a constant factor of the same query processed deterministically.
(iv) We further generalize our results to higher moments, polynomial circuits, and prove that for RA+ queries, the processing time in approximation is within a constant factor of the same query processed deterministically.
Our hardness results follow by considering a suitable generalization of the lineage polynomial in Example~\ref{ex:bag-vs-set}. First it is easy to generalize the polynomial in Example~\ref{ex:bag-vs-set} to $\poly_G^k(X_1,\dots,X_n)$ that represents the edge set of a graph $G$ in $n$ vertices. Then $\inparen{\poly_G^k(X_1,\dots,X_n)}^k$ encodes as its monomials all subgraphs of $G$ with at most $k$ edges in it. This implies that the corresponding reduced polynomial $\rpoly_G^k(p,\dots,p)$ can be written as $\sum_{i=0}^2k c_i\cdot p^i$ and we observe that $c_{2k}$ is proportional to the number of $k$-matchings (computing which is \sharpwonehard\ ) in $G$. Thus, if we have access to $\rpoly_G^k(p_i,\dots,p_)$ for distinct values of $p_i$ for $0\le i\le 2k$, then we can setup a system of linear equations and compute $c_{2k}$ (and hence the number of $k$-matchings in $G$). This result, however, does not rule out the possibility that computing $\rpoly_G^k(p,\dots,p)$ for a {\em single specific} value of $p$ might be easy: indeed it is easy for $p=0$ or $p=1$. However, we are able to show that for any other value of $p$, computing $\rpoly_G^k(p,\dots,p)$ exactly will most probably require super-linear time. This reduction needs more work (and we cannot yet extend our results to $k>3$). Further, we have to rely on more recent conjectures in {\em fine-grained} complexity on e.g. the complexity of counting the number of triangles in $G$ and not more standard parameterized hardness like \sharpwonehard.
Our hardness results follow by considering a suitable generalization of the lineage polynomial in Example~\ref{ex:bag-vs-set}. First it is easy to generalize the polynomial in Example~\ref{ex:bag-vs-set} to $\poly_G^k(X_1,\dots,X_n)$ that represents the edge set of a graph $G$ in $n$ vertices. Then $\inparen{\poly_G^k(X_1,\dots,X_n)}^k$ encodes as its monomials all subgraphs of $G$ with at most $k$ edges in it. This implies that the corresponding reduced polynomial $\rpoly_G^k(p,\dots,p)$ can be written as $\sum_{i=0}^{2k} c_i\cdot p^i$ and we observe that $c_{2k}$ is proportional to the number of $k$-matchings (computing which is \sharpwonehard\ ) in $G$. Thus, if we have access to $\rpoly_G^k(p_i,\dots,p_i)$ for distinct values of $p_i$ for $0\le i\le 2k$, then we can setup a system of linear equations and compute $c_{2k}$ (and hence the number of $k$-matchings in $G$). This result, however, does not rule out the possibility that computing $\rpoly_G^k(p,\dots,p)$ for a {\em single specific} value of $p$ might be easy: indeed it is easy for $p=0$ or $p=1$. However, we are able to show that for any other value of $p$, computing $\rpoly_G^k(p,\dots,p)$ exactly will most probably require super-linear time. This reduction needs more work (and we cannot yet extend our results to $k>3$). Further, we have to rely on more recent conjectures in {\em fine-grained} complexity on e.g. the complexity of counting the number of triangles in $G$ and not more standard parameterized hardness like \sharpwonehard.
The starting point of our approximation algorithm was the simple observation that for any lineage polynomial $\poly(X_1,\dots,X_n)$, we have $\rpoly(1,\dots,1)=Q(1,\dots,1)$ and if all the coefficients of $\poly$ are constants then $\poly(X_1,\dots,X_n)$ (which can be easily computed in linear time) is a $p^k$ approximation to the value $\rpoly(p,\dots,p)$ that we are after. If $p$ and $k=\deg(\poly)$ are constants, then this gives a constant factor approximation. We then use sampling to get a better approximation factor of $(1\pm \eps)$: we sample monomials from $\poly(X_1,\dots,X_n)$ and do an appropriate weighted sum of their coefficients. Standard tail bounds then allow us to get our desired approximation scheme. To get a linear runtime, it turns out that we need the following properties from our compressed representation of $\poly$: (i) be able to compute $\poly(X_1,\dots,X_n)$ in linear time and (ii) be able to sample monomials from $\poly(X_1,\dots,X_n)$ quickly as well. For the ease of exposition, we start off with expression trees (see~\Cref{fig:intro-q2-etree} for an example) and show that they satisfy both of these properties. Later we show that it easy to show that these properties also extend to polynomial circuits as well (we essentially show that in the requires time bound, we can simulate access to the `unrolled' expression tree by considering the polynomial circuit).
The starting point of our approximation algorithm was the simple observation that for any lineage polynomial $\poly(X_1,\dots,X_n)$, we have $\rpoly(1,\dots,1)=Q(1,\dots,1)$ and if all the coefficients of $\poly$ are constants, then $\poly(X_1,\dots,X_n)$ (which can be easily computed in linear time) is a $p^k$ approximation to the value $\rpoly(p,\dots,p)$ that we are after. If $p$ and $k=\deg(\poly)$ are constants, then this gives a constant factor approximation. We then use sampling to get a better approximation factor of $(1\pm \eps)$: we sample monomials from $\poly(X_1,\dots,X_n)$ and do an appropriate weighted sum of their coefficients. Standard tail bounds then allow us to get our desired approximation scheme. To get a linear runtime, it turns out that we need the following properties from our compressed representation of $\poly$: (i) be able to compute $\poly(X_1,\dots,X_n)$ in linear time and (ii) be able to sample monomials from $\poly(X_1,\dots,X_n)$ quickly as well. For the ease of exposition, we start off with expression trees (see~\Cref{fig:intro-q2-etree} for an example) and show that they satisfy both of these properties. Later we show that it is easy to show that these properties also extend to polynomial circuits as well (we essentially show that in the required time bound, we can simulate access to the `unrolled' expression tree by considering the polynomial circuit).
We also formalize our claim since our approximation algorithm runs in time linear in the size of the polynomial circuit, we show that we can approximate the expected output tuple multiplicities with only a $O(\log{Z})$ overhead (where $Z$ is the number of output tuples) over the runtime of a broad class of query processing algorithms. We also observe that our results trivially extend to the problem of computing higher moments of the tuple multiplicity (instead of just the expectation).
We also formalize our claim that, since our approximation algorithm runs in time linear in the size of the polynomial circuit, we can approximate the expected output tuple multiplicities with only a $O(\log{Z})$ overhead (where $Z$ is the number of output tuples) over the runtime of a broad class of query processing algorithms. We also observe that our results trivially extend to higher moments of the tuple multiplicity (instead of just the expectation).
\paragraph{Paper Organization.} We present some relevant background and setup our notation in~\Cref{sec:background}. We present our hardness results in~\Cref{sec:hard} and our approximation algorithm in~\Cref{sec:algo}. We present some (easy) generalizations of our results in~\Cref{sec:gen}. We do a quick overview of related work in~\Cref{sec:related-work} and conclude with some open questions in~\Cref{sec:concl-future-work}.

View File

@ -2,6 +2,9 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% NOTATION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Circuits
\newcommand{\caseheading}[1]{\smallskip \noindent \textbf{#1}.~}
%%%%%
\newcommand{\wElem}{w} %an element of \vct{w}
\newcommand{\st}{\;|\;} %such that
\newcommand{\kElem}{k}%the kth element
@ -37,6 +40,7 @@
\newcommand{\mtrix}[1]{M_{#1}}
\newcommand{\dtrm}[1]{Det\left(#1\right)}
\newcommand{\tuple}[1]{\left<#1\right>}
\newcommand{\indicator}[1]{\onesymbol\inparen{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Query Classes
@ -143,6 +147,7 @@
%using \wVec for world bit vector notation
\newcommand{\poly}{Q}
\newcommand{\rpoly}{\widetilde{Q}}%r for reduced as in reduced 'Q'
\newcommand{\polyForTuple}{\poly_{\tup}}
\newcommand{\out}{output}%output aggregation over the output vector
\newcommand{\numocc}[2]{\#\left(#1, #2\right)}

View File

@ -128,7 +128,7 @@ sensitive=true
\email{atri@buffalo.edu}
\pagestyle{plain}
\begin{document}

View File

@ -1,10 +1,10 @@
%root:main.tex
%!TEX root=./main.tex
\section{Hardness of exact computation}
\label{sec:hard}
\AH{The notation used here is different than in~\Cref{sec:background}, in particular~\Cref{eq:expect-q-nx}. Maybe we should decide on a notation and try to stick to it as much as possible?}
\BG{We sometimes use $\expct_{\vct{X} \sim P}$ sometimes $\expct_{\vct{X}}$}
In this section, we will prove that computing $\expct\limits_{\vct{X} \sim \pd}\pbox{\poly(\vct{X})}$ for a \tis-lineage polynomial $\poly(\vct{X})$ generated from a project-join query is \sharpwonehard. Note that this implies hardness for \bis and general $\semNX$-PDBs. Furthermore, we demonstrate \Cref{sec:single-p} that the problem remains hard, even if $\pd(X_i) = p$ for all $X_i$ and some fixed valued $p$ as long as these conjectures hold. Finally, using popular hardness conjectures in fine-grained complexity we show that if these conjectures hold and except for the trivial choices of $p \in \{0,1\}$, the problem is hard for any given $p$.
In this section, we will prove that computing $\expct\limits_{\vct{X} \sim \pd}\pbox{\poly(\vct{X})}$ for a \ti-lineage polynomial $\poly(\vct{X})$ generated from a project-join query is \sharpwonehard. Note that this implies hardness for \bis and general $\semNX$-PDBs. Furthermore, we demonstrate \Cref{sec:single-p} that the problem remains hard, even if $\pd(X_i) = p$ for all $X_i$ and some fixed valued $p$ as long as these conjectures hold. Finally, using popular hardness conjectures in fine-grained complexity we show that if these conjectures hold and except for the trivial choices of $p \in \{0,1\}$, the problem is hard for any given $p$.
% We would like to argue for a compressed version of $\poly(\vct{X})$, in general $\expct\limits_{\vct{X} \sim \pd}\pbox{\poly(\vct{X})}$ even for tis, cannot be computed in linear time. We will argue two flavors of such a hardness result. In Section~\ref{sec:multiple-p}, we argue that computing the expected value exactly for all query polynommials $\poly(\vct{X})$ for multiple values of $p$ is \sharpwonehard. However, this does not rule out the possibility of being able to solve the problem for a any {\em fixed} value of $p$ in linear time. In Section~\ref{sec:single-p}, we rule out even this possibility (based on some popular hardness conjectures in fine-grained complexity).
@ -12,10 +12,9 @@ In this section, we will prove that computing $\expct\limits_{\vct{X} \sim \pd}\
\subsection{Preliminaries}
Our hardness results are based on (exactly) counting the number of occurrences of a fixed graph $H$ as a subgraph in $G$. Let $\numocc{G}{H}$ denote the number of occurrences of pattern $H$ in graph $G$. %, where, for example, $\numocc{G}{\ed}$ means the number of single edges in $G$.
In particular, we will consider the problems of computing the following counts (given $G$ as an input in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threepath}$ (the number of $3$-paths), $\numocc{G}{\threedis}$ (the number of $3$-matchings or collection of three node disjoint edges) and its generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings or collections of $k$ node-disjoint edges).
Our hardness result in \Cref{sec:multiple-p} is based on the following hardness result:
In particular, we will consider the problems of computing the following counts (given $G$ as an input in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threepath}$ (the number of $3$-paths), $\numocc{G}{\threedis}$ (the number of $3$-matchings or collection of three node-disjoint edges) and its generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings or collections of $k$ node-disjoint edges).
%
Our hardness result in \Cref{sec:multiple-p} is based on the following result:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
@ -26,7 +25,7 @@ Given a positive integer $k$ and an undirected graph $G$ with no self-loops or
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem take time $|V|^{\Omega(k)}$.
%
Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -39,30 +38,30 @@ There exists a constant $\eps_0>0$ such that given an undirected graph $G=(V,E)$
Based on the so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detection whether $G$ has a triangle or not takes time $\Omega\inparen{|E|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.
\AR{Need to add something about 3-paths and 3-matchings as well.}
Both of our hardness results use a query polynomial that is based on a simple encoding of the edges of a graph.
To prove our hardness result, consider a graph $G(V, E)$, where $|E| = m$, $|V| = \numvar$. Our query polynomial will have a variable $X_i$ for every $i$ in $[\numvar]$.
Now consider the polynomial
Both of our hardness results rely on a simple query polynomial encoding of the edges of a graph.
To prove our hardness result, consider a graph $G(V, E)$, where $|E| = m$, $|V| = \numvar$. Our query polynomial has a variable $X_i$ for every $i$ in $[\numvar]$.
Consider the polynomial
\[\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in E} X_i \cdot X_j.\]
The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above, i.e.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}
Let $G=([n],E)$ be a graph. Then for any $\kElem\ge 1$, define
For any graph $G=([n],E)$ and $\kElem\ge 1$, define
\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in E} X_i \cdot X_j\right)^\kElem.\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Our hardness results only need a \ti instance and further, we consider the special case when all the tuple probabilities (probabilities assigned by to $X_i$ by $\vct{p}$) are the same value. It is not too hard to see that we can encode the above polynomial in an expression tree of size $\Theta(km)$.
Our hardness results only need a \ti instance; We also consider the special case when all the tuple probabilities (probabilities assigned by to $X_i$ by $\vct{p}$) are the same value. Note that this polynomial can be encoded in an expression tree of size $\Theta(km)$.
Following up on the discussion around Example~\ref{ex:intro}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the query polynomial corresponding to the following query:
Following on Example~\ref{ex:intro}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ is the query polynomial corresponding to the query:
\[\poly^k_G:- R(A_1),E(A_1,B_1),R(B_1),\dots,R(A_\kElem),E(A_\kElem,B_\kElem),R(B_\kElem)\]
where generalizaing the PDB instance in Example~\ref{ex:intro}, relation $R$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $p$ and $E(A,B)$ has tuples corresponding to the edges in $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $E$ as well but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $E$ also are present with probability $p$ but to simplify notation we assign probability $1$ to edges.}
where generalizaing the PDB instance in Example~\ref{ex:intro}, relation $R$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $p$ and $E(A,B)$ has tuples corresponding to the edges in $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $E$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $E$ also are present with probability $p$ but to simplify notation we assign probability $1$ to edges.}
Note that this imples that our hard query polynomial can be created from a project-join query -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials generated by union of select-project-join queries. % (i.e. we do not need union or select operator to derive our hardness result).
Note that this imples that our hard query polynomial can be created from a project-join query -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials generated by union of select-project-join (SPJU) queries. % (i.e. we do not need union or select operator to derive our hardness result).
%\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p. Finally, we only need project-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}
\subsection{Multiple Distinct $\prob$ Values}
\label{sec:multiple-p}
Unless otherwise noted, all proofs for this section are in~\Cref{app:single-mult-p}.
We are now ready to present our main hardness result.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -71,27 +70,17 @@ Computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ for arbitrary $G$ and any $(2
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
We will prove the above result by reduction from the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state-of-the-art $k$-matching algorithms are improved, we cannot hope to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its \abbrSMB form and then using \Cref{cor:expct-sop}. By contrast the approximation algorithm we present in \Cref{sec:algo} runtime is in$O_k\inparen{m}$ for this query (since it runs in linear-time on all lineage polynomials).
We will prove the above result by reduction from the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state-of-the-art $k$-matching algorithms are improved, we cannot hope to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its \abbrSMB form and then using \Cref{cor:expct-sop}. By contrast the approximation algorithm we present in \Cref{sec:algo} has runtime $O_k\inparen{m}$ for this query (since it runs in linear-time on all lineage polynomials).
As mentioned earlier, we prove our hardness result by presenting a reduction from the problem of couting $\kElem$-matchings in a graph:
\begin{Lemma}\label{lem:qEk-multi-p}
Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $O\inparen{\kElem^3}$ time.
\end{Lemma}
Before we prove the above Lemma, let us use it to prove~\Cref{thm:mult-p-hard-result}:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of Theorem~\ref{thm:mult-p-hard-result}]
For the sake of contradiction, let us assume we can solve our problem in $f(\kElem)\cdot m^c$ time for some absolute constant $c$. Then given a graph $G$ we can compute the query polynomial $\rpoly_G^\kElem$ (in the obvious way) in $O(km)$ time. Then after we run our algorithm on $\rpoly_G^\kElem$, we get $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$ in additional $f(\kElem)\cdot m^c$ time. \Cref{lem:qEk-multi-p} then computes the number of $k$-matchings in $G$ in $O(\kElem^3)$ time. Thus, overall we have an algorithm for computing the number of $k$-matchings in time
\begin{align*}
O(km) + f(\kElem)\cdot m^c + O(\kElem^3)
&\le \inparen{O(\kElem^3) + f(\kElem)}\cdot m^{c+1} \\
&\le \inparen{O(\kElem^3) + f(\kElem)}\cdot n^{2c+2},
\end{align*}
which contradicts \Cref{thm:k-match-hard}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Finally, we are ready to prove \Cref{lem:qEk-multi-p}:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of \Cref{lem:qEk-multi-p}]
%It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$.
@ -102,24 +91,30 @@ We first argue that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0
%\sum_{\substack{(i_1, j_1),\\\cdots,\\(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem}
%\end{equation*}
%Since each of $(i_1, j_1),\ldots, (i_\kElem, j_\kElem)$ are from $E$, it follows that the set of $\kElem!$ permutations of the $\kElem$ $X_iX_j$ pairs which form the monomial products are of degree $2\kElem$ with the number of distinct variables in an arbitrary monomial $\leq 2\kElem$.
By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\degree(\rpoly_{G}^\kElem)\le \degree(\poly_G^\kElem)=2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\degree(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\degree(\rpoly_{G}^\kElem)\le \degree(\poly_G^\kElem)= 2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\degree(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
%thereby shrinking the degree a monomial product term in the SOP form of $\poly_{G}^{\kElem}(\vct{X})$ to the exact number of distinct variables the monomial contains. This implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. Then it is the case that
\begin{equation*}
\rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i
\end{equation*}
We note that $c_i$ is {\em exactly} the number of monomials in the \abbrSMB expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables.%, with $\prob$ substituted for each distinct variable
We note that $c_i$ is {\em exactly} the number of monomials in the SMB %\BG{\abbrSMB?}
expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables.%, with $\prob$ substituted for each distinct variable
\footnote{Since $\rpoly_G^\kElem(\vct{X})$ does not have any monomial with degree $< 2$, it is the case that $c_0 = c_1 = 0$ but for the sake of simplcity we will ignore this observation.}
Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ for $0\leq i\leq2\kElem$, it follows that
%we then have $2\kElem + 1$ distinct rows of the form $\prob_i^0\ldots\prob_i^{2\kElem}$ which form a matrix $M$.
we have a linear system of the form $\vec{M} \cdot \vct{c} = \vct{b}$ where the $i$th row of $\vec{M}$ is $\inparen{\prob_i^0\ldots\prob_i^{2\kElem}}$, $\vct{c}$ is the coefficient vector $\inparen{c_0,\ldots, c_{2\kElem}}$, and $\vct{b}$ is the vector such that $\vct{b}[i] = \rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. In other words, matrix $\vec{M}$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank (since the $p_i$'s are distinct), and we can solve the linear system in $O(k^3)$ time (say using Gaussian Elimination) to determine $\vct{c}$ exactly. Thus, after $O(k^3)$ work, we know $\vct{c}$ and in particular, $c_{2k}$ exactly. Next we show why we can compute $\numocc{G}{\kmatch}$ from $c_{2k}$ in $O(1)$ additional time.
we have a linear system of the form $\vec{M} \cdot \vct{c} = \vct{b}$ where the $i$th row of $\vec{M}$ is $\inparen{\prob_i^0\ldots\prob_i^{2\kElem}}$, $\vct{c}$ is the coefficient vector $\inparen{c_0,\ldots, c_{2\kElem}}$, and $\vct{b}$ is the vector such that $\vct{b}[i] = \rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$.
In other words, matrix $\vec{M}$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank (the $p_i$'s are distinct), and we can solve the linear system in $O(k^3)$ time (e.g., using Gaussian Elimination) to determine $\vct{c}$ exactly.
Thus, after $O(k^3)$ work, we know $\vct{c}$ and in particular, $c_{2k}$ exactly.
Next, we show why we can compute $\numocc{G}{\kmatch}$ from $c_{2k}$ in $O(1)$ additional time.
%Denote the number of $\kElem$-matchings in $G$ as $\numocc{G}{\kmatch}$.
We claim that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the original factorized representation
\[\poly_{G}^\kElem(\vct{X}) = \sum_{\substack{(i_1, j_1),\\\cdots,\\(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem},\]
where across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem \kElem = \kElem!$ times. Next, note that each $\kElem$-matching $(i_1, j_1)\ldots$ $(i_k, j_k)$ in $G$ corresponds to the monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\poly_{G}^\kElem(\vct{X})$, with all indexes distinct. %Since each index is distinct, then each variable has an exponent $e = 1$ and this monomial survives in $\rpoly_{G}^{\kElem}(\vct{X})$ Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\poly_{G}^\kElem(\vct{X})$ has strictly less than $2\kElem$ distinct variables, which, as stated earlier implies that every other non-$\kElem$-matching monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$.
Second, the only surviving monomials $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ of degree exactly $2k$ in $\rpoly_{G}^{\kElem}(\vct{X})$ must have that all of $i_1,j_1,\dots,i_\kElem,j_\kElem$ are distinct in $\poly_{G}^{\kElem}(\vct{X})$. Then, by the last two statements, only monomials composed of $2k$ distinct variables in $\poly_{G}^{\kElem}(\vct{X})$ (and hence of degree $2\kElem$ in $\rpoly_{G}^{\kElem}(\vct{X})$) correspond to a $k$-matching in $G$.
\[\poly_{G}^\kElem(\vct{X}) = \sum_{\substack{(i_1, j_1),\cdots,(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem},\]
where across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem i = \kElem!$ times.
Next, note that each $\kElem$-matching $(i_1, j_1)\ldots$ $(i_k, j_k)$ in $G$ corresponds to the monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\poly_{G}^\kElem(\vct{X})$, with distinct indexes. %Since each index is distinct, then each variable has an exponent $e = 1$ and this monomial survives in $\rpoly_{G}^{\kElem}(\vct{X})$ Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\poly_{G}^\kElem(\vct{X})$ has strictly less than $2\kElem$ distinct variables, which, as stated earlier implies that every other non-$\kElem$-matching monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$.
Second, the only surviving monomials $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ of degree exactly $2k$ in $\rpoly_{G}^{\kElem}(\vct{X})$ must have that all of $i_1,j_1,\dots,i_\kElem,j_\kElem$ are distinct in $\poly_{G}^{\kElem}(\vct{X})$.
By the last two statements, only monomials composed of $2k$ distinct variables in $\poly_{G}^{\kElem}(\vct{X})$ (and hence of degree $2\kElem$ in $\rpoly_{G}^{\kElem}(\vct{X})$) correspond to a $k$-matching in $G$.
%It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$.
Notice that %we have $\kElem!$ duplicates of

View File

@ -3,9 +3,8 @@
%\onecolumn
\subsection{Reduced Polynomials and Equivalences}
Since we have shown that computing the expected multiplicity of a query result tuple is equivalent to computing the expectation of a polynomial (for that tuple) given a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$, we focus on this problem exclusively from now on.
We now introduce some basic terminology for polynomials and then develop a reduced normal form for polynomials that preserves a polynomial expectation for probability distributions that stem from \bis or \tis.
Let us use the expression $(X + Y)^2$ as a running example in this section.
We now introduce some terminology for polynomials and develop a reduced form for polynomials --- a closed form of the polynomial's expectation over probability distributions derived from a \bi or \ti.
Throughout, we will use $(X + Y)^2$ as a running example.
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{Definition}[Monomial]\label{def:monomial}
@ -18,42 +17,48 @@ Let us use the expression $(X + Y)^2$ as a running example in this section.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
A monomial is a product of a set of variables, each raised to a non-negative integer power.
A polynomial is in \termSMB (\abbrSMB) when it is of the form:
A monomial is a product of variable terms, each raised to a non-negative integer power.
A polynomial in \termSMB (\abbrSMB) has the form:
\[
\sum_{i=1}^n c_i \cdot m_i
\]
where each $c_i$ is a positive integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. Given a polynomial $\poly$ we denote its \abbrSMB as $\smbOf{\poly}$.
where each $c_i$ is a positive integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. The \abbrSMB of a polynomial $\poly$ is $\smbOf{\poly}$.
% fully expanded out such that no product of sums exist and where each unique monomial appears exactly once.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \abbrSMB for the running example is $X^2 +2XY + Y^2$. While $X^2 + XY + XY + Y^2$ is an expanded form of the expression, it is not the standard monomial basis since $XY$ appears more than once.
\BG{Maybe inline degree?}
% \BG{Maybe inline degree?}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Degree]\label{def:degree}
The degree of polynomial $\poly(\vct{X})$ is the maximum sum of the exponents of a monomial, over all monomials in $\smbOf{\poly(\vct{X})}$.
The degree of polynomial $\poly(\vct{X})$ is the maximum sum of exponents, over all monomials in $\smbOf{\poly(\vct{X})}$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The degree of the running example polynomial is $2$. In this paper we consider only finite degree polynomials.
%
% Throughout this paper, we also make the following \textit{assumption}.
%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{Assumption}\label{assump:poly-smb}
% All polynomials considered are in standard monomial basis, i.e., $\poly(\vct{X}) = \sum\limits_{\vct{d} \in \mathbb{N}^\numvar}q_d \cdot \prod\limits_{i = 1, d_i \geq 1}^{\numvar}X_i^{d_i}$, where $q_d$ is the coefficient for the monomial encoded in $\vct{d}$ and $d_i$ is the $i^{th}$ element of $\vct{d}$.
% \end{Assumption}
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (\emph{\ti-lineage polynomial}, or simply lineage polynomial), if
\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.} there exists an n-ary $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and n-ary tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case).
Note the \tis are a special case of \bis and, thus, the following applies to \tis as well.
Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$. The tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ and each tuple $t_i$ is associated with a probability $\prob(\tup_i) = \pd[X_i = 1]$. Together with the assumption that blocks are assumed to be independent and tuples from the same block are disjoint events, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
%
We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if
%\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.}
% OK: agreed w/ AH, this can be treated as implicit
there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case).
As they are a special case of \bis, the following applies to \tis as well.
Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$.
Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob(\tup_i) = \pd[X_i = 1]$.\footnote{
Note the deviation from the more common approach of defining a single independent, $[\abs{\block_i}+1]$-valued variable per block; Here we define $\abs{\block_i}$ correlated variables per block.
}
Because blocks are independent and tuples from the same block are disjoint, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as
$\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $\block_{i, j}$ denotes tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.
\SF{Where is $\block_{i, j}$ used? Is it $X_{\block_{1, 1}}$ or $X_{\block_1, 1}$ ?}
$\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $X_{i, j}$ denotes the annotation of tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.
%\SF{Where is $\block_{i, j}$ used? Is it $X_{\block_{1, 1}}$ or $X_{\block_1, 1}$ ?}
% and the probability distribution of $\pxdb$ is uniquely determined based on a probability vector $\vct{p}$ that associates each tuple a probability
% variables are independent of each other (or disjoint if they are from the same block) and each variable $X$ is associated with a probability $\vct{p}(X) = \pd[X = 1]$. Thus, we are dealing with polynomials $\poly(\vct{X})$ that are annotations of a tuple in the result of a query $\query$ over a BIDB $\pxdb$ where $\vct{X}$ is the set of variables that occur in annotations of tuples of $\pxdb$.
@ -66,29 +71,30 @@ $\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{definition}[Modding with a set]\label{def:mod-set}
Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S}$ is the polynomial obtained by taking the mod of $\poly(\vct{X})$ over {\em all} polynomials in $S$ (the order does not matter).
Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S}$ is the polynomial obtained by taking the mod of $\poly(\vct{X})$ over {\em all} polynomials in $S$ (order does not matter).
\end{definition}
For example when $S_0=\inset{X^2-X, Y^2-Y}$, taking the polynomial in~\cref{eq:poly-eg} mod $S_0$, we get $2X+3XY-2Y$.
For example when $S_0=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S_0$ gives $2X+3XY-2Y$.
%
\begin{Definition}\label{def:mod-set-polys}
Given the set of BIDB variables $\inset{X_{b,i}}$, define
\[\mathcal{B}=\inset{X_{b,i}\cdot X_{b,j}|\text{ for every block } b \text{and } i\ne j},\]
\[\mathcal{T}=\inset{X_{b,i}^2-X_{b,i}|\text{ for every block } b \text{and } i}.\]
\[\mathcal{B}=\comprehension{X_{b,i}\cdot X_{b,j}}{\text{ for every block } b \text{and } i\ne j}\]
\[\mathcal{T}=\comprehension{X_{b,i}^2-X_{b,i}}{\text{ for every block } b \text{and } i}\]
\end{Definition}
%
\begin{Definition}[Reduced \bi Polynomials]\label{def:reduced-bi-poly}
Let $\poly(\vct{X})$ be a \bi-lineage polynomial.
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is defined as
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is:
\begin{equation*}
\rpoly(\vct{X}) = \smbOf{\poly(\vct{X})} \mod \mathcal{T} \mod \mathcal{B}%X_i^2 - X_i \mod X_{\block_s, t}X_{\block_s, u}
\end{equation*}
%for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{\block_s}]$, $t \neq u$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Intuitively, in the reduced form all exponents $e > 1$ are reduced to $e = 1$ and, all monomials containing more than one variable from the same block $\block$ are dropped (tuples from the same block are disjoint events in \bis and, thus, any world containing more than one tuple from a block has $0$ probability and can be ignored). Note that for the special case of \tis, the second step (dropping monomials with variables from the same block) is not necessary since every block contains a single tuple.
%
Intuitively, in the reduced form, all exponents $e > 1$ are reduced to $e = 1$ and all monomials with multile variables from the same block $\block$ are dropped (any world containing more than one tuple from a block has $0$ probability and can be ignored).
For the special case of \tis, the second step is not necessary since every block contains a single tuple.
Alternatively, one can think of $\rpoly$ as the \abbrSMB of $\poly(\vct{X})$ when the product operator is idempotent.
%
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{Definition}[$\rpoly(\vct{X})$] \label{def:qtilde}
% Define $\rpoly(X_1,\ldots, X_\numvar)$ as the reduced version of $\poly(X_1,\ldots, X_\numvar)$, of the form
@ -97,10 +103,10 @@ Alternatively, one can think of $\rpoly$ as the \abbrSMB of $\poly(\vct{X})$ whe
% \[\poly(X_1,\ldots, X_\numvar) \mod X_1^2-X_1\cdots\mod X_\numvar^2 - X_\numvar.\]
% \end{Definition}
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Example}\label{example:qtilde}
Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blocks. Then the expanded derivation for $\rpoly(X, Y)$ is
Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blocks. The expanded derivation for $\rpoly(X, Y)$ is
\begin{align*}
(&X^2 + 2XY + Y^2 \mod X^2 - X) \mod Y^2 - Y\\
= ~&X + 2XY + Y^2 \mod Y^2 - Y\\
@ -108,17 +114,19 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
\end{align*}
\end{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Intuitively, $\rpoly(\textbf{X})$ is the \abbrSMB form of $\poly(\textbf{X})$ such that if any $X_j$ term has an exponent $e > 1$, it is reduced to $1$, i.e. $X_j^e\mapsto X_j$ for any $e > 1$.
%
%When considering $\bi$ input, it becomes necessary to redefine $\rpoly(\vct{X})$.
The usefulness of this will reduction become clear in \Cref{lem:exp-poly-rpoly}.
%
\noindent The usefulness of this will reduction become clear in \Cref{lem:exp-poly-rpoly}.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:pre-poly-rpoly}
When $\poly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar}q_{\vct{d}} \cdot \prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i^{d_i}$, we have then that $\rpoly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar} q_{\vct{d}}\cdot\prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i$.
If
$\poly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \{0,\ldots, B\}^\numvar}q_{\vct{d}} \cdot \prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i^{d_i}$
then
$\rpoly(X_1,\ldots, X_\numvar) = \sum\limits_{\vct{d} \in \eta} q_{\vct{d}}\cdot\prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numvar}X_i$ \;\;\; for some $\eta \subseteq \{0,\ldots, B\}^\numvar$
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -127,13 +135,13 @@ Follows by the construction of $\rpoly$ in \cref{def:reduced-bi-poly}. \qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note the following fact:
\noindent Note the following fact:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{proposition:q-qtilde} For any \bi-lineage polynomial $\poly(X_1, \ldots, X_\numvar)$ and all $\vct{w} \in \{0,1\}^\numvar$,
\[
\begin{Proposition}\label{proposition:q-qtilde} For any \bi-lineage polynomial $\poly(X_1, \ldots, X_\numvar)$ and all $\vct{w} \in \{0,1\}^\numvar$, it holds that
$% \[
\poly(\vct{w}) = \rpoly(\vct{w}).
\]
$% \]
\end{Proposition}
@ -149,7 +157,7 @@ Note the following fact:
Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and with probability distribution $\vct{p} = (\prob_1, \ldots, \prob_\numvar)$. For any \bi-lineage polynomial $\poly(\vct{X})$ based on $\pxdb$ and some query $\query$ we have
% The expectation over possible worlds in $\poly(\vct{X})$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numvar)$.
\begin{equation*}
\expct_{\vct{X}}\pbox{\poly(\vct{X})} = \rpoly(\vct{p}).
\expct_{\vct{W}}\pbox{\poly(\vct{W})} = \rpoly(\vct{p}).
\end{equation*}
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -170,7 +178,7 @@ to the variables $\vct{X}$. Intuitively, \Cref{lem:exp-poly-rpoly} states that w
\begin{Corollary}\label{cor:expct-sop}
If $\poly$ is a \bi-lineage polynomial, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $O(|\smbOf{\poly}|)$, where $|\poly|$ denotes the total number of multiplication/addition operators in $\poly$.
\end{Corollary}
\AH{What if $\poly$ is not in \abbrSMB form?}
%\AH{What if $\poly$ is not in \abbrSMB form?}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -13,27 +13,28 @@ Denote the schema of $\db$ as $\sch(\db)$. A \textit{probabilistic database} $\p
For a probabilistic database $\pdb = (\idb, \pd)$, the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$ that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:
\[\forall \db \in \query(\idb): \pd'(\db) = \sum_{\db' \in \idb: \query(\db') = \db} \pd(\db') \]
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when being combined with $\multK$.
Note that in this work we consider multisets, i.e., each possible world is a set of multiset relations and queries are evaluated using bag semantics. We will use K-relations to model multisets. A \emph{K-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = (\domK, \addK, \multK, \zeroK, \oneK)$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ ($\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to tuple $t$.
We review the semantics of positive relational algebra queries over $\semK$-relations below.
A $\semK$-database is a set of $\semK$-relations. It will be convenient to also interpret a $\semK$-database as a function from tuples to annotations. Thus, $\rel(t)$ (resp., $\db(t)$) denotes the annotation associated by $\semK$-relation $\rel$ ($\semK$-database $\db$) to $t$.
We review positive relational algebra semantics for $\semK$-relations below.
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases are used to model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We will study the problem of evaluating statistical moments of query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result tuple $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \pd}[\query(\db)(t)]$:
Consider the semiring $\semN = (\domN,+,\times,0,1)$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Specifically, given a probabilistic $\semN$-database $\pdb = (\idb, \pd)$, query $\query$, and possible result $t$, we treat $\query(\db)(t)$ as a random $\semN$-valued variable and are interested in computing its expectation $\expct_{\idb \sim \pd}[\query(\db)(t)]$:
%
\begin{align}\label{eq:bag-expectation}
\expct_{\idb \sim \pd}[\query(\db)(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \pd(\db)
\end{align}
%
Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\semK$-relational Query Semantics}\label{sec:semnx-as-repr}
\subsection{Representation System and Semantics}\label{sec:semnx-as-repr}
\subsubsection{$\semK$-relational Query Semantics}
For completeness, we briefly review the semantics for $\raPlus$ queries over $\semK$-relations~\cite{DBLP:conf/pods/GreenKT07}.
We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ over $\semK$-database $\db$. In the definition shown below, we assume that tuples are of appropriate arity and use $\project_A(\tup)$ to denote the projection of tuple $\tup$ on a list of attributes $A$. Furthermore, $\theta(\tup)$ denotes the (boolean) result of evaluating condition $\theta$ over $\tup$.
%
\begin{align*}
& \evald{\project_A(\rel)}{\db}(\tup) & & = & & \sum_{\tup': \project_A(\tup') = \tup} \evald{\rel}{\db}(\tup') \\
& \evald{(\rel_1 \union \rel_2)}{\db}(\tup) & & = & & \evald{\rel_1}{\db}(\tup) \addK \evald{\rel_2}{\db}(\tup) \\
@ -46,107 +47,50 @@ We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ ov
\end{align*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
\subsubsection{$\semNX$ as a Representation System}\label{sec:semnx-as-repr}
Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number co-efficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$ and addition and multiplication are standard addition and multiplication of polynomials. We will utilize $\semNX$-databases $\db$ paired with probability distributions to represent $\semN$-PDBs.\BG{Need more motivation?} To justify the use of $\semNX$-databases, we need to show that we can encode any $\semN$-PDB in this way and that the query semantics over this representation coincides with query semantics over $\semN$-PDB. For that it will be opportune to define representation systems for $\semN$-PDBs.\BG{cite}
Before we proceed, unless otherwise mentioned, all subsequent proofs for~\Cref{sec:background} can be found in~\Cref{sec:proofs-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Representation System]\label{def:representation-syste}
A representation system for $\semN$-PDBs is a tuple $(\reprs, \rmod)$ where $\reprs$ is a set of representations and $\rmod$ associates with each $\repr \in \reprs$ an $\semN$-PDB $\pdb$. We say that a representation system is \emph{closed} under a class of queries $\qClass$ if for any query $\query \in \qClass$ we have:
%
\[ \rmod(\query(\repr)) = \query(\rmod(\repr)) \]
A representation system is \emph{complete} if for every $\semN$-PDB $\pdb$ there exists $\repr \in \reprs$ such that:
%
\[ \rmod(\repr) = \pdb \]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As mentioned above we will use $\semNX$-databases paired with a probability distribution as a representation system.
We refer to such databases as $\semNX$-PDBs and use bold symbols to distinguish them from possible worlds (which are $\semN$-databases).
Formally, an $\semNX$-PDB is an $\semNX$-database $\db$ and a probability distribution $\pd$ over assignments $\assign$ of the variables $\vct{X} = \{X_1, \ldots, X_n\}$ occurring in annotations of $\db$ to $\{0,1\}$. Note that an assignment $\assign: \vct{X} \to \{0,1\}$ can be represented as a vector $\vct{w} \in \{0,1\}^n$ where $\vct{w}[i]$ records the value assigned to variable $X_i$. Thus, from now on we will solely use such vectors which we refer to as \emph{world vectors} and implicitly understand them to represent assignments. Given an assignment $\assign$ we use $\assign(\pxdb)$ to denote the semiring homomorphism $\semNX \to \semN$ that applies the assignment $\assign$ to all variables of a polynomial and evaluates the resulting expression in $\semN$.\BG{explain connection to homomorphism lifting in K-relations}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\semNX$-PDBs]\label{def:semnx-pdbs}
An $\semNX$-PDB $\pxdb$ over variables $\vct{X} = \{X_1, \ldots, X_n\}$ is a tuple $(\db,\pd)$ where $\db$ is an $\semNX$-database and $\pd$ is a probability distribution over $\vct{w} \in \{0,1\}^n$. We use $\assign_{\vct{w}}$ to denote the assignment corresponding to $\vct{w} \in \{0,1\}^n$. The $\semN$-PDB $\rmod(\pxdb) = (\idb, \pd')$ encoded by $\pxdb$ is defined as:
\begin{align*}
\idb & = \{ \assign_{\vct{w}}(\pxdb) \mid \vct{w} \in \{0,1\}^n \} \\
\pd'(\db) & = \sum_{\vct{w} \in \{0,1\}^n: \assign_{\vct{w}}(\pxdb) = \db} \pd(\vct{w})
\end{align*}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For instance, consider a $\pxdb$ consisting of a single tuple $\tup_1 = (1)$ annotated with $X_1 + X_2$ with probability distribution $\pd([0,0]) = 0$, $\pd([0,1]) = 0$, $\pd([1,0]) = 0.3$ and $\pd([1,1]) = 0.7$. This $\semNX$-PDB encodes two possible worlds (with non-zero) probability that we denote using their world vectors.
%
\[
D_{[0,1]}(\tup_1) = 1 \hspace{0.3cm} \mathbf{and} \hspace{0.3cm} D_{[1,1]}(\tup_1) = 2
\]
%
Importantly, as the following proposition shows, any finite $\semN$-PDB can be encoded as an $\semNX$-PDB and $\semNX$-PDBs are closed under positive relational algebra queries, the class of queries we are interested in in this work.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{prop:semnx-pdbs-are-a-}
$\semNX$-PDBs are a complete representation system for $\semN$-PDBs that is closed under $\raPlus$ queries.
\end{Proposition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Now let us consider computing the expected multiplicity of a tuple $\tup$ in the result of a query $\query$ over an $\semN$-PDB $\pdb$ using the annotation of $\tup$ in the result of evaluating $\query$ over an $\semNX$-PDB $\pxdb$ for which $\rmod(\pxdb) = \pdb$. The expectation of the polynomial $\poly = \query(\pxdb)(\tup)$ based on the probability distribution of $\pxdb$ over the variables in $\pxdb$ is:
\begin{equation}
\expct_{\vct{X} \sim \pd}\pbox{\poly(\vct{X})} = \sum_{\vct{w} \in \{0,1\}^n} \query(\assign_{\vct{w}}(\pxdb))(\tup) \cdot \pd(\vct{w})\label{eq:expect-q-nx}
\end{equation}
Since $\semNX$-PDBs $\pxdb$ are a complete representation system for $\semN$-PDBs which are closed under $\raPlus$, computing the expectation of the multiplicity of a tuple $t$ in the result of an $\raPlus$ query over the $\semN$-PDB $\rmod(\pxdb)$, is the same as computing the expectation of the polynomial $\query(\pxdb)(t)$.
Let $\semNX$ denote the set of polynomials over variables $\vct{X}$ with natural number coefficients and exponents.
Consider now the semiring $(\semNX, +, \cdot, 0, 1)$ whose domain is $\semNX$ and with the standard addition and multiplication of polynomials.
We will utilize $\semNX$-PDB $\pxdb$, defined as the tuple $(\db, \pd)$, where $\semNX$-database $\db$ is paired with probability distribution $\pd$.
We denote by $\polyForTuple$ the annotation of tuple $t$ in the result of $\query$ (i.e., $\polyForTuple = \query(\pxdb)(t)$) and as before, interpret it as a function $\polyForTuple: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
$\semNX$-PDBs, a function $\rmod$, which takes an $\semNX$-PDB input and outputs an equivalent $\semN$-PDB are formally defined in \Cref{subsec:supp-mat-background}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db,\pd')$ such that $\rmod(\pxdb) = \pdb$, we have:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{X} \sim \pd'}\pbox{\poly(\vct{X})} \]
Given an $\semN$-PDB $\pdb = (\idb,\pd)$ and $\semNX$-PDB $\pxdb = (\db,\pd')$ where $\rmod(\pxdb) = \pdb$:
\[ \expct_{\idb \sim \pd}[\query(\db)(t)] = \expct_{\vct{w} \sim \pd'}\pbox{\polyForTuple(\vct{w})} \]
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing the expected multiplicity of a query result tuple is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem exclusively from now on, assume an implicit result tuple, and drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from this point on).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Two important subclasses of $\semNX$-PDBs that are of interest to us are the bag versions of tuple-independent databases (\tis) and block-independent databases (\bis). Under set semantics, a \ti is a deterministic database $\db$ where each tuple $\tup$ is assigned a probability $\prob(\tup)$. The set of possible worlds represented by a \ti $\db$ is all subsets of $\db$. The probability of each world is the product of the probabilities of all tuples that exist with one minus the probability of all tuples of $\db$ that are not part of this world, i.e., tuples are treated as independent random events. In a \bi, we also assign each tuple a probability, but additionally partition $\db$ into blocks. The possible worlds of a \bi $\db$ are all subsets of $\db$ that contain at most one tuple from each block. Note then that the tuples sharing the same block are disjoint, and the sum of the probabilitites of all the tuples in the same block $\block$ is $1$. The probability of such a world is the product of the probabilities of all tuples present in the world. %and one minus the sum of the probabilities of all tuples from blocks for which no tuple is present in the world.
For bag \tis and \bis, we define the probability of a tuple to be the probability that the tuple exists with multiplicity at least $1$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[\tis and \bis]\label{def:tidbs-and-bidbs}
A \emph{\ti} $\pxdb = (\db, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) the probability distribution $\pd$ is such that all variables are independent.
A \emph{\bi} $\pxdb = (\db, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
\BG{Should this be written in math?}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note that the main difference to the standard definitions of \tis and \bis is that we define them as subclasses of $\semNX$-PDBs and that we use bag semantics. Even though tuples cannot occur more than once in the input \ti or \bi, they can occur with a multiplicity larger than one in the result of a query. Since in \tis and \bis, there is a one-to-one correspondence between tuples in the database and variables, we can interpret a vector $\vct{w} \in \{0,1\}^n$ as denoting which tuples exist in the possible world $\assign_{\vct{w}}(\pxdb)$ (the ones where $\vct{w}[i] = 1$). Denote the vector $\vct{p}$ to be a vector whose elements are the individual probabilities $\prob_i$ of each tuple $\tup_i$. Let $\pd^{(\vct{p})}$ denote the distribution induced by $\vct{p}$.
\subsubsection{\tis and \bis}
In this paper, we focus on two popular forms of PDB: Block-Independent (\bi) and Tuple-Independent (\ti) PDBs.
%
\begin{align}\label{eq:tidb-expectation}
\expct_{\vct{X} \sim \pd^{(\vct{p})}}\pbox{\poly(\vct{X})} = \sum\limits_{\vct{w} \in \{0, 1\}^\numvar} \poly(\vct{w})\prod_{\substack{i \in [\numvar]\\ s.t. \wElem_i = 1}}\prob_i \prod_{\substack{i \in [\numvar]\\s.t. w_i = 0}}\left(1 - \prob_i\right).
\end{align}
A \bi $\pxdb = (\db, \pd)$ is an $\semNX$-PDB such that (i) every tuple is annotated with either $0$ or a unique variable $X_i$ and (ii) that the tuples $\tup$ of $\pxdb$ for which $\pxdb(\tup) \neq 0$ can be partitioned into a set of blocks such that variables from separate blocks are independent of each other and variables from the same blocks are disjoint events.
%
\BG{Do we need the BIDB formula?}
\BG{Oliver's conjecture: Bag-\tis + Q can express any finite bag-PDB:
A well-known result for set semantics PDBs is that while not all finite PDBs can be encoded as \tis, any finite PDB can be encoded using a \ti and a query. An analog result holds in our case: any finite $\semN$-PDB can be encoded as a bag \ti and a query (WHAT CLASS? ADD PROOF)
}
A \emph{\ti} is a \bi where each block contains exactly one tuple.
\Cref{subsec:supp-mat-ti-bi-def} explains \tis and \bis in greater detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{poly-form.tex}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Expression Trees}\label{sec:expression-trees}
In the following we will make use of expression trees to encode polynomials which we define formally in this subsection.
In this section, we formally define expression trees, an encoding of polynomials that we use throughout much of the paper before generalizing to circuits in~\Cref{sec:gen}.
For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X_1^2 + 3X_1X_2 - 2X_2^2$ over $\vct{X} = [X_1, X_2]$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -157,19 +101,18 @@ tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes bei
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We ignore the remaining fields (\vari{partial} and \vari{weight}) for now. Their purpose will become clear in~\Cref{sec:algo}. Note that $\etree$ need not encode an expression in standard monomial basis. For instance, $\etree$ could represent a compressed form of the running example, such as $(X_1 + 2X_2)(2X_1 - X_2)$.
We ignore the remaining fields (\vari{partial} and \vari{weight}) until \Cref{sec:algo}. Note that $\etree$ need not encode an expression in standard monomial basis. For instance, $\etree$ could represent a compressed form of the running example, such as $(X_1 + 2X_2)(2X_1 - X_2)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[poly$(\cdot)$]\label{def:poly-func}
Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial in \abbrSMB. $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively.
%
% \begin{align*}
% &\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\
% &\etree.\type = \times\mapsto&& \polyf(\etree_\lchild) \cdot \polyf(\etree_\rchild)\\
% &\etree.\type = \var \text{ OR } \tnum\mapsto&& \etree.\val
% \end{align*}
%
\begin{equation*}
\polyf(\etree) = \begin{cases}
\polyf(\etree_\lchild) + \polyf(\etree_\rchild) &\text{ if \etree.\type } = +\\
@ -193,18 +136,16 @@ For our running example, $\etreeset{\smb} = \{2X_1^2 + 3X_1X_2 - 2X_2^2, (X_1 +
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Problem Definition}\label{sec:problem-definition}
We are now ready to formally state the main problem addressed in this work.
We are now ready to formally state our main problem.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
The \expectProblem is defined as follows:
\AH{I think we mean $\poly(\vct{X}) = \query(\pxdb)(t)$ instead of $\poly(\vct{X}) = \query(\pdb)(t)$. I changed the following to reflect this.}
\BG{Correct}
\begin{itemize}
\item \textbf{Input}: Given an expression tree $\etree \in \etreeset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\item \textbf{Output}: $\expct_{\vct{X} \sim \pd}[\poly(\vct{X})]$
\end{itemize}
% \AH{I think we mean $\poly(\vct{X}) = \query(\pxdb)(t)$ instead of $\poly(\vct{X}) = \query(\pdb)(t)$. I changed the following to reflect this.}
% \BG{Correct}
\\\hspace*{5mm}\textbf{Input}: An expression tree $\etree \in \etreeset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\\\hspace*{5mm}\textbf{Output}: $\expct_{\vct{X} \sim \pd}[\poly(\vct{X})]$
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -1,5 +1,5 @@
%root: main.tex
%!TEX root=./main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Single $\prob$ value}
@ -15,7 +15,9 @@ Fix $p\in (0,1)$. Then assuming \Cref{conj:graph} is true, then any algorithm th
%\begin{proof}[Proof of Corollary ~\ref{th:single-p-gen-k}]
%Consider $\poly^3_{G}$ and $\poly' = 1$ such that $\poly'' = \poly^3_{G} \cdot \poly'$. By \Cref{th:single-p}, query $\poly''$ with $\kElem = 4$ has $\Omega(\numvar^{\frac{4}{3}})$ complexity.
%\end{proof}
The above shows the hardness for a very specific query polynomial but it is easy to come up with an infinite family of hard query polynomials by `embedding' $\rpoly_{G}^3$ into an infinite family of trivial query polynomials. However, unlike \Cref{thm:mult-p-hard-result} the above result does not show that computing $\rpoly_{G}^3(\prob,\dots,\prob)$ for a fixed $p\in (0,1)$ is \sharpwonehard. By contrast, in \Cref{sec:algo} we show that if we are willing to compute an approximation that this problem (and indeed solving our problem for a much more general setting) is in linear time.
The above shows the hardness for a very specific query polynomial but it is easy to come up with an infinite family of hard query polynomials by `embedding' $\rpoly_{G}^3$ into an infinite family of trivial query polynomials.
Unlike \Cref{thm:mult-p-hard-result} the above result does not show that computing $\rpoly_{G}^3(\prob,\dots,\prob)$ for a fixed $p\in (0,1)$ is \sharpwonehard.
However, in \Cref{sec:algo} we show that if we are willing to compute an approximation that this problem (and indeed solving our problem for a much more general setting) is in linear time.
%\AH{@atri needs to put in the result for triangles of $\numvar^{\frac{4}{3}}$ runtime.}
We will prove the above result by the following reduction:
@ -26,23 +28,21 @@ If we can compute $\rpoly_{G}^3(\prob,\dots,\prob)$ exactly in $T(\numedge)$ tim
in $O\inparen{T(\numedge) + \numedge}$ time.
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We now use \Cref{th:single-p} to prove \Cref{th:single-p-hard}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of \Cref{th:single-p-hard}]
For the sake of contradiction, let us assume that for any $G$, we can compute $\rpoly_{G}^3(\prob,\dots,\prob)$ in $o\inparen{m^{1+\eps_0}}$ time.
For the sake of contradiction, assume that for any $G$, we can compute $\rpoly_{G}^3(\prob,\dots,\prob)$ in $o\inparen{m^{1+\eps_0}}$ time.
Let $G$ be the input graph. It is easy to see that one can compute the expression tree for $\poly_{G}^3(\vct{X})$ in $O(m)$ time. Then by \Cref{th:single-p} we can compute $\numocc{G}{\tri}$, $\numocc{G}{\threepath}$ and $\numocc{G}{\threedis}$ in further time $o\inparen{m^{1+\eps_0}}+O(m)$. Thus, the overall, reduction takes $o\inparen{m^{1+\eps_0}}+O(m)= o\inparen{m^{1+\eps_0}}$ time, which violates \Cref{conj:graph}.
\end{proof}
\qed
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Before moving on to prove \Cref{th:single-p}, let us state the results, lemmas and defintions that will be useful in the proof.
In other words, if \Cref{th:single-p} holds, then so must \Cref{th:single-p-hard}.
Before we move on to the proof itself, we state the results, lemmas, and defintions that will be useful in the proof.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Preliminaries and Notation}
We need to list all possible edge patterns in an arbitrary $G$ consisting of at most three distinct edges. We have already seen $\tri,\threepath$ and $\threedis$, so here we define the remaining patterns:
We need all the possible edge patterns in an arbitrary $G$ with at most three distinct edges. We have already seen $\tri,\threepath$ and $\threedis$, so we define the remaining patterns:
\begin{itemize}
\item Single Edge $\left(\ed\right)$
@ -50,14 +50,14 @@ We need to list all possible edge patterns in an arbitrary $G$ consisting of at
\item 2-matching ($\twodis$)
%\item Triangle ($\tri$)
%\item 3-path ($\threepath$)
\item 3-star ($\oneint$)--this is the graph that results when all three edges share exactly one common endpoint. The remaining endpoint for each edge is disconnected from any endpoint of the three edges.
\item 3-star ($\oneint$)--this is the graph that results when all three edges share exactly one common endpoint. The remaining endpoint for each edge is disconnected from any endpoint of the remaining two edges.
\item Disjoint Two-Path ($\twopathdis$)--this subgraph consists of a two path and a remaining disjoint edge.
%\item 3-matching ($\threedis$)--this subgraph is composed of three disjoint edges.
\end{itemize}
%Let $\numocc{G}{H}$ denote the number of occurrences of pattern $H$ in graph $G$, where, for example, $\numocc{G}{\ed}$ means the number of single edges in $G$.
For any graph $G$, the following formulas for $\numocc{G}{H}$ for their respective patterns can be used to compute them exactly in $O(\numedge)$ time, with $d_i$ representing the degree of vertex $i$ (proofs are in \Cref{app:easy-counts}):
For any graph $G$, the following formulas for $\numocc{G}{H}$ compute their respective patterns exactly in $O(\numedge)$ time, with $d_i$ representing the degree of vertex $i$ (proofs are n \Cref{app:easy-counts}):
\begin{align}
&\numocc{G}{\ed} = \numedge, \label{eq:1e}\\
&\numocc{G}{\twopath} = \sum_{i \in V} \binom{d_i}{2} \label{eq:2p}\\
@ -76,7 +76,7 @@ For any graph $G$, the following formulas for $\numocc{G}{H}$ for their respecti
\subsubsection{The proofs}
Note that $\rpoly_{G}^3(\prob,\ldots, \prob)$ as a polynomial in $\prob$ has degree at most six. Next, we figure out the exact coefficients since this would be useful Hin our arguments:
Note that $\rpoly_{G}^3(\prob,\ldots, \prob)$ as a polynomial in $\prob$ has degree at most six. Next, we figure out the exact coefficients (i.e., $c_i$) since this would be useful in our arguments:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:qE3-exp}
%When we expand $\poly_{G}^3(\vct{X})$ out and assign all exponents $e \geq 1$ a value of $1$, we have the following result,
@ -91,18 +91,18 @@ For any $p$, we have:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}%[Proof of \Cref{lem:qE3-exp}]
By definition we have that
\[\poly_{G}^3(\vct{X}) = \sum_{\substack{(i_1, j_1),\\ (i_2, j_2),\\ (i_3, j_3) \in E}} \prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}.\]
Hence $\rpoly_{G}^3(\vct{X})$ has degree six. Note that the monomial $\prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}$ will contribute to the coefficient of $p^i$ in $\rpoly_{G}^3(\vct{X})$, where $i$ is the number of distinct variables in the monomial.
\[\poly_{G}^3(\vct{X}) = \sum_{\substack{(i_1, j_1), (i_2, j_2), (i_3, j_3) \in E}}~\; \prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}.\]
Hence $\rpoly_{G}^3(\vct{X})$ has degree six. Note that the monomial $\prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}$ will contribute to the coefficient of $p^\nu$ in $\rpoly_{G}^3(\vct{X})$, where $\nu$ is the number of distinct variables in the monomial.
%Rather than list all the expressions in full detail, let us make some observations regarding the sum.
Let $e_1 = (i_1, j_1), e_2 = (i_2, j_2), e_3 = (i_3, j_3)$. Notice that each expression in the sum consists of a triple $(e_1, e_2, e_3)$. There are three forms the triple $(e_1, e_2, e_3)$ can take (and in each case, we will account for their contribution to $\rpoly_{G}^3(\vct{X})$).
Let $e_1 = (i_1, j_1), e_2 = (i_2, j_2), e_3 = (i_3, j_3)$. Notice that each expression in the sum is a triple $(e_1, e_2, e_3)$. There are three forms the triple $(e_1, e_2, e_3)$ can take (and in each case, we account for their contribution to $\rpoly_{G}^3(\vct{X})$).
\textsc{case 1:} $e_1 = e_2 = e_3$, where all edges are the same. There are exactly $\numedge=\numocc{G}{\ed}$ such triples, each with a $\prob^2$ factor in $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 1:} $e_1 = e_2 = e_3$ (all edges are the same). There are exactly $\numedge=\numocc{G}{\ed}$ such triples, each with a $\prob^2$ factor in $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 2:} This case occurs when there are two distinct edges of the three, call them $e$ and $e'$. When there are two distinct edges, there is then the occurence when $2$ variables in the triple $(e_1, e_2, e_3)$ are bound to $e$. There are three combinations for this occurrence in $\poly_{G}^3(\vct{X})$. Analogusly, there are three such occurrences in $\poly_{G}^3(\vct{X})$ when there is only one occurrence of $e$, i.e. $2$ of the variables in $(e_1, e_2, e_3)$ are $e'$. %Again, there are three combinations for this.
This implies that all $3 + 3 = 6$ combinations of two distinct edges $e$ and $e'$ contribute to the same monomial in $\rpoly_{G}^3$. % consist of the same monomial in $\rpoly$, i.e. $(e_1, e_1, e_2)$ is the same as $(e_2, e_1, e_2)$.
Since $e\ne e'$, this case produces the following edge patterns: $\twopath, \twodis$, which contribute $p^3$ and $p^4$ respectively to $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 3:} All $e_1,e_2$ and $e_3$ are distinct. For this case, we have $3! = 6$ permutations of $(e_1, e_2, e_3)$, each of which contribute to a different monomial in the SOP (see \Cref{def:expand-tree}) expansion of $\poly_{G}^3(\vct{X})$. This case consists of the following edge patterns: $\tri, \oneint, \threepath, \twopathdis, \threedis$, which contribute $p^3,p^4,p^4,p^5$ and $p^6$ respectively to $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 3:} All $e_1,e_2$ and $e_3$ are distinct. For this case, we have $3! = 6$ permutations of $(e_1, e_2, e_3)$, each of which contribute to a different monomial in the SOP expansion of $\poly_{G}^3(\vct{X})$. This case consists of the following edge patterns: $\tri, \oneint, \threepath, \twopathdis, \threedis$, which contribute $p^3,p^4,p^4,p^5$ and $p^6$ respectively to $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\end{proof}
\qed
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -183,12 +183,12 @@ from which we can compute $\numocc{G}{\tri}, \numocc{G}{\threepath}$ and $\numoc
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Due to lack of space we defer the proof of the above results to \Cref{subsec:proofs-struc-lemmas}.
The above result immediately implies \Cref{th:single-p-hard}:
%
This result immediately implies \Cref{th:single-p-hard}:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}[Proof of \Cref{th:single-p-hard}]
It is easy to check that in $O(m)$ time we can compute $\graph{2}$ and $\graph{3}$ from $\graph{1}=G$ (and further note that these graphs also have $O(m)$ edges). Thus,
in time $O(T(m))$, we can compute $\rpoly_{\graph{\ell}}^3(\prob,\dots,\prob)$ for $\ell\in [3]$. \Cref{lem:lin-sys} then completes the proof.
We can compute $\graph{2}$ and $\graph{3}$ from $\graph{1}=G$ in $O(m)$ time (also note that these graphs also have $O(m)$ edges). Thus,
in time $O(T(m))$, we have $\rpoly_{\graph{\ell}}^3(\prob,\dots,\prob)$ for $\ell\in [3]$ and \Cref{lem:lin-sys} completes the proof.
\end{proof}
\qed
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%