Merge branch 'master' of gitlab.odin.cse.buffalo.edu:ahuber/SketchingWorlds

2020-12-14 19:57:11 -06:00 · 2020-12-14 19:57:11 -06:00 · 6a811cefc7
parent b4e27c5f12 9f140d22eb
commit 6a811cefc7
2 changed files with 49 additions and 3 deletions
--- a/approx_alg.tex
+++ b/approx_alg.tex
@ -42,7 +42,7 @@ tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes bei

 Note that $\etree$ need not encode an expression in the standard monomial basis.  For instance, $\etree$ could represent a compressed form of the polynomial in~\cref{eq:poly-eg}, such as $(x + 2y)(2x - y)$.

-\begin{Definition}[poly$(\cdot)$]\label{def:poly-func}
+\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
 Denote $\polyf(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial.  $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively.

 %	\begin{align*}
@ -154,8 +154,15 @@ Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(
 In the subsequent subsections we will prove the following theorem.

 \begin{Theorem}\label{lem:approx-alg}
-Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a BIDB. An estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$, such that
-\[P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.\]
+Let $\etree$ be an expression tree for a UCQ over BIDB and define $\poly(\vct{X})=\polyf(\etree)$ and let $k=\deg(\poly)$
+%Let $\poly(\vct{X})$ be a query polynomial corresponding to the output of a UCQ in a BIDB. 
+An estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time 
+\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot  k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right),\] 
+such that
+\begin{equation}
+\label{eq:approx-algo-bound}
+P\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
+\end{equation}
 %with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
 \end{Theorem}

@ -166,6 +173,25 @@ Given an expression tree $\etree$, define
 \end{Definition}
 \AR{Need to make sure use of indicator variable $\onesymbol$ above is consistent with the rest of the paper.}

+We next present couple of corollaries of~\Cref{lem:approx-alg}.
+\begin{Corollary}
+\label{cor:approx-algo-const-p}
+Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\etree)$. Further let it be the case that $p_i\ge p_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\cref{eq:approx-algo-bound} can be computed in time
+\[O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot depth(\etree))}{\error^2\cdot(1-\gamma)^2\cdot p_0^{2k}}\right)\]
+In particular, if $p_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\frac 1\eps\cdot\treesize(\etree)\cdot \log{\frac{1}{\conf}}\right)$. 
+\end{Corollary}
+We note that the restiction on $\gamma$ is satisfied by TIDB (where $\gamma=0$) and for some BIDB benchmarks (see~\Cref{sec:experiments} for more on this claim).
+\AR{{\bf Boris/Oliver:} Is there a way to claim that all probabilities in practice are actually constants: i.e. they do not increase with the number of  tuples?}
+
+\begin{proof}[Proof of~\Cref{cor:approx-algo-const-p}]
+The result follows by first noting that by definition of $\gamma$, we have 
+\[\rpoly(1,\dots,1)\ge (1-\gamma)\cdot \abs{\etree}(1,\dots,1).\] 
+Further, since each $p_i\ge p_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
+\[ \rpoly(1,\dots,1) \ge p_0^k\cdot \rpoly(1,\dots,1).\]
+The above two inequalities implies $\rpoly(1,\dots,1) \ge p_0^k\cdot (1-\gamma)\cdot \abs{\etree}(1,\dots,1)$.
+Applying this bound in the runtime bound in~\Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\treesize(\etree)\cdot \log{\frac{1}{\conf}}\right)$ follows by noting that $depth(\etree)\le \treesize(\etree)$ and absorbing all factors that just depend on $k$.
+\end{proof}
+
 \subsection{Approximating $\rpoly$}
 We state the approximation algorithm in terms of a $\bi$.
 \subsubsection{Description}
--- a/experiments.tex
+++ b/experiments.tex
@ -1 +1,21 @@
+% root: main.tex
+We ran our experiments using Windows 10 WSL Operating System on a machine with an Intel Core i7 2.40GHz processor with 16GB RAM.  All experiments used the PostgreSQL 13.0 database system.
+
+The intention of the experiments was to determine whether queries over $\bi$ instances in practice generate a lot of cancellations or not.  Recall that by definition of $\bi$, a query result cannot be derived by a self-join between tuples belonging to the same block.
+
+For this purpose we used the MayBMS data generator~\cite{pdbench} tool to generate uncertain versions of TPCH tables.  We then ran $\poly_1$, $\poly_2$, and $\poly_3$ from~\cite{U-relations}, all of which are modified versions of TPC-H queries $\poly_3$, $\poly_6$, and $\poly_7$ where all aggregations have been dropped.
+
+As written, the queries disallow $\bi$ cross terms.  We ran all queries, and then rewrote the queries so as not to filter out the cross terms.  The results show that in practice, there are little to no cancelling terms, as shown in \Cref{fig:experiment-bidb-cancel}.  \Cref{tbl:cancel} has the number of result tuples returned when the query filters out tuples that are cancelled by $\bi$ constraints, the number of output tuples when the cancelled tuples are included in the result, and the difference between the two.
+
+\begin{figure}[ht]
+		\begin{tabular}{ c | c c c}\label{tbl:cancel}
+			Query & Cancellations Filtered & Cancellations Included & Difference\\
+			\hline
+			 $\poly_1$ & $46,714$ & $46,768$ & $54$\\
+			 $\poly_2$ & $179.917$ & $179,917$ & $0$\\
+			 $\poly_3$ & $11,535$ & $11,535$ & $0$\\
+		\end{tabular}
+	\caption{Number of Cancellations for Queries Over $\bi$.}
+	\label{fig:experiment-bidb-cancel}
+\end{figure}
 \AR{Experimental stuff about BIDB should go in here}