From 345d95b651f9dd11dd562afbf6c512c4223dceda Mon Sep 17 00:00:00 2001 From: Oliver Date: Wed, 7 Apr 2021 01:02:46 -0400 Subject: [PATCH] Read through: Space, grammar, notation fixes --- approx_alg.tex | 44 ++++++++++++++++++++++---------------- circuits-model-runtime.tex | 1 + macros.tex | 12 +++++------ main.tex | 2 +- mult_distinct_p.tex | 17 +++++++++------ poly-form.tex | 29 ++++++++++++------------- prob-def.tex | 24 ++++++++++++--------- ra-to-poly.tex | 8 +++++-- 8 files changed, 77 insertions(+), 60 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index 641fdad..fa57d6c 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -23,7 +23,7 @@ We now introduce useful definitions and notation related to polynomials. We use \revision{ \begin{Definition}[Pure Expansion] -The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes ~\Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$. +The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes \Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$. \end{Definition} } @@ -45,18 +45,19 @@ $ \end{Definition} \revision{ -Note that similar in spirit to ~\Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though ~\Cref{def:reduced-bi-poly} is more general. +Note that similar in spirit to \Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though \Cref{def:reduced-bi-poly} is more general. +\OK{More general, how?} } In the following, we abuse notation and write $\monom$ to denote the monomial obtained as the products of the variables in the set. \begin{Example}\label{example:expr-tree-T} Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}. -Its circuit $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}. +Its circuit $\etree$ is illustrated in \cref{fig:circuit}. The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expansion{\circuit}$ is $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$. \end{Example} $\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$. -Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form. +However, unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form. \begin{Definition}[Positive \circuit]\label{def:positive-circuit} For any circuit $\circuit$, the corresponding @@ -64,7 +65,7 @@ For any circuit $\circuit$, the corresponding \end{Definition} -Using the same factorization from ~\Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}. +Using the same factorization from \Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}. @@ -112,7 +113,9 @@ In the subsequent subsections we will prove the following theorem. \begin{Theorem}\label{lem:approx-alg} Let \circuit be a circuit for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time +{\small \[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\] +} such that \begin{equation} \label{eq:approx-algo-bound} @@ -120,16 +123,16 @@ such that \end{equation} \end{Theorem} -\noindent The proof of~\Cref{lem:approx-alg} (which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample}) can be found in~\Cref{sec:proof-lem-approx-alg}. The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}. +\noindent The proof of~\Cref{lem:approx-alg} (which relies on \Cref{lem:one-pass} and \Cref{lem:sample}) can be found in~\Cref{sec:proof-lem-approx-alg}. The proofs for the referenced lemmas are also found in \Cref{sec:proof-one-pass} and \Cref{sec:proof-sample-monom}. -To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}): +To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}). \begin{Definition}[Parameter $\gamma$]\label{def:param-gamma} Given an expression tree $\circuit$, define \[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\] \end{Definition} -\noindent We next present couple of corollaries of~\Cref{lem:approx-alg}. +\noindent We next present a few corollaries of~\Cref{lem:approx-alg}. \begin{Corollary} \label{cor:approx-algo-const-p} Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time @@ -147,7 +150,7 @@ Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\l \label{lem:val-ub} For any circuit $\circuit$ with $\degree(\circuit)=k$, we have \[\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \size(\circuit)}.\] -Further, under the following conditions: +Further, under either of the following conditions: \begin{enumerate} \item $\circuit$ is a tree, \item $\circuit$ encodes the run of the algorithm in~\cite{DBLP:conf/pods/KhamisNR16} on an FAQ query, @@ -156,7 +159,7 @@ we have \[\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.\] \end{Lemma} -Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from Corollary~\Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the special conditions in~\Cref{lem:val-ub}. In~\Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios. +Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the special conditions in~\Cref{lem:val-ub}. In~\Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios. \subsection{Approximating $\rpoly$} @@ -171,7 +174,7 @@ The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}): 2\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \conf \implies\samplesize \geq \frac{2\log{\frac{2}{\conf}}}{\error^2}. \end{equation*} -To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$. +To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit, $\numsamp$ times and uses that information to approximate $\rpoly$. \begin{algorithm}[t] \caption{$\approxq(\circuit, \vct{p}, \conf, \error)$} @@ -185,11 +188,11 @@ To summarize, \approxq modifies \circuit with a call to \onepass. It then sampl \State $\accum \gets 0$\label{alg:mon-sam-global1} \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2} - \State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass-iter}} + \State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is \Cref{alg:one-pass-iter}} \For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples} \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample} - \State\Comment{\sampmon \; is ~\Cref{alg:sample}} + \State\Comment{\sampmon \; is \Cref{alg:sample}} \If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block} \State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1} \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product} @@ -205,19 +208,22 @@ To summarize, \approxq modifies \circuit with a call to \onepass. It then sampl \subsubsection{Correctness} In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that, -we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies. +we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which \Cref{alg:mon-sam} relies. \begin{Lemma}\label{lem:one-pass} -The $\onepass$ function completes in $O\left(size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit}}\right)$ time. %, where $N = \size(\circuit)$.\footnote{In the appendix we give a sufficient condition when $\abs{\circuit}(1,\ldots, 1)$ is indeed $O(1)$ in arithmetic computations. Most notably, WCOJ and FAQ results are not affected by the general runtime of arithmetic computations, a point which we also address in the appendix.} +The $\onepass$ function completes in time: +$$O\left(size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit}}\right)$$ +%, where $N = \size(\circuit)$.\footnote{In the appendix we give a sufficient condition when $\abs{\circuit}(1,\ldots, 1)$ is indeed $O(1)$ in arithmetic computations. Most notably, WCOJ and FAQ results are not affected by the general runtime of arithmetic computations, a point which we also address in the appendix.} $\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght. \end{Lemma} To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$. \begin{Lemma}\label{lem:sample} -The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$ time, +The function $\sampmon$ completes in time +$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$ %\footnote{Note that the same sufficient condition on \circuit to guarentee $O(1)$ arithmetic computations applies here, and when this condition is met, the runtime loses the $\frac{\log{\abs{\circuit}(1,\ldots, 1)}}{\log{\size(\circuit)}}$ factor}, - where $k = \degree(\circuit)$. Upon completion, every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ is returned with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$. + where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$. \end{Lemma} With the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}): @@ -262,9 +268,9 @@ Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top For a parent $+$ gate, the input to be visited is sampled from the weighted distribution precomputed by \onepass. When a parent $\times$ node is visited, both inputs are visited. The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of signs of visited coefficient leaf nodes. - +% We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements. - +% $\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}. \begin{algorithm}[t] diff --git a/circuits-model-runtime.tex b/circuits-model-runtime.tex index 926de06..3668e4a 100644 --- a/circuits-model-runtime.tex +++ b/circuits-model-runtime.tex @@ -112,6 +112,7 @@ We now have all the pieces to argue that using our approximation algorithm, the \end{Corollary} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{proof} +\OK{Invalid reference} This follows from~\Cref{lem:circuits-model-runtime} and (the circuit counterpart-- see~\Cref{sec:results-circuits})~\Cref{cor:approx-algo-const-p} (where the latter is used with $\delta$ being substituted\footnote{Recall that~\Cref{cor:approx-algo-const-p} is stated for a single output tuple so to get the required guarantee for all (at most $n^k$) output tuples of $Q$ we get at most $\frac \delta{n^k}$ probability of failure for each output tuple and then just a union bound over all output tuples. } with $\frac \delta{n^k}$). \end{proof} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/macros.tex b/macros.tex index 56d66a4..beb202e 100644 --- a/macros.tex +++ b/macros.tex @@ -267,12 +267,12 @@ % COMMENTS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\newcommand{\BG}[1]{\todo[inline]{\textbf{Boris says:$\,$} #1}} -\newcommand{\SF}[1]{\todo[inline]{\textbf{Su says:$\,$} #1}} -\newcommand{\OK}[1]{\todo[inline]{\textbf{Oliver says:$\,$} #1}} -\newcommand{\AH}[1]{\todo[inline, backgroundcolor=cyan]{\textbf{Aaron says:$\,$} #1}} -\newcommand{\SR}[1]{\todo[inline, backgroundcolor=white]{\textbf{Note to self:$\,$} #1}} -\newcommand{\AR}[1]{\todo[inline, color=green]{\textbf{Atri says:$\,$} #1}} +\newcommand{\BG}[1]{\todo{\textbf{Boris says:$\,$} #1}} +\newcommand{\SF}[1]{\todo{\textbf{Su says:$\,$} #1}} +\newcommand{\OK}[1]{\todo[color=gray]{\textbf{Oliver says:$\,$} #1}} +\newcommand{\AH}[1]{\todo[backgroundcolor=cyan]{\textbf{Aaron says:$\,$} #1}} +\newcommand{\SR}[1]{\todo[backgroundcolor=white]{\textbf{Note to self:$\,$} #1}} +\newcommand{\AR}[1]{\todo[color=green]{\textbf{Atri says:$\,$} #1}} %\newcommand{\AR}[1]{} %\newcommand{\AH}[1]{} diff --git a/main.tex b/main.tex index 067591d..ba6d183 100644 --- a/main.tex +++ b/main.tex @@ -19,7 +19,7 @@ \usepackage[normalem]{ulem} \usepackage{subcaption} \usepackage{booktabs} -\usepackage[disable]{todonotes} +\usepackage{todonotes} \usepackage{graphicx} \usepackage{listings} %%%%%%%%%% SQL + proveannce listing settings diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index aaf141e..7a85177 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -16,7 +16,7 @@ In particular, we will consider the problems of computing the following counts ( %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Theorem}[\cite{k-match}] \label{thm:k-match-hard} -Given a positive integer $k$ and an undirected graph $G$ with no self-loops or parallel edges, computing $\numocc{G}{\kmatch}$ exactly is %counting the number of $k$-matchings in $G$ is +Given positive integer $k$ and undirected graph $G$ with no self-loops or parallel edges, computing $\numocc{G}{\kmatch}$ exactly is %counting the number of $k$-matchings in $G$ is \sharpwonehard (parameterization is in $k$). \end{Theorem} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -39,7 +39,7 @@ Both of our hardness results rely on a simple query polynomial encoding of the e To prove our hardness result, consider a graph $G(V, E)$, where $|E| = m$, $|V| = \numvar$. Our query polynomial has a variable $X_i$ for every $i$ in $[\numvar]$. Consider the polynomial \[\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in E} X_i \cdot X_j\] -The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above, i.e. +The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Definition}\label{def:qk} For any graph $G=([n],E)$ and $\kElem\ge 1$, define @@ -48,11 +48,14 @@ For any graph $G=([n],E)$ and $\kElem\ge 1$, define %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Our hardness results only need a \ti instance; We also consider the special case when all the tuple probabilities (probabilities assigned to $X_i$ by $\probAllTup$) are the same value. Note that our hardness results do not require the general circuit representation and hold for even the expression tree representation. %this polynomial can be encoded in an expression tree of size $\Theta(km)$. -Using the tables in \cref{fig:ex-shipping}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ can be constructed as follows: -\[\poly^k_G:- Loc(C_1),Route(C_1, C_1'),Loc(C_1'),\dots,Loc(C_\kElem),Route(C_\kElem,C_\kElem'),Loc(C_\kElem')\] -where generalizaing the PDB instance in \cref{fig:ex-shipping}, relation $Loc$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $\prob$ and $Route(\text{City}_1, \text{City}_2)$ has tuples corresponding to the edges $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.} -Note that this imples that our hard query polynomial can be represented as an expression tree, created from a project-join query with some probability value for each $\prob_i$ -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials represented as circuits generated by union of select-project-join (SPJU) queries with potentially distinct $\prob_i$ values. % (i.e. we do not need union or select operator to derive our hardness result). + +\noindent Returning to in \cref{fig:ex-shipping-simp}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ generalizes our running example query: +\[\poly^k_G:- Loc(C_1),Route(C_1, C_1'),Loc(C_1'),\dots,Loc(C_\kElem),Route(C_\kElem,C_\kElem'),Loc(C_\kElem')\] +where adapting the PDB instance in \cref{fig:ex-shipping-simp}, relation $Loc$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $\prob$ and $Route(\text{City}_1, \text{City}_2)$ has tuples corresponding to the edges $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.} +Note that this implies that our hard query polynomial can be represented even as an expression tree, created from a project-join query with some probability value for each $\prob_i$; our hardness result transfers here as well. +% OK: The following (commented-out) sentence feels a bit misplaced here. +% -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials represented as circuits generated by union of select-project-join (SPJU) queries with potentially distinct $\prob_i$ values. % (i.e. we do not need union or select operator to derive our hardness result). %\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p. Finally, we only need project-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.} @@ -69,7 +72,7 @@ Computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ for arbitrary $G$ and any $(2 % We will prove the above result by reducing from the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state-of-the-art $k$-matching algorithms are improved, we cannot hope to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its \abbrSMB form and then using \Cref{cor:expct-sop}. By contrast the approximation algorithm we present in \Cref{sec:algo} has runtime $O_k\inparen{m}$ for this query (since it runs in linear-time on all lineage polynomials). -Here, we present a reduction from the problem of counting $\kElem$-matchings in a graph to our problem: +\noindent The following lemma reduces the problem of counting $\kElem$-matchings in a graph to our problem: \begin{Lemma}\label{lem:qEk-multi-p} Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $O\inparen{\kElem^3}$ time. \end{Lemma} diff --git a/poly-form.tex b/poly-form.tex index 85dece2..4efd508 100644 --- a/poly-form.tex +++ b/poly-form.tex @@ -13,7 +13,7 @@ A monomial is a product of variable terms, each raised to a non-negative integer \end{Definition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -The \abbrSMB for the running example is $X^2 +2XY + Y^2$. While $X^2 + XY + XY + Y^2$ is an expanded form of the expression, it is not the standard monomial basis since $XY$ appears more than once. +The \abbrSMB for the running example is $X^2 +2XY + Y^2$. $X^2 + XY + XY + Y^2$ is an expanded form of the expression, but is not in SMB since $XY$ appears twice. % \BG{Maybe inline degree?} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -23,7 +23,7 @@ The degree of polynomial $\poly(\vct{X})$ is the maximum sum of exponents, over %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The degree of the running example polynomial is $2$. -Note that product terms can only arise as a consequence of join operations, so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in one clause of the UCQ query that created it. +Product terms in lineage arise only as a consequence of join operations, so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in any clause of the UCQ query that created it. In this paper we consider only finite degree polynomials. % % Throughout this paper, we also make the following \textit{assumption}. @@ -38,15 +38,13 @@ We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \ %\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.} % OK: agreed w/ AH, this can be treated as implicit there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case). -As they are a special case of \bis, the following applies to \tis as well. -Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$. -Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob_{\tup_i} = \pd[X_i = 1]$. -\footnote{ - Although it is customary to define a single independent, $[\abs{\block_i}+1]$-valued variable per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_j \in b_i$, the event $(X_j = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme. +As a special case of \bis, the following applies to \tis as well. +Recall that in a \bi $\pxdb$, tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \pd[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.\footnote{ + Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_j \in b_i$, the event $(X_{i,j} = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme. } -Because blocks are independent and tuples from the same block are disjoint, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$. +Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pxdb$. We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as -$\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $X_{i, j}$ denotes the annotation of tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.\footnote{Later on in the paper, especially in~\Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.} +$\poly(\vct{X})$ = $\poly(X_{1, 1},\ldots, X_{1, \abs{\block_1}},$ $\ldots, X_{\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$.\footnote{Later on in the paper, especially in~\Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.} %\SF{Where is $\block_{i, j}$ used? Is it $X_{\block_{1, 1}}$ or $X_{\block_1, 1}$ ?} % and the probability distribution of $\pxdb$ is uniquely determined based on a probability vector $\vct{p}$ that associates each tuple a probability % variables are independent of each other (or disjoint if they are from the same block) and each variable $X$ is associated with a probability $\vct{p}(X) = \pd[X = 1]$. Thus, we are dealing with polynomials $\poly(\vct{X})$ that are annotations of a tuple in the result of a query $\query$ over a BIDB $\pxdb$ where $\vct{X}$ is the set of variables that occur in annotations of tuples of $\pxdb$. @@ -65,7 +63,7 @@ Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S For example for a set of polynomials $S=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S$ yields $2X+3XY-2Y$. % \begin{Definition}\label{def:mod-set-polys} -Given the set of BIDB variables $\inset{X_{b,i}}$, define +Given the set of BIDB variables $\inset{X_{i,j}}$, define \setlength\parindent{0pt} \vspace*{-3mm} @@ -74,7 +72,7 @@ Given the set of BIDB variables $\inset{X_{b,i}}$, define \begin{minipage}[b]{0.45\linewidth} \centering \begin{equation*} - \mathcal{B}=\comprehension{X_{b,i}\cdot X_{b,j}}{\text{ for every block } b \text{ and } i\ne j \in [~\abs{\block}~]}, + \mathcal{B}=\comprehension{X_{i,j}\cdot X_{i,j'}}{i \in [\ell], j\neq j' \in [~\abs{\block_i}~]} \end{equation*} \end{minipage}% \hspace{13mm} @@ -82,7 +80,7 @@ Given the set of BIDB variables $\inset{X_{b,i}}$, define \begin{minipage}[b]{0.45\linewidth} \centering \begin{equation*} - \mathcal{T}=\comprehension{X_{b,i}^2-X_{b,i}}{\text{ for every block } b \text{ and } i \in [~\abs{\block}~]} + \mathcal{T}=\comprehension{X_{i,j}^2-X_{i,j}}{i \in [\ell], j \in [~\abs{\block_i}~]} \end{equation*} \end{minipage} \\ @@ -134,14 +132,15 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Definition}[Valid Worlds] -For probability distribution $\probDist$ and its corresponding probability mass function $\probOf$, the set of valid worlds $\eta$ consists of all the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$ +For probability distribution $\probDist$, % and its corresponding probability mass function $\probOf$, +the set of valid worlds $\eta$ consists of all the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$ \[ -\eta = \{\vct{w}\suchthat \probOf[\vct{W} = \vct{w}] > 0\} +\eta = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0} \] \end{Definition} %We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$ in~\Cref{app:subsec-pre-poly-rpoly} and~\Cref{app:subsec-prop-q-qtilde}. -Next, we show why the reduced form is useful for our purposes: +\noindent Next, we show why the reduced form is useful for our purposes: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/prob-def.tex b/prob-def.tex index a8179b1..abc2aab 100644 --- a/prob-def.tex +++ b/prob-def.tex @@ -1,4 +1,5 @@ %root: main.tex +%!TEX root=./main.tex \subsection{Problem Definition}\label{sec:expression-trees} @@ -11,16 +12,19 @@ We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complex \begin{Definition}[Circuit]\label{def:circuit} A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal nodes and sink node of $\circuit$ have binary input and are either sum ($\circplus$) or product ($\circmult$) gates. -$\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. The member \degval holds the degree of \circuit. When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of the \circuit. +$\circuit$ additionally has the following members: \type, \vari{val}, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. The member \degval holds the degree of \circuit. When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of \circuit. \end{Definition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -As stated in ~\Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree. +As stated in \Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree. Note that if we limit the outdegree to one, then we get expression trees. +We ignore the fields \vari{partial}, \vari{Lweight}, and \vari{Rweight} until \Cref{sec:algo}. + + \begin{Example} -The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$. Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$. Note further that \circuit is indeed a tree with edges pointing towards the root. +The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$. Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$. Note further that \circuit is indeed a tree with edges pointing towards the root. \end{Example} \begin{figure}[t] @@ -86,8 +90,6 @@ The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial \caption{ } \end{figure} -We ignore the remaining fields (\vari{partial}, \vari{Lweight}, and \vari{Rweight}) until \Cref{sec:algo}. - The semantics of circuits follows the obvious interpretation. We next define its realtionship with polynomials formally: \begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func} @@ -101,7 +103,7 @@ Denote $\polyf(\circuit)$ to be the function from circuit $\circuit$ to its corr \end{equation*} \end{Definition} -Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$ +Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously, a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$ , as shown in \Cref{fig:circuit}. \begin{Definition}[Circuit Set]\label{def:circuit-set} @@ -112,7 +114,7 @@ $\circuitset{\smb}$ is the set of all possible circuits $\circuit$ such that $\p The circuit of \Cref{fig:circuit} is an element of $\circuitset{\smb}$. One can think of $\circuitset{\smb}$ as the infinite set of circuits each of which model an encoding (factorization) equal to $\polyf(\circuit)$. %\supset \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$. -Note that ~\Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$. +Note that \Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \medskip @@ -121,9 +123,11 @@ Note that ~\Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf( %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl} Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple. - The \expectProblem is defined as follows: - -\hspace*{5mm}\textbf{Input}: A circuit $\circuit \in \circuitset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$ + The \expectProblem is defined as follows:\\[-7mm] +\begin{center} +\textbf{Input}: A circuit $\circuit \in \circuitset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$ +\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$ +\end{center} \end{Definition} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/ra-to-poly.tex b/ra-to-poly.tex index 8751a65..8f789c8 100644 --- a/ra-to-poly.tex +++ b/ra-to-poly.tex @@ -82,7 +82,11 @@ We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ ov \evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\ \zeroK & \text{otherwise}. \end{cases} & - \evald{(\rel_1 \join \rel_2)}{\db}(\tup) &= \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\ + \evald{(\rel_1 \join \rel_2)}{\db}(\tup) &= + \begin{aligned} + \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \\ + \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) + \end{aligned}\\ & & \evald{R}{\db}(\tup) &= \rel(\tup) \end{align*} @@ -104,7 +108,7 @@ $\semNX$-PDBs and a function $\rmod$ (which transforms an $\semNX$-PDB to an equ \end{Proposition} \noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}. This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$. -We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from now on). +We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ will denote a polynomial). \subsubsection{\tis and \bis} \label{subsec:tidbs-and-bidbs}