Read through: Space, grammar, notation fixes

master
Oliver Kennedy 2021-04-07 01:02:46 -04:00
parent 01581a4e4c
commit 345d95b651
Signed by: okennedy
GPG Key ID: 3E5F9B3ABD3FDB60
8 changed files with 77 additions and 60 deletions

View File

@ -23,7 +23,7 @@ We now introduce useful definitions and notation related to polynomials. We use
\revision{
\begin{Definition}[Pure Expansion]
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes ~\Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes \Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
\end{Definition}
}
@ -45,18 +45,19 @@ $
\end{Definition}
\revision{
Note that similar in spirit to ~\Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though ~\Cref{def:reduced-bi-poly} is more general.
Note that similar in spirit to \Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ reduces all variable exponents $e > 1$ to $e = 1$, though \Cref{def:reduced-bi-poly} is more general.
\OK{More general, how?}
}
In the following, we abuse notation and write $\monom$ to denote the monomial obtained as the products of the variables in the set.
\begin{Example}\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in~\Cref{eq:poly-eg}.
Its circuit $\etree$ is illustrated in Figure ~\ref{fig:expr-tree-T}.
Its circuit $\etree$ is illustrated in \cref{fig:circuit}.
The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$ and the $\expansion{\circuit}$ is $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
\end{Example}
$\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$.
Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.
However, unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.
\begin{Definition}[Positive \circuit]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
@ -64,7 +65,7 @@ For any circuit $\circuit$, the corresponding
\end{Definition}
Using the same factorization from ~\Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.
Using the same factorization from \Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from~\Cref{eq:poly-eg}.
@ -112,7 +113,9 @@ In the subsequent subsections we will prove the following theorem.
\begin{Theorem}\label{lem:approx-alg}
Let \circuit be a circuit for a UCQ over \bi and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
{\small
\[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\]
}
such that
\begin{equation}
\label{eq:approx-algo-bound}
@ -120,16 +123,16 @@ such that
\end{equation}
\end{Theorem}
\noindent The proof of~\Cref{lem:approx-alg} (which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample}) can be found in~\Cref{sec:proof-lem-approx-alg}. The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}.
\noindent The proof of~\Cref{lem:approx-alg} (which relies on \Cref{lem:one-pass} and \Cref{lem:sample}) can be found in~\Cref{sec:proof-lem-approx-alg}. The proofs for the referenced lemmas are also found in \Cref{sec:proof-one-pass} and \Cref{sec:proof-sample-monom}.
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}):
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}).
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\circuit$, define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\]
\end{Definition}
\noindent We next present couple of corollaries of~\Cref{lem:approx-alg}.
\noindent We next present a few corollaries of~\Cref{lem:approx-alg}.
\begin{Corollary}
\label{cor:approx-algo-const-p}
Let $\poly(\vct{X})$ be as in~\Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying~\Cref{eq:approx-algo-bound} can be computed in time
@ -147,7 +150,7 @@ Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\l
\label{lem:val-ub}
For any circuit $\circuit$ with $\degree(\circuit)=k$, we have
\[\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \size(\circuit)}.\]
Further, under the following conditions:
Further, under either of the following conditions:
\begin{enumerate}
\item $\circuit$ is a tree,
\item $\circuit$ encodes the run of the algorithm in~\cite{DBLP:conf/pods/KhamisNR16} on an FAQ query,
@ -156,7 +159,7 @@ we have
\[\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.\]
\end{Lemma}
Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from Corollary~\Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the special conditions in~\Cref{lem:val-ub}. In~\Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios.
Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the special conditions in~\Cref{lem:val-ub}. In~\Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios.
\subsection{Approximating $\rpoly$}
@ -171,7 +174,7 @@ The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
2\exp{\left(-\frac{\samplesize\error^2}{2}\right)}\leq \conf \implies\samplesize \geq \frac{2\log{\frac{2}{\conf}}}{\error^2}.
\end{equation*}
To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$.
To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit, $\numsamp$ times and uses that information to approximate $\rpoly$.
\begin{algorithm}[t]
\caption{$\approxq(\circuit, \vct{p}, \conf, \error)$}
@ -185,11 +188,11 @@ To summarize, \approxq modifies \circuit with a call to \onepass. It then sampl
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\Cref{alg:one-pass-iter}}
\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is \Cref{alg:one-pass-iter}}
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
\State\Comment{\sampmon \; is ~\Cref{alg:sample}}
\State\Comment{\sampmon \; is \Cref{alg:sample}}
\If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block}
\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\var\inparen{\vari{M}}}p_j$\label{alg:mon-sam-assign1}
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
@ -205,19 +208,22 @@ To summarize, \approxq modifies \circuit with a call to \onepass. It then sampl
\subsubsection{Correctness}
In order to prove~\Cref{lem:approx-alg}, we will need to argue the correctness of~\Cref{alg:mon-sam}. Before we formally do that,
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\Cref{alg:mon-sam} relies.
we first state the lemmas that summarize the relevant properties of $\onepass$ and $\sampmon$, the auxiliary algorithms on which \Cref{alg:mon-sam} relies.
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O\left(size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit}}\right)$ time. %, where $N = \size(\circuit)$.\footnote{In the appendix we give a sufficient condition when $\abs{\circuit}(1,\ldots, 1)$ is indeed $O(1)$ in arithmetic computations. Most notably, WCOJ and FAQ results are not affected by the general runtime of arithmetic computations, a point which we also address in the appendix.}
The $\onepass$ function completes in time:
$$O\left(size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit}}\right)$$
%, where $N = \size(\circuit)$.\footnote{In the appendix we give a sufficient condition when $\abs{\circuit}(1,\ldots, 1)$ is indeed $O(1)$ in arithmetic computations. Most notably, WCOJ and FAQ results are not affected by the general runtime of arithmetic computations, a point which we also address in the appendix.}
$\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
\end{Lemma}
To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$ time,
The function $\sampmon$ completes in time
$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
%\footnote{Note that the same sufficient condition on \circuit to guarentee $O(1)$ arithmetic computations applies here, and when this condition is met, the runtime loses the $\frac{\log{\abs{\circuit}(1,\ldots, 1)}}{\log{\size(\circuit)}}$ factor},
where $k = \degree(\circuit)$. Upon completion, every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ is returned with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)\in \expansion{\abs{\circuit}}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
\end{Lemma}
With the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}):
@ -262,9 +268,9 @@ Instead, \Cref{alg:sample} selects a monomial from $\expansion{\circuit}$ by top
For a parent $+$ gate, the input to be visited is sampled from the weighted distribution precomputed by \onepass.
When a parent $\times$ node is visited, both inputs are visited.
The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of signs of visited coefficient leaf nodes.
%
We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements.
%
$\sampmon$ is given in \Cref{alg:sample}, and a proof of its correctness (via \Cref{lem:sample}) is provided in \Cref{sec:proofs-approx-alg}.
\begin{algorithm}[t]

View File

@ -112,6 +112,7 @@ We now have all the pieces to argue that using our approximation algorithm, the
\end{Corollary}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}
\OK{Invalid reference}
This follows from~\Cref{lem:circuits-model-runtime} and (the circuit counterpart-- see~\Cref{sec:results-circuits})~\Cref{cor:approx-algo-const-p} (where the latter is used with $\delta$ being substituted\footnote{Recall that~\Cref{cor:approx-algo-const-p} is stated for a single output tuple so to get the required guarantee for all (at most $n^k$) output tuples of $Q$ we get at most $\frac \delta{n^k}$ probability of failure for each output tuple and then just a union bound over all output tuples. } with $\frac \delta{n^k}$).
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -267,12 +267,12 @@
% COMMENTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\BG}[1]{\todo[inline]{\textbf{Boris says:$\,$} #1}}
\newcommand{\SF}[1]{\todo[inline]{\textbf{Su says:$\,$} #1}}
\newcommand{\OK}[1]{\todo[inline]{\textbf{Oliver says:$\,$} #1}}
\newcommand{\AH}[1]{\todo[inline, backgroundcolor=cyan]{\textbf{Aaron says:$\,$} #1}}
\newcommand{\SR}[1]{\todo[inline, backgroundcolor=white]{\textbf{Note to self:$\,$} #1}}
\newcommand{\AR}[1]{\todo[inline, color=green]{\textbf{Atri says:$\,$} #1}}
\newcommand{\BG}[1]{\todo{\textbf{Boris says:$\,$} #1}}
\newcommand{\SF}[1]{\todo{\textbf{Su says:$\,$} #1}}
\newcommand{\OK}[1]{\todo[color=gray]{\textbf{Oliver says:$\,$} #1}}
\newcommand{\AH}[1]{\todo[backgroundcolor=cyan]{\textbf{Aaron says:$\,$} #1}}
\newcommand{\SR}[1]{\todo[backgroundcolor=white]{\textbf{Note to self:$\,$} #1}}
\newcommand{\AR}[1]{\todo[color=green]{\textbf{Atri says:$\,$} #1}}
%\newcommand{\AR}[1]{}
%\newcommand{\AH}[1]{}

View File

@ -19,7 +19,7 @@
\usepackage[normalem]{ulem}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage[disable]{todonotes}
\usepackage{todonotes}
\usepackage{graphicx}
\usepackage{listings}
%%%%%%%%%% SQL + proveannce listing settings

View File

@ -16,7 +16,7 @@ In particular, we will consider the problems of computing the following counts (
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
\label{thm:k-match-hard}
Given a positive integer $k$ and an undirected graph $G$ with no self-loops or parallel edges, computing $\numocc{G}{\kmatch}$ exactly is %counting the number of $k$-matchings in $G$ is
Given positive integer $k$ and undirected graph $G$ with no self-loops or parallel edges, computing $\numocc{G}{\kmatch}$ exactly is %counting the number of $k$-matchings in $G$ is
\sharpwonehard (parameterization is in $k$).
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -39,7 +39,7 @@ Both of our hardness results rely on a simple query polynomial encoding of the e
To prove our hardness result, consider a graph $G(V, E)$, where $|E| = m$, $|V| = \numvar$. Our query polynomial has a variable $X_i$ for every $i$ in $[\numvar]$.
Consider the polynomial
\[\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in E} X_i \cdot X_j\]
The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above, i.e.
The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}\label{def:qk}
For any graph $G=([n],E)$ and $\kElem\ge 1$, define
@ -48,11 +48,14 @@ For any graph $G=([n],E)$ and $\kElem\ge 1$, define
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Our hardness results only need a \ti instance; We also consider the special case when all the tuple probabilities (probabilities assigned to $X_i$ by $\probAllTup$) are the same value. Note that our hardness results do not require the general circuit representation and hold for even the expression tree representation. %this polynomial can be encoded in an expression tree of size $\Theta(km)$.
Using the tables in \cref{fig:ex-shipping}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ can be constructed as follows:
\[\poly^k_G:- Loc(C_1),Route(C_1, C_1'),Loc(C_1'),\dots,Loc(C_\kElem),Route(C_\kElem,C_\kElem'),Loc(C_\kElem')\]
where generalizaing the PDB instance in \cref{fig:ex-shipping}, relation $Loc$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $\prob$ and $Route(\text{City}_1, \text{City}_2)$ has tuples corresponding to the edges $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
Note that this imples that our hard query polynomial can be represented as an expression tree, created from a project-join query with some probability value for each $\prob_i$ -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials represented as circuits generated by union of select-project-join (SPJU) queries with potentially distinct $\prob_i$ values. % (i.e. we do not need union or select operator to derive our hardness result).
\noindent Returning to in \cref{fig:ex-shipping-simp}, it is easy to see that $\poly_{G}^\kElem(\vct{X})$ generalizes our running example query:
\[\poly^k_G:- Loc(C_1),Route(C_1, C_1'),Loc(C_1'),\dots,Loc(C_\kElem),Route(C_\kElem,C_\kElem'),Loc(C_\kElem')\]
where adapting the PDB instance in \cref{fig:ex-shipping-simp}, relation $Loc$ has $n$ tuples corresponding to each vertex in $V=[n]$ each with probability $\prob$ and $Route(\text{City}_1, \text{City}_2)$ has tuples corresponding to the edges $E$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
Note that this implies that our hard query polynomial can be represented even as an expression tree, created from a project-join query with some probability value for each $\prob_i$; our hardness result transfers here as well.
% OK: The following (commented-out) sentence feels a bit misplaced here.
% -- by contrast our approximation algorithm in \Cref{sec:algo} can handle lineage polynomials represented as circuits generated by union of select-project-join (SPJU) queries with potentially distinct $\prob_i$ values. % (i.e. we do not need union or select operator to derive our hardness result).
%\AR{need discussion on the `tightness' of various params. First, this is for degree 6 poly-- while things are easy for say deg 2. Second this is for any fixed p. Finally, we only need project-join queries to get the hardness results. Also need to compare this with the generality of the approx upper bound results.}
@ -69,7 +72,7 @@ Computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ for arbitrary $G$ and any $(2
%
We will prove the above result by reducing from the problem of computing the number of $k$-matchings in $G$. Given the current best-known algorithm for this counting problem, our results imply that unless the state-of-the-art $k$-matching algorithms are improved, we cannot hope to solve our problem in time better than $\Omega_k\inparen{m^{k/2}}$, which is only quadratically faster than expanding $\poly_{G}^\kElem(\vct{X})$ into its \abbrSMB form and then using \Cref{cor:expct-sop}. By contrast the approximation algorithm we present in \Cref{sec:algo} has runtime $O_k\inparen{m}$ for this query (since it runs in linear-time on all lineage polynomials).
Here, we present a reduction from the problem of counting $\kElem$-matchings in a graph to our problem:
\noindent The following lemma reduces the problem of counting $\kElem$-matchings in a graph to our problem:
\begin{Lemma}\label{lem:qEk-multi-p}
Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $O\inparen{\kElem^3}$ time.
\end{Lemma}

View File

@ -13,7 +13,7 @@ A monomial is a product of variable terms, each raised to a non-negative integer
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The \abbrSMB for the running example is $X^2 +2XY + Y^2$. While $X^2 + XY + XY + Y^2$ is an expanded form of the expression, it is not the standard monomial basis since $XY$ appears more than once.
The \abbrSMB for the running example is $X^2 +2XY + Y^2$. $X^2 + XY + XY + Y^2$ is an expanded form of the expression, but is not in SMB since $XY$ appears twice.
% \BG{Maybe inline degree?}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -23,7 +23,7 @@ The degree of polynomial $\poly(\vct{X})$ is the maximum sum of exponents, over
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The degree of the running example polynomial is $2$.
Note that product terms can only arise as a consequence of join operations, so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in one clause of the UCQ query that created it.
Product terms in lineage arise only as a consequence of join operations, so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in any clause of the UCQ query that created it.
In this paper we consider only finite degree polynomials.
%
% Throughout this paper, we also make the following \textit{assumption}.
@ -38,15 +38,13 @@ We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \
%\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.}
% OK: agreed w/ AH, this can be treated as implicit
there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case).
As they are a special case of \bis, the following applies to \tis as well.
Recall that in a \bi $\pxdb$ with tuples $t_1, \ldots, t_n$, each input tuple $t_i$ is annotated with a unique variable $X_i$.
Tuples of $\pxdb$ are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_i$ is associated with a probability $\prob_{\tup_i} = \pd[X_i = 1]$.
\footnote{
Although it is customary to define a single independent, $[\abs{\block_i}+1]$-valued variable per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_j \in b_i$, the event $(X_j = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme.
As a special case of \bis, the following applies to \tis as well.
Recall that in a \bi $\pxdb$, tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \pd[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.\footnote{
Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_j \in b_i$, the event $(X_{i,j} = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme.
}
Because blocks are independent and tuples from the same block are disjoint, $\prob$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as
$\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $X_{i, j}$ denotes the annotation of tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$.\footnote{Later on in the paper, especially in~\Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.}
$\poly(\vct{X})$ = $\poly(X_{1, 1},\ldots, X_{1, \abs{\block_1}},$ $\ldots, X_{\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$.\footnote{Later on in the paper, especially in~\Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.}
%\SF{Where is $\block_{i, j}$ used? Is it $X_{\block_{1, 1}}$ or $X_{\block_1, 1}$ ?}
% and the probability distribution of $\pxdb$ is uniquely determined based on a probability vector $\vct{p}$ that associates each tuple a probability
% variables are independent of each other (or disjoint if they are from the same block) and each variable $X$ is associated with a probability $\vct{p}(X) = \pd[X = 1]$. Thus, we are dealing with polynomials $\poly(\vct{X})$ that are annotations of a tuple in the result of a query $\query$ over a BIDB $\pxdb$ where $\vct{X}$ is the set of variables that occur in annotations of tuples of $\pxdb$.
@ -65,7 +63,7 @@ Let $S$ be a {\em set} of polynomials over $\vct{X}$. Then $\poly(\vct{X})\mod{S
For example for a set of polynomials $S=\inset{X^2-X, Y^2-Y}$, taking the polynomial $2X^2 + 3XY - 2Y^2\mod S$ yields $2X+3XY-2Y$.
%
\begin{Definition}\label{def:mod-set-polys}
Given the set of BIDB variables $\inset{X_{b,i}}$, define
Given the set of BIDB variables $\inset{X_{i,j}}$, define
\setlength\parindent{0pt}
\vspace*{-3mm}
@ -74,7 +72,7 @@ Given the set of BIDB variables $\inset{X_{b,i}}$, define
\begin{minipage}[b]{0.45\linewidth}
\centering
\begin{equation*}
\mathcal{B}=\comprehension{X_{b,i}\cdot X_{b,j}}{\text{ for every block } b \text{ and } i\ne j \in [~\abs{\block}~]},
\mathcal{B}=\comprehension{X_{i,j}\cdot X_{i,j'}}{i \in [\ell], j\neq j' \in [~\abs{\block_i}~]}
\end{equation*}
\end{minipage}%
\hspace{13mm}
@ -82,7 +80,7 @@ Given the set of BIDB variables $\inset{X_{b,i}}$, define
\begin{minipage}[b]{0.45\linewidth}
\centering
\begin{equation*}
\mathcal{T}=\comprehension{X_{b,i}^2-X_{b,i}}{\text{ for every block } b \text{ and } i \in [~\abs{\block}~]}
\mathcal{T}=\comprehension{X_{i,j}^2-X_{i,j}}{i \in [\ell], j \in [~\abs{\block_i}~]}
\end{equation*}
\end{minipage}
\\
@ -134,14 +132,15 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Valid Worlds]
For probability distribution $\probDist$ and its corresponding probability mass function $\probOf$, the set of valid worlds $\eta$ consists of all the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
For probability distribution $\probDist$, % and its corresponding probability mass function $\probOf$,
the set of valid worlds $\eta$ consists of all the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
\[
\eta = \{\vct{w}\suchthat \probOf[\vct{W} = \vct{w}] > 0\}
\eta = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
\]
\end{Definition}
%We state additional equivalences between $\poly(\vct{X})$ and $\rpoly(\vct{X})$ in~\Cref{app:subsec-pre-poly-rpoly} and~\Cref{app:subsec-prop-q-qtilde}.
Next, we show why the reduced form is useful for our purposes:
\noindent Next, we show why the reduced form is useful for our purposes:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -1,4 +1,5 @@
%root: main.tex
%!TEX root=./main.tex
\subsection{Problem Definition}\label{sec:expression-trees}
@ -11,16 +12,19 @@ We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complex
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal nodes and sink node of $\circuit$ have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
$\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. The member \degval holds the degree of \circuit. When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of the \circuit.
$\circuit$ additionally has the following members: \type, \vari{val}, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. The member \degval holds the degree of \circuit. When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of \circuit.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As stated in ~\Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
As stated in \Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees.
We ignore the fields \vari{partial}, \vari{Lweight}, and \vari{Rweight} until \Cref{sec:algo}.
\begin{Example}
The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$. Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$. Note further that \circuit is indeed a tree with edges pointing towards the root.
The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$. Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$. Note further that \circuit is indeed a tree with edges pointing towards the root.
\end{Example}
\begin{figure}[t]
@ -86,8 +90,6 @@ The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial
\caption{ }
\end{figure}
We ignore the remaining fields (\vari{partial}, \vari{Lweight}, and \vari{Rweight}) until \Cref{sec:algo}.
The semantics of circuits follows the obvious interpretation. We next define its realtionship with polynomials formally:
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
@ -101,7 +103,7 @@ Denote $\polyf(\circuit)$ to be the function from circuit $\circuit$ to its corr
\end{equation*}
\end{Definition}
Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$
Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously, a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$
, as shown in \Cref{fig:circuit}.
\begin{Definition}[Circuit Set]\label{def:circuit-set}
@ -112,7 +114,7 @@ $\circuitset{\smb}$ is the set of all possible circuits $\circuit$ such that $\p
The circuit of \Cref{fig:circuit} is an element of $\circuitset{\smb}$. One can think of $\circuitset{\smb}$ as the infinite set of circuits each of which model an encoding (factorization) equal to $\polyf(\circuit)$.
%\supset \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$.
Note that ~\Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$.
Note that \Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip
@ -121,9 +123,11 @@ Note that ~\Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\vct{X} = (X_1, \ldots, X_n)$, and $\pdb$ be an $\semNX$-PDB over $\vct{X}$ with probability distribution $\pd$ over assignments $\vct{X} \to [0,1]$, $\query$ an n-ary query, and $t$ an n-ary tuple.
The \expectProblem is defined as follows:
\hspace*{5mm}\textbf{Input}: A circuit $\circuit \in \circuitset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$
The \expectProblem is defined as follows:\\[-7mm]
\begin{center}
\textbf{Input}: A circuit $\circuit \in \circuitset{\smb}$ for $\poly(\vct{X}) = \query(\pxdb)(t)$
\hspace*{5mm}\textbf{Output}: $\expct_{\vct{W} \sim \pd}[\poly(\vct{W})]$
\end{center}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -82,7 +82,11 @@ We use $\evald{\cdot}{\db}$ to denote the result of evaluating query $\query$ ov
\evald{\rel}{\db}(\tup) & \text{if }\theta(\tup) \\
\zeroK & \text{otherwise}.
\end{cases} &
\evald{(\rel_1 \join \rel_2)}{\db}(\tup) &= \evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\
\evald{(\rel_1 \join \rel_2)}{\db}(\tup) &=
\begin{aligned}
\evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \multK \\
\evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup))
\end{aligned}\\
& & \evald{R}{\db}(\tup) &= \rel(\tup)
\end{align*}
@ -104,7 +108,7 @@ $\semNX$-PDBs and a function $\rmod$ (which transforms an $\semNX$-PDB to an equ
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.
This proposition shows that computing expected tuple multiplicities is equivalent to computing the expectation of a polynomial (for that tuple) from a probability distribution over all possible assignments of variables in the polynomial to $\{0,1\}$.
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ is used as a polynomial from now on).
We focus on this problem from now on, assume an implicit result tuple, and so drop the subscript from $\polyForTuple$ (i.e., $\poly$ will denote a polynomial).
\subsubsection{\tis and \bis}
\label{subsec:tidbs-and-bidbs}