master
Boris Glavic 2021-04-08 21:30:03 -05:00
parent ab6c53c52e
commit 0f704e7377
4 changed files with 46 additions and 38 deletions

View File

@ -34,7 +34,7 @@ For further explanation, please refer to \cref{example:expr-tree-T}.
\begin{Definition}[$\abs{\circuit}(\vct{X})$]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
\end{Definition}
Please see \cref{ex:def-pos-circ} for an illustration.
@ -97,14 +97,13 @@ Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\l
\begin{Lemma}
\label{lem:val-ub}
For any circuit $\circuit$ with $\degree(\circuit)=k$, we have
\[\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \size(\circuit)}.\]
$\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \size(\circuit)}.$
Further, under either of the following conditions:
\begin{enumerate}
\item $\circuit$ is a tree,
\item $\circuit$ encodes the run of the algorithm in~\cite{DBLP:conf/pods/KhamisNR16} on an FAQ query,
\end{enumerate}
we have
\[\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.\]
we have $\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.$
\end{Lemma}
Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the special conditions in~\Cref{lem:val-ub}. In~\Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios.
@ -128,7 +127,7 @@ To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in time
$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
\end{Lemma}
With the above two lemmas, we are ready to argue the following result (proof in~\Cref{sec:proofs-approx-alg}):

View File

@ -30,10 +30,12 @@ We adopt a minimalistic compute-bound model of query evaluation drawn from the w
%
\noindent\resizebox{1\linewidth}{!}{
\begin{minipage}{1.0\linewidth}
\begin{align*}
\qruntime{R,D} & = |R| &
\qruntime{\sigma Q, D} & = \qruntime{Q,D} &
\qruntime{\pi Q, D} & = \qruntime{Q,D} + \abs{Q(D)}
\end{align*}\\[-15mm]
\begin{align*}
\qruntime{R,D} & = |R| \\
\qruntime{\sigma Q, D} & = \qruntime{Q,D} \\
\qruntime{\pi Q, D} & = \qruntime{Q,D} + \abs{Q(D)} \\
\qruntime{Q \cup Q', D} & = \qruntime{Q, D} + \qruntime{Q', D} +\abs{Q(D)}+\abs{Q'(D)} \\
\qruntime{Q_1 \bowtie \ldots \bowtie Q_n, D} & = \qruntime{Q_1, D} + \ldots + \qruntime{Q_n,D} + \abs{Q_1(D) \bowtie \ldots \bowtie Q_n(D)}
\end{align*}

View File

@ -3,18 +3,25 @@
%\onecolumn
\subsection{Reduced Polynomials and Equivalences}
We now introduce some terminology for polynomials and develop a reduced form for polynomials --- a closed form of the polynomial's expectation over probability distributions derived from a \bi or \ti.
We now introduce some terminology % for polynomials
and develop a reduced form for polynomials --- a closed form of the polynomial's expectation over probability distributions derived from a \bi or \ti.
%We will use $(X + Y)^2$ as a running example.
Recall that a polynomial over $\vct{X}=(X_1,\dots,X_n)$ is formally defined as:
\[Q(X_1,\dots,X_n)=\sum_{\vct{i}=(i_1,\dots,i_n)\in \semN^n} c_{\vct{i}}\cdot \prod_{j=1}^n X_j^{i_j}.\]
\begin{equation}
\label{eq:sop-form}
Q(X_1,\dots,X_n)=\sum_{\vct{i}=(i_1,\dots,i_n)\in \semN^n} c_{\vct{i}}\cdot \prod_{j=1}^n X_j^{i_j}.
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
%A monomial is a product of variable terms, each raised to a non-negative integer power.
% A polynomial in \termSMB (\abbrSMB) has the form: $\sum_{i=1}^n c_i \cdot m_i$ for each of its $n$ terms, where each $c_i \neq 0$ is an integer and each $m_i$ is a monomial and $m_i \neq m_j$ for $i \neq j$. We use $\smbOf{\poly}$ to denote the \abbrSMB of $\poly$.
The term $\prod_{j=1}^n X_j^{i_j}$ is a {\em monomial}. A polynomial $Q(\vct{X})$ is in standard monomial basis (or SMB) if in the above sum only terms with $c_{\vct{i}}\ne 0$ appear.
The term $\prod_{j=1}^n X_j^{i_j}$ is a {\em monomial}. A polynomial $Q(\vct{X})$ is in standard monomial basis (SMB) if % in the above sum
terms with $c_{\vct{i}}\ne 0$ are removed from \Cref{eq:sop-form}.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
In this paper we consider the default representation of a polynomial to be in \abbrSMB. Sometimes when we want to stress that we want to use the SMB representation of a polynomial $\poly$ we will explicitly state $\smbOf{\poly}$.
We consider \abbrSMB as the default representation of a polynomial. % When we want to stress the use of the SMB representation,
We use $\smbOf{\poly}$ to denote the SMB form of a polynomial $\poly$.
%The \abbrSMB for the running example is $X^2 +2XY + Y^2$. Note that the example's SOP expansion $X^2 + XY + XY + Y^2$ is is not $\smbOf{(X+Y)^2}$ since $XY$ appears twice.
@ -25,7 +32,7 @@ The degree of polynomial $\poly(\vct{X})$ is the largest $\sum_{j=1}^n i_j$ such
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The degree of the polynomial $X^2+2XY+Y^2$ is $2$.
The degree of the polynomial $X^2+2XY+Y^2$ is $2$.
Product terms in lineage arise only as a consequence of join operations, so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins in any clause of the UCQ query that created it.
In this paper we consider only finite degree polynomials.
%
@ -38,13 +45,13 @@ In this paper we consider only finite degree polynomials.
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
We call a polynomial $\query(\vct{X})$ a \emph{\bi-lineage polynomial} (resp., \emph{\ti-lineage polynomial}, or simply lineage polynomial), if
%\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.}
%\AH{Why is it required for the tuple to be n-ary? I think this slightly confuses me since we have n tuples.}
% OK: agreed w/ AH, this can be treated as implicit
there exists a $\raPlus$ query $\query$, \bi $\pxdb$ (\ti $\pxdb$, or $\semNX$-PDB $\pxdb$), and tuple $\tup$ such that $\query(\vct{X}) = \query(\pxdb)(\tup)$. % Before proceeding, note that the following is assume that polynomials are \bis (which subsume \tis as a special case).
As a special case of \bis, the following applies to \tis as well.
In a \bi $\pxdb$, tuples are partitioned into $\ell$ blocks $\block_1, \ldots, \block_\ell$ where tuple $t_{i,j} \in \block_i$ is associated with a probability $\prob_{\tup_{i,j}} = \pd[X_{i,j} = 1]$, and is annotated with a unique variable $X_{i,j}$.\footnote{
Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_j \in b_i$, the event $(X_{i,j} = 1)$ corresponds to the event $(X_i = j)$ in the customary annotation scheme.
}
}
Because blocks are independent and tuples from the same block are disjoint, the probabilities $\prob_{\tup_{i,j}}$ and the blocks induce the probability distribution $\pd$ of $\pxdb$.
We will write a \bi-lineage polynomial $\poly(\vct{X})$ for a \bi with $\ell$ blocks as
$\poly(\vct{X})$ = $\poly(X_{1, 1},\ldots, X_{1, \abs{\block_1}},$ $\ldots, X_{\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$.\footnote{Later on in the paper, especially in~\Cref{sec:algo}, we will overload notation and rename the variables as $X_1,\dots,X_n$, where $n=\sum_{i=1}^\ell \abs{b_i}$.}
@ -93,16 +100,16 @@ Given the set of BIDB variables $\inset{X_{i,j}}$, define
%
\begin{Definition}[Reduced \bi Polynomials]\label{def:reduced-bi-poly}
Let $\poly(\vct{X})$ be a \bi-lineage polynomial.
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is:
\begin{equation*}
\rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}%X_i^2 - X_i \mod X_{\block_s, t}X_{\block_s, u}
\end{equation*}
The reduced form $\rpoly(\vct{X})$ of $\poly(\vct{X})$ is: $\rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}$
% \begin{equation*}
% \rpoly(\vct{X}) = \poly(\vct{X}) \mod \inparen{\mathcal{T} \cup \mathcal{B}}%X_i^2 - X_i \mod X_{\block_s, t}X_{\block_s, u}
% \end{equation*}
%for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{\block_s}]$, $t \neq u$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
All exponents $e > 1$ in $\smbOf{\poly(\vct{X})}$ are reduced to $e = 1$ via mod $\mathcal{T}$. Performing the modulus of $\rpoly(\vct{X})$ with $\mathcal{B}$ ensures the disjoint condition of \bi, removing monomials with lineage variables from the same block.%, (recall the constraint on tuples from the same block being disjoint in a \bi).% any monomial containing more than one tuple from a block has $0$ probability and can be ignored).
All exponents $e > 1$ in $\smbOf{\poly(\vct{X})}$ are reduced to $e = 1$ via mod $\mathcal{T}$. Performing the modulus of $\rpoly(\vct{X})$ with $\mathcal{B}$ ensures the disjoint condition of \bi, removing monomials with lineage variables from the same block.%, (recall the constraint on tuples from the same block being disjoint in a \bi).% any monomial containing more than one tuple from a block has $0$ probability and can be ignored).
%
For the special case of \tis, the second step is not necessary since every block contains a single tuple.
%Alternatively, one can think of $\rpoly$ as the \abbrSMB of $\poly(\vct{X})$ when the product operator is idempotent.
@ -138,7 +145,7 @@ Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blo
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Valid Worlds]
For probability distribution $\probDist$, % and its corresponding probability mass function $\probOf$,
For probability distribution $\probDist$, % and its corresponding probability mass function $\probOf$,
the set of valid worlds $\eta$ consists of all the worlds with probability value greater than $0$; i.e., for variable vector $\vct{W}$
\[
\eta = \comprehension{\vct{w}}{\probOf[\vct{W} = \vct{w}] > 0}
@ -168,8 +175,8 @@ Let $\pxdb$ be a \bi over variables $\vct{X} = \{X_1, \ldots, X_\numvar\}$ and w
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Note that in the preceding lemma, we have assigned $\vct{p}$
%(introduced in \Cref{subsec:def-data})
Note that in the preceding lemma, we have assigned $\vct{p}$
%(introduced in \Cref{subsec:def-data})
to the variables $\vct{X}$. Intuitively, \Cref{lem:exp-poly-rpoly} states that when we replace each variable $X_i$ with its probability $\prob_i$ in the reduced form of a \bi-lineage polynomial and evaluate the resulting expression in $\mathbb{R}$, then the result is the expectation of the polynomial.

View File

@ -10,10 +10,10 @@ For illustrative purposes consider the polynomial $\poly(\vct{X}) = 2X^2 + 3XY -
We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way.
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal nodes and (the single) sink node of $\circuit$ (corresponding to the result tuple $t$) have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal nodes and (the single) sink node of $\circuit$ (corresponding to the result tuple $t$) have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
$\circuit$ additionally has the following members: \type, \vari{val}, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input.
%The member \degval holds the degree of \circuit.
$\circuit$ additionally has the following members: \type, \vari{val}, \vari{partial}, \vari{input}, \degval and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input.
%The member \degval holds the degree of \circuit.
When the underlying DAG is a tree (with edges pointing towards the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of \circuit.
\end{Definition}
@ -21,7 +21,7 @@ When the underlying DAG is a tree (with edges pointing towards the root), we wil
As stated in \Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees.
Note that if we limit the outdegree to one, then we get expression trees.
We ignore the fields \vari{partial}, \vari{Lweight}, and \vari{Rweight} until \Cref{sec:algo}.
@ -37,12 +37,12 @@ The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $
\node[tree_node] (b1) at (1, 0){$\boldsymbol{Y}$};
\node[tree_node] (c1) at (2, 0){$\boldsymbol{W}$};
\node[tree_node] (d1) at (3, 0){$\boldsymbol{Z}$};
\node[tree_node] (a2) at (0.5, 1){$\boldsymbol{\circmult}$};
\node[tree_node] (b2) at (2.5, 1){$\boldsymbol{\circmult}$};
\node[tree_node] (a3) at (1.5, 2){$\boldsymbol{\circplus}$};
\draw[->] (a1) -- (a2);
\draw[->] (b1) -- (a2);
\draw[->] (c1) -- (b2);
@ -62,16 +62,16 @@ The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $
\node[tree_node] (b1) at (1.5, 0) {$\boldsymbol{2}$};
\node[tree_node] (c1) at (3, 0) {$\boldsymbol{Y}$};
\node[tree_node] (d1) at (4.5, 0) {$\boldsymbol{-1}$};
\node[tree_node] (a2) at (0.75, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (b2) at (2.25, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (c2) at (3.75, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (a3) at (0.55, 1.5) {$\boldsymbol{\circplus}$};
\node[tree_node] (b3) at (3.75, 1.5) {$\boldsymbol{\circplus}$};
\node[tree_node] (a4) at (2.25, 2.25) {$\boldsymbol{\circmult}$};
\node[tree_node] (a4) at (2.25, 2.25) {$\boldsymbol{\circmult}$};
\draw[->] (a1) -- (a2);
\draw[->, thick] (a1) -- (a3);
\draw[->] (b1) -- (a2);
@ -89,7 +89,7 @@ The circuit \circuit in \Cref{fig:circuit-express-tree} encodes the polynomial $
\caption{Circuit encoding of $(X + 2Y)(2X - Y)$}
\label{fig:circuit}
\end{subfigure}
\caption{ }
\caption{Example circuit encodings}
\end{figure}
@ -113,8 +113,8 @@ $\circuitset{\smb}$ is the set of all possible circuits $\circuit$ such that $\p
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. One can think of $\circuitset{\smb}$ as the infinite set of circuits each of which model an encoding (factorization) equal to $\polyf(\circuit)$.
%\supset \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$.
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. One can think of $\circuitset{\smb}$ as the infinite set of circuits each of which model an encoding (factorization) equal to $\polyf(\circuit)$.
%\supset \{2X^2 + 3XY - 2Y^2, (X + 2Y)(2X - Y), X(2X - Y) + 2Y(2X - Y), 2X(X + 2Y) - Y(X + 2Y)\}$.
Note that \Cref{def:circuit-set} implies that $\circuit \in \circuitset{\polyf(\circuit)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%