Implemented @oliver's 021221 suggestions.

This commit is contained in:
Aaron Huber 2021-02-15 13:24:19 -05:00
parent 9aa7773219
commit cca1cb5dc8
3 changed files with 125 additions and 50 deletions

View file

@ -5,7 +5,7 @@
In~\Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed representation of a bag polynomial for \ti (even just based on project-join queries) is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability (\Cref{th:single-p-hard}).
Given this, we now design an approximation algorithm for our problem that runs in {\em linear time}.
Unlike the results in~\Cref{sec:hard} our approximation algorithm works for \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.
The folowing approximation algorithm applies to \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.
%it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.
\subsection{Preliminaries and some more notation}
@ -69,45 +69,7 @@ $\expansion{\circuit}$ encodes the \emph{reduced} form of $\polyf\inparen{\circu
Note, however, that unlike $\rpoly$, $\expansion{\circuit}$ does not need to be in SOP form.
\begin{figure}[t]
\resizebox{0.65\columnwidth}{!}{
\begin{tikzpicture}[thick, level distance=0.9cm,level 1/.style={sibling distance=3.55cm}, level 2/.style={sibling distance=1.8cm}, level 3/.style={sibling distance=0.8cm}]% level/.style={sibling distance=6cm/(#1 * 1.5)}]
\node[tree_node](root){$\boldsymbol{\times}$}
child{node[tree_node]{$\boldsymbol{+}$}
child{node[tree_node]{x}
%child[missing]{node[tree_node]{}}
%child{node[tree_node]{x}}
}
child{node[tree_node]{$\boldsymbol{\times}$}
child{node[tree_node]{2}}
child{node[tree_node]{y}}
}
}
child{node[highlight_treenode] (TR) {$\boldsymbol{+}$}
child{node[tree_node]{$\boldsymbol{\times}$}
child{node[tree_node]{2}}
child{node[tree_node]{x}}
}
child{node[tree_node]{$\boldsymbol{\times}$}
child{node[tree_node] (neg-leaf) {-1}}
child{node[tree_node]{y}}
}
%child[sibling distance= 0cm, grow=north east, red]{node[tree_node]{$\circuit_\rchild$}}
};
% \node[below=2pt of neg-leaf, inner sep=1pt, blue] (neg-comment) {\textbf{Negation pushed to leaf nodes}};
% \draw[<-|, blue] (neg-leaf) -- (neg-comment);
\node[above right=0.7cm of TR, highlight_color, inner sep=0pt, font=\bfseries] (tr-label) {$\circuit_\rinput$};
\node[above right=0.7cm of root, highlight_color, inner sep=0pt, font=\bfseries] (t-label) {$\circuit$};
\draw[<-|, highlight_color] (TR) -- (tr-label);
\draw[<-|, highlight_color] (root) -- (t-label);
\end{tikzpicture}
}
\vspace*{-2mm}
\caption{Expression tree $\circuit$ for the product $\boldsymbol{(x + 2y)(2x - y)}$.}
\label{fig:expr-tree-T}
\trimfigurespacing
\end{figure}
\revision{
\begin{Definition}[Positive \circuit]\label{def:positive-circuit}
@ -158,9 +120,14 @@ such that
%with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
\end{Theorem}
\noindent The proof of~\Cref{lem:approx-alg} can be found in~\Cref{sec:proofs-approx-alg}.
\noindent The proof of~\Cref{lem:approx-alg} \revision{
(which relies on ~\Cref{lem:one-pass} and ~\Cref{lem:sample})
}
can be found in~\Cref{sec:proof-lem-approx-alg}. \revision{
The proofs for the referenced lemmas are also found in ~\Cref{sec:proof-one-pass} and ~\Cref{sec:proof-sample-monom}.
}
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (~\cref{def:mod-set-polys}):
To get linear runtime results from~\Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$ to be `canceled' when it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}):
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given an expression tree $\circuit$, define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\monom\mod{\mathcal{B}}\equiv 0}}{\abs{\circuit}(1,\ldots, 1)}\]
@ -205,6 +172,8 @@ The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
%\frac{\samplesize\error^2}{2}\geq \log{\frac{2}{\conf}}\\
\end{equation*}
To summarize, \approxq modifies \circuit with a call to \onepass. It then samples from \circuit $\numsamp$ times and uses that information to approximate $\rpoly$.
%We state the approximation algorithm in terms of a $\bi$.
%\subsubsection{Description}
%Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\circuit$ produces a non-biased weight distribution over the monomials of $\expansion{\circuit}$ and a correct count of $|\circuit|(1,\ldots, 1)$, i.e., the number of monomials in $\expansion{\circuit}$. Next, ~\Cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expansion{\circuit}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\Cref{lem:mon-samp}.
@ -264,7 +233,8 @@ The number of samples is computed by (see \Cref{app:subsec-th-mon-samp}):
%\State $\vari{i} \gets 1$
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
%\State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, used to check for cross-terms}\newline
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon \; is ~\Cref{alg:sample}}
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}
\State\Comment{\sampmon \; is ~\Cref{alg:sample}}
%\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
% \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
% \newline
@ -340,7 +310,7 @@ we first state the lemmas that summarize the relevant properties of $\onepass$ a
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in $O(size(\circuit))$ time. $\onepass$ guarantees two post conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, for each $\vari{input}$ of $\vari{S}$, $\vari{input}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{input}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
The $\onepass$ function completes in $O(size(\circuit))$ time. $\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, for each $\vari{input}$ of $\vari{S}$, $\vari{input}.\vari{weight}$ is set to $\frac{\abs{\vari{S}_{\vari{input}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$. % is correctly computed for each child of $\vari{S}.$
\end{Lemma}
To prove correctness of~\Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$), $\circuit_{\vari{mod}}$, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
%\AH{I'm wondering if there is a better notation to use here. I myself got confused by my own notation of $\circuit_{\vari{mod}}$. \emph{But}, we need to to be referencing the modified $\circuit$ returned by $\onepass$ in the algorithm, so maybe this is the best we can do?}
@ -400,7 +370,7 @@ It turns out that for proof of~\Cref{lem:sample}, we need to argue that when $\c
%\begin{align*}
%&\eval{\circuit~|~\circuit.\type = +}_{\wght} =&&\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}; \circuit_\lchild.\wght = \frac{\eval{\circuit_\lchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}; \circuit_\rchild.\wght = \frac{\eval{\circuit_\rchild}_{\abs{\circuit}}}{\eval{\circuit_\lchild}_{\abs{\circuit}} + \eval{\circuit_\rchild}_{\abs{\circuit}}}
%\end{align*}
\noindent \onepass\ (Algorithm ~\ref{alg:one-pass} in \Cref{sec:proofs-approx-alg}) essentially populates the \wght and \vpartial variables on each node with the definitions above. Lemma~\ref{lem:one-pass} is also proved in~\Cref{sec:proofs-approx-alg}.
\noindent \onepass\ (Algorithm ~\ref{alg:one-pass-iter} in \Cref{sec:proofs-approx-alg}) iteratively visits each gate one time according to the topological ordering of \circuit annotating the \lwght, \rwght, and \prt variables of each node according to the definitions above. Lemma~\ref{lem:one-pass} is also proved in~\Cref{sec:proofs-approx-alg}.
%\subsubsection{Psuedo Code}

View file

@ -379,7 +379,7 @@ The number of triangles in $\graph{\ell}$ for $\ell \geq 2$ will always be $0$ f
\section{Missing Details from Section~\ref{sec:algo}}\label{sec:proofs-approx-alg}
Before proving~\Cref{lem:mon-samp}, we use it to argue our main result,~\Cref{lem:approx-alg}:
\subsection{Proof of Theorem \ref{lem:approx-alg}}
\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
Set $\mathcal{E}=\approxq(\revision{\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\revision{\circuit}}(1,\ldots, 1)},\]
@ -632,7 +632,7 @@ level 2/.style={sibling distance=0.7cm},
\revision{
\subsection{Proof of ~\Cref{lem:one-pass} Iterative}
\subsection{Proof of ~\Cref{lem:one-pass} Iterative}\label{sec:proof-one-pass}
\paragraph{\onepass Correctness}
We first note that all DAGs have a topological ordering. By definition, a topological ordering will always order the source nodes first, since if there exists an edge $(u, v)$ in the set of edges $E$, then gate $u$ will be ordered before gate $v$, since the edge goes from $u$ to $v$. Therefore \onepass will visit all source (leaf) nodes first, correctly annotating the \prt values. Recall that all internal gates are either $\circplus$ or $\circmult$. The parent gates of the source gates will be the visited next in \topord(\circuit). If a parent node \subcircuit is $\circplus$, \onepass will add the values of the two source nodes $\subcircuit_\linput.\prt + \subcircuit_\rinput.\prt$ to correctly annotate \subcircuit.\prt. Further \subcircuit.\lwght will be computed correctly as $\frac{\subcircuit_\linput.\prt}{\subcircuit.\prt}$ and analogously for \subcircuit.\rwght. If the parent gate \subcircuit visited is a $\circmult$, then \onepass will correctly annotate \subcircuit.\prt as $\subcircuit_\linput.\prt \times \subcircuit_\rinput.\prt$. As gates further in the order are subsequently visited, note that it is the case that all previous gates visited will always contain the correct \prt values and it follows that all subsequent gates visited in \topord(\circuit) will correctly compute their respective \prt values. Since \lwght and \rwght are computed dependent only on \prt values, it follows that all \lwght and \rwght values will then be computed correctly.
@ -649,7 +649,7 @@ The efficiency gains of circuits over trees is found in the capability of circui
}
\subsection{Proof of~\Cref{lem:sample}}
\subsection{Proof of~\Cref{lem:sample}}\label{sec:proof-sample-monom}
We first need to show that $\sampmon$ indeed returns a monomial $\monom$,\footnote{Technically it returns $\var(\monom)$ but for less cumbersome notation we will refer to $\var(\monom)$ simply by $\monom$ in this proof.} such that $(\monom, \coef)$ is in $\expansion{\circuit}$, which we do by induction on the depth of $\circuit$.
For the base case, let the depth $d$ of $\circuit$ be $0$. We have that the root node is either a constant $\coef$ for which by line ~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\circuit.\type = \var$ and $\circuit.\val = x$, and by line ~\ref{alg:sample-var-return} we return $\{x\}$. Both cases sample a monomial%satisfy ~\cref{def:monomial}

View file

@ -109,14 +109,82 @@ We represent query polynomials via {\em arithmetic circuits}~\cite{arith-complex
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source nodes (in degree of $0$) consist of elements in either $\reals$ or $\vct{X}$. The internal and sink nodes of $\circuit$ have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
Circuit $\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored, and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. When the underlying DAG is a tree, we will refer to the structure as an expression tree.
$\circuit$ additionally has the following members: \type, \val, \vari{partial}, \vari{input}, and \vari{Lweight}, \vari{Rweight}, where \type is the type of value stored in the node $\circuit$ (i.e. one of $\{\circplus, \circmult, \var, \tnum\}$, \val is the value stored (a constant or variable), and \vari{input} is the list of \circuit 's inputs where $\circuit_\linput$ is the left input and $\circuit_\rinput$ the right input. When the underlying DAG is a tree (with edges pointing away from the root), we will refer to the structure as an expression tree \etree. Note that in such a case, the root of \etree is analogous to the sink of the \circuit.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As stated in ~\Cref{def:circuit}, every internal node has at most two in-edges, is labeled as an addition or a multiplication node, and has no limit on its outdegree.
Note that if we limit the outdegree to one, then we get expression trees.
Note that if we limit the outdegree to one, then we get expression trees.
\begin{Example}
The circuit \circuit in ~\Cref{fig:circuit-express-tree} encodes the polynomial $XY + WZ$. Note that such an encoding lends itself naturally to having all gates with an outdegree of $1$. Note further that \circuit is indeed a tree with edges pointing away from the root.
\end{Example}
\begin{figure}[t]
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (0, 0){$\boldsymbol{X}$};
\node[tree_node] (b1) at (1, 0){$\boldsymbol{Y}$};
\node[tree_node] (c1) at (2, 0){$\boldsymbol{W}$};
\node[tree_node] (d1) at (3, 0){$\boldsymbol{Z}$};
\node[tree_node] (a2) at (0.5, 1){$\boldsymbol{\circmult}$};
\node[tree_node] (b2) at (2.5, 1){$\boldsymbol{\circmult}$};
\node[tree_node] (a3) at (1.5, 2){$\boldsymbol{\circplus}$};
\draw[->] (a1) -- (a2);
\draw[->] (b1) -- (a2);
\draw[->] (c1) -- (b2);
\draw[->] (d1) -- (b2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (a3);
\end{tikzpicture}
\caption{Circuit encoding $XY + WZ$, a special case of an expression tree}
\label{fig:circuit-express-tree}
\end{figure}
%\begin{figure}[t]
%
%\resizebox{0.65\columnwidth}{!}{
%\begin{tikzpicture}[thick, level distance=0.9cm,level 1/.style={sibling distance=3.55cm}, level 2/.style={sibling distance=1.8cm}, level 3/.style={sibling distance=0.8cm}]% level/.style={sibling distance=6cm/(#1 * 1.5)}]
% \node[tree_node](root){$\boldsymbol{\times}$}
% child{node[tree_node]{$\boldsymbol{+}$}
% child{node[tree_node]{x}
% %child[missing]{node[tree_node]{}}
% %child{node[tree_node]{x}}
% }
% child{node[tree_node]{$\boldsymbol{\times}$}
% child{node[tree_node]{2}}
% child{node[tree_node]{y}}
% }
% }
% child{node[highlight_treenode] (TR) {$\boldsymbol{+}$}
% child{node[tree_node]{$\boldsymbol{\times}$}
% child{node[tree_node]{2}}
% child{node[tree_node]{x}}
% }
% child{node[tree_node]{$\boldsymbol{\times}$}
% child{node[tree_node] (neg-leaf) {-1}}
% child{node[tree_node]{y}}
% }
% %child[sibling distance= 0cm, grow=north east, red]{node[tree_node]{$\circuit_\rchild$}}
% };
%% \node[below=2pt of neg-leaf, inner sep=1pt, blue] (neg-comment) {\textbf{Negation pushed to leaf nodes}};
%% \draw[<-|, blue] (neg-leaf) -- (neg-comment);
% \node[above right=0.7cm of TR, highlight_color, inner sep=0pt, font=\bfseries] (tr-label) {$\circuit_\rinput$};
% \node[above right=0.7cm of root, highlight_color, inner sep=0pt, font=\bfseries] (t-label) {$\circuit$};
% \draw[<-|, highlight_color] (TR) -- (tr-label);
% \draw[<-|, highlight_color] (root) -- (t-label);
%\end{tikzpicture}
%}
%\vspace*{-2mm}
%\caption{Expression tree $\circuit$ for the product $\boldsymbol{(x + 2y)(2x - y)}$.}
%\label{fig:expr-tree-T}
%\trimfigurespacing
%\end{figure}
We ignore the remaining fields (\vari{partial}, \vari{Lweight}, and \vari{Rweight}) until \Cref{sec:algo}.
}
@ -135,7 +203,44 @@ Denote \revision{$\polyf(\circuit)$}~ to be the function from circuit \revision{
\end{equation*}
\end{Definition}
Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$.
Note that $\circuit$ need not encode an expression in standard monomial basis, while as stated previously a polynomial is considered to be in SMB, and the output of \polyf($\cdot$) is therefore in SMB. For instance, $\circuit$ could represent a compressed form of the running example, such as $(X + 2Y)(2X - Y)$\revision{
, as shown in \Cref{fig:circuit}.
\begin{figure}[t]
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (0, 0) {$\boldsymbol{X}$};
\node[tree_node] (b1) at (1.5, 0) {$\boldsymbol{2}$};
\node[tree_node] (c1) at (3, 0) {$\boldsymbol{Y}$};
\node[tree_node] (d1) at (4.5, 0) {$\boldsymbol{-1}$};
\node[tree_node] (a2) at (0.75, 1) {$\boldsymbol{\circmult}$};
\node[tree_node] (b2) at (2.25, 1) {$\boldsymbol{\circmult}$};
\node[tree_node] (c2) at (3.75, 1) {$\boldsymbol{\circmult}$};
\node[tree_node] (a3) at (0.55, 2) {$\boldsymbol{\circplus}$};
\node[tree_node] (b3) at (3.75, 2) {$\boldsymbol{\circplus}$};
\node[tree_node] (a4) at (2.25, 3) {$\boldsymbol{\circmult}$};
\draw[->] (a1) -- (a2);
\draw[->, thick] (a1) -- (a3);
\draw[->] (b1) -- (a2);
\draw[->] (b1) -- (b2);
\draw[->] (c1) -- (c2);
\draw[->] (c1) -- (b2);
\draw[->] (d1) -- (c2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (b3);
\draw[->] (c2) -- (b3);
\draw[->] (a3) -- (a4);
\draw[->] (b3) -- (a4);
\draw[->] (a4) -- (2.25, 3.5);
\end{tikzpicture}
\caption{Circuit encoding of the formula (X + 2Y)(2X - Y)}
\label{fig:circuit}
\end{figure}
}
\oldstuff{
\begin{Definition}[Expression Tree Set]\label{def:express-tree-set}$\etreeset{\smb}$ is the set of all possible expression trees $\etree$, such that $poly(\etree) = \poly(\vct{X})$.