Finished pass on S4.

2021-09-10 11:49:29 -04:00 · 2021-09-10 11:49:29 -04:00 · d087ce7fbb
parent 615ec842d3
commit d087ce7fbb
3 changed files with 35 additions and 33 deletions
--- a/app_approx-alg-analysis.tex
+++ b/app_approx-alg-analysis.tex
@ -3,7 +3,7 @@
 %\input{app_approx-alg-pseudo-code}

 \subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
-
+\input{app_approx-alg-pseudo-code}
 In order to prove \Cref{lem:approx-alg}, we will need to argue the correctness of \approxq, which relies on the correctness of auxiliary algorithms \onepass and \sampmon.

 \begin{Lemma}\label{lem:one-pass}
--- a/approx_alg.tex
+++ b/approx_alg.tex
@ -3,9 +3,10 @@

 \section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}

-In \Cref{sec:hard}, we showed that computing the expected multiplicity of a compressed lineage polynomial for \ti (even just based on project-join queries), and by extension \bi (or more general \abbrPDB models) %any $\semNX$-PDB) 
-is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability  (\Cref{th:single-p-hard}).
-Given this, we now design an approximation algorithm for our problem that runs in {\em linear time}.\footnote{For a very broad class of circuits: please see the discussion after \Cref{lem:val-ub} for more.}
+In \Cref{sec:hard}, we showed that the answer to $\Cref{prob:intro-stmt}$ is no.
+%computing the expected multiplicity of a compressed lineage polynomial for \ti (even just based on project-join queries), and by extension \bi (or more general \abbrPDB models) %any $\semNX$-PDB) 
+%is unlikely to be possible in linear time (\Cref{thm:mult-p-hard-result}), even if all tuples have the same probability  (\Cref{th:single-p-hard}).
+With this result, we now design an approximation algorithm for our problem that runs in {\em linear time}.\footnote{For a very broad class of circuits: please see the discussion after \Cref{lem:val-ub} for more.}
 The folowing approximation algorithm applies to \bi, though our bounds are more meaningful for a non-trivial subclass of \bis that contains both \tis, as well as the PDBench benchmark~\cite{pdbench}.  As before, all proofs and pseudocode can be found in \Cref{sec:proofs-approx-alg}.
 %it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next.

@ -23,9 +24,8 @@ We now introduce useful definitions and notation related to circuits and polynom


 \begin{Definition}[$\expansion{\circuit}$]\label{def:expand-circuit}
-For a circuit $\circuit$, we define $\expansion{\circuit}$ as a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef \in \domN$.  We will denote the monomial composed of the variables in $\monom$ as $\encMon$.
+For a circuit $\circuit$, we define $\expansion{\circuit}$ as a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef \in \domN$.  
 $\expansion{\circuit}$ has the following recursive definition ($\circ$ is list concatenation).
-
 $\expansion{\circuit} =
 \begin{cases}
 					\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput}		&\textbf{ if }\circuit.\type = \circplus\\
@ -35,12 +35,13 @@ $\expansion{\circuit} =
 \end{cases}
 $
 \end{Definition}
-Consider $\circuit$ illustrated in \Cref{fig:circuit}.  $\expansion{\circuit}$ is then $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
+Later on, we will denote the monomial composed of the variables in $\monom$ as $\encMon$.  As an example of $\expansion{\circuit}$, consider $\circuit$ illustrated in \Cref{fig:circuit}.  $\expansion{\circuit}$ is then $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.

-\begin{Definition}[$\abs{\circuit}(\vct{X})$]\label{def:positive-circuit}
+\begin{Definition}[$\abs{\circuit}$]\label{def:positive-circuit}
 For any circuit $\circuit$, the corresponding
 {\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
 \end{Definition}
+We will overload notation and use $\abs{\circuit}\inparen{\vct{X}}$ to mean $\polyf\inparen{\abs{\circuit}}$.
 Conveniently, $\abs{\circuit}\inparen{1,\ldots,1}$ gives us the number of terms represented in $\expansion{\circuit}$, i.e. $\sum\limits_{\inparen{\monom, \coef} \in \expansion{\circuit}}\abs{\coef}$.

 \begin{Definition}[\size($\cdot$), \depth$\inparen{\cdot}$]\label{def:size-depth}
@ -56,30 +57,30 @@ The functions \size and \depth output the number of gates and levels respectivel
 %NEEDS to be moved to appendix
 %%%%%%%%%%%%%%%%%%%%%%%%%

-%\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
-%$\degree(\circuit)$ is defined recursively as follows:
-%\[\degree(\circuit)=
-%\begin{cases}
-%\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
-%\degree(\circuit_\linput) + \degree(\circuit_\rinput)+1 &\text{ if }\circuit.\type=\times\\
-%1 & \text{ if }\circuit.\type = \var\\
-%0 & \text{otherwise}.
-%\end{cases}
-%\]
-%\end{Definition}
+\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
+$\degree(\circuit)$ is defined recursively as follows:
+\[\degree(\circuit)=
+\begin{cases}
+\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
+\degree(\circuit_\linput) + \degree(\circuit_\rinput)+1 &\text{ if }\circuit.\type=\times\\
+1 & \text{ if }\circuit.\type = \var\\
+0 & \text{otherwise}.
+\end{cases}
+\]
+\end{Definition}
 %%%%%%%%%%%%%%%%%%%%%%%%%%
 %END move to appendix
 %%%%%%%%%%%%%%%%%%%%%%%%%%

-Finally, we will need the following notation for the complexity of multiplying large integers:
+Finally, we use the following notation for the complexity of multiplying integers:
 \begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$. More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
 In a RAM model of word size of $W$-bits, $\multc{M}{W}$ denotes the complexity of multiplying two integers represented with $M$-bits. (We will assume that for input of size $N$, $W=O(\log{N})$.
 \end{Definition}

 \subsection{Our main result}
-\AH{Verify that the proof for \cref{lem:approx-alg} doesn't rely on properties of $\raPlus$ or \abbrBIDB.}
+\AH{Verify that the proof for \Cref{lem:approx-alg} doesn't rely on properties of $\raPlus$ or \abbrBIDB.}
 \begin{Theorem}\label{lem:approx-alg}
-Let \circuit be an arbitrary arithmetic circuit %for a UCQ over \bi 
+Let \circuit be an arbitrary arithmetic circuit from a \abbrBIDB %for a UCQ over \bi 
 and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
 Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
 {\small
@ -94,10 +95,10 @@ such that

 To get linear runtime results from \Cref{lem:approx-alg}, we will need to define another parameter modeling the (weighted) number of monomials in %$\poly\inparen{\vct{X}}$ 
 $\expansion{\circuit}$ 
-to be `canceled' monomials with dependent variables are removed (\cref{def:reduced-bi-poly}).  %def:hen it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}).
-Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise.
+to be `canceled' monomials with dependent variables are removed (\Cref{def:reduced-bi-poly}).  %def:hen it is modded with $\mathcal{B}$ (\Cref{def:mod-set-polys}).
+Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise; further, let $\indicator{\theta}$ also be a boolean function returning true if $\theta$ evaluates to true.
 \begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
-Given an expression tree $\circuit$, define
+Given a circuit $\circuit$ from a \abbrBIDB, define
 \AH{Technically, $\monom$ is a set of variables rather than a monomial.  Perhaps we don't need the $\var(\cdot)$ function and can replace is with a function that returns the monomial represented by a set of variables.  FIXED: need to propogate this to the appendix ($\encMon$)}
 \AH{To add, this is an issue on line 1073, 1117 of app C.}
 \[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\neg\isInd{\encMon}} }%\encMon\mod{\mathcal{B}}\equiv 0}}
@ -107,7 +108,7 @@ Given an expression tree $\circuit$, define
 \noindent We next present a few corollaries of \Cref{lem:approx-alg}.
 \begin{Corollary}
 \label{cor:approx-algo-const-p}
-Let $\poly(\vct{X})$ be as in \Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying \Cref{eq:approx-algo-bound} can be computed in time
+Let $\poly(\vct{X})$ be as in \Cref{lem:approx-alg} and let $\gamma=\gamma(\circuit)$ for \abbrBIDB circuit \circuit. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$  of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ satisfying \Cref{eq:approx-algo-bound} can be computed in time
 \[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\]
 In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$.
 \end{Corollary}
@ -117,30 +118,31 @@ The restriction on $\gamma$ is satisfied by any \ti (where $\gamma=0$) as well a
 Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}$ term in the runtime. %In \Cref{susec:proof-val-up}, we show the following:
 \begin{Lemma}
 \label{lem:val-ub}
-For any circuit $\circuit$ with $\degree(\circuit)=k$, we have
+For any \abbrBIDB circuit $\circuit$ with $\degree(\circuit)=k$, we have
 $\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \size(\circuit)}.$
 Further, under either of the following conditions:
 \begin{enumerate}
 \item $\circuit$ is a tree,
-\item $\circuit$ encodes the run of the algorithm in~\cite{DBLP:conf/pods/KhamisNR16} on an FAQ\AH{citation would help here, as a reviewer complaint on this was ``What is FAQ?'', though we do cite (I think) in the appendix.} query,
+\item $\circuit$ encodes the run of the algorithm in~\cite{DBLP:conf/pods/KhamisNR16} on an FAQ\AH{AJAR citation.} query,
 \end{enumerate}
 we have $\abs{\circuit}(1,\ldots, 1)\le  \size(\circuit)^{O(k)}.$
 \end{Lemma}

 Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$ and to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$ for the case when $\circuit$ satisfies the specific conditions in \Cref{lem:val-ub}. In \Cref{app:proof-lem-val-ub} we argue that these conditions are very general and encompass many interesting scenarios, including query evaluation under \raPlus or FAQ.
+\AH{AJAR reference.}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Approximating $\rpoly$}
-We prove \Cref{lem:approx-alg} by developing an approximation algorithm (\approxq detailed in \Cref{alg:mon-sam}) with the desired runtime. This algorithm is based on the following observation.
+We prove \Cref{lem:approx-alg} by developing an approximation algorithm (\approxq pseudo code in \Cref{sec:proof-lem-approx-alg}) with the desired runtime. This algorithm is based on the following observation.
 % The algorithm (\approxq detailed in \Cref{alg:mon-sam}) to prove \Cref{lem:approx-alg} follows from the following observation.
-Given a query polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit over $\bi$, we have: % can exactly represent $\rpoly(\vct{X})$ as follows:
+Given a lineage polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit over $\bi$, we have: % can exactly represent $\rpoly(\vct{X})$ as follows:

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{equation}
 \label{eq:tilde-Q-bi}
 \rpoly\inparen{X_1,\dots,X_\numvar}=\hspace*{-1mm}\sum_{(\monom,\coef)\in \expansion{\circuit}} %\hspace*{-2mm}
 \indicator{\isInd{\encMon}%\mod{\mathcal{B}}\not\equiv 0
-}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \monom}\hspace*{-2mm} X_i
+}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \monom}\hspace*{-2mm} X_i.
 \end{equation}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@ -156,7 +158,7 @@ Given a query polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit

 Given the above, the algorithm is a sampling based algorithm for the above sum: we sample (via \sampmon) $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional %\footnote{We could have also uniformly sampled from $\expansion{\circuit}$ but this gives better parameters.}
 to $\abs{\coef}$ and compute $\vari{Y}=\indicator{\isInd{\encMon}}%\monom\mod{\mathcal{B}}\not\equiv 0}
- \cdot \prod_{X_i\in \monom} p_i$. Taking $\numsamp$ samples and computing the average of $\vari{Y}$ gives us our final estimate. \onepass is used to compute the sampling probabilities needed in \sampmon (details are in \Cref{sec:proofs-approx-alg}).
+ \cdot \prod_{X_i\in \monom} p_i$. Taking $\ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$ samples and computing the average of $\vari{Y}$ gives us our final estimate. \onepass is used to compute the sampling probabilities needed in \sampmon (details are in \Cref{sec:proofs-approx-alg}).
 %\approxq (\Cref{alg:mon-sam}) modifies \circuit with a call to \onepass.  It then samples from $\circuit_{\vari{mod}}\numsamp$ times and uses that information to approximate $\rpoly$.


--- a/macros.tex
+++ b/macros.tex
@ -237,7 +237,7 @@
 \newcommand{\mtrix}[1]{M_{#1}}
 \newcommand{\dtrm}[1]{Det\left(#1\right)}
 \newcommand{\tuple}[1]{\left<#1\right>}
-\newcommand{\indicator}[1]{\underset{#1}{\onesymbol}}
+\newcommand{\indicator}[1]{\onesymbol_{#1}}
 %----------------------------------------------

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%