From f20577deadc0e8cb2f81c7178355b0b0ef537444 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Mon, 7 Dec 2020 15:12:39 -0500 Subject: [PATCH 01/17] Finished restructuring mult p and single p arguments. --- approx_alg.tex | 3 +- lin_sys.tex | 120 +++++++++++++++++++++++++++----------------- macros.tex | 3 +- mult_distinct_p.tex | 91 ++++++++++++--------------------- poly-form.tex | 4 +- single_p.tex | 49 +++++++++++++----- 6 files changed, 150 insertions(+), 120 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index e78c50d..b9ba4a7 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -99,7 +99,8 @@ $\expandtree{\etree}$ is the pure sum of products expansion of $\etree$. The lo % &\etree.\type = \var \mapsto&& \elist{(\etree.\val, 1)} % \end{align*} \begin{align*} -\expandtree{\etree} = \begin{cases} +&\expandtree{\etree} = \\ +&\begin{cases} \expandtree{\etree_\lchild} \circ \expandtree{\etree_\rchild} &\textbf{ if }\etree.\type = +\\ \left\{(\monom_\lchild \cup \monom_\rchild, \coef_\lchild \cdot \coef_\rchild) ~|~ (\monom_\lchild, \coef_\lchild) \in \expandtree{\etree_\lchild}, (\monom_\rchild, \coef_\rchild) \in \expandtree{\etree_\rchild}\right\} &\textbf{ if }\etree.\type = \times\\ \elist{(\emptyset, \etree.\val)} &\textbf{ if }\etree.\type = \tnum\\ diff --git a/lin_sys.tex b/lin_sys.tex index 80ebf13..6ce3535 100644 --- a/lin_sys.tex +++ b/lin_sys.tex @@ -14,34 +14,46 @@ As previously outlined, assume graph $\graph{1}$ to be an arbitrary graph, with \subsubsection{$\graph{2}$} -Let us call the linear equation for graph $\graph{2}$ $\linsys{2}$. Using the hard to compute terms of the RHS in ~\cref{eq:LS-subtract}, let us consider the RHS, +Let us call the linear equation for graph $\graph{2}$ $\linsys{2}$. Using the hard to compute terms of the RHS in ~\cref{lem:qE3-exp}, let us consider the RHS, \begin{align} & \numocc{\graph{2}}{\tri} + \numocc{\graph{2}}{\threepath}\prob - \numocc{\graph{2}}{\threedis}\left(3\prob^2 - \prob^3\right)\nonumber\\ = &\numocc{\graph{2}}{\threepath}\prob - \numocc{\graph{2}}{\threedis}\left(3\prob^2 - \prob^3\right)\label{eq:ls-2-1}\\ -= &2 \cdot \numocc{\graph{1}}{\twopath}\prob - \pbrace{8 \cdot \numocc{\graph{1}}{\threedis} + 6 \cdot \numocc{\graph{1}}{\twopathdis} + 4 \cdot \numocc{\graph{1}}{\oneint} + 4 \cdot \numocc{\graph{1}}{\threepath} + 2 \cdot \numocc{\graph{1}}{\tri}}\left(3\prob^2 - \prob^3\right)\label{eq:ls-2-2}\\ -= &\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} - 8\cdot\numocc{\graph{1}}{\threedis} - 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right) + 2\cdot\numocc{\graph{1}}{\twopath}\prob - 4\cdot\numocc{\graph{1}}{\oneint}\cdot\left(3\prob^2 - \prob^3\right).\label{eq:ls-2-3} += &2 \cdot \numocc{\graph{1}}{\twopath}\prob - \left(8 \cdot \numocc{\graph{1}}{\threedis} + 6 \cdot \numocc{\graph{1}}{\twopathdis} \right.\nonumber\\ +&\left.+ 4 \cdot \numocc{\graph{1}}{\oneint} + 4 \cdot \numocc{\graph{1}}{\threepath} + 2 \cdot \numocc{\graph{1}}{\tri}\right)\left(3\prob^2 - \prob^3\right)\label{eq:ls-2-2}\\ += &\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} - 8\cdot\numocc{\graph{1}}{\threedis}\right.\nonumber\\ +&\left.- 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right) + 2\cdot\numocc{\graph{1}}{\twopath}\prob\nonumber \\ +&- 4\cdot\numocc{\graph{1}}{\oneint}\cdot\left(3\prob^2 - \prob^3\right).\label{eq:ls-2-3} \end{align} %define $\linsys{2} = \numocc{\graph{2}}{\tri} + \numocc{\graph{2}}{\threepath}\prob - \numocc{\graph{2}}{\threedis}\left(3\prob^2 - \prob^3\right)$. By \cref{claim:four-two} we can compute $\linsys{2}$ in $O(T(\numedge) + \numedge)$ time with $\numedge = |E_2|$, and more generally, $\numedge = |E_k|$ for a graph $\graph{k}$. Equation ~\ref{eq:ls-2-1} follows by \cref{lem:tri}. Similarly ~\cref{eq:ls-2-2} follows by both \cref{lem:3m-G2} and \cref{lem:3p-G2}. Finally, ~\cref{eq:ls-2-3} follows by a simple rearrangement of terms. -Now, by simple algebraic manipulations of ~\cref{eq:LS-subtract}, we deduce, +Now, by simple algebraic manipulations of ~\cref{lem:qE3-exp}, we deduce, \begin{align} -&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2\nonumber\\ -&\qquad\qquad =\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} - 8\cdot\numocc{\graph{1}}{\threedis} - 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right) + 2\cdot\numocc{\graph{1}}{\twopath}\prob - 4\cdot\numocc{\graph{1}}{\oneint}\cdot\left(3\prob^2 - \prob^3\right)\label{eq:lem3-G2-1}\\ -&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 - 2\cdot\numocc{\graph{1}}{\twopath}\prob\nonumber\\ -&\qquad + 4\cdot\numocc{\graph{1}}{\oneint}\left(3\prob^2 - \prob^3\right)\nonumber\\ -&\qquad\qquad =\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} - 8\cdot\numocc{\graph{1}}{\threedis} - 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right)\label{eq:lem3-G2-2}\\ -&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 - 2\cdot\numocc{\graph{1}}{\twopath}\prob\nonumber\\ -&\qquad + \left(4\cdot\numocc{\graph{1}}{\oneint}+ 6\cdot\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right)\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ -&\qquad\qquad =\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} + 10\cdot\numocc{\graph{1}}{\threedis}\right)\cdot\left(3\prob^2 - \prob^3\right)\label{eq:lem3-G2-3} +&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob\nonumber\\ +& - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 \nonumber\\ +&=\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath}\right.\nonumber\\ +&\left. - 8\cdot\numocc{\graph{1}}{\threedis} - 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right) + 2\cdot\numocc{\graph{1}}{\twopath}\prob\nonumber\\ +&- 4\cdot\numocc{\graph{1}}{\oneint}\cdot\left(3\prob^2 - \prob^3\right)\label{eq:lem3-G2-1}\\ +&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob \nonumber\\ +&- \numocc{\graph{2}}{\oneint}\prob- \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 \nonumber\\ +&- 2\cdot\numocc{\graph{1}}{\twopath}\prob+ 4\cdot\numocc{\graph{1}}{\oneint}\left(3\prob^2 - \prob^3\right)\nonumber\\ +&=\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} - 8\cdot\numocc{\graph{1}}{\threedis}\right. \nonumber\\ +&\left.- 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right)\label{eq:lem3-G2-2}\\ +&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob\nonumber\\ +&- \numocc{\graph{2}}{\oneint}\prob - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2\nonumber\\ +&- 2\cdot\numocc{\graph{1}}{\twopath}\prob + \left(4\cdot\numocc{\graph{1}}{\oneint}+ 6\cdot\left(\numocc{\graph{1}}{\twopathdis}\right.\right. \nonumber\\ +&\left.\left.+ 3\cdot\numocc{\graph{1}}{\threedis}\right)\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ +&=\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath} + 10\cdot\numocc{\graph{1}}{\threedis}\right)\cdot\left(3\prob^2 - \prob^3\right)\label{eq:lem3-G2-3} \end{align} Equation ~\ref{eq:lem3-G2-1} follows by substituting ~\cref{eq:ls-2-3} in the RHS. We then arrive with ~\cref{eq:lem3-G2-2} by adding the inverse of the last 3 terms of ~\cref{eq:ls-2-3} to both sides. Finally, we arrive at ~\cref{eq:lem3-G2-3} by adding the $O(\numedge)$ computable term (by ~\cref{eq:2pd-3d}) $6\left(\cdot\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right)$ to both sides. -Denote the matrix of the linear system as $\mtrix{\rpoly_{G}}$, where $\mtrix{\rpoly_{G}}[i]$ is the $i^{\text{th}}$ row of $\mtrix{\rpoly_{G}}$. From ~\cref{eq:lem3-G2-3} it follows that -\[\mtrix{\rpoly_{\graph{2}}}[2] = \left(-2 \cdot \numocc{\graph{1}}{\tri} - 4 \cdot \numocc{\graph{1}}{\threepath} + 10 \cdot \numocc{\graph{1}}{\threedis}\right)\cdot \left(3\prob^2 - \prob^3\right)\] +Denote the matrix of the linear system as $\mtrix{\rpoly_{G}}$, where $\mtrix{\rpoly_{G}}[i]$ is the $i^{\text{th}}$ row of $\mtrix{\rpoly_{G}}$. From ~\cref{eq:lem3-G2-3} it follows that $\mtrix{\rpoly_{\graph{2}}}[2] = $ +\begin{equation*} +\left(-2 \cdot \numocc{\graph{1}}{\tri} - 4 \cdot \numocc{\graph{1}}{\threepath} + 10 \cdot \numocc{\graph{1}}{\threedis}\right)\cdot \left(3\prob^2 - \prob^3\right) +\end{equation*} and @@ -67,32 +79,43 @@ and %Equation ~\ref{eq:LS-G2'} is the result of collecting $2\cdot\left(\numocc{\graph{1}}{\twopathdis} + 3\numocc{\graph{1}}{\threedis}\right)$ and moving them to the other side. Then ~\cref{eq:LS-G2'-1} results from adding $4\cdot\left(\numocc{\graph{1}}{\twopathdis} + 3\numocc{\graph{1}}{\threedis}\right)$ to both sides. Equation ~\ref{eq:LS-G2'-2} is the result of simplifying terms. % %For the left hand side, following the above steps, we obtain + + \begin{align*} -\vct{b}[2] &= \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob - \left(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\right)\prob^2\\ -&- 2\cdot \numocc{\graph{1}}{\twopath}\prob + \left(4\cdot\numocc{\graph{1}}{\oneint}+ 6\cdot\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right)\right)\left(3\prob^2 - \prob^3\right). +&\vct{b}[2] = \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob \nonumber\\ + &- \numocc{\graph{2}}{\oneint}\prob- \left(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\right)\prob^2- 2\cdot \numocc{\graph{1}}{\twopath}\prob \\ +&+ \left(4\cdot\numocc{\graph{1}}{\oneint}+ 6\cdot\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right)\right)\left(3\prob^2 - \prob^3\right). \end{align*} -We now have a linear equation in terms of $\graph{1}$ for $\graph{2}$. Note that by ~\cref{eq:2pd-3d}, it is the case that any term of the form $x \cdot \left(\numocc{\graph{i}}{\twopathdis} + 3\cdot \numocc{\graph{i}}{\threedis}\right)$ is computable in linear time. By ~\cref{eq:1e}, ~\cref{eq:2p}, ~\cref{eq:2m}, and ~\cref{eq:3s} the same is true for $\numocc{\graph{i}}{\ed}$, $\numocc{\graph{i}}{\twopath}$, $\numocc{\graph{i}}{\twodis}$, and $\numocc{\graph{i}}{\oneint}$ respectively. +We now have a linear equation in terms of $\graph{1}$ for $\graph{2}$. Note that by ~\cref{eq:2pd-3d}, it is the case that any term of the form $x \cdot \left(\numocc{\graph{i}}{\twopathdis}\right.$ + $\left.3\cdot \numocc{\graph{i}}{\threedis}\right)$ is computable in linear time. By ~\cref{eq:1e}, ~\cref{eq:2p}, ~\cref{eq:2m}, and ~\cref{eq:3s} the same is true for $\numocc{\graph{i}}{\ed}$, $\numocc{\graph{i}}{\twopath}$, $\numocc{\graph{i}}{\twodis}$, and $\numocc{\graph{i}}{\oneint}$ respectively. \subsubsection{$\graph{3}$} Following the same reasoning for $\graph{3}$, using \cref{lem:3m-G3}, \cref{lem:3p-G3}, and \cref{lem:tri}, starting with the RHS of ~\cref{eq:LS-subtract}, we derive \begin{align} &\numocc{\graph{3}}{\tri} + \numocc{\graph{3}}{\threepath}\prob - \numocc{\graph{3}}{\threedis}\left(3\prob^2 - \prob^3\right)\nonumber\\ -=& \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob - \left\{4 \cdot \numocc{\graph{1}}{\twopath} + 6 \cdot \numocc{\graph{1}}{\twodis} + 18 \cdot \numocc{\graph{1}}{\tri} + 21 \cdot \numocc{\graph{1}}{\threepath} + 24 \cdot \numocc{\graph{1}}{\twopathdis} +\right.\nonumber\\ +=& \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob - \left\{4 \cdot \numocc{\graph{1}}{\twopath} + 6 \cdot \numocc{\graph{1}}{\twodis}\right.\nonumber\\ +&\left.+ 18 \cdot \numocc{\graph{1}}{\tri} + 21 \cdot \numocc{\graph{1}}{\threepath} + 24 \cdot \numocc{\graph{1}}{\twopathdis} +\right.\nonumber\\ &\left.20 \cdot \numocc{\graph{1}}{\oneint} + 27 \cdot \numocc{\graph{1}}{\threedis}\right\}\left(3\prob^2 - \prob^3\right)\label{eq:LS-G3-sub}\\ -=&\pbrace{ -18\numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} - 24 \cdot \numocc{\graph{1}}{\twopathdis} - 27 \cdot \numocc{\graph{1}}{\threedis}}\left(3\prob^2 - \prob^3\right) \nonumber\\ -&+ \pbrace{-20 \cdot \numocc{\graph{1}}{\oneint} - 4\cdot \numocc{\graph{1}}{\twopath} - 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right)+ \numocc{\graph{1}}{\ed}\prob + 2 \cdot \numocc{\graph{1}}{\twopath}\prob. \label{eq:lem3-G3-1} +=&\left\{ -18\numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} - 24 \cdot \numocc{\graph{1}}{\twopathdis}\right. \nonumber\\ +&\qquad\left.- 27 \cdot \numocc{\graph{1}}{\threedis}\right\}\left(3\prob^2 - \prob^3\right) \nonumber\\ +&+ \pbrace{-20 \cdot \numocc{\graph{1}}{\oneint} - 4\cdot \numocc{\graph{1}}{\twopath} - 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right)\nonumber\\ +&+ \numocc{\graph{1}}{\ed}\prob + 2 \cdot \numocc{\graph{1}}{\twopath}\prob. \label{eq:lem3-G3-1} \end{align} Looking at ~\cref{eq:LS-subtract}, \begin{align} -&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2\nonumber\\ -&\qquad\qquad= \pbrace{ -18\numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} - 24 \cdot \numocc{\graph{1}}{\twopathdis} - 27 \cdot \numocc{\graph{1}}{\threedis}}\left(3\prob^2 - \prob^3\right) \nonumber\\ -&\qquad\qquad\qquad+ \pbrace{-20 \cdot \numocc{\graph{1}}{\oneint} - 4\cdot \numocc{\graph{1}}{\twopath} - 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right)+ \numocc{\graph{1}}{\ed}\prob + 2 \cdot \numocc{\graph{1}}{\twopath}\prob. \label{eq:lem3-G3-2}\\ -&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \left(\numocc{\graph{1}}{\ed} + \numocc{\graph{1}}{\twopath}\right)\prob\nonumber\\ -&\qquad + \left(24\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right) + 20\cdot\numocc{\graph{1}}{\oneint} + 4\cdot\numocc{\graph{1}}{\twopath} + 6\cdot\numocc{\graph{1}}{\twodis}\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ -&\qquad\qquad = \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\left(3p^2 - p^3\right)\label{eq:lem3-G3-3} +&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob \nonumber\\ +&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2\nonumber\\ +&= \left\{ -18\numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} - 24 \cdot \numocc{\graph{1}}{\twopathdis}\right. \nonumber\\ +&\left.- 27 \cdot \numocc{\graph{1}}{\threedis}\right\}\left(3\prob^2 - \prob^3\right) \nonumber\\ +&+ \pbrace{-20 \cdot \numocc{\graph{1}}{\oneint} - 4\cdot \numocc{\graph{1}}{\twopath} - 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right)\nonumber\\ +&+ \numocc{\graph{1}}{\ed}\prob + 2 \cdot \numocc{\graph{1}}{\twopath}\prob. \label{eq:lem3-G3-2}\\ +&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob\nonumber\\ +&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \left(\numocc{\graph{1}}{\ed} + \numocc{\graph{1}}{\twopath}\right)\prob\nonumber\\ +&+ \left(24\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right) + 20\cdot\numocc{\graph{1}}{\oneint} + 4\cdot\numocc{\graph{1}}{\twopath}\right.\nonumber\\ +&\left.+ 6\cdot\numocc{\graph{1}}{\twodis}\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ +&= \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\left(3p^2 - p^3\right)\label{eq:lem3-G3-3} \end{align} Equation ~\ref{eq:lem3-G3-2} follows from substituting ~\cref{eq:lem3-G3-2} in for the RHS of ~\cref{eq:LS-subtract}. We derive ~\cref{eq:lem3-G3-3} by adding the inverse of all $O(\numedge)$ computable terms, and for the case of $\twopathdis$ and $\threedis$, we add the $O(\numedge)$ computable term $24\cdot\left(\numocc{\graph{1}}{\twopathdis} + \numocc{\graph{1}}{\threedis}\right)$ to both sides. @@ -111,8 +134,10 @@ and %For the LHS we get \begin{align*} -\vct{b}[3] =& \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob\\ -& + \pbrace{24 \cdot \left(\numocc{\graph{1}}{\twopathdis} + 3\numocc{\graph{1}}{\threedis}\right) + 20 \cdot \numocc{\graph{1}}{\oneint} + 4\cdot \numocc{\graph{1}}{\twopath} + 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right) +&\vct{b}[3] = \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob \nonumber\\ +&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob\\ +&+ \left\{24 \cdot \left(\numocc{\graph{1}}{\twopathdis} + 3\numocc{\graph{1}}{\threedis}\right) + 20 \cdot \numocc{\graph{1}}{\oneint} + 4\cdot \numocc{\graph{1}}{\twopath} \right.\nonumber\\ +&\left.+ 6 \cdot \numocc{\graph{1}}{\twodis}\right\}\left(3\prob^2 - \prob^3\right) \end{align*} We now have a linear system consisting of three linear combinations, for $\graph{1}, \graph{2}, \graph{3}$ in terms of $\graph{1}$. Note that the constants for $\graph{1}$ follow the RHS of ~\cref{eq:LS-subtract}. To make it easier, use the following variable representations: $x = \numocc{\graph{1}}{\tri}, y = \numocc{\graph{1}}{\threepath}, z = \numocc{\graph{1}}{\threedis}$. Using $\linsys{2}$ and $\linsys{3}$, the following matrix is obtained, @@ -135,46 +160,51 @@ Now we seek to show that all rows of the system are indeed independent. The method of minors can be used to compute the determinant, $\dtrm{\mtrix{\rpoly}}$. We also make use of the fact that for a matrix with entries $ab, ac, ad,$ and $ae$, the determinant is $a^2be - a^2cd = a^2(be - cd)$. -\begin{equation*} -\begin{vmatrix} +\begin{align*} +&\begin{vmatrix} 1 & \prob & -(3\prob^2 - \prob^3)\\ -2(3\prob^2 - \prob^3) & -4(3\prob^2 - \prob^3) & 10(3\prob^2 - \prob^3)\\ -18(3\prob^2 - \prob^3) & -21(3\prob^2 - \prob^3) & 45(3\prob^2 - \prob^3) \end{vmatrix} -= (3\prob^2 - \prob^3)^2 \cdot += \\ +&(3\prob^2 - \prob^3)^2 \cdot \begin{vmatrix} -4 & 10\\ -21 & 45 \end{vmatrix} -~ - ~ \prob(3\prob^2 - \prob^3)^2~ \cdot +- \prob(3\prob^2 - \prob^3)^2~ \cdot \begin{vmatrix} -2 & 10\\ -18 & 45 -\end{vmatrix} -+ \left(- ~(3\prob^2 - \prob^3)^3\right)~ \cdot +\end{vmatrix}\\ +&+ \left(- ~(3\prob^2 - \prob^3)^3\right)~ \cdot \begin{vmatrix} -2 & -4\\ -18 & -21 \end{vmatrix}. -\end{equation*} +\end{align*} Compute each RHS term starting with the left and working to the right, \begin{equation} (3\prob^2 - \prob^3)^2\cdot \left((-4 \cdot 45) - (-21 \cdot 10)\right) = (3\prob^2 - \prob^3)^2\cdot(-180 + 210) = 30(3\prob^2 - \prob^3)^2.\label{eq:det-1} \end{equation} The middle term then is -\begin{equation} --\prob(3\prob^2 - \prob^3)^2 \cdot \left((-2 \cdot 45) - (-18 \cdot 10)\right) = -\prob(3\prob^2 - \prob^3)^2 \cdot (-90 + 180) = -90\prob(3\prob^2 - \prob^3)^2.\label{eq:det-2} -\end{equation} +\begin{align} +&-\prob(3\prob^2 - \prob^3)^2 \cdot \left((-2 \cdot 45) - (-18 \cdot 10)\right) \nonumber\\ +&= -\prob(3\prob^2 - \prob^3)^2 \cdot (-90 + 180) = -90\prob(3\prob^2 - \prob^3)^2.\label{eq:det-2} +\end{align} Finally, the rightmost term, -\begin{equation} --\left(3\prob^2 - \prob^3\right)^3 \cdot \left((-2 \cdot -21) - (-18 \cdot -4)\right) = -\left(3\prob^2 - \prob^3\right)^3 \cdot (42 - 72) = 30\left(3\prob^2 - \prob^3\right)^3.\label{eq:det-3} -\end{equation} +\begin{align} +&-\left(3\prob^2 - \prob^3\right)^3 \cdot \left((-2 \cdot -21) - (-18 \cdot -4)\right) \nonumber\\ +&= -\left(3\prob^2 - \prob^3\right)^3 \cdot (42 - 72) = 30\left(3\prob^2 - \prob^3\right)^3.\label{eq:det-3} +\end{align} Putting \cref{eq:det-1}, \cref{eq:det-2}, \cref{eq:det-3} together, we have, \begin{align} -\dtrm{\mtrix{\rpoly}} =& 30(3\prob^2 - \prob^3)^2 - 90\prob(3\prob^2 - \prob^3)^2 +30(3\prob^2 - \prob^3)^3 = 30(3\prob^2 - \prob^3)^2\left(1 - 3\prob + (3\prob^2 - \prob^3)\right) = 30\left(9\prob^4 - 6\prob^5 + \prob^6\right)\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right)\nonumber\\ -=&\left(30\prob^6 - 180\prob^5 + 270\prob^4\right)\cdot\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right).\label{eq:det-final} +&\dtrm{\mtrix{\rpoly}} = 30(3\prob^2 - \prob^3)^2 - 90\prob(3\prob^2 - \prob^3)^2 +30(3\prob^2 - \prob^3)^3\nonumber\\ +&= 30(3\prob^2 - \prob^3)^2\left(1 - 3\prob + (3\prob^2 - \prob^3)\right) \nonumber\\ +&= 30\left(9\prob^4 - 6\prob^5 + \prob^6\right)\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right)\nonumber\\ +&=\left(30\prob^6 - 180\prob^5 + 270\prob^4\right)\cdot\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right).\label{eq:det-final} \end{align} \AH{It appears that the equation below has roots at p = 0 (left factor) and p = 1, with NO roots $\in (0, 1)$.} @@ -184,7 +214,7 @@ Putting \cref{eq:det-1}, \cref{eq:det-2}, \cref{eq:det-3} together, we have, \end{proof}\AH{End proof of Lemma \ref{lem:lin-sys}} \qed - -Thus, we have proved the ~\cref{lem:const-p} for fixed $p \in (0, 1)$. +\begin{proof}[Proof of \cref{th:single-p}] +Thus, by ~\cref{lem:lin-sys} we have proved ~\cref{th:single-p} for fixed $p \in (0, 1)$. \end{proof} \qed diff --git a/macros.tex b/macros.tex index 68ac58b..ac918a9 100644 --- a/macros.tex +++ b/macros.tex @@ -6,7 +6,8 @@ \newcommand{\wElem}{w} %an element of \vct{w} -\newcommand{\st}{\;|\;} +\newcommand{\st}{\;|\;} %such that +\newcommand{\kElem}{k}%the kth element %RA-to-Poly Notation \newcommand{\polyinput}[2]{\left(#1,\ldots, #2\right)} \newcommand{\numvar}{n} diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 2069914..9b93ebb 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -1,40 +1,12 @@ %root:main.tex \subsection{When $\poly$ is not in sum of monomials form} -We would like to argue in the general case that $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. + +We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. To this end, consider the following graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$. -Before proceeding, let us list all possible edge patterns in an arbitrary $G$ consisting of $\leq 3$ distinct edges. - -\begin{itemize} - \item Single Edge $\left(\ed\right)$ - \item 2-path ($\twopath$) - \item 2-matching ($\twodis$) - \item Triangle ($\tri$) - \item 3-path ($\threepath$) - \item 3-star ($\oneint$)--this is the graph that results when all three edges share exactly one common endpoint. The remaining endpoint for each edge is disconnected from any endpoint of the three edges. - \item Disjoint Two-Path ($\twopathdis$)--this subgraph consists of a two path and a remaining disjoint edge. - \item 3-matching ($\threedis$)--this subgraph is composed of three disjoint edges. -\end{itemize} - -Let $\numocc{G}{H}$ denote the number of occurrences of pattern $H$ in graph $G$, where, for example, $\numocc{G}{\ed}$ means the number of single edges in $G$. - -For any graph $G$, the following formulas compute $\numocc{G}{H}$ for their respective patterns in $O(\numedge)$ time, with $d_i$ representing the degree of vertex $i$. -\begin{align} - &\numocc{G}{\ed} = \numedge, \label{eq:1e}\\ - &\numocc{G}{\twopath} = \sum_{i \in V} \binom{d_i}{2} \label{eq:2p}\\ - &\numocc{G}{\twodis} = \sum_{(i, j) \in E} \frac{\numedge - d_i - d_j + 1}{2}\label{eq:2m}\\%\binom{\numedge - d_i - d_j + 1}{2}\label{eq:2m}\\ - &\numocc{G}{\oneint} = \sum_{i \in V} \binom{d_i}{3}\label{eq:3s}\\ - &\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis} = \sum_{(i, j) \in E} \binom{\numedge - d_i - d_j + 1}{2}\label{eq:2pd-3d} -\end{align} - -A quick argument to why \cref{eq:2m} is true. Note that for edge $(i, j)$ connecting arbitrary vertices $i$ and $j$, finding all other edges in $G$ disjoint to $(i, j)$ is equivalent to finding all edges that are not connected to either vertex $i$ or $j$. The number of such edges is $m - d_i - d_j + 1$, where we add $1$ since edge $(i, j)$ is removed twice when subtracting both $d_i$ and $d_j$. Since the summation is iterating over all edges such that a pair $\left((i, j), (k, \ell)\right)$ will also be counted as $\left((k, \ell), (i, j)\right)$, division by $2$ then eliminates this double counting. - -Equation ~\ref{eq:2pd-3d} is true for similar reasons. For edge $(i, j)$, it is necessary to find two additional edges, disjoint or connected. As in ~\cref{eq:2m}, once the number of edges disjoint to $(i, j)$ have been computed, then we only need to consider all possible combinations of two edges from the set of disjoint edges, since it doesn't matter if the two edges are connected or not. Note, the factor $3$ of $\threedis$ is necessary to account for the triple counting of $3$-matchings. It is also the case that, since the two path in $\twopathdis$ is connected, that there will be no double counting by the fact that the summation automatically 'disconnects' the current edge, meaning that a two matching at the current vertex will not be counted. The sum over all such edge combinations is precisely then $\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}$. - - -Now consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$. For the following discussion, set $\poly_{G}^3(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^3$. +Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$. %Original lemma proving the exact coefficient terms in qE3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -58,56 +30,57 @@ Now consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\l %\textsc{case 3:} $e_1 \neq e_2 \neq e_3$, i.e., when all edges are distinct. For this case, we have $3! = 6$ permutations of $(e_1, e_2, e_3)$. This case consists of the following edge patterns: $\tri, \oneint, \threepath, \twopathdis, \threedis$. %\end{proof} %\qed - -\begin{Lemma}\label{lem:alt-qE3-exp} -Given polynomial $\poly_{G}^3(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^3$ as $\rpoly_{G}^3(\prob,\ldots, \prob) = \sum\limits_{i = 0}^6 c_i \cdot \prob^i$ for some fixed terms $\vct{c}$ and seven distinct $\prob$ values, one can compute each $c_i$ in $\vct{c}$ exactly. +\subsubsection{Multiple Distinct $\prob$ values} +For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$. +\begin{Lemma}\label{lem:qEk-multi-p} +Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$ and $2\kElem + 1$ distinct $\prob$ values, one can compute each $c_i$ in $\vct{c}$ exactly. \end{Lemma} -\begin{proof}[Proof of ~\cref{lem:alt-qE3-exp}] -By defintion we know that a polynomial consists of coefficient terms being multiplied to variables. In our case, one can readily expand the cubed expression by performing the $n^3$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^3$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^3X_j^3$ is $X_iX_j$ in $\rpoly_{G}^3$, and the value after substition is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $7$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $3$ distinct edges, with non-intersecting points, a case equivalent to $p^6$. +\begin{proof}[Proof of ~\cref{lem:qEk-multi-p}] +It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges, with non-intersecting points, a case equivalent to $p^{2\kElem}$. -Given that we have at least $7$ distinct values of $\prob$ by the lemma statement, it follows that we then have $7$ linear equations which are distinct. Further, by definition of the summation, these seven equations collectively form the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system to determine $\vct{c}$ exactly. +Given that we have $2\kElem + 1$ distinct values of $\prob$ by the lemma statement, it follows that we then have $2\kElem + 1$ linear equations which are distinct. Further, by construction of the summation, these $2\kElem + 1$ equations collectively form the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system to determine $\vct{c}$ exactly. \end{proof} \qed -\begin{Lemma}\label{lem:alt-qE3-exp-3-match} -The number of $3$-matchings in $\poly_{G}(\vct{X})$ is exactly $6\cdot\numocc{G}{\threedis}$. +\begin{Lemma}\label{lem:qEk-multi-p-k-match} +The number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$ is exactly $\kElem!\cdot\numocc{G}{\threedis}$. \end{Lemma} -\begin{proof} -A $3$-matching occurs when there are $3$ edges, $e_1, e_2, e_3$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq e_3$. In $\poly_{G}(\vct{X})$ there are $3$ choices from the first factor to select an edge of a given three matching. In the second factor, only $2$ choices, and so on, yielding $3! = 6$ terms in the expansion of $\poly_{G}(\vct{X})$ of an arbitrary $3-matching$. +\begin{proof}[Proof of Lemma ~\ref{lem:qEk-multi-p-k-match}] +A $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. -Thus, the product $6\cdot\numocc{G}{\threedis}$ is the exact number of $3$-matchings in $\poly_{G}(\vct{X})$. +Thus, the product $\kElem!\cdot\numocc{G}{\threedis}$ is the exact number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$. \end{proof} \qed -\begin{Corollary}\label{cor:lem-alt-qE3} -One can comute $\numocc{G}{\threedis}$ in $\query_{G}(\vct{X})$ exactly. +\begin{Corollary}\label{cor:lem-qEk} +One can compute $\numocc{G}{\threedis}$ in $\query_{G}^\kElem(\vct{X})$ exactly. \end{Corollary} -\begin{proof}[Proof for Corollary ~\ref{cor:lem-alt-qE3}] -By ~\cref{lem:alt-qE3-exp}, the term $c_6$ can be exactly computed. By ~\cref{lem:alt-qE3-exp-3-match}, we know that $c_6$ can be broken into two factors, and by dividing $c_6$ by the factor $6$, it follows that the resulting value is indeed $\numocc{G}{\threedis}$. +\begin{proof}[Proof for Corollary ~\ref{cor:lem-qEk}] +By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. By ~\cref{lem:qEk-multi-p-k-match}, we know that $c_{2\kElem}$ can be broken into two factors, and by dividing $c_{2\kElem}$ by the factor $\kElem!$, it follows that the resulting value is indeed $\numocc{G}{\threedis}$. \end{proof} \qed -\begin{Lemma}\label{lem:alt-qEk} -Given $k$ distinct $\prob$ values and $\poly_{G}^k(\prob,\ldots, \prob)$, one can solve the number of $3$-matchings exactly. -\end{Lemma} - -\begin{proof}[Proof for Lemma ~\ref{lem:alt-qEk}] -By the same logic as ~\cref{lem:alt-qE3-exp} it is the case that there are $k$ $\prob^i$ values for $i$ in $[0, k - 1]$. This, combined with $k$ distinct $\prob$ values yields the Vandermonde matrix with full rank, and thus all the values $c_i$ in $\vct{c}$ can be computed exactly. Finally, along the same lines as ~\cref{lem:alt-qE3-exp-3-match}, dividing by $k!$ yields the desired result, $\numocc{G}{k-matchings}$. This can be seen, since it is the case that only a $k-matching$ can have a $\prob^{2k}$ factor, and, secondly, for a $k-product$, there are $k$ choices in the first product, $k - 1$ choices in the second factor, and so on, yielding $k!$ copies of each $k-matching$. - - -\AH{Any suggestions for a better notation/representation of k-matching??} -\end{proof} - -\qed +%\begin{Lemma}\label{lem:alt-qEk} +%Given $k$ distinct $\prob$ values and $\poly_{G}^k(\prob,\ldots, \prob)$, one can solve the number of $3$-matchings exactly. +%\end{Lemma} +% +%\begin{proof}[Proof for Lemma ~\ref{lem:alt-qEk}] +%By the same logic as ~\cref{lem:qEk-multi-p} it is the case that there are $k$ $\prob^i$ values for $i$ in $[0, k - 1]$. This, combined with $k$ distinct $\prob$ values yields the Vandermonde matrix with full rank, and thus all the values $c_i$ in $\vct{c}$ can be computed exactly. Finally, along the same lines as ~\cref{lem:qEk-multi-p-k-match}, dividing by $k!$ yields the desired result, $\numocc{G}{k-matchings}$. This can be seen, since it is the case that only a $k-matching$ can have a $\prob^{2k}$ factor, and, secondly, for a $k-product$, there are $k$ choices in the first product, $k - 1$ choices in the second factor, and so on, yielding $k!$ copies of each $k-matching$. +% +% +%\AH{Any suggestions for a better notation/representation of k-matching??} +%\end{proof} +% +%\qed \begin{Corollary}\label{cor:reduct} -Since lemmas ~\ref{lem:alt-qE3-exp} and ~\ref{lem:alt-qEk} are true, it follows that computing $\rpoly(\vct{X})$ is hard. +By ~\cref{lem:qEk-multi-p}, ~\cref{lem:qEk-multi-p-k-match}, and ~\cref{cor:lem-qEk} it follows that computing $\rpoly(\vct{X})$ is hard. \end{Corollary} %Old proof diff --git a/poly-form.tex b/poly-form.tex index c8ec296..045a21f 100644 --- a/poly-form.tex +++ b/poly-form.tex @@ -13,13 +13,13 @@ A monomial is a product of a fixed set of variables, each raised to a non-negati For the term $2xy$, by ~\cref{def:monomial} the monomial is $xy$. -\begin{Definition}[Standard Monomial Basis] +\begin{Definition}[Standard Monomial Basis]\label{def:smb} A polynomial is in standard monomial basis when it is fully expanded out such that no product of sums exist and where each unique monomial appears exactly once. \end{Definition} For example, consider the expression $(x + y)^2$. The standard monomial basis for this expression is $x^2 +2xy + y^2$. While $x^2 + xy + xy + y^2$ is an expanded form of the expression, it is not the standard monomial basis since $xy$ appears more than once. -\begin{Assumption} +\begin{Assumption}\label{assump:poly-smb} All polynomials considered are in standard monomial basis, i.e., $\poly(\vct{X}) = \sum\limits_{\vct{d} \in \mathbb{N}^\numvar}q_d \cdot \prod\limits_{i = 1, d_i \geq 1}^{\numvar}X_i^{d_i}$, where $q_d$ is the coefficient for the monomial encoded in $\vct{d}$ and $d_i$ is the $i^{th}$ element of $\vct{d}$. \end{Assumption} diff --git a/single_p.tex b/single_p.tex index 1087532..744ed73 100644 --- a/single_p.tex +++ b/single_p.tex @@ -1,19 +1,52 @@ %root: main.tex +\subsubsection{Single $\prob$ value} -\begin{Theorem}\label{lem:const-p} +In this discussion, let us fix $\kElem = 3$. + +\begin{Theorem}\label{th:single-p} If we can compute $\rpoly_{G}^3(\vct{X})$ in T(\numedge) time for $X_1 =\cdots= X_\numvar = \prob$, then we can count the number of triangles, 3-paths, and 3-matchings in $G$ in $T(\numedge) + O(\numedge)$ time. \end{Theorem} -Before moving on to prove ~\cref{lem:const-p}, let us state the lemmas and defintions that will be useful in the proof. +Before moving on to prove ~\cref{th:single-p}, let us state the results, lemmas and defintions that will be useful in the proof. + +We need to list all possible edge patterns in an arbitrary $G$ consisting of $\leq 3$ distinct edges. + +\begin{itemize} + \item Single Edge $\left(\ed\right)$ + \item 2-path ($\twopath$) + \item 2-matching ($\twodis$) + \item Triangle ($\tri$) + \item 3-path ($\threepath$) + \item 3-star ($\oneint$)--this is the graph that results when all three edges share exactly one common endpoint. The remaining endpoint for each edge is disconnected from any endpoint of the three edges. + \item Disjoint Two-Path ($\twopathdis$)--this subgraph consists of a two path and a remaining disjoint edge. + \item 3-matching ($\threedis$)--this subgraph is composed of three disjoint edges. +\end{itemize} + +Let $\numocc{G}{H}$ denote the number of occurrences of pattern $H$ in graph $G$, where, for example, $\numocc{G}{\ed}$ means the number of single edges in $G$. + +For any graph $G$, the following formulas compute $\numocc{G}{H}$ for their respective patterns in $O(\numedge)$ time, with $d_i$ representing the degree of vertex $i$. +\begin{align} + &\numocc{G}{\ed} = \numedge, \label{eq:1e}\\ + &\numocc{G}{\twopath} = \sum_{i \in V} \binom{d_i}{2} \label{eq:2p}\\ + &\numocc{G}{\twodis} = \sum_{(i, j) \in E} \frac{\numedge - d_i - d_j + 1}{2}\label{eq:2m}\\%\binom{\numedge - d_i - d_j + 1}{2}\label{eq:2m}\\ + &\numocc{G}{\oneint} = \sum_{i \in V} \binom{d_i}{3}\label{eq:3s}\\ + &\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis} = \sum_{(i, j) \in E} \binom{\numedge - d_i - d_j + 1}{2}\label{eq:2pd-3d} +\end{align} + +A quick argument to why \cref{eq:2m} is true. Note that for edge $(i, j)$ connecting arbitrary vertices $i$ and $j$, finding all other edges in $G$ disjoint to $(i, j)$ is equivalent to finding all edges that are not connected to either vertex $i$ or $j$. The number of such edges is $m - d_i - d_j + 1$, where we add $1$ since edge $(i, j)$ is removed twice when subtracting both $d_i$ and $d_j$. Since the summation is iterating over all edges such that a pair $\left((i, j), (k, \ell)\right)$ will also be counted as $\left((k, \ell), (i, j)\right)$, division by $2$ then eliminates this double counting. + +Equation ~\ref{eq:2pd-3d} is true for similar reasons. For edge $(i, j)$, it is necessary to find two additional edges, disjoint or connected. As in ~\cref{eq:2m}, once the number of edges disjoint to $(i, j)$ have been computed, then we only need to consider all possible combinations of two edges from the set of disjoint edges, since it doesn't matter if the two edges are connected or not. Note, the factor $3$ of $\threedis$ is necessary to account for the triple counting of $3$-matchings. It is also the case that, since the two path in $\twopathdis$ is connected, that there will be no double counting by the fact that the summation automatically 'disconnects' the current edge, meaning that a two matching at the current vertex will not be counted. The sum over all such edge combinations is precisely then $\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}$. + + %Original lemma proving the exact coefficient terms in qE3 %%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{Lemma}\label{lem:qE3-exp} -When we expand $\poly_{G}(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^3$ out and assign all exponents $e \geq 1$ a value of $1$, we have the following, +When we expand $\poly_{G}^3(\vct{X})$ out and assign all exponents $e \geq 1$ a value of $1$, we have the following result, \begin{align} - &\rpoly_{G}(\prob,\ldots, \prob) = \numocc{G}{\ed}\prob^2 + 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis} + 6\numocc{G}{\tri}\prob^3\nonumber\\ + &\rpoly_{G}^3(\prob,\ldots, \prob) = \numocc{G}{\ed}\prob^2 + 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis} + 6\numocc{G}{\tri}\prob^3\nonumber\\ &+ 6\numocc{G}{\oneint}\prob^4 + 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6.\label{claim:four-one} \end{align} \end{Lemma} @@ -31,9 +64,6 @@ By definition we have that \end{proof} \qed - -\begin{proof}[Proof of \cref{lem:const-p}] - \begin{Definition}\label{def:Gk} For $k > 1$, let graph $\graph{k}$ be a graph generated from an arbitrary graph $\graph{1}$, by replacing every edge $e$ of $\graph{1}$ with a $k$-path, such that all $k$-path replacement edges are disjoint in the sense that they only intersect at the original intersection endpoints as seen in $\graph{1}$. \end{Definition} @@ -84,11 +114,6 @@ We then show that for any of the patterns $\threedis, \threepath, \tri$ which ar Before proceeding, let us introduce a few more helpful definitions. -\subsubsection{$f_k$ and $\graph{k}$} - - - - \begin{Definition}\label{def:ed-nota} For the set of edges in $\graph{k}$ we write $E_k$. For any graph $\graph{k}$, its edges are denoted by the a pair $(e, b)$, such that $b \in \{0,\ldots, k-1\}$ and $e\in E_1$. \end{Definition} From 3bae3a47462150609c0f176e2c821b13a6eafe5c Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Mon, 7 Dec 2020 17:02:12 -0500 Subject: [PATCH 02/17] Changes to Approx. Alg section, mostly cosmetic --- approx_alg.tex | 75 +++++++++++++------------------------------------- lin_sys.tex | 9 ++++++ 2 files changed, 28 insertions(+), 56 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index b9ba4a7..75cd41d 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -1,40 +1,8 @@ %root: main.tex \section{$1 \pm \epsilon$ Approximation Algorithm} -%\AH{I am attempting to rewrite this section mostly from scratch. This will involve taking 'baby' steps towards the goals we spoke of on Friday 080720 as well as throughout the following week on chat channel.} -% -%\AH{\textbf{BEGIN}: Old stuff.} -% -% -%\begin{proof} -% -%Let us now show a sampling scheme which can run in $O\left(|\poly|\cdot k\right)$ per sample. -% -%First, consider when $\poly$ is already an SOP of pure products. In this case, sampling is trivial, and one would sample from the $\setsize$ terms with probability proportional to the product of probabilitites for each variable in the sampled monomial. -% -%Second, consider when $\poly$ has a POS form with a product width of $k$. In this case, we can view $\poly$ as an expression tree, where the leaves represent the individual values of each factor. The leaves are joined together by either a $\times$ or $+$ internal node, and so on, until we reach the root, which is joining the $k$-$\times$ nodes. -% -%Then for each $\times$ node, we multiply its subtree values, while for each $+$ node, we pick one of its children with probability proportional to the product of probabilities across its variables. -% -%\AH{I think I mean to say a probability proportional to the number of elements in it's given subtree.} -% -%The above sampling scheme is in $O\left(|\poly|\cdot k\right)$ time then, since we have for either case, that at most the scheme would perform within a factor of the $|\poly|$ operations, and those operations are repeated the product width of $k$ times. -% -%Thus, it is the case, that we can approximate $\rpoly(\prob_1,\ldots, \prob_n)$ within the claimed confidence bounds and computation time, thus proving the lemma.\AH{State why.} -% -%\AH{Discuss how we have that $\rpoly \geq O(\setsize)$. Discuss that we need $b-a$ to be small.} -%\end{proof} -% -%\qed -%\AH{{\bf END:} Old Stuff} +Since it is the case that computing the expected multiplicity of a compressed representation of a bag polynomial is hard, it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next. - -%\begin{Definition}[Polynomial]\label{def:polynomial} -%The expression $\poly(\vct{X})$ is a polynomial if it satisfies the standard mathematical definition of polynomial, and additionally is in the standard monomial basis. -%\end{Definition} - -%To clarify defintion ~\ref{def:polynomial}, a polynomial in the standard monomial basis is one whose monomials are in SOP form, and whose non-distinct monomials have been collapsed into one distinct monomial, with its corresponding coefficient accurately reflecting the number of monomials combined. - -Now, some useful definitions and notation. For illustrative purposes in the definitions below, let us consider when $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$. +First, let us introduce some useful definitions and notation. For illustrative purposes in the definitions below, let us consider when $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$. \begin{Definition}[Degree]\label{def:degree} The degree of polynomial $\poly(\vct{X})$ is the maximum sum of the exponents of a monomial, over all monomials. @@ -42,23 +10,15 @@ The degree of polynomial $\poly(\vct{X})$ is the maximum sum of the exponents of The degree of $\poly(\vct{X})$ in the above example is $2$. In this paper we consider only finite degree polynomials. -For example, the expression $xy$ is a monomial from the term $3xy$ of $\poly(\vct{X})$, produced from the set of variables $\vct{X} = \{x, y\}$. - -%\begin{Definition}[$|\vct{X}|$]\label{def:num-vars} -%Denote the number of variables in $\poly(\vct{X})$ as $|\vct{X}|$. -%\end{Definition} -% -%In the running example, $|\vct{X}| = 2$. - \begin{Definition}[Expression Tree]\label{def:express-tree} An expression tree $\etree$ is a binary %an ADT logically viewed as an n-ary tree, whose internal nodes are from the set $\{+, \times\}$, with leaf nodes being either from the set $\mathbb{R}$ $(\tnum)$ or from the set of monomials $(\var)$. The members of $\etree$ are \type, \val, \vari{partial}, \vari{children}, and \vari{weight}, where \type is the type of value stored in the node $\etree$ (i.e. one of $\{+, \times, \var, \tnum\}$, \val is the value stored, and \vari{children} is the list of $\etree$'s children where $\etree_\lchild$ is the left child and $\etree_\rchild$ the right child. Remaining fields hold values whose semantics we will fix later. When $\etree$ is used as input of ~\cref{alg:mon-sam} and ~\cref{alg:one-pass}, the values of \vari{partial} and \vari{weight} will not be set. %SEMANTICS FOR \etree: \vari{partial} is the sum of $\etree$'s coefficients , n, and \vari{weight} is the probability of $\etree$ being sampled. \end{Definition} -Note that $\etree$ need not encode an expression in the standard monomial basis, for example, when $\etree$ represents the expression $(x + 2y)(2x - y)$. +Note that $\etree$ need not encode an expression in the standard monomial basis. For example, instead of our running example, $\etree$ could represent a compressed form such as $(x + 2y)(2x - y)$. \begin{Definition}[poly$(\cdot)$]\label{def:poly-func} -Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial. Recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively. +Denote $poly(\etree)$ to be the function that takes as input expression tree $\etree$ and outputs its corresponding polynomial. $poly(\cdot)$ is recursively defined on $\etree$ as follows, where $\etree_\lchild$ and $\etree_\rchild$ denote the left and right child of $\etree$ respectively. % \begin{align*} % &\etree.\type = +\mapsto&& \polyf(\etree_\lchild) + \polyf(\etree_\rchild)\\ @@ -76,8 +36,6 @@ Denote $poly(\etree)$ to be the function that takes as input expression tree $\e \end{equation*} \end{Definition} -\AH{ -\par2) Below seems like over-defining to me. Is this really necessary? The first sentence I think is \textit{enough}.} Note that addition and multiplication above follow the standard interpretation over polynomials. %Specifically, when adding two monomials whose variables and respective exponents agree, the coefficients corresponding to the monomials are added and their sum is multiplied to the monomial. Multiplication here is denoted by concatenation of the monomial and coefficient. When two monomials are multiplied, the product of each corresponding coefficient is computed, and the variables in each monomial are multiplied, i.e., the exponents of like variables are added. Again we notate this by the direct product of coefficient product and all disitinct variables in the two monomials, with newly computed exponents. @@ -102,7 +60,7 @@ $\expandtree{\etree}$ is the pure sum of products expansion of $\etree$. The lo &\expandtree{\etree} = \\ &\begin{cases} \expandtree{\etree_\lchild} \circ \expandtree{\etree_\rchild} &\textbf{ if }\etree.\type = +\\ - \left\{(\monom_\lchild \cup \monom_\rchild, \coef_\lchild \cdot \coef_\rchild) ~|~ (\monom_\lchild, \coef_\lchild) \in \expandtree{\etree_\lchild}, (\monom_\rchild, \coef_\rchild) \in \expandtree{\etree_\rchild}\right\} &\textbf{ if }\etree.\type = \times\\ + \left\{(\monom_\lchild \cup \monom_\rchild, \coef_\lchild \cdot \coef_\rchild) ~|~\right.&\\ \left.(\monom_\lchild, \coef_\lchild) \in \expandtree{\etree_\lchild}, (\monom_\rchild, \coef_\rchild) \in \expandtree{\etree_\rchild}\right\} &\textbf{ if }\etree.\type = \times\\ \elist{(\emptyset, \etree.\val)} &\textbf{ if }\etree.\type = \tnum\\ \elist{(\{\etree.\val\}, 1)} &\textbf{ if }\etree.\type = \var.\\ \end{cases} @@ -173,9 +131,9 @@ For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ \subsection{Approximating $\rpoly$} \subsubsection{Description} -Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated over $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}. +Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. First, a call to $\onepass$ on its input $\etree$ produces a non-biased weight distribution over the monomials of $\expandtree{\etree}$ and a correct count of $|\etree|(1,\ldots, 1)$, i.e., the number of monomials in $\expandtree{\etree}$. Next, ~\cref{alg:mon-sam} calls $\sampmon$ to sample one monomial and its sign from $\expandtree{\etree}$. The sampling is repeated $\ceil{\frac{2\log{\frac{2}{\delta}}}{\epsilon^2}}$ times, where each of the samples are evaluated with input $\vct{p}$, multiplied by $1 \times sign$, and summed. The final result is scaled accordingly returning an estimate of $\rpoly$ with the claimed $(\error, \conf)$-bound of ~\cref{lem:mon-samp}. -Kindly recall that the notaion $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$. +Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$. \subsubsection{Psuedo Code} \begin{algorithm}[H] @@ -212,10 +170,10 @@ We state the lemmas for $\onepass$ and $\sampmon$, the auxiliary algorithms on w The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$. Second, when $\vari{S}.\val = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$ \end{Lemma} -At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$. $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node. +At the conclusion of $\onepass$, $\etree.\vari{partial}$ will hold the sum of all coefficients in $\expandtree{\abs{\etree}}$, i.e., $\sum\limits_{(\monom, \coef) \in \expandtree{\abs{\etree}}}\coef$. $\etree.\vari{weight}$ will hold the weighted probability that $\etree$ is sampled from from its parent $+$ node. \begin{Lemma}\label{lem:sample} -The function $\sampmon$ complete in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$. +The function $\sampmon$ completes in $O(\log{k} \cdot k \cdot depth(\etree))$ time, where $k = \degree(poly(\abs{\etree})$. Upon completion, with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$, $\sampmon$ returns the sampled term $\left(\monom, sign(\coef)\right)$ from $\expandtree{\abs{\etree}}$. \end{Lemma} \begin{Theorem}\label{lem:mon-samp} @@ -238,7 +196,10 @@ Consider now a set of $\samplesize$ random variables $\vct{\randvar}$, where eac $\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that -\[\expct\pbox{\empmean} = \expct\pbox{ \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}.\] +\begin{align*} +&\expct\pbox{\empmean} = \expct\pbox{ \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}\nonumber\\ +&= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}. +\end{align*} Hoeffding' inequality can be used to compute an upper bound on the number of samples $\samplesize$ needed to establish the $(\error, \conf)$-bound. The inequality states that if we know that each $\randvar_i$ is strictly bounded by the intervals $[a_i, b_i]$, then it is true that \begin{equation*} @@ -322,7 +283,9 @@ The evaluation of $\abs{\etree}(1,\ldots, 1)$ can be defined recursively, as fol In the same fashion the weighted distribution can be described as above with the following modification for the case when $\etree.\type = +$: \begin{align*} -&\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1); \etree_\lchild.\vari{weight} \gets \frac{\abs{\etree_\lchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1)}, \etree_\rchild.\vari{weight} \gets \frac{\abs{\etree_\rchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1)+ \abs{\etree_\rchild}(1,\ldots, 1)} &\textbf{if }\etree.\type = + +&\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1); &\textbf{if }\etree.\type = + \\ +&\etree_\lchild.\vari{weight} \gets \frac{\abs{\etree_\lchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1) + \abs{\etree_\rchild}(1,\ldots, 1)};\\ +&\etree_\rchild.\vari{weight} \gets \frac{\abs{\etree_\rchild}(1,\ldots, 1)}{\abs{\etree_\lchild}(1,\ldots, 1)+ \abs{\etree_\rchild}(1,\ldots, 1)} \end{align*} %\begin{align*} @@ -372,13 +335,13 @@ See algorithm ~\ref{alg:one-pass} for details. \end{algorithm} \begin{Example}\label{example:one-pass} -Consider the when $\etree$ is $+\left(\times\left(+\left(\times\left(1, x_1\right), \times\left(1, x_2\right)\right), +\left(\times\left(1, x_1\right) as seen in ~\cref{fig:expr-tree-T-wght}, \times\left(-1, x_2\right)\right)\right), \times\left(\times\left(1, x_2\right), \times\left(1, x_2\right)\right)\right)$, which encodes the expression $(x_1 + x_2)(x_1 - x_2) + x_2^2$. After one pass, \cref{alg:one-pass} would have computed the following weight distribution. For the two children of the root $+$ node $\etree$, $\etree_\lchild.\wght = \frac{4}{5}$ and $\etree_\rchild.\wght = \frac{1}{5}$. Similarly, $\stree \gets \etree_\lchild$, $\stree_\lchild.\wght = \stree_\rchild.\wght = \frac{1}{2}$. Note that in this example, the sampling probabilities for the children of each inner $+$ node of $\stree$ are equal to one another because both parents have the same number of children, and, in each case, the children of each parent $+$ node share the same $|\coef_i|$. +Consider when $\etree$ encodes the expression $(x_1 + x_2)(x_1 - x_2) + x_2^2$. After one pass, \cref{alg:one-pass} would have computed the following weight distribution. For the two children of the root $+$ node $\etree$, $\etree_\lchild.\wght = \frac{4}{5}$ and $\etree_\rchild.\wght = \frac{1}{5}$. Similarly, $\stree \gets \etree_{\lchild_\lchild}$, $\stree_\lchild.\wght = \stree_\rchild.\wght = \frac{1}{2}$. Note that in this example, the sampling probabilities for the children of each inner $+$ node of $\stree$ are equal to one another because both parents have the same number of children, and, in each case, the children of each parent $+$ node share the same $|\coef_i|$. \end{Example} \begin{figure}[h!] \begin{tikzpicture}[thick, every tree node/.style={default_node, thick, draw=black, black, circle, text width=0.3cm, font=\bfseries, minimum size=0.65cm}, every child/.style={black}, edge from parent/.style={draw, thick}, -level 1/.style={sibling distance=2.5cm}, -level 2/.style={sibling distance=1.25cm}, +level 1/.style={sibling distance=1.25cm}, +level 2/.style={sibling distance=1.0cm}, %level 2+/.style={sibling distance=0.625cm} %level distance = 1.25cm, %sibling distance = 1cm, diff --git a/lin_sys.tex b/lin_sys.tex index 6ce3535..7b82037 100644 --- a/lin_sys.tex +++ b/lin_sys.tex @@ -218,3 +218,12 @@ Putting \cref{eq:det-1}, \cref{eq:det-2}, \cref{eq:det-3} together, we have, Thus, by ~\cref{lem:lin-sys} we have proved ~\cref{th:single-p} for fixed $p \in (0, 1)$. \end{proof} \qed + +\begin{Corollary}\label{cor:single-p-gen-k} +For every value $\kElem \geq 3$, there exists a query with $\kElem$ product width that is hard. +\end{Corollary} +\begin{proof}[Proof of Corollary ~\cref{cor:single-p-gen-k}] +Consider $\poly^3_{G}$ and $\poly' = 1$ such that $\poly'' = \poly^3_{G} \cdot \poly'$. By ~\cref{th:single-p}, query $\poly''$ with $\kElem = 4$ is hard. +\end{proof} + +\qed From a8331ef52e64202b331be8bbf9ee5f416067e26d Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Tue, 8 Dec 2020 11:59:46 -0500 Subject: [PATCH 03/17] Small adjustments to approx algo. --- approx_alg.tex | 485 ++------------------------------------- lin_sys.tex | 16 +- macros.tex | 2 + main.tex | 2 +- mult_distinct_p.tex | 104 +-------- retracted_bidb_stuff.tex | 451 ++++++++++++++++++++++++++++++++++++ single_p.tex | 14 +- 7 files changed, 496 insertions(+), 578 deletions(-) create mode 100644 retracted_bidb_stuff.tex diff --git a/approx_alg.tex b/approx_alg.tex index 75cd41d..fc78c93 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -428,16 +428,16 @@ For the base case, $d = 0$, it is the case that the node is a leaf and therefore Let the inductive hypothesis be the assumption that for $d \leq k$ for $k \geq 1$, lemma ~\ref{lem:one-pass} is true for algorithm ~\ref{alg:one-pass}. -Now prove that lemma ~\ref{lem:one-pass} holds for $k + 1$. Notice that $\etree$ has at most two children, $\etree_\lchild$ and $\etree_\rchild$. Note also, that for each child, it is the case that $d \leq k$. Then, by inductive hypothesis, lemma ~\ref{lem:one-pass} holds for each existing child, and we are left with two possibilities for $\etree$. The first case is when $\etree$ is a $+$ node. When this happens, algorithm ~\ref{alg:one-pass} computes $|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)$ on line ~\ref{alg:one-pass-plus-add} which by definition is $\abs{\etree}(1,\ldots, 1)$ and hence the inductive hypothesis holds in this case. For the distribution of the children of $+$, algorithm ~\ref{alg:one-pass} computes $P(\etree_i) = \frac{|T_i|(1,\ldots, 1)}{|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)}$ which is indeed the case. The second case is when the $\etree.\val = \times$. By inductive hypothesis, it is the case that both $\abs{\etree_\lchild}\polyinput{1}{1}$ and $\abs{\etree_\rchild}\polyinput{1}{1}$ have been correctly computed. On line~\ref{alg:one-pass-times-product} algorithm ~\ref{alg:one-pass} then computes the product of the subtree partial values, $|T_\lchild|(1,\ldots, 1) \times |T_\rchild|(1,\ldots, 1)$ which by definition is $\abs{\etree}(1,\ldots, 1)$. +Now prove that lemma ~\ref{lem:one-pass} holds for $k + 1$. Notice that $\etree$ has at most two children, $\etree_\lchild$ and $\etree_\rchild$. Note also, that for each child, it is the case that $d \leq k$. Then, by inductive hypothesis, lemma ~\ref{lem:one-pass} holds for each existing child, and we are left with two possibilities for $\etree$. The first case is when $\etree$ is a $+$ node. When this happens, algorithm ~\ref{alg:one-pass} computes $|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)$ on line ~\ref{alg:one-pass-plus-add} which by definition is $\abs{\etree}(1,\ldots, 1)$ and hence the inductive hypothesis holds in this case. For the weight computation of the children of $+$, by lines ~\ref{alg:one-pass-plus-add}, ~\ref{alg:one-pass-plus-assign2}, and ~\ref{alg:one-pass-plus-prob} algorithm ~\ref{alg:one-pass} computes $\etree_i.\wght = \frac{|T_i|(1,\ldots, 1)}{|T_\lchild|(1,\ldots, 1) + |T_\rchild|(1,\ldots, 1)}$ which is indeed as claimed. The second case is when the $\etree.\val = \times$. By inductive hypothesis, it is the case that both $\abs{\etree_\lchild}\polyinput{1}{1}$ and $\abs{\etree_\rchild}\polyinput{1}{1}$ have been correctly computed. On line~\ref{alg:one-pass-times-product} algorithm ~\ref{alg:one-pass} then computes the product of the subtree partial values, $|T_\lchild|(1,\ldots, 1) \times |T_\rchild|(1,\ldots, 1)$ which by definition is $\abs{\etree}(1,\ldots, 1)$. %That $\onepass$ makes exactly one traversal of $\etree$ follows by noting for lines ~\ref{alg:one-pass-equality1} and ~\ref{alg:one-pass-equality2} are the checks for the non-base cases, where in each matching exactly one recursive call is made on each of $\etree.\vari{children}$. For the base cases, lines ~\ref{alg:one-pass-equality3} and ~\ref{alg:one-pass-equality4} both return values without making any further recursive calls. Since all nodes are covered by the cases, and the base cases cover only leaf nodes, it follows that algorithm ~\ref{alg:one-pass} then terminates after it visits every node exactly one time. -To conclude, note that when $\etree.\type = +$, the compuatation of $\etree_\lchild.\wght$ and $\etree_\rchild.\wght$ are solely dependent on the correctness of $\abs{\etree}\polyinput{1}{1}$, $\abs{\etree_\lchild}\polyinput{1}{1}$, and $\abs{\etree_\rchild}\polyinput{1}{1}$, which have already been argued to be correct. +%To conclude, note that when $\etree.\type = +$, the compuatation of $\etree_\lchild.\wght$ and $\etree_\rchild.\wght$ are solely dependent on the correctness of $\abs{\etree}\polyinput{1}{1}$, $\abs{\etree_\lchild}\polyinput{1}{1}$, and $\abs{\etree_\rchild}\polyinput{1}{1}$, which have already been argued to be correct. \paragraph{Run-time Analysis} -The runtime for \textsc{OnePass} is fairly straight forward. Note that line ~\ref{alg:one-pass-equality1}, ~\ref{alg:one-pass-equality2}, and ~\ref{alg:one-pass-equality3} give a constant number of equality checks per node. Then, for $+$ nodes, lines ~\ref{alg:one-pass-plus-add} and ~\ref{alg:one-pass-plus-prob} (note there is a \textit{constant} factor of $2$ here) perform a constant number of arithmetic operations, while ~\ref{alg:one-pass-plus-assign1} ~\ref{alg:one-pass-plus-assign2}, and ~\ref{alg:one-pass-plus-assign3} all have $O(1)$ assignments. Similarly, when a $\times$ node is visited, lines \ref{alg:one-pass-times-assign1}, \ref{alg:one-pass-times-assign2}, and \ref{alg:one-pass-times-assign3} have $O(1)$ assignments, while line ~\ref{alg:one-pass-times-product} has $O(1)$ product operations per node. For leaf nodes, ~\cref{alg:one-pass-leaf-assign1} and ~\cref{alg:one-pass-global-assign} are both $O(1)$ assignment. +The runtime for \textsc{OnePass} is fairly straight forward. Note that line ~\ref{alg:one-pass-equality1}, ~\ref{alg:one-pass-equality2}, and ~\ref{alg:one-pass-equality3} give a constant number of equality checks per node. Then, for $+$ nodes, lines ~\ref{alg:one-pass-plus-add} and ~\ref{alg:one-pass-plus-prob} perform a constant number of arithmetic operations, while ~\ref{alg:one-pass-plus-assign1} ~\ref{alg:one-pass-plus-assign2}, and ~\ref{alg:one-pass-plus-assign3} all have $O(1)$ assignments. Similarly, when a $\times$ node is visited, lines \ref{alg:one-pass-times-assign1}, \ref{alg:one-pass-times-assign2}, and \ref{alg:one-pass-times-assign3} have $O(1)$ assignments, while line ~\ref{alg:one-pass-times-product} has $O(1)$ product operations per node. For leaf nodes, ~\cref{alg:one-pass-leaf-assign1} and ~\cref{alg:one-pass-global-assign} are both $O(1)$ assignment. -Thus, the algorithm visits each node of $\etree$ one time, with a constant number of operations for all of the $+$, $\times$, and leaf nodes, leading to a runtime of $O\left(\treesize(\etree)\right)$, and completes the proof. +Thus, the algorithm visits each node of $\etree$ one time, with a constant number of operations for all of the $+$, $\times$, and leaf nodes, leading to a runtime of $O\left(\treesize(\etree)\right)$, and this completes the proof. \end{proof} \qed @@ -497,31 +497,32 @@ See algorithm ~\ref{alg:sample} for the details of $\sampmon$ algorithm. \begin{proof}[Proof of Lemma ~\ref{lem:sample}] First, we need to show that $\sampmon$ indeed returns a monomial $\monom$, such that $(\monom, \coef)$ is in $\expandtree{\etree}$. -For the base case of the depth $d$ of $\etree$ is $0$, we have that the root node is either a constant $\coef$ for which case lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\etree.\type = \var$ and $\etree.\val = x$, in which case by lines ~\ref{alg:sample-var-union} and ~\ref{alg:sample-var-return} we return $\{x\}$. Both cases satisfy the definition of a monomial, and the base case is proven. +For the base case, let the depth $d$ of $\etree$ be $0$. We have that the root node is either a constant $\coef$ for which by line ~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\etree.\type = \var$ and $\etree.\val = x$, and by line ~\ref{alg:sample-var-return} we return $\{x\}$. Both cases satisfy ~\cref{def:monomial}, and the base case is proven. -By inductive hyptothesis, assume that for $d \leq k$ for $k \geq 0$, that it is indeed the case that $\sampmon$ returns a monomial. +By inductive hyptothesis, assume that for $d \leq k$ for $k \geq 1$, that it is indeed the case that $\sampmon$ returns a monomial. -For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial. +For the inductive step, let us take a tree $\etree$ with $d = k + 1$. Note that each child has depth $d \leq k$, and by inductive hyptothesis both of them return a valid monomial. Then the root can be either a $+$ or $\times$ node. For the case of a $+$ root node, line ~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the children of the root. Since by inductive hypothesis it is the case that a monomial is being returned from either child, and only one of these monomials is selected, we have for the case of $+$ root node that a valid monomial is returned by $\sampmon$. When the root is a $\times$ node, lines ~\ref{alg:sample-times-union} and ~\ref{alg:sample-times-product} multiply the monomials returned by the two children of the root, and by definition ~\ref{def:monomial} the product of two monomials is also a monomial, which means that $\sampmon$ returns a vaild monomial for the $\times$ root node, thus concluding the fact that $\sampmon$ indeed returns a monomial. %Note that for any monomial sampled by algorithm ~\ref{alg:sample}, the nodes traversed form a subgraph of $\etree$ that is \textit{not} a subtree in the general case. We thus seek to prove that the subgraph traversed produces the correct probability corresponding to the monomial sampled. We seek to prove by induction on the depth $d$ of $\etree$ that the subgraph traversed by $\sampmon$ has a probability that is in accordance with the monomial sampled, $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$. -For the base case $d = 0$, by definition ~\ref{def:express-tree} we know that the root has to be either a coefficient or a variable. For either case, the probability of the value returned is $1$ since there is only one value to sample from. When the root is a variable $x$ the algorithm correctly returns $(\{x\}, 1 )$. When the root is a coefficient, \sampmon ~correctly returns $(\{~\}, sign(\coef_i) \times 1)$. +For the base case $d = 0$, by definition ~\ref{def:express-tree} we know that the root has to be either a coefficient or a variable. For either case, the probability of the value returned is $1$ since there is only one value to sample from. When the root is a variable $x$ the algorithm correctly returns $(\{x\}, 1 )$. When the root is a coefficient, \sampmon ~correctly returns $(\{~\}, sign(\coef_i))$. -For the inductive hypothesis, assume that for $d \leq k$ and $k \geq 0$ $\sampmon$ indeed samples $\monom$ in $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$.%bove is true.%lemma ~\ref{lem:sample} is true. +For the inductive hypothesis, assume that for $d \leq k$ and $k \geq 1$ $\sampmon$ indeed samples $\monom$ in $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}\polyinput{1}{1}}$.%bove is true.%lemma ~\ref{lem:sample} is true. -Prove now, that when $d = k + 1$ the correctness holds. It is the case that the root of $\etree$ has up to two children $\etree_\lchild$ and $\etree_\rchild$. Since $\etree_\lchild$ and $\etree_\rchild$ are both depth $d \leq k$, by inductive hypothesis correctness holds for both of them, thus, $\sampmon$ has sampled both monomials $\monom_\lchild$ in $(\monom_\lchild, \coef_\lchild)$ of $\expandtree{\etree_\lchild}$ and $\monom_\rchild$ in $(\monom_\rchild, \coef_\rchild)$ of $\expandtree{\etree_\rchild}$, from $\etree_\lchild$ and $\etree_\rchild$ with probability $\frac{|\coef_\lchild|}{\abs{\etree_\lchild}\polyinput{1}{1}}$ and $\frac{|\coef_\rchild|}{\abs{\etree_\rchild}\polyinput{1}{1}}$. +Prove now, that when $d = k + 1$ the correctness holds. It is the case that the root of $\etree$ has up to two children $\etree_\lchild$ and $\etree_\rchild$. Since $\etree_\lchild$ and $\etree_\rchild$ are both depth $d \leq k$, by inductive hypothesis correctness holds for both of them, thus, $\sampmon$ will sample both monomials $\monom_\lchild$ in $(\monom_\lchild, \coef_\lchild)$ of $\expandtree{\etree_\lchild}$ and $\monom_\rchild$ in $(\monom_\rchild, \coef_\rchild)$ of $\expandtree{\etree_\rchild}$, from $\etree_\lchild$ and $\etree_\rchild$ with probability $\frac{|\coef_\lchild|}{\abs{\etree_\lchild}\polyinput{1}{1}}$ and $\frac{|\coef_\rchild|}{\abs{\etree_\rchild}\polyinput{1}{1}}$. Then the root has to be either a $+$ or $\times$ node. Consider the case when the root is $\times$. Note that we are sampling a term from $\expandtree{\etree}$. Consider $(\monom, \coef)$ in $\expandtree{\etree}$, where $\monom$ is the sampled monomial. Notice also that it is the case that $\monom = \monom_\lchild \times \monom_\rchild$, where $\monom_\lchild$ is coming from $\etree_\lchild$ and $\monom_\rchild$ from $\etree_\rchild$. The probability that \sampmon$(\etree_{\lchild})$ returns $\monom_\lchild$ is $\frac{|\coef_{\monom_\lchild}|}{|\etree_\lchild|(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\lchild}|}{\abs{\etree_\rchild}\polyinput{1}{1}}$ for $\monom_R$. Since both $\monom_\lchild$ and $\monom_\rchild$ are sampled with independent randomness, the final probability for sample $\monom$ is then $\frac{|\coef_{\monom_\lchild}| \cdot |\coef_{\monom_R}|}{|\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)}$. For $(\monom, \coef)$ in \expandtree{\etree}, it is indeed the case that $|\coef_i| = |\coef_{\monom_\lchild}| \cdot |\coef_{\monom_\rchild}|$ and that $\abs{\etree}(1,\ldots, 1) = |\etree_\lchild|(1,\ldots, 1) \cdot |\etree_\rchild|(1,\ldots, 1)$, and therefore $\monom$ is sampled with correct probability $\frac{|\coef_i|}{\abs{\etree}(1,\ldots, 1)}$. -For the case when $\etree.\val = +$, \sampmon ~will sample monomial $\monom$ from one of its children. By inductive hypothesis we know that any $\monom_\lchild$ in $\expandtree{\etree_\lchild}$ and any $\monom_\rchild$ in $\expandtree{\etree_\rchild}$ will both be sampled with correct probability $\frac{|\coef_{\monom_\lchild}|}{\etree_{\lchild}(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\rchild}|}{|\etree_\rchild|(1,\ldots, 1)}$, where either $\monom_\lchild$ or $\monom_\rchild$ will equal $\monom$, depending on whether $\etree_\lchild$ or $\etree_\rchild$ is sampled. Assume that $\monom$ is sampled from $\etree_\lchild$, and note that a symmetric argument holds for the case when $\monom$ is sampled from $\etree_\rchild$. Notice also that the probability of sampling $\etree_\lchild$ from $\etree$ is $\frac{\abs{\etree_\lchild}\polyinput{1}{1}}{\abs{\etree_\lchild}\polyinput{1}{1} + \abs{\etree_\rchild}\polyinput{1}{1}}$ as computed by $\onepass$. Then, since $\sampmon$ goes top-down, and each sampling choice is independent (which follows from the randomness in the root of $\etree$ being independent from the randomness used in its subtrees), the probability for $\monom$ to be sampled from $\etree$ is equal to the product of the probability that $\etree_\lchild$ is sampled from $\etree$ and $\monom$ is sampled in $\etree_\lchild$, and +For the case when $\etree.\val = +$, \sampmon ~will sample monomial $\monom$ from one of its children. By inductive hypothesis we know that any $\monom_\lchild$ in $\expandtree{\etree_\lchild}$ and any $\monom_\rchild$ in $\expandtree{\etree_\rchild}$ will both be sampled with correct probability $\frac{|\coef_{\monom_\lchild}|}{\etree_{\lchild}(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\rchild}|}{|\etree_\rchild|(1,\ldots, 1)}$, where either $\monom_\lchild$ or $\monom_\rchild$ will equal $\monom$, depending on whether $\etree_\lchild$ or $\etree_\rchild$ is sampled. Assume that $\monom$ is sampled from $\etree_\lchild$, and note that a symmetric argument holds for the case when $\monom$ is sampled from $\etree_\rchild$. Notice also that the probability of choosing $\etree_\lchild$ from $\etree$ is $\frac{\abs{\etree_\lchild}\polyinput{1}{1}}{\abs{\etree_\lchild}\polyinput{1}{1} + \abs{\etree_\rchild}\polyinput{1}{1}}$ as computed by $\onepass$. Then, since $\sampmon$ goes top-down, and each sampling choice is independent (which follows from the randomness in the root of $\etree$ being independent from the randomness used in its subtrees), the probability for $\monom$ to be sampled from $\etree$ is equal to the product of the probability that $\etree_\lchild$ is sampled from $\etree$ and $\monom$ is sampled in $\etree_\lchild$, and \begin{align*} -P(\sampmon(\etree) = \monom) = &P(\sampmon(\etree_\lchild) = \monom) \cdot P(SampledChild(\etree) = \etree_\lchild)\\ -= &\frac{|\coef_\monom|}{|\etree_\lchild|(1,\ldots, 1)} \cdot \frac{\abs{\etree_\lchild}(1,\ldots, 1)}{|\etree_\lchild|(1,\ldots, 1) + |\etree_\rchild|(1,\ldots, 1)}\\ -= &\frac{|\coef_\monom|}{\abs{\etree}(1,\ldots, 1)}, +&P(\sampmon(\etree) = \monom) = \\ +&P(\sampmon(\etree_\lchild) = \monom) \cdot P(SampledChild(\etree) = \etree_\lchild)\\ +&= \frac{|\coef_\monom|}{|\etree_\lchild|(1,\ldots, 1)} \cdot \frac{\abs{\etree_\lchild}(1,\ldots, 1)}{|\etree_\lchild|(1,\ldots, 1) + |\etree_\rchild|(1,\ldots, 1)}\\ +&= \frac{|\coef_\monom|}{\abs{\etree}(1,\ldots, 1)}, \end{align*} and we obtain the desired result. @@ -532,7 +533,7 @@ We now bound the number of recursive calls in $\sampmon$ by $O\left(k\cdot depth Now, we prove by induction on the depth $d$ of tree $\etree$ the following claim. \begin{Claim}\label{claim:num-nodes-level-i} -The number of nodes in expression tree $\etree$ at arbitrary level $i$ is bounded by the count of $\times$ nodes in levels $[0, i - 1] + 1$. +The number of nodes in a sample subgraph of expression tree $\etree$ at arbitrary level $i$ is bounded by the count of $\times$ nodes in levels $[0, i - 1] + 1$. \end{Claim} \begin{proof}[Proof of Claim ~\ref{claim:num-nodes-level-i}] @@ -545,462 +546,12 @@ The inductive step is to show that for arbitrary $\etree$ with depth = $d + 1 \l \qed -By ~\cref{def:degree}, a sampled monomial will have $O(k)$ $\times$ nodes, and this implies $O(k)$ nodes at $\leq$ $depth(\etree)$ levels of the $\sampmon$ subgraph, bounding the number of recursive calls to $O(k \cdot depth(\etree))$. +By ~\cref{def:degree}, a sampled monomial will have $O(k)$ $\times$ nodes, and this along with ~\cref{claim:num-nodes-level-i} implies $O(k)$ nodes at $\leq$ $depth(\etree)$ levels of the $\sampmon$ subgraph, bounding the number of recursive calls to $O(k \cdot depth(\etree))$. Globally, lines ~\ref{alg:sample-global1} and ~\ref{alg:sample-global2} are $O(1)$ time. For the $+$ node, line ~\ref{alg:sample-plus-bsamp} has $O(1)$ time by the fact that $\etree$ is binary. Line ~\ref{alg:sample-plus-union} has $O(\log{k})$ time by nature of the TreeSet datastructure and the fact that by definition any monomial sampled from $\expandtree{\etree}$ has degree $\leq k$ and hence at most $k$ distinct variables, which in turn implies that the TreeSet has $\leq k$ elements in it at any time. -Finally, line ~\ref{alg:sample-plus-product} is in $O(1)$ for a product and an assignment operation. When a times node is visited, the same union, product, and assignment operations take place, and we again have $O(\log{k})$ runtime. When a variable leaf node is traversed, the same union operation occurs with $O(\log{k})$ runtime, and a constant leaf node has the above mentioned product and assignment operations. Thus for each node visited, we have $O(\log{k})$ runtime, and the final runtime for $\sampmon$ is $O(\log{k} \cdot k \cdot depth(\etree))$. +Finally, line ~\ref{alg:sample-times-product} is in $O(1)$ for a product and an assignment operation. When a times node is visited, the same union, product, and assignment operations take place, and we again have $O(\log{k})$ runtime. When a variable leaf node is traversed, the same union operation occurs with $O(\log{k})$ runtime, and a constant leaf node has the above mentioned product and assignment operations. Thus for each node visited, we have $O(\log{k})$ runtime, and the final runtime for $\sampmon$ is $O(\log{k} \cdot k \cdot depth(\etree))$. \end{proof} \qed - -\AH{\large\bf{New stuff 092520.}} - -\begin{Claim}\label{claim:constpk-TI} -Given a positive query polynomial $\poly$ over a $\ti$, with constant $\prob$ such that there exists a $\prob_0$ where for all $\prob_i, \prob_0 \leq \prob_i$, and constant $k = \degree(\poly)$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is constant. -\end{Claim} - -\begin{proof}[Proof of Claim ~\ref{claim:constpk-TI}] -By independence, a $\ti$ has the property that all of its annotations are positive. Combined with the fact that ~\cref{claim:constpk-TI} uses only positive queries, i.e., queries that only use $\oplus$ and $\otimes$ semiring operators over its polynomial annotations, it is the case that no negation exists pre or post query. - -For any $\poly$ then, it is true that all coefficients in $\abs{\etree}(1,\ldots, 1)$ are positive and thus the same as their $\rpoly$ counterparts. This then implies that the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)} \leq \frac{\abs{\etree}(1,\ldots, 1)}{\abs{\etree}(1,\ldots, 1) \cdot \prob_0^k}$, which is indeed a constant. -\end{proof} - -\qed - -\subsection{$\rpoly$ over $\bi$} -\AH{A general sufficient condition is the $\bi$ having fixed block size (thus implying increasing number of blocks for growing $\numvar$). For increasing $\numvar$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ can be proven to be a constant since, as $\numvar$ increases, it has to be the case that new blocks are added, and this results in a constant number of terms cancelled out by $\rpoly$, with the rest surviving, which gives us a constant $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$. -\par In the general case, with fixed number of blocks and growing $\numvar$, all additional terms will be cancelled out by $\rpoly$ while for $\abs{\etree}(1,\ldots, 1)$ it is the case that it will grow exponentially with $\numvar$, yielding a ratio $\frac{O(2^\numvar)}{O(1)}$ and (as will be seen) greater.} - -\subsubsection{Known Reduction Result $\bi \mapsto \ti$} - -Denote an arbitrary $\bi$ as $\bipdb = (\bipd, \biwset)$ and a constructed $\ti$ to be $\tipdb = (\tipd, \tiwset)$, the details to be described next. -It is well known that $\bipdb$ can be reduced to a query $\poly$ over $\tipdb$. For completeness, let us describe the reduction. - -Let tuples in $\bipdb$ be denoted $a_{\block, i}$ and their $\tipdb$ counterparts as $x_{\block, i}$, where $\block$ represents the block id in which $a_{\block, i}$ resides. - -\begin{Theorem}\label{theorem:bi-red-ti} -For any $\bipdb$, there exists a query $\poly$ and $\tipdb$ such that $\poly(\tiwset)$ over distribution $\tipd$ outputs elements in $\biwset$ according to their respective probabilities in $\bipd$. -\end{Theorem} - -\begin{Definition}[Total Ordering $\biord$]\label{def:bi-red-ti-order} -The order $\biord$ is a fixed total order across all tuples in block $\block$ of $\bipdb$. -\end{Definition} -\begin{Definition}[Query $\poly$]\label{def:bi-red-ti-q} -$\poly$ is constructed to map all possible worlds of $\db_{ti} \in \tiwset$ for which $x_i$ is the greatest according to $\biord$, to the worlds $\vct{w}$ in $\biwset$ in which $a_{\block, i}$ is present and $\bipd(\vct{w}) > 0$. Recall the constraint on $\bipdb$ to be that if $a_{\block, i}$ is present, then it is the case that for all $j \neq i$, tuple $a_{\block, j}$ is not present. For $\bipdb$ with exactly one block, all such worlds $\db_{ti}$ are mapped to the world $\{a_i\}$. -\end{Definition} - -For simplicity, we will consider $\bipdb$ to consist of one block $\block$. By independence of blocks in $\bi$, the proofs below immediately generalize to the case of $\bipdb$ with multiple blocks\textcolor{blue}{...umm, we'll see, we made need to argue this}. - -The reduction consists of the construction of a query $\poly$ and $\tipdb$ such that $\poly$ is computed over $\tipdb$. To construct the $\tipdb$ given an arbitrary $\bipdb$ a tuple alternative $a_{\block, i}$ is transcribed to a tuple in $\tipdb$ with probability - -\begin{equation} - P(x_{b, i}) = \begin{cases} - \frac{P(a_{\block, i})}{\prod_{j = 1}^{i - 1}(1 - P(x_{\block, j}))} &\textbf{if }i > 1\\ - P(a_i) &\textbf{if } i = 1. - \end{cases}\label{eq:bi-red-ti-func} -\end{equation} - -The above is more simply written as - -\begin{equation*} -\tipd(x_{\block, i}) = \frac{P(a_{\block, i})}{1 - \sum_{j = 1}^{i - 1} P(a_{\block, j})} -\end{equation*} - -The above mapping is applied across all tuples of $\bipdb$. - -This method for computing the probabilities of the tuples in $\tipdb$ allows for the following. According to $\biord$, the powerset of possible worlds is mapped in such a way that the first ordered tuple appearing in a possible world $\db_{\tiabb}$of $\tiwset$ has that world mapped to the world $\db_{\biabb} \in \biwset$ where $a_{\block, i}$ is present with $\bipd(\db_{\biabb}) > 0$. Recall that since we are considering a $\bi$ with one block, there is only one such world in $\biwset$. - -\begin{Lemma}\label{lem:bi-red-ti-prob} -The sum of the probabilities of all $\db_{\tiabb} \in \tiwset$ database worlds mapped to a a given tuple $x_{b, i}$ equals the probability of the tuple $a_{\block, i}$ in the original $\bipdb$. -\end{Lemma} - -\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-prob}] -The proof is by induction. Given a tuple $a_{\block, i}$ in $\bipdb$ such that $1 \leq i \leq \abs{b}$, (where $\abs{b}$ denotes the number of alternative tuples in block $\block$), by ~\cref{eq:bi-red-ti-func} $P(x_{\block, i}) = \frac{P(a_{\block, i})}{1 \cdot \prod_{j = 1}^{i - 1} (1 - P(x_{\block, j}))}$. - -For the base case, we have that $i = 1$ which implies that $P(x_{\block, i}) = P(a_{\block, i})$ and the base case is satisfied. - -%Other neat tidbits include that $\abs{b} = 1$, the set $b = \{a_1\}$, and the powerset $2^b = \{\emptyset, \{1\}\} = \tiwset$. For coolness, also see that $P(\neg x_i) = 1 - P(x_i) = 1 - P(a_i) = \emptyset$, so there is, in this case, a one to one correspondence of possible worlds and their respective probabilities in both $\ti$ and $\bi$, but this is extraneous information for the proof. - -The hypothesis is then that for $k \geq 1$ tuple alternatives, ~\cref{lem:bi-red-ti-prob} holds. - -For the inductive step, prove that ~\cref{lem:bi-red-ti-prob} holds for $k + 1$ alternatives. By definition of the query $\poly$ (~\cref{def:bi-red-ti-q}), it is a fact that only the world $\wElem_{x_{\block, k + 1}} = \{x_{\block, k + 1}\}$ in the set of possible worlds is mapped to $\bi$ world $\{a_{\block, k + 1}\}$. Then for world $\wElem_{x_{\block, k + 1}}$ it is the case that $P(\wElem_{x_{\block, k + 1}}) = \prod_{j = 1}^{k} (1 - P(x_j)) \cdot P(x_{\block k + 1})$. Since by ~\cref{eq:bi-red-ti-func} $P(x_{\block, k + 1}) = \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}$, we get -\begin{align*} -P(\wElem_{x_{\block, k + 1}}) =& \prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot P(x_{\block, k + 1})\\ -=&\prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}\\ -=&P(a_{\block, k + 1}). -\end{align*} -\end{proof} - -\qed - -This leaves us with the task of constructing a query $\poly$ over $\tipdb$ to perform the desired mapping of possible worlds. Setting $\poly$ to the following query yields the desired result. -\begin{lstlisting} -SELECT A FROM TI as a - WHERE A = 1 OR - OR A = 2 AND NOT EXISTS(SELECT A FROM TI as b - WHERE A = 1 AND a.blockID = b.blockID) - $\vdots$ - OR A = $|$b.blockID$|$ AND NOT EXISTS(SELECT A FROM TI as b - WHERE A = 1 OR A = 2 $\ldots$ A = $|$b.blockID$|$ AND a.blockID = b.blockID -\end{lstlisting} - -\begin{Lemma}\label{lem:bi-red-ti-q} -The query $\poly$ satisfies the requirements of ~\cref{def:bi-red-ti-q}. -\end{Lemma} - -\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-q}] -For any possible world in $2^b$, notice that the WHERE clause selects the tuple with the greatest ordering in the possible world. For all other tuples, disjunction of predicates dictates that no other tuple will be in the output by mutual exclusivity of the disjunction. Thus, it is the case for any $\ti$ possible world, that the tuple $x_{\block, i}$ with the greatest ordering appearing in that possible world will alone be in the output, and all such possible worlds with $x_{\block, i}$ as the greatest in the ordering will output the same world corresponding to the $\bi$ world for the disjoint tuple $a_{\block, i}$. -\end{proof} - -\qed - -\begin{proof}[Proof of Theorem ~\ref{theorem:bi-red-ti}] - -For multiple blocks in $\bipdb$, note that the above reduction to $\poly(\tipdb)$ with multiple 'blocks' will behave the same as $\bipdb$ since the property of independence for $\ti$ ensures that all tuples in the $\ti$ will have the same marginal probability across all possible worlds as their tuple probability, regardless of how many tuples and, thus, worlds the $\tipdb$ has. Note that this propety is unchanging no matter what probabilities additional tuples in $\tipdb$ are assigned. - -To see this consider the following. -\begin{Lemma}\label{lem:bi-red-ti-ind} -For any set of independent variables $S$ with size $\abs{S}$, when adding another distinct independent variable $y$ to $S$ with probability $\prob_y$, it is the case that the probability of each variable $x_i$ in $S$ remains unchanged. -\AH{This may be a well known property that I might not even have the need to prove, but since I am not certain, here goes.} -\end{Lemma} - -\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-ind}] -The proof is by induction. For the base case, consider a set of one element $S = \{x\}$ with probability $\prob_x$. The set of possible outcomes includes $2^S = \{\emptyset, \{x\}\}$, with $P(\emptyset) = 1 - \prob_x$ and $P(x) = p_x$. Now, consider $S' = \{y\}$ with $P(y) = \prob_y$ and $S \cup S' = \{x, y\}$ with the set of possible outcomes now $2^{S \cup S'} = \{\emptyset, \{x\}, \{y\}, \{x, y\}\}$. The probabilities for each world then are $P(\emptyset) = (1 - \prob_x)\cdot(1 - \prob_y), P(x) = \prob_x \cdot (1 - \prob_y), P(y) = (1 - \prob_x)\cdot \prob_y$, and $P(xy) = \prob_x \cdot \prob_y$. For the worlds where $x$ appears we have - -\[P(x) + P(xy) = \prob_x \cdot (1 - \prob_y) + \prob_x \cdot \prob_y = \prob_x \cdot \left((1 - \prob_y) + \prob_y\right) = \prob_x \cdot 1 = \prob_x.\] -Thus, the base case is satisfied. - -For the hypothesis, assume that $\abs{S} = k$ for some $k \geq 1$, and for $S'$ such that $\abs{S'} = 1$ where its element is distinct from all elements in $S$, the probability of each independent variable in $S$ is the same in $S \cup S'$. - -For the inductive step, let us prove that for $\abs{S_{k + 1}} = k + 1$ elements, adding another element will not change the probabilities of the independent variables in $S$. By the hypothesis, that $S_k \cup S_{k + 1}$, all probabilities in $S_k$ remained untainted after the union. Now consider a set $S' = \{z\}$ and the union $S_{k + 1} \cup S'$. Since all variables are distinct and independent, it is the case that the set of possible outcomes of $S_{k + 1} \cup S' = 2^{S_{k + 1} \cup S'}$ with $\abs{2^{S_{k + 1} \cup S'}} = 2^{\abs{S_{k + 1}} + \abs{S'}}$ since $\abs{S_{k + 1}} + \abs{S'} = \abs{S_{k + 1} \cup S'}$. Then, since $2^{\abs{S_{k + 1}} + \abs{S'}} = 2^{\abs{S_{k + 1}}} \cdot 2^{\abs{S'}}$, and $2^{S'} = \{\emptyset, \{x\}\}$, it is the case that all elements in the original set of out comes will appear \textit{exactly one} time without $z$ and \textit{exactly one }time with $z$, such that for element $x \in 2^{S_{k + 1}}$ with probability $\prob_x$ we have $P(x\text{ }OR\text{ }xz) = \prob_x \cdot (1 - \prob_z) + \prob_x \cdot \prob_z = \prob_x\cdot \left((1 - z) + z\right) = \prob_x \cdot 1 = \prob_x$, and the probabilities remain unchanged, and, thus, the marginal probabilities for each variable in $S_{k + 1}$ across all possible outcomes remain unchanged. -\end{proof} - -\qed - -The repeated application of ~\cref{lem:bi-red-ti-ind} to any 'block' of independent variables in $\tipdb$ provides the same result as joining two sets of distinct elements of size $\abs{S_1}, \abs{S_2} > 1$. - -Thus, by lemmas ~\ref{lem:bi-red-ti-prob}, ~\ref{lem:bi-red-ti-q}, and ~\ref{lem:bi-red-ti-ind}, the proof follows. -\end{proof} - -\qed - -\subsubsection{General results for $\bi$}\label{subsubsec:bi-gen} -\AH{One thing I don't see in the argument below is that as $\numvar \rightarrow \infty$, we have that $\prob_0 \rightarrow 0$.} -The general results of approximating a $\bi$ using the reduction and ~\cref{alg:mon-sam} do not allow for the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ to be a constant. Consider the following example. - -Let monomial $y_i = P(x_i) \cdot \prod_{j = 1}^{i - 1}(1 - P(x_j))$ Let $\poly(\vct{X}) = \sum_{i = 1}^{\numvar}y_i$. Note that this query output can exist on a projection for which each tuple agrees on the projected values of the query in a $\bi$ consisting of one block and $\numvar$ tuples. - -First, let's analyze the numerator $\abs{\etree}(1,\ldots, 1)$. Expanding $\abs{\etree}$ yields $X_i + (1 + X_1)\cdot X_2 + \cdots + (1 + X_1)\cdot(1 + X_2)\cdots(1 + X_{\numvar - 1})\cdot X_n$ which yields a geometric series $S_{\abs{\etree}} = 2^0 + 2^1 +\cdots+2^{\numvar - 1}$. We can perform the following manipulations to obtain the following closed form. - -\begin{align*} -2 \cdot S_{\abs{\etree}} =& 2^1 +\cdots+2^\numvar = 2^{\numvar} + S_2 - 1 \\ -S_{\abs{\etree}} =& 2^{\numvar + 1} - 1 -\end{align*} - -So, then $\abs{\etree}(1,\ldots, 1) = 2^{\numvar} - 1$. - -On the other hand, considering $\rpoly(\prob_1,\ldots, \prob_\numvar)$, since we are simply summing up the probabilities of one block of disjoint tuples (recall that $P(x_i) = \frac{P(a_i)}{1\cdot\prod_{j = 1}^{i - 1}(1 - P(x_j))}$ in the reduction for $a_i$ the original $\bi$ probability), it is the case that $\rpoly(\prob_1,\ldots, \prob_\numvar) \leq 1$, and the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ in this case is exponential $O(2^\numvar)$. Further note that setting $\poly(\vct{X}) = \sum_{i = 1}^{\numvar} y_i^k$ will yield an $O(2^{\numvar \cdot k})$ bound. -\subsubsection{Sufficient Condition for $\bi$ for linear time Approximation Algorithm} - -Let us introduce a sufficient condition on $\bipdb$ for a linear time approximation algorithm. - -\AH{Lemma ~\ref{lem:bi-suf-cond} is not true for the case of $\sigma$, where a $\sigma(\bowtie)$ query could select tuples from the same block, and self join them such that all tuples cancel out. We need a definition for 'safe' (in this context) queries, to prove the lemma.} -\begin{Lemma}\label{lem:bi-suf-cond} -For $\bipdb$ with fixed block size $\abs{b}$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is a constant. -\end{Lemma} - -\AH{Two observations. -\par -1) I am not sure that the argument below is correct, as I think we would still get something exponential in the numerator $\abs{\etree}(1,\ldots, 1)$. -\par2) I \textit{think} a similar argument will hold however for the method of not using the reduction.} -\begin{proof}[Prood of Lemma ~\ref{lem:bi-suf-cond}] -For increasing $\numvar$ and fixed block size $\abs{b}$ in $\bipdb$ given query $\poly = \sum_{i = 1}^{\numvar}$ where $y_i = x_i \cdot \prod_{j = 1}^{i - 1} (1 - x_j)$, a query whose output is the maximum possible output, it has to be the case as seen in ~\cref{subsubsec:bi-gen} that for each block $b$, $\rpoly(\prob_{b, 1},\ldots, \prob_{b, \abs{b}}) = P(a_{b, 1}) + P(a_{b, 2}) + \cdots + P(a_{b, \abs{b}})$ for $a_i$ in $\bipdb$. As long as there exists no block in $\bipdb$ such that the sum of alternatives is $0$ (which by definition of $\bi$ should be the case), we can bound the $\rpoly(p_1,\ldots, \prob_\numvar) \geq \frac{\prob_0 \cdot \numvar}{\abs{\block}}$ for $\prob_0 > 0$, and then we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is indeed a constant. -\end{proof} - -\qed - -Given a $\bipdb$ satisfying ~\cref{lem:bi-suf-cond}, it is the case by ~\cref{lem:approx-alg} that ~\cref{alg:mon-sam} runs in linear time. - -\AH{\Large \bf{092520 -- 100220 New material.}} - -\section{Algorithm ~\ref{alg:mon-sam} for $\bi$} - -We may be able to get a better run time by developing a separate approximation algorithm for the case of $\bi$. Instead performing the reduction from $\bi \mapsto \poly(\ti)$, we decide to work with the original variable annotations given to each tuple alternative in $\bipdb$. For clarity, let us assume the notation of $\bivar$ for the annotation of a tuple alternative. The algorithm yields $0$ for any monomial sampled that cannot exist in $\bipdb$ due to the disjoint property characterizing $\bi$. The semantics for $\rpoly$ change in this case. $\rpoly$ not only performs the same modding function, but also sets all monomial terms to $0$ if they contain variables which appear within the same block. - -\begin{algorithm}[H] - \caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\bivec$)} - \label{alg:bi-mon-sam} - \begin{algorithmic}[1] - \Require \etree: Binary Expression Tree - \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$ - \Require $\conf$ $\in [0, 1]$ - \Require $\error$ $\in [0, 1]$ - \Require $\bivec$ $\in [0, 1]^{\abs{\block}}$\Comment{$\abs{\block}$ is the number of blocks} - \Ensure \vari{acc} $\in \mathbb{R}$ - \State $\vari{sample}_\vari{next} \gets 0$ - \State $\accum \gets 0$\label{alg:mon-sam-global1} - \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2} - \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}} - \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples} - \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} - \For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$} - \If{$\bivec[\block] = 1$}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} - \State $\vari{sample}_{\vari{next}} \gets 1$ - \State break - \Else - \State $\bivec[\block] = 1$ -% \State $\vari{sum} = 0$ -% \For{$\ell \in [\abs{\block}]$} -% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$ -% \EndFor -% \If{$\vari{sum} \geq 2$} -% \State $\vari{sample}_{\vari{next}} \gets 1$ -% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.} - \EndIf - \EndFor - \If{$\vari{sample}_{\vari{next}} = 1$} - \State $\vari{sample}_{\vari{next}} \gets 0$ - \State continue - \EndIf - \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1} - \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$} - \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$} - \EndFor - \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product} - \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add} - \EndFor - - \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3} - \State \Return \vari{acc} - \end{algorithmic} -\end{algorithm} - -Before redefining $\rpoly$ in terms of the $\bi$ model, we need to define the notion of performing a mod operation with a set of polynomials. - -\begin{Definition}[Mod with a set of polynomials]\label{def:mod-set-poly} -To mod a polynomial $\poly$ with a set $\vct{Z} = \{Z_1,\ldots Z_x\}$ of polynomials, the mod operation is performed successively on the $\poly$ modding out each element of the set $\vct{Z}$ from $\poly$. -\end{Definition} - -\begin{Example}\label{example:mod-set-poly} -To illustrate for $\poly = X_1^2 + X_1X_2^3$ and the set $\vct{Z} = \{X_1^2 - X_1, X_2^2 - X_2, X_1X_2\}$ we get - -\begin{align*} -&X_1^2 + X_1X_2^3 \mod X_1^2 - X_1 \mod X_2^2 - X_2 \mod X_1X_2\\ -=&X_1 + X_1X_2^3 \mod X_2^2 - X_2 \mod X_1X_2\\ -=&X_1 + X_1X_2 \mod X_1X_2\\ -=&X_1 -\end{align*} - -\end{Example} - -\begin{Definition}[$\rpoly$ for $\bi$ Data Model]\label{def:bi-alg-rpoly} -$\rpoly(\vct{X})$ over the $\bi$ data model is redefined to include the following mod operation in addition to definition ~\ref{def:qtilde}. For every $j \neq i$, we add the operation $\mod X_{\block, i}\cdot X_{\block, j}$. For set of blocks $\mathcal{B}$ and the size of block $\block$ as $\abs{\block}$, - -\[\rpoly(\vct{X}) = \poly(\vct{X}) \mod \{X_{\block, i}^2 - X_{\block, i} \st \block \in \mathcal{B}, i \in [\abs{\block}]\} \cup_{\block \in \mathcal{B}} \{X_{\block, i}X_{\block, j} \st i, j \in [\abs{\block}], i \neq j\} -% \mod X_{\block_1, 1}^2 - X_{\block_1, 1} \cdots \mod X_{\block_k, \abs{\block_k}}^2 - X_{\block_k, \abs{\block_k}} \mod X_{b_1, 1} \cdot X_{b_1, 2}\cdots \mod X_{\block_1, \abs{\block_1} -1} \cdot X_{\block, \abs{\block_1}}\cdots \mod X_{\block_k, 1} \cdot X_{\block_k, 2} \cdots \mod X_{\block_k, \abs{\block_k} - 1}\cdot X_{\block_K, \abs{\block_k}}. -\] -\end{Definition} - -\subsection{Correctness} -\begin{Theorem}\label{theorem:bi-approx-rpoly-bound} -For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ in the $\bi$ setting can be computed in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$, with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$. -\end{Theorem} - -\begin{proof}[Proof of Theorem ~\ref{theorem:bi-approx-rpoly-bound}] -By the proof of ~\cref{lem:approx-alg}, with a minor adjustment on $\evalmp$, such that we define the function to output $0$ for any monomial sharing disjoint variables, coupled with the fact that additional operations in ~\cref{alg:bi-mon-sam} are $O(1)$ occuring at most $k$ times for each of the $\numsamp$ samples, the proof of ~\cref{theorem:bi-approx-rpoly-bound} immediately follows. -\end{proof} - -\qed - -\subsection{Safe Query Class for $\bi$} -We want to analyze what is the class of queries and data restrictions that are necessary to guarantee that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_{1},\ldots, \prob_{\numvar})}$ is $O(1)$. - -\subsubsection{When $\rpoly$ is zero} - -First, consider the case when $\rpoly$ cancels out all terms in $\poly$, where $\poly \neq \emptyset$. For $\rpoly$ to cancel out a tuple $\tup$, by ~\cref{def:bi-alg-rpoly} it must be the case that output tuple $\tup$ is dependent on two different tuples appearing in the same block. For this condition to occur, it must be that the query $\poly$ contains a self join operation on a table $\rel$, from which $\tup$ has been derived. - -Certain conditions on both the data and query must exist for all tuples $\tup$ to be cancelled out by $\rpoly$ as described above. - -For $\rpoly$ to be $0$, the data of a $\bi$ must satisfy certain conditions. - -\begin{Definition}[Data Restrictions]\label{def:bi-qtilde-data} -Consider $\bi$ table $\rel$. For $\rpoly$ to potentially cancel all its terms, $\rel$ must be such that given a self join, the join constraints remain unsatisfied for all tuple combinations $x_{\block_i, \ell} \times x_{\block_j, \ell'}$ for $i \neq j$, $\ell \in [\abs{\block_i}], \ell' \in [\abs{\block_j}]$, i.e. combinations across different blocks. Note that this is trivially satisfied with a $\rel$ composed of just one block. Further, it must be the case that the self join constraint is only satisfied in one or more crossterm combinations $x_{\block, i} \times x_{\block_j}$ for $i \neq j$, i.e., within the same block of the input data. -\end{Definition} - -To be precise, only equijoins are considered in the following definition. Before preceding, note that a natural self join will never result in $\rpoly$ cancelling all terms, since it is the case that each tuple will necessarily join with itself, and $\rpoly$ will not mod out this case. Also, although we are using the term self join, we consider cases such that query operations over $\rel$ might be performed on each join input prior to the join operation. While technically the inputs may not be the same set of tuples, this case must be considered, since all the tuples originate from the table $\rel$. To this end, let $\poly_1(\rel) = S_1$ and $\poly_2(\rel) = S_2$ be the input tables to the join operation. -\begin{Definition}[Class of Cancelling Queries]\label{def:bi-qtilde-query-class} -When ~\cref{def:bi-qtilde-data} is satisfied, it must be that $\poly$ contains a join $S_1 \bowtie_\theta S_2$ such that either% that satisfies the following constraints based on its structure. - -\textsc{Case 1:} $S_1 \cap S_2 = \emptyset$ - -%Any join over this structure will produce a $\poly$ such that $\rpoly$ cancels all monomials out. -%Such a condition implies $\rpoly$ is $0$ regardless of join condition $\theta$. Note the beginning premise of this definition, and the fact that such premise rules out the natural join across all attributes, since we would have that $\poly = \rpoly = 0$. -Or - -\textsc{Case 2:} $S_1 \cap S_2 \neq \emptyset$, the attributes in the join predicate are non-matching, i.e., neither operand of the comparison is a strict subset of the other, and no input tuple has agreeing values across the join attributes. - - -%\begin{enumerate} -% \item When the join condition $\theta$ involves equality between matching attributes, it must be that the attributes of the join conditon $\attr{\theta}$ are a strict subset of $\attr{\rel}$. Then, to satisfy ~\cref{def:bi-qtilde-data} it must be that the join input consists of non-intersecting strict subsets of $\rel$, meaning $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$. $\poly_1$ in ~\cref{ex:bi-tildeq-0} illustrates this condition. -% \item If $\theta$ involves an equality on non-matching attributes, there exist two cases. -% \begin{enumerate} -% \item The first case consists of when the join inputs intersect, i.e., $S_1 \cap S_2 \neq \emptyset$ . To satisfy ~\cref{def:bi-qtilde-data} it must be the case that no tuple can exist with agreeing values across all attributes in $\attr{\theta}$. $\poly_3$ of ~\cref{ex:bi-tildeq-0} demonstrates this condition. -% \item The second case consists of when $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$ in the join input, and this case does not contradict the requirements of ~\cref{def:bi-qtilde-query-class}. This case is illustrated in $\poly_2$ of ~\cref{ex:bi-tildeq-0}. -% \end{enumerate} -%\end{enumerate}% , cause $\rpoly$ to be $0$ must have the following characteristics. First, there must be a self join. Second, prior to the self join, there must be operations that produce non-intersecting sets of tuples for each block in $\bi$ as input to the self join operation. -\end{Definition} - -In ~\cref{ex:bi-tildeq-0}, $\poly_1$ and $\poly_2$ are both examples of \textsc{Case 1}, while $\poly_3$ is an example of \textsc{Case 2}. - -\begin{Theorem}\label{theorem:bi-safe-q} -When both ~\cref{def:bi-qtilde-data} and ~\cref{def:bi-qtilde-query-class} are satisfied, $\rpoly$ cancels out all monomials. -\end{Theorem} - -\begin{proof}[Proof of Theorem ~\ref{theorem:bi-safe-q}] -Starting with the case that $S_1 \cap S_2 = \emptyset$. When this is the case, by definition, all joins on tuples in $S_1$ and $S_2$ will be will involve elements in $S_1 \times S_2$ such that both tuples are distinct. Further, ~\cref{def:bi-qtilde-data} rules out joins across different blocks, while calling for joins of the above form within the same block. Thus all tuples in the query output are dependent on more than one tuple from the same block, thus implying by ~\cref{def:bi-alg-rpoly} that $\rpoly$ will cancel all monomials. - -For the next case where $S_1 \cap S_2 \neq \emptyset$, note that there exists at least one tuple in both $S_1$ and $S_2$ that is the same. Therefore, all equijoins involving matching attributes will produce at least one self joined tuple in the output, breaking the last property of ~\cref{def:bi-qtilde-data}. For the case of equijoins with predicates involving non-matching attribute operands, note that by definition of equijoin, the only case that a tuple shared in both $S_1$ and $S_2$ can join on itself is precisely when that tuple's values agree across all the join attributes in $\theta$. Thus, it is the case that when $S_1 \cap S_2 \neq \emptyset$ and the join predicate involves equality comparison between non-matching attributes such that the values of the non-matching comparison attributes for each tuple in $\{S_1 \cap S_2\}$ do not agree, we have that ~\cref{def:bi-qtilde-data} is not contradicted, and when ~\cref{def:bi-qtilde-data} is fulfilled, it must be the case that $\poly \neq 0$ while $\rpoly = 0$. - -This concludes the proof. -\end{proof} - -\qed - - -Note then that the class of queries described in ~\cref{def:bi-qtilde-query-class} belong to the set of queries containing some form of selction over self cross product. -%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-data}] -%\end{proof} -%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-query-class}] -%\end{proof} - - %%%%%%%%%%%%%%%%%%%%%%% - -%The condition that causes $\rpoly(\prob_1,\ldots, \prob_\numvar)$ to be $0$ is when all the output tuples in each block cancel each other out. Such occurs when the annotations of each output tuple break the required $\bi$ property that tuples in the same block must be disjoint. This can only occur for the case when a self-join outputs tuples each of which have been joined to another tuple from its block other than itself. -% -%The observation is then the following. In order for such a condition to occur, we must have a query that is a self-join such that the join is on two different sets of atoms for each block. This condition can occur when inner query operations with different constraints on input table $\rel$ produce two non-intersecting sets of tuples and then performs a self join on them, such that the join condition \textit{only} holds for tuples that are members of the same block. -% -%There are two operators that can produce the aforementioned selectivity. First, consider $\sigma$, where two different selection conditions $\theta_1$ and $\theta_2$ over $\rel$ can output sets $S_{\sigma_{\theta_1}}$ and $S_{\sigma_{\theta_2}}$ where $S_{\sigma_{\theta_1}} \cap S_{\sigma_{\theta_2}} = \emptyset$. A join over these two outputs can produce an ouput $\poly$ where all annotations will be disjoint and $\rpoly$ will effectively cancel them all out. Second, consider the projection operator $\pi$, such that projections over $\rel$ which project on different attributes can output two non-intersecting sets of tuples, which when joined, again, provided that the join condition holds only for tuples appearing in the same block, can output tuples all of which will break the disjoint requirement and $\rpoly$ will cancel them out. - -\begin{Example}\label{ex:bi-tildeq-0} -Consider the following $\bi$ table $\rel$ consisting of one block, with the following queries $\poly_1 = \sigma_{A = 1}(\rel)\bowtie_{B = B'} \sigma_{A = 2}(\rel)$, $\poly_2 = \sigma_{A = 1}(\rel)\bowtie_{A = B'} \sigma_{A = 2}(\rel)$, and $\poly_3 = \rel \bowtie_{A = B} \rel$. While the output $\poly_i \neq \emptyset$, all queries have that $\rpoly_i = 0$. Since $\rel$ consists of only one block, we will use single indexing over the annotations. -\end{Example} - - -\begin{figure}[ht] - \begin{tabular}{ c | c c c } - \rel & A & B & $\phi$\\ - \hline - & 1 & 2 & $x_1$\\ - & 2 & 1 & $x_2$\\ - & 1 & 3 & $x_3$\\ - & 3 & 1 & $x_4$\\ - \end{tabular} - \caption{Example~\ref{ex:bi-tildeq-0} Table $\rel$} - \label{fig:bi-ex-table} -\end{figure} -%%%%%%%%%%Query 1 and 2 -\begin{figure}[ht] - \begin{subfigure}{0.2\textwidth} - \centering - \begin{tabular}{ c | c c c } - $\sigma_{\theta_{A = 1}}(\rel )$& A & B & $\phi$\\ - \hline - & 1 & 2 & $x_1$\\ - & 1 & 3 & $x_3$\\ - \end{tabular} - \caption{$\poly_1, \poly_2$ First Selection} - \label{subfig:bi-q1-sigma1} - \end{subfigure} - \begin{subfigure}{0.2\textwidth} - \centering - \begin{tabular}{ c | c c c} - $\sigma_{\theta_{A = 2}}(\rel)$ & A & B' & $\phi$\\ - \hline - & 2 & 1 & $x_2$\\ - \end{tabular} - \caption{$\poly_1, \poly_2$ Second Selection} - \label{subfig:bi-q1-sigma2} - \end{subfigure} - \begin{subfigure}{0.25\textwidth} - \centering - \begin{tabular}{ c | c c c c c} - $\poly_1(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ - \hline - & 1 & 2 & 2 & 1 & $x_1x_2$\\ - \end{tabular} - \caption{$\poly_1(\rel)$ Output} - \label{subfig:bi-q1-output} - \end{subfigure} - \begin{subfigure}{0.4\textwidth} - \centering - \begin{tabular}{ c | c c c c c} - $\poly_2(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ - \hline - & 1 & 2 & 2 & 1 & $x_1x_2$\\ - & 1 & 3 & 2 & 1 & $x_2x_3$\\ - \end{tabular} - \caption{$\poly_2(\rel)$ Output} - \label{subfig:bi-q2-output} - \end{subfigure} - \caption{$\poly_1, \poly_2(\rel)$} - \label{fig:bi-q1-q2} -\end{figure} -%%%%%%%%%%%Query 3 -\begin{figure}[ht] -% \begin{subfigure}{0.2\textwidth} -% \centering -% \begin{tabular}{ c | c c } -% $\pi_{A}(\rel)$ & A & $\phi$\\ -% \hline -% & 1 & $x_1$\\ -% & 2 & $x_2$\\ -% & 1 & $x_3$\\ -% & 3 & $x_4$\\ -% \end{tabular} -% \caption{$\poly_3$ First Projection} -% \label{subfig:bi-q3-pi1} -% \end{subfigure} -% \begin{subfigure}{0.2\textwidth} -% \centering -% \begin{tabular}{ c | c c } -% $\pi_{B}(\rel)$ & B & $\phi$\\ -% \hline -% & 2 & $x_1$\\ -% & 1 & $x_2$\\ -% & 3 & $x_3$\\ -% & 1 & $x_4$\\ -% \end{tabular} -% \caption{$\poly_3$ Second Projection} -% \label{subfig:bi-q3-pi2} -% \end{subfigure} - \begin{subfigure}{0.2\textwidth} - \centering - \begin{tabular}{ c | c c c c c } - $\poly_3(\rel)$ & A & B & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ - \hline - & 1 & 2& 2 & 1 & $x_1x_2$\\ - & 1 & 2 & 3 & 1 & $x_1x_2$\\ - & 2 & 1 & 1 & 2 & $x_1x_2$\\ - & 1 & 3 & 2 & 1 & $x_2x_3$\\ - & 1 & 3 & 3 & 1 & $x_3x_4$\\ - & 3 & 1 & 1 & 3 & $x_3x_4$\\ - \end{tabular} - \caption{$\poly_3(\rel)$ Output} - \label{subfig:bi-q3-output} - \end{subfigure} - \caption{$\poly_3(\rel)$} - \label{fig:bi-q3} -\end{figure} - -Note that all of ~\cref{subfig:bi-q1-output}, ~\cref{subfig:bi-q2-output}, and ~\cref{subfig:bi-q3-output} each have a set of tuples, where each annotation has cross terms from its block, and by ~\cref{def:bi-alg-rpoly} $\rpoly$ will eliminate all tuples output in the respective queries. - -\subsubsection{When $\rpoly > 0$} -\par\AH{General Case and Sufficient Condition for $\bi$ and $\rpoly_{\bi}$ approx alg needs to be written.} -\paragraph{General Case} -Consider the query $\poly = \sum_{i = 1}^{\numvar}x_i$, analogous to a projection where all tuples match on the projected set of attributes, meaning $\tup_i[A] = \tup_j[A]$ for $i, j \in [\numvar]$ such that $i \neq j$. When $\numvar$ grows unboundedly, $\abs{\etree}(1,\ldots, 1) = \numvar$. We assume that the sum of the probabilities of all $\numvar$ tuples in the block remain a constant as $\numvar$ grows. Thus, we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{n}{c}$ for some constant $c$, and this implies $O(\numvar)$ growth. -% while $\rpoly(\vct{\prob}) \leq 1$, which implies that the ratio is linear, i.e., $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{p})} = \frac{\numvar}{\numvar \cdot \prob_0} = \frac{1}{\prob_0}$ for $\prob_0 = min(\vct{\prob})$. However, note that for $\numvar \rightarrow \infty$ it is the case that $\prob_0 \rightarrow 0$, and as $\numvar$ grows, so does $\frac{1}{\prob_0}$. Intuitively, consider when $p_0 = \frac{1}{\numvar}$. Then we know that the bound is $\frac{\numvar}{1}$ which is $O(\numvar)$. - -\paragraph{Sufficient Condition for $\bi$ to achieve linear approximation} -Consider the same query $\poly = \sum_{i = 1}^{\numvar}$, but this time conditioned on a fixed block size which we denote $\abs{\block}$. Then it is the case that $\abs{\etree}(1,\ldots, 1) = \numvar$, but if we assume that all blocks have a sum of probabilities equal to $1$, $\rpoly(\vct{\prob}) = \frac{\numvar}{\abs{b}}$, and this means that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{\numvar}{\frac{\numvar}{\abs{\block}}} = \abs{\block}$. For the general case when all blocks do not have the property that the sum of the probabilities of the alternatives equal $1$, we can lower bound the sum of probabilities as $\frac{\numvar}{\abs{\block}} \cdot \prob_0$ for $\prob_0 = min(\vct{\prob})$. Note that in $\numvar \cdot \frac{\prob_0}{\abs{\block}}$, $\frac{\prob_0}{\block}$ is indeed a constant, and this gives an overall ratio of $O(1)$ as $\numvar$ increases. diff --git a/lin_sys.tex b/lin_sys.tex index 7b82037..240c2b8 100644 --- a/lin_sys.tex +++ b/lin_sys.tex @@ -1,8 +1,6 @@ %root: main.tex -\subsection{Developing a Linear System} - -\AH{The changes in ~\cref{eq:2pd-3d} have been propagated 110420. Barring any errors, everything should be updated and correct.} +\subsubsection{Developing a Linear System} \begin{proof}[Proof of Lemma \ref{lem:lin-sys}] Our goal is to build a linear system $M \cdot (x~y~z)^T = \vct{b}$, such that, assuming an indexing starting at $1$, each $i^{th}$ row in $M$ corresponds to the RHS of ~\cref{eq:LS-subtract} for $\graph{i}$ \textit{in} terms of $\graph{1}$. The vector $\vct{b}$ analogously has the terms computable in $O(\numedge)$ time for each $\graph{i}$ at its corresponing $i^{th}$ entry for the LHS of ~\cref{eq:LS-subtract}. Lemma ~\ref{lem:qE3-exp} gives the identity for $\rpoly_{G}(\prob,\ldots, \prob)$ when $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar)^3$, and using @@ -185,9 +183,10 @@ We also make use of the fact that for a matrix with entries $ab, ac, ad,$ and $a \end{align*} Compute each RHS term starting with the left and working to the right, -\begin{equation} -(3\prob^2 - \prob^3)^2\cdot \left((-4 \cdot 45) - (-21 \cdot 10)\right) = (3\prob^2 - \prob^3)^2\cdot(-180 + 210) = 30(3\prob^2 - \prob^3)^2.\label{eq:det-1} -\end{equation} +\begin{align} +&(3\prob^2 - \prob^3)^2\cdot \left((-4 \cdot 45) - (-21 \cdot 10)\right) = (3\prob^2 - \prob^3)^2\cdot(-180 + 210)\nonumber\\ +&= 30(3\prob^2 - \prob^3)^2.\label{eq:det-1} +\end{align} The middle term then is \begin{align} &-\prob(3\prob^2 - \prob^3)^2 \cdot \left((-2 \cdot 45) - (-18 \cdot 10)\right) \nonumber\\ @@ -209,9 +208,8 @@ Putting \cref{eq:det-1}, \cref{eq:det-2}, \cref{eq:det-3} together, we have, \AH{It appears that the equation below has roots at p = 0 (left factor) and p = 1, with NO roots $\in (0, 1)$.} -%Equation \cref{eq:det-final} has no roots in $(0, 1)$. -\AH{I need to understand how lemma ~\ref{lem:lin-sys} follows.} -\end{proof}\AH{End proof of Lemma \ref{lem:lin-sys}} +It can be shown through standard polynomial roots computation techniques \footnote{An online roots solver such as https://www.mathportal.org/calculators/polynomials-solvers/polynomial-roots-calculator.php will suffice}, that $\dtrm{\mtrix{\rpoly}}$ has no roots in $(0, 1)$, ensuring independence for all $\prob$ values in $(0, 1)$, and thus ~\cref{lem:lin-sys} follows. +\end{proof} \qed \begin{proof}[Proof of \cref{th:single-p}] diff --git a/macros.tex b/macros.tex index ac918a9..3657c6d 100644 --- a/macros.tex +++ b/macros.tex @@ -159,6 +159,8 @@ \end{tikzpicture} } } + + \newcommand{\kmatch}{\ed\cdots\ed^\kElem} \newcommand{\twodis}{\patternshift{ \begin{tikzpicture}[every path/.style={thick, draw}] \node at (0, 0) [pattern_node] (bottom1) {}; diff --git a/main.tex b/main.tex index 21b1243..3fc8792 100644 --- a/main.tex +++ b/main.tex @@ -170,7 +170,7 @@ sensitive=true \input{single_p} \input{lin_sys} \input{approx_alg} -\input{bi_cancellation} +%\input{bi_cancellation} diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 9b93ebb..563adcc 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -1,6 +1,6 @@ %root:main.tex -\subsection{When $\poly$ is not in sum of monomials form} +\subsection{Multiple Distinct $\prob$ Values} We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. @@ -8,122 +8,38 @@ To this end, consider the following graph $G(V, E)$, where $|E| = \numedge$, $|V Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$. -%Original lemma proving the exact coefficient terms in qE3 -%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%\begin{Lemma}\label{lem:qE3-exp} -%When we expand $\poly_{G}(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^3$ out and assign all exponents $e \geq 1$ a value of $1$, we have the following, -% \begin{align} -% &\rpoly_{G}(\prob,\ldots, \prob) = \numocc{G}{\ed}\prob^2 + 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis} + 6\numocc{G}{\tri}\prob^3 + 6\numocc{G}{\oneint}\prob^4 + 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6.\label{claim:four-one} -% \end{align} -%\end{Lemma} -% -%\begin{proof}[Proof of \cref{lem:qE3-exp}] -%By definition we have that -% \[\poly_{G}(\vct{X}) = \sum_{\substack{(i_1, j_1),\\ (i_2, j_2),\\ (i_3, j_3) \in E}} \prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}.\] -% Rather than list all the expressions in full detail, let us make some observations regarding the sum. Let $e_1 = (i_1, j_1), e_2 = (i_2, j_2), e_3 = (i_3, j_3)$. Notice that each expression in the sum consists of a triple $(e_1, e_2, e_3)$. There are three forms the triple $(e_1, e_2, e_3)$ can take. -% -%\textsc{case 1:} $e_1 = e_2 = e_3$, where all edges are the same. There are exactly $\numedge$ such triples, each with a $\prob^2$ factor in $\rpoly_{G}\left(\prob_1,\ldots, \prob_\numvar\right)$. -% -%\textsc{case 2:} This case occurs when there are two distinct edges of the three, call them $e$ and $e'$. When there are two distinct edges, there is then the occurence when $2$ variables in the triple $(e_1, e_2, e_3)$ are bound to $e$. There are three combinations for this occurrence. It is the analogue for when there is only one occurrence of $e$, i.e. $2$ of the variables in $(e_1, e_2, e_3)$ are $e'$. Again, there are three combinations for this. All $3 + 3 = 6$ combinations of two distinct values consist of the same monomial in $\rpoly$, i.e. $(e_1, e_1, e_2)$ is the same as $(e_2, e_1, e_2)$. This case produces the following edge patterns: $\twopath, \twodis$. -% -%\textsc{case 3:} $e_1 \neq e_2 \neq e_3$, i.e., when all edges are distinct. For this case, we have $3! = 6$ permutations of $(e_1, e_2, e_3)$. This case consists of the following edge patterns: $\tri, \oneint, \threepath, \twopathdis, \threedis$. -%\end{proof} -%\qed -\subsubsection{Multiple Distinct $\prob$ values} For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$. \begin{Lemma}\label{lem:qEk-multi-p} -Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$ and $2\kElem + 1$ distinct $\prob$ values, one can compute each $c_i$ in $\vct{c}$ exactly. +Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$. Given $2\kElem + 1$ distinct $\prob$ values, one can compute each $c_i$ in $\vct{c}$ exactly. Additionally, the number of $\kElem$-matchings can be computed exactly. \end{Lemma} \begin{proof}[Proof of ~\cref{lem:qEk-multi-p}] -It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges, with non-intersecting points, a case equivalent to $p^{2\kElem}$. +It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$. Given that we have $2\kElem + 1$ distinct values of $\prob$ by the lemma statement, it follows that we then have $2\kElem + 1$ linear equations which are distinct. Further, by construction of the summation, these $2\kElem + 1$ equations collectively form the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system to determine $\vct{c}$ exactly. + +It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. + +Thus, the product $\kElem!\cdot\numocc{G}{\kmatch}$ is the exact number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$. \end{proof} \qed -\begin{Lemma}\label{lem:qEk-multi-p-k-match} -The number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$ is exactly $\kElem!\cdot\numocc{G}{\threedis}$. -\end{Lemma} -\begin{proof}[Proof of Lemma ~\ref{lem:qEk-multi-p-k-match}] -A $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. -Thus, the product $\kElem!\cdot\numocc{G}{\threedis}$ is the exact number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$. -\end{proof} - -\qed \begin{Corollary}\label{cor:lem-qEk} -One can compute $\numocc{G}{\threedis}$ in $\query_{G}^\kElem(\vct{X})$ exactly. +One can compute $\numocc{G}{\kmatch}$ in $\query_{G}^\kElem(\vct{X})$ exactly. \end{Corollary} \begin{proof}[Proof for Corollary ~\ref{cor:lem-qEk}] -By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. By ~\cref{lem:qEk-multi-p-k-match}, we know that $c_{2\kElem}$ can be broken into two factors, and by dividing $c_{2\kElem}$ by the factor $\kElem!$, it follows that the resulting value is indeed $\numocc{G}{\threedis}$. +By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. Additionally we know that $c_{2\kElem}$ can be broken into two factors, and by dividing $c_{2\kElem}$ by the factor $\kElem!$, it follows that the resulting value is indeed $\numocc{G}{\kmatch}$. \end{proof} \qed -%\begin{Lemma}\label{lem:alt-qEk} -%Given $k$ distinct $\prob$ values and $\poly_{G}^k(\prob,\ldots, \prob)$, one can solve the number of $3$-matchings exactly. -%\end{Lemma} -% -%\begin{proof}[Proof for Lemma ~\ref{lem:alt-qEk}] -%By the same logic as ~\cref{lem:qEk-multi-p} it is the case that there are $k$ $\prob^i$ values for $i$ in $[0, k - 1]$. This, combined with $k$ distinct $\prob$ values yields the Vandermonde matrix with full rank, and thus all the values $c_i$ in $\vct{c}$ can be computed exactly. Finally, along the same lines as ~\cref{lem:qEk-multi-p-k-match}, dividing by $k!$ yields the desired result, $\numocc{G}{k-matchings}$. This can be seen, since it is the case that only a $k-matching$ can have a $\prob^{2k}$ factor, and, secondly, for a $k-product$, there are $k$ choices in the first product, $k - 1$ choices in the second factor, and so on, yielding $k!$ copies of each $k-matching$. -% -% -%\AH{Any suggestions for a better notation/representation of k-matching??} -%\end{proof} -% -%\qed - \begin{Corollary}\label{cor:reduct} -By ~\cref{lem:qEk-multi-p}, ~\cref{lem:qEk-multi-p-k-match}, and ~\cref{cor:lem-qEk} it follows that computing $\rpoly(\vct{X})$ is hard. +By ~\cref{lem:qEk-multi-p} and ~\cref{cor:lem-qEk} it follows that computing $\rpoly(\vct{X})$ is hard. \end{Corollary} -%Old proof -%%%%%%%%%%%%%%%%%%%%% -%Notice that ~\cref{lem:qE3-exp} is an example of a query that reduces to the hard problems in graph theory of counting triangles, three-matchings, three-paths, etc. Thus, in general, computing $\expct_{\vct{w}}\pbox{\poly(\vct{w})} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ is a hard problem. -% -%\begin{Claim}\label{claim:four-two} -% If one can compute $\rpoly_{G}(\prob,\ldots, \prob)$ in time T(\numedge), then we can compute the following in O(T(\numedge) + \numedge): -%\[\numocc{G}{\tri} + \numocc{G}{\threepath} \cdot \prob - \numocc{G}{\threedis}\cdot(3\prob^2 - \prob^3).\] -%\end{Claim} -%\begin{proof}[Proof of Claim \ref{claim:four-two}] -%%We have shown that the following subgraph cardinalities can be computed in $O(\numedge)$ time: -%%\[\numocc{G}{\ed}, \numocc{G}{\twopath}, \numocc{G}{\twodis}, \numocc{G}{\oneint}, \numocc{G}{\twopathdis} + \numocc{G}{\threedis}.\] -%It has already been shown previously that $\numocc{G}{\ed}, \numocc{G}{\twopath}, \numocc{G}{\twodis},$ and $\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}$ can be computed in $O(\numedge)$ time. -% -%Using the result of \cref{lem:qE3-exp}, let us show a derivation to the identity of the consequent in \cref{claim:four-two}. -% -%All of \cref{eq:1e}, \cref{eq:2p}, \cref{eq:2m}, \cref{eq:3s}, \cref{eq:2pd-3d} show that we can compute the respective edge patterns in $O(\numedge)$ time. Rearrange ~\cref{claim:four-one}, $\rpoly_{G}$, with all linear time computations on one side, leaving only the hard computations, -%\begin{align} -%&\rpoly_{G}(\prob,\ldots, \prob) = \numocc{G}{\ed}\prob^2 + 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis}\prob^4 + 6\numocc{G}{\oneint}\prob^4 + 6\numocc{G}{\tri}\prob^3 + 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6\nonumber\\ -%&\rpoly_{G}(\prob,\ldots, \prob) - \numocc{G}{\ed}\prob^2 - 6\numocc{G}{\twopath}\prob^3 - 6\numocc{G}{\twodis}\prob^4 - 6\numocc{G}{\oneint}\prob^4 = 6\numocc{G}{\tri}\prob^3 + 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6\label{eq:LS-rearrange}\\ -%&\frac{\rpoly_{G}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{G}{\ed}}{6\prob} - \numocc{G}{\twopath} - \numocc{G}{\twodis}\prob - \numocc{G}{\oneint}\prob = \numocc{G}{\tri} + \numocc{G}{\threepath}\prob + \numocc{G}{\twopathdis}\prob^2 + \numocc{G}{\threedis}\prob^3\label{eq:LS-reduce}\\ -%&\frac{\rpoly_{G}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{G}{\ed}}{6\prob} - \numocc{G}{\twopath} - \numocc{G}{\twodis}\prob - \numocc{G}{\oneint}\prob - \big(\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}\big)\prob^2 = \numocc{G}{\tri} + \numocc{G}{\threepath}\prob - \numocc{G}{\threedis}\left(3\prob^2 - \prob^3\right)\label{eq:LS-subtract} -%\end{align} -% -%\cref{eq:LS-rearrange} is the result of simply subtracting from both sides terms that have $O(\numedge)$ complexity. Dividing all terms by the common factor of $6\prob^3$ gives \cref{eq:LS-reduce}. Equation ~\ref{eq:LS-subtract}, is the result of subtracting the $O(\numedge)$ computable term $\left(\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}\right)\prob^2$ from both sides. -% -%%\begin{equation} -%%\frac{\rpoly_{G}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{G}{\ed}}{6\prob} - \numocc{G}{\twopath} - \numocc{G}{\twodis}\prob - \numocc{G}{\oneint}\prob - \big(\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}\big)\prob^2 = \numocc{G}{\tri} + \numocc{G}{\threepath}\prob - \numocc{G}{\threedis}\left(3\prob^2 - \prob^3\right) -%%\end{equation} -% -% -%The implication in \cref{claim:four-two} follows by the above and \cref{lem:qE3-exp}. -%\end{proof} -%\qed -% -%\begin{Lemma}\label{lem:gen-p} -%If we can compute $\rpoly_{G}(\vct{X})$ in $T(\numedge)$ time for $O(1)$ distinct values $\vct{\prob}$ such that all $\prob_i = \prob$ for all $i \in [\numvar], \prob_i \in \vct{\prob}$, then we can count the number of triangles, 3-paths, and 3-matchings in $G$ in $T(\numedge) + O(\numedge)$ time. -%\end{Lemma} -% -%\begin{proof}[Proof of \cref{lem:gen-p}] -% -%\cref{claim:four-two} says that if we know $\rpoly_{G}(\prob,\ldots, \prob)$, then we can know in O(\numedge) additional time -%\[\numocc{G}{\tri} + \numocc{G}{\threepath} \cdot \prob - \numocc{G}{\threedis}\cdot(3\prob^2 - \prob^3).\] We can think of each term in the above equation as a variable, where one can solve a linear system given 3 distinct $\prob$ values, assuming independence of the three linear equations. In the worst case, without independence, 4 distinct values of $\prob$ would suffice. This follows from the fact that the corresponding coefficient matrix is the so called Vandermonde matrix, which has full rank -%\end{proof} -%\qed \ No newline at end of file diff --git a/retracted_bidb_stuff.tex b/retracted_bidb_stuff.tex new file mode 100644 index 0000000..e7c6847 --- /dev/null +++ b/retracted_bidb_stuff.tex @@ -0,0 +1,451 @@ +%root = main.tex +\AH{\large\bf{New stuff 092520.}} + +\begin{Claim}\label{claim:constpk-TI} +Given a positive query polynomial $\poly$ over a $\ti$, with constant $\prob$ such that there exists a $\prob_0$ where for all $\prob_i, \prob_0 \leq \prob_i$, and constant $k = \degree(\poly)$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is constant. +\end{Claim} + +\begin{proof}[Proof of Claim ~\ref{claim:constpk-TI}] +By independence, a $\ti$ has the property that all of its annotations are positive. Combined with the fact that ~\cref{claim:constpk-TI} uses only positive queries, i.e., queries that only use $\oplus$ and $\otimes$ semiring operators over its polynomial annotations, it is the case that no negation exists pre or post query. + +For any $\poly$ then, it is true that all coefficients in $\abs{\etree}(1,\ldots, 1)$ are positive and thus the same as their $\rpoly$ counterparts. This then implies that the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)} \leq \frac{\abs{\etree}(1,\ldots, 1)}{\abs{\etree}(1,\ldots, 1) \cdot \prob_0^k}$, which is indeed a constant. +\end{proof} + +\qed + +\subsection{$\rpoly$ over $\bi$} +\AH{A general sufficient condition is the $\bi$ having fixed block size (thus implying increasing number of blocks for growing $\numvar$). For increasing $\numvar$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ can be proven to be a constant since, as $\numvar$ increases, it has to be the case that new blocks are added, and this results in a constant number of terms cancelled out by $\rpoly$, with the rest surviving, which gives us a constant $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$. +\par In the general case, with fixed number of blocks and growing $\numvar$, all additional terms will be cancelled out by $\rpoly$ while for $\abs{\etree}(1,\ldots, 1)$ it is the case that it will grow exponentially with $\numvar$, yielding a ratio $\frac{O(2^\numvar)}{O(1)}$ and (as will be seen) greater.} + +\subsubsection{Known Reduction Result $\bi \mapsto \ti$} + +Denote an arbitrary $\bi$ as $\bipdb = (\bipd, \biwset)$ and a constructed $\ti$ to be $\tipdb = (\tipd, \tiwset)$, the details to be described next. +It is well known that $\bipdb$ can be reduced to a query $\poly$ over $\tipdb$. For completeness, let us describe the reduction. + +Let tuples in $\bipdb$ be denoted $a_{\block, i}$ and their $\tipdb$ counterparts as $x_{\block, i}$, where $\block$ represents the block id in which $a_{\block, i}$ resides. + +\begin{Theorem}\label{theorem:bi-red-ti} +For any $\bipdb$, there exists a query $\poly$ and $\tipdb$ such that $\poly(\tiwset)$ over distribution $\tipd$ outputs elements in $\biwset$ according to their respective probabilities in $\bipd$. +\end{Theorem} + +\begin{Definition}[Total Ordering $\biord$]\label{def:bi-red-ti-order} +The order $\biord$ is a fixed total order across all tuples in block $\block$ of $\bipdb$. +\end{Definition} +\begin{Definition}[Query $\poly$]\label{def:bi-red-ti-q} +$\poly$ is constructed to map all possible worlds of $\db_{ti} \in \tiwset$ for which $x_i$ is the greatest according to $\biord$, to the worlds $\vct{w}$ in $\biwset$ in which $a_{\block, i}$ is present and $\bipd(\vct{w}) > 0$. Recall the constraint on $\bipdb$ to be that if $a_{\block, i}$ is present, then it is the case that for all $j \neq i$, tuple $a_{\block, j}$ is not present. For $\bipdb$ with exactly one block, all such worlds $\db_{ti}$ are mapped to the world $\{a_i\}$. +\end{Definition} + +For simplicity, we will consider $\bipdb$ to consist of one block $\block$. By independence of blocks in $\bi$, the proofs below immediately generalize to the case of $\bipdb$ with multiple blocks\textcolor{blue}{...umm, we'll see, we made need to argue this}. + +The reduction consists of the construction of a query $\poly$ and $\tipdb$ such that $\poly$ is computed over $\tipdb$. To construct the $\tipdb$ given an arbitrary $\bipdb$ a tuple alternative $a_{\block, i}$ is transcribed to a tuple in $\tipdb$ with probability + +\begin{equation} + P(x_{b, i}) = \begin{cases} + \frac{P(a_{\block, i})}{\prod_{j = 1}^{i - 1}(1 - P(x_{\block, j}))} &\textbf{if }i > 1\\ + P(a_i) &\textbf{if } i = 1. + \end{cases}\label{eq:bi-red-ti-func} +\end{equation} + +The above is more simply written as + +\begin{equation*} +\tipd(x_{\block, i}) = \frac{P(a_{\block, i})}{1 - \sum_{j = 1}^{i - 1} P(a_{\block, j})} +\end{equation*} + +The above mapping is applied across all tuples of $\bipdb$. + +This method for computing the probabilities of the tuples in $\tipdb$ allows for the following. According to $\biord$, the powerset of possible worlds is mapped in such a way that the first ordered tuple appearing in a possible world $\db_{\tiabb}$of $\tiwset$ has that world mapped to the world $\db_{\biabb} \in \biwset$ where $a_{\block, i}$ is present with $\bipd(\db_{\biabb}) > 0$. Recall that since we are considering a $\bi$ with one block, there is only one such world in $\biwset$. + +\begin{Lemma}\label{lem:bi-red-ti-prob} +The sum of the probabilities of all $\db_{\tiabb} \in \tiwset$ database worlds mapped to a a given tuple $x_{b, i}$ equals the probability of the tuple $a_{\block, i}$ in the original $\bipdb$. +\end{Lemma} + +\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-prob}] +The proof is by induction. Given a tuple $a_{\block, i}$ in $\bipdb$ such that $1 \leq i \leq \abs{b}$, (where $\abs{b}$ denotes the number of alternative tuples in block $\block$), by ~\cref{eq:bi-red-ti-func} $P(x_{\block, i}) = \frac{P(a_{\block, i})}{1 \cdot \prod_{j = 1}^{i - 1} (1 - P(x_{\block, j}))}$. + +For the base case, we have that $i = 1$ which implies that $P(x_{\block, i}) = P(a_{\block, i})$ and the base case is satisfied. + +%Other neat tidbits include that $\abs{b} = 1$, the set $b = \{a_1\}$, and the powerset $2^b = \{\emptyset, \{1\}\} = \tiwset$. For coolness, also see that $P(\neg x_i) = 1 - P(x_i) = 1 - P(a_i) = \emptyset$, so there is, in this case, a one to one correspondence of possible worlds and their respective probabilities in both $\ti$ and $\bi$, but this is extraneous information for the proof. + +The hypothesis is then that for $k \geq 1$ tuple alternatives, ~\cref{lem:bi-red-ti-prob} holds. + +For the inductive step, prove that ~\cref{lem:bi-red-ti-prob} holds for $k + 1$ alternatives. By definition of the query $\poly$ (~\cref{def:bi-red-ti-q}), it is a fact that only the world $\wElem_{x_{\block, k + 1}} = \{x_{\block, k + 1}\}$ in the set of possible worlds is mapped to $\bi$ world $\{a_{\block, k + 1}\}$. Then for world $\wElem_{x_{\block, k + 1}}$ it is the case that $P(\wElem_{x_{\block, k + 1}}) = \prod_{j = 1}^{k} (1 - P(x_j)) \cdot P(x_{\block k + 1})$. Since by ~\cref{eq:bi-red-ti-func} $P(x_{\block, k + 1}) = \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}$, we get +\begin{align*} +P(\wElem_{x_{\block, k + 1}}) =& \prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot P(x_{\block, k + 1})\\ +=&\prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}\\ +=&P(a_{\block, k + 1}). +\end{align*} +\end{proof} + +\qed + +This leaves us with the task of constructing a query $\poly$ over $\tipdb$ to perform the desired mapping of possible worlds. Setting $\poly$ to the following query yields the desired result. +\begin{lstlisting} +SELECT A FROM TI as a + WHERE A = 1 OR + OR A = 2 AND NOT EXISTS(SELECT A FROM TI as b + WHERE A = 1 AND a.blockID = b.blockID) + $\vdots$ + OR A = $|$b.blockID$|$ AND NOT EXISTS(SELECT A FROM TI as b + WHERE A = 1 OR A = 2 $\ldots$ A = $|$b.blockID$|$ AND a.blockID = b.blockID +\end{lstlisting} + +\begin{Lemma}\label{lem:bi-red-ti-q} +The query $\poly$ satisfies the requirements of ~\cref{def:bi-red-ti-q}. +\end{Lemma} + +\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-q}] +For any possible world in $2^b$, notice that the WHERE clause selects the tuple with the greatest ordering in the possible world. For all other tuples, disjunction of predicates dictates that no other tuple will be in the output by mutual exclusivity of the disjunction. Thus, it is the case for any $\ti$ possible world, that the tuple $x_{\block, i}$ with the greatest ordering appearing in that possible world will alone be in the output, and all such possible worlds with $x_{\block, i}$ as the greatest in the ordering will output the same world corresponding to the $\bi$ world for the disjoint tuple $a_{\block, i}$. +\end{proof} + +\qed + +\begin{proof}[Proof of Theorem ~\ref{theorem:bi-red-ti}] + +For multiple blocks in $\bipdb$, note that the above reduction to $\poly(\tipdb)$ with multiple 'blocks' will behave the same as $\bipdb$ since the property of independence for $\ti$ ensures that all tuples in the $\ti$ will have the same marginal probability across all possible worlds as their tuple probability, regardless of how many tuples and, thus, worlds the $\tipdb$ has. Note that this propety is unchanging no matter what probabilities additional tuples in $\tipdb$ are assigned. + +To see this consider the following. +\begin{Lemma}\label{lem:bi-red-ti-ind} +For any set of independent variables $S$ with size $\abs{S}$, when adding another distinct independent variable $y$ to $S$ with probability $\prob_y$, it is the case that the probability of each variable $x_i$ in $S$ remains unchanged. +\AH{This may be a well known property that I might not even have the need to prove, but since I am not certain, here goes.} +\end{Lemma} + +\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-ind}] +The proof is by induction. For the base case, consider a set of one element $S = \{x\}$ with probability $\prob_x$. The set of possible outcomes includes $2^S = \{\emptyset, \{x\}\}$, with $P(\emptyset) = 1 - \prob_x$ and $P(x) = p_x$. Now, consider $S' = \{y\}$ with $P(y) = \prob_y$ and $S \cup S' = \{x, y\}$ with the set of possible outcomes now $2^{S \cup S'} = \{\emptyset, \{x\}, \{y\}, \{x, y\}\}$. The probabilities for each world then are $P(\emptyset) = (1 - \prob_x)\cdot(1 - \prob_y), P(x) = \prob_x \cdot (1 - \prob_y), P(y) = (1 - \prob_x)\cdot \prob_y$, and $P(xy) = \prob_x \cdot \prob_y$. For the worlds where $x$ appears we have + +\[P(x) + P(xy) = \prob_x \cdot (1 - \prob_y) + \prob_x \cdot \prob_y = \prob_x \cdot \left((1 - \prob_y) + \prob_y\right) = \prob_x \cdot 1 = \prob_x.\] +Thus, the base case is satisfied. + +For the hypothesis, assume that $\abs{S} = k$ for some $k \geq 1$, and for $S'$ such that $\abs{S'} = 1$ where its element is distinct from all elements in $S$, the probability of each independent variable in $S$ is the same in $S \cup S'$. + +For the inductive step, let us prove that for $\abs{S_{k + 1}} = k + 1$ elements, adding another element will not change the probabilities of the independent variables in $S$. By the hypothesis, that $S_k \cup S_{k + 1}$, all probabilities in $S_k$ remained untainted after the union. Now consider a set $S' = \{z\}$ and the union $S_{k + 1} \cup S'$. Since all variables are distinct and independent, it is the case that the set of possible outcomes of $S_{k + 1} \cup S' = 2^{S_{k + 1} \cup S'}$ with $\abs{2^{S_{k + 1} \cup S'}} = 2^{\abs{S_{k + 1}} + \abs{S'}}$ since $\abs{S_{k + 1}} + \abs{S'} = \abs{S_{k + 1} \cup S'}$. Then, since $2^{\abs{S_{k + 1}} + \abs{S'}} = 2^{\abs{S_{k + 1}}} \cdot 2^{\abs{S'}}$, and $2^{S'} = \{\emptyset, \{x\}\}$, it is the case that all elements in the original set of out comes will appear \textit{exactly one} time without $z$ and \textit{exactly one }time with $z$, such that for element $x \in 2^{S_{k + 1}}$ with probability $\prob_x$ we have $P(x\text{ }OR\text{ }xz) = \prob_x \cdot (1 - \prob_z) + \prob_x \cdot \prob_z = \prob_x\cdot \left((1 - z) + z\right) = \prob_x \cdot 1 = \prob_x$, and the probabilities remain unchanged, and, thus, the marginal probabilities for each variable in $S_{k + 1}$ across all possible outcomes remain unchanged. +\end{proof} + +\qed + +The repeated application of ~\cref{lem:bi-red-ti-ind} to any 'block' of independent variables in $\tipdb$ provides the same result as joining two sets of distinct elements of size $\abs{S_1}, \abs{S_2} > 1$. + +Thus, by lemmas ~\ref{lem:bi-red-ti-prob}, ~\ref{lem:bi-red-ti-q}, and ~\ref{lem:bi-red-ti-ind}, the proof follows. +\end{proof} + +\qed + +\subsubsection{General results for $\bi$}\label{subsubsec:bi-gen} +\AH{One thing I don't see in the argument below is that as $\numvar \rightarrow \infty$, we have that $\prob_0 \rightarrow 0$.} +The general results of approximating a $\bi$ using the reduction and ~\cref{alg:mon-sam} do not allow for the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ to be a constant. Consider the following example. + +Let monomial $y_i = P(x_i) \cdot \prod_{j = 1}^{i - 1}(1 - P(x_j))$ Let $\poly(\vct{X}) = \sum_{i = 1}^{\numvar}y_i$. Note that this query output can exist on a projection for which each tuple agrees on the projected values of the query in a $\bi$ consisting of one block and $\numvar$ tuples. + +First, let's analyze the numerator $\abs{\etree}(1,\ldots, 1)$. Expanding $\abs{\etree}$ yields $X_i + (1 + X_1)\cdot X_2 + \cdots + (1 + X_1)\cdot(1 + X_2)\cdots(1 + X_{\numvar - 1})\cdot X_n$ which yields a geometric series $S_{\abs{\etree}} = 2^0 + 2^1 +\cdots+2^{\numvar - 1}$. We can perform the following manipulations to obtain the following closed form. + +\begin{align*} +2 \cdot S_{\abs{\etree}} =& 2^1 +\cdots+2^\numvar = 2^{\numvar} + S_2 - 1 \\ +S_{\abs{\etree}} =& 2^{\numvar + 1} - 1 +\end{align*} + +So, then $\abs{\etree}(1,\ldots, 1) = 2^{\numvar} - 1$. + +On the other hand, considering $\rpoly(\prob_1,\ldots, \prob_\numvar)$, since we are simply summing up the probabilities of one block of disjoint tuples (recall that $P(x_i) = \frac{P(a_i)}{1\cdot\prod_{j = 1}^{i - 1}(1 - P(x_j))}$ in the reduction for $a_i$ the original $\bi$ probability), it is the case that $\rpoly(\prob_1,\ldots, \prob_\numvar) \leq 1$, and the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ in this case is exponential $O(2^\numvar)$. Further note that setting $\poly(\vct{X}) = \sum_{i = 1}^{\numvar} y_i^k$ will yield an $O(2^{\numvar \cdot k})$ bound. +\subsubsection{Sufficient Condition for $\bi$ for linear time Approximation Algorithm} + +Let us introduce a sufficient condition on $\bipdb$ for a linear time approximation algorithm. + +\AH{Lemma ~\ref{lem:bi-suf-cond} is not true for the case of $\sigma$, where a $\sigma(\bowtie)$ query could select tuples from the same block, and self join them such that all tuples cancel out. We need a definition for 'safe' (in this context) queries, to prove the lemma.} +\begin{Lemma}\label{lem:bi-suf-cond} +For $\bipdb$ with fixed block size $\abs{b}$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is a constant. +\end{Lemma} + +\AH{Two observations. +\par +1) I am not sure that the argument below is correct, as I think we would still get something exponential in the numerator $\abs{\etree}(1,\ldots, 1)$. +\par2) I \textit{think} a similar argument will hold however for the method of not using the reduction.} +\begin{proof}[Prood of Lemma ~\ref{lem:bi-suf-cond}] +For increasing $\numvar$ and fixed block size $\abs{b}$ in $\bipdb$ given query $\poly = \sum_{i = 1}^{\numvar}$ where $y_i = x_i \cdot \prod_{j = 1}^{i - 1} (1 - x_j)$, a query whose output is the maximum possible output, it has to be the case as seen in ~\cref{subsubsec:bi-gen} that for each block $b$, $\rpoly(\prob_{b, 1},\ldots, \prob_{b, \abs{b}}) = P(a_{b, 1}) + P(a_{b, 2}) + \cdots + P(a_{b, \abs{b}})$ for $a_i$ in $\bipdb$. As long as there exists no block in $\bipdb$ such that the sum of alternatives is $0$ (which by definition of $\bi$ should be the case), we can bound the $\rpoly(p_1,\ldots, \prob_\numvar) \geq \frac{\prob_0 \cdot \numvar}{\abs{\block}}$ for $\prob_0 > 0$, and then we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is indeed a constant. +\end{proof} + +\qed + +Given a $\bipdb$ satisfying ~\cref{lem:bi-suf-cond}, it is the case by ~\cref{lem:approx-alg} that ~\cref{alg:mon-sam} runs in linear time. + +\AH{\Large \bf{092520 -- 100220 New material.}} + +\section{Algorithm ~\ref{alg:mon-sam} for $\bi$} + +We may be able to get a better run time by developing a separate approximation algorithm for the case of $\bi$. Instead performing the reduction from $\bi \mapsto \poly(\ti)$, we decide to work with the original variable annotations given to each tuple alternative in $\bipdb$. For clarity, let us assume the notation of $\bivar$ for the annotation of a tuple alternative. The algorithm yields $0$ for any monomial sampled that cannot exist in $\bipdb$ due to the disjoint property characterizing $\bi$. The semantics for $\rpoly$ change in this case. $\rpoly$ not only performs the same modding function, but also sets all monomial terms to $0$ if they contain variables which appear within the same block. + +\begin{algorithm}[H] + \caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\bivec$)} + \label{alg:bi-mon-sam} + \begin{algorithmic}[1] + \Require \etree: Binary Expression Tree + \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$ + \Require $\conf$ $\in [0, 1]$ + \Require $\error$ $\in [0, 1]$ + \Require $\bivec$ $\in [0, 1]^{\abs{\block}}$\Comment{$\abs{\block}$ is the number of blocks} + \Ensure \vari{acc} $\in \mathbb{R}$ + \State $\vari{sample}_\vari{next} \gets 0$ + \State $\accum \gets 0$\label{alg:mon-sam-global1} + \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2} + \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}} + \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples} + \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} + \For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$} + \If{$\bivec[\block] = 1$}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} + \State $\vari{sample}_{\vari{next}} \gets 1$ + \State break + \Else + \State $\bivec[\block] = 1$ +% \State $\vari{sum} = 0$ +% \For{$\ell \in [\abs{\block}]$} +% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$ +% \EndFor +% \If{$\vari{sum} \geq 2$} +% \State $\vari{sample}_{\vari{next}} \gets 1$ +% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.} + \EndIf + \EndFor + \If{$\vari{sample}_{\vari{next}} = 1$} + \State $\vari{sample}_{\vari{next}} \gets 0$ + \State continue + \EndIf + \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1} + \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$} + \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$} + \EndFor + \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product} + \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add} + \EndFor + + \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3} + \State \Return \vari{acc} + \end{algorithmic} +\end{algorithm} + +Before redefining $\rpoly$ in terms of the $\bi$ model, we need to define the notion of performing a mod operation with a set of polynomials. + +\begin{Definition}[Mod with a set of polynomials]\label{def:mod-set-poly} +To mod a polynomial $\poly$ with a set $\vct{Z} = \{Z_1,\ldots Z_x\}$ of polynomials, the mod operation is performed successively on the $\poly$ modding out each element of the set $\vct{Z}$ from $\poly$. +\end{Definition} + +\begin{Example}\label{example:mod-set-poly} +To illustrate for $\poly = X_1^2 + X_1X_2^3$ and the set $\vct{Z} = \{X_1^2 - X_1, X_2^2 - X_2, X_1X_2\}$ we get + +\begin{align*} +&X_1^2 + X_1X_2^3 \mod X_1^2 - X_1 \mod X_2^2 - X_2 \mod X_1X_2\\ +=&X_1 + X_1X_2^3 \mod X_2^2 - X_2 \mod X_1X_2\\ +=&X_1 + X_1X_2 \mod X_1X_2\\ +=&X_1 +\end{align*} + +\end{Example} + +\begin{Definition}[$\rpoly$ for $\bi$ Data Model]\label{def:bi-alg-rpoly} +$\rpoly(\vct{X})$ over the $\bi$ data model is redefined to include the following mod operation in addition to definition ~\ref{def:qtilde}. For every $j \neq i$, we add the operation $\mod X_{\block, i}\cdot X_{\block, j}$. For set of blocks $\mathcal{B}$ and the size of block $\block$ as $\abs{\block}$, + +\[\rpoly(\vct{X}) = \poly(\vct{X}) \mod \{X_{\block, i}^2 - X_{\block, i} \st \block \in \mathcal{B}, i \in [\abs{\block}]\} \cup_{\block \in \mathcal{B}} \{X_{\block, i}X_{\block, j} \st i, j \in [\abs{\block}], i \neq j\} +% \mod X_{\block_1, 1}^2 - X_{\block_1, 1} \cdots \mod X_{\block_k, \abs{\block_k}}^2 - X_{\block_k, \abs{\block_k}} \mod X_{b_1, 1} \cdot X_{b_1, 2}\cdots \mod X_{\block_1, \abs{\block_1} -1} \cdot X_{\block, \abs{\block_1}}\cdots \mod X_{\block_k, 1} \cdot X_{\block_k, 2} \cdots \mod X_{\block_k, \abs{\block_k} - 1}\cdot X_{\block_K, \abs{\block_k}}. +\] +\end{Definition} + +\subsection{Correctness} +\begin{Theorem}\label{theorem:bi-approx-rpoly-bound} +For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ in the $\bi$ setting can be computed in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$, with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$. +\end{Theorem} + +\begin{proof}[Proof of Theorem ~\ref{theorem:bi-approx-rpoly-bound}] +By the proof of ~\cref{lem:approx-alg}, with a minor adjustment on $\evalmp$, such that we define the function to output $0$ for any monomial sharing disjoint variables, coupled with the fact that additional operations in ~\cref{alg:bi-mon-sam} are $O(1)$ occuring at most $k$ times for each of the $\numsamp$ samples, the proof of ~\cref{theorem:bi-approx-rpoly-bound} immediately follows. +\end{proof} + +\qed + +\subsection{Safe Query Class for $\bi$} +We want to analyze what is the class of queries and data restrictions that are necessary to guarantee that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_{1},\ldots, \prob_{\numvar})}$ is $O(1)$. + +\subsubsection{When $\rpoly$ is zero} + +First, consider the case when $\rpoly$ cancels out all terms in $\poly$, where $\poly \neq \emptyset$. For $\rpoly$ to cancel out a tuple $\tup$, by ~\cref{def:bi-alg-rpoly} it must be the case that output tuple $\tup$ is dependent on two different tuples appearing in the same block. For this condition to occur, it must be that the query $\poly$ contains a self join operation on a table $\rel$, from which $\tup$ has been derived. + +Certain conditions on both the data and query must exist for all tuples $\tup$ to be cancelled out by $\rpoly$ as described above. + +For $\rpoly$ to be $0$, the data of a $\bi$ must satisfy certain conditions. + +\begin{Definition}[Data Restrictions]\label{def:bi-qtilde-data} +Consider $\bi$ table $\rel$. For $\rpoly$ to potentially cancel all its terms, $\rel$ must be such that given a self join, the join constraints remain unsatisfied for all tuple combinations $x_{\block_i, \ell} \times x_{\block_j, \ell'}$ for $i \neq j$, $\ell \in [\abs{\block_i}], \ell' \in [\abs{\block_j}]$, i.e. combinations across different blocks. Note that this is trivially satisfied with a $\rel$ composed of just one block. Further, it must be the case that the self join constraint is only satisfied in one or more crossterm combinations $x_{\block, i} \times x_{\block_j}$ for $i \neq j$, i.e., within the same block of the input data. +\end{Definition} + +To be precise, only equijoins are considered in the following definition. Before preceding, note that a natural self join will never result in $\rpoly$ cancelling all terms, since it is the case that each tuple will necessarily join with itself, and $\rpoly$ will not mod out this case. Also, although we are using the term self join, we consider cases such that query operations over $\rel$ might be performed on each join input prior to the join operation. While technically the inputs may not be the same set of tuples, this case must be considered, since all the tuples originate from the table $\rel$. To this end, let $\poly_1(\rel) = S_1$ and $\poly_2(\rel) = S_2$ be the input tables to the join operation. +\begin{Definition}[Class of Cancelling Queries]\label{def:bi-qtilde-query-class} +When ~\cref{def:bi-qtilde-data} is satisfied, it must be that $\poly$ contains a join $S_1 \bowtie_\theta S_2$ such that either% that satisfies the following constraints based on its structure. + +\textsc{Case 1:} $S_1 \cap S_2 = \emptyset$ + +%Any join over this structure will produce a $\poly$ such that $\rpoly$ cancels all monomials out. +%Such a condition implies $\rpoly$ is $0$ regardless of join condition $\theta$. Note the beginning premise of this definition, and the fact that such premise rules out the natural join across all attributes, since we would have that $\poly = \rpoly = 0$. +Or + +\textsc{Case 2:} $S_1 \cap S_2 \neq \emptyset$, the attributes in the join predicate are non-matching, i.e., neither operand of the comparison is a strict subset of the other, and no input tuple has agreeing values across the join attributes. + + +%\begin{enumerate} +% \item When the join condition $\theta$ involves equality between matching attributes, it must be that the attributes of the join conditon $\attr{\theta}$ are a strict subset of $\attr{\rel}$. Then, to satisfy ~\cref{def:bi-qtilde-data} it must be that the join input consists of non-intersecting strict subsets of $\rel$, meaning $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$. $\poly_1$ in ~\cref{ex:bi-tildeq-0} illustrates this condition. +% \item If $\theta$ involves an equality on non-matching attributes, there exist two cases. +% \begin{enumerate} +% \item The first case consists of when the join inputs intersect, i.e., $S_1 \cap S_2 \neq \emptyset$ . To satisfy ~\cref{def:bi-qtilde-data} it must be the case that no tuple can exist with agreeing values across all attributes in $\attr{\theta}$. $\poly_3$ of ~\cref{ex:bi-tildeq-0} demonstrates this condition. +% \item The second case consists of when $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$ in the join input, and this case does not contradict the requirements of ~\cref{def:bi-qtilde-query-class}. This case is illustrated in $\poly_2$ of ~\cref{ex:bi-tildeq-0}. +% \end{enumerate} +%\end{enumerate}% , cause $\rpoly$ to be $0$ must have the following characteristics. First, there must be a self join. Second, prior to the self join, there must be operations that produce non-intersecting sets of tuples for each block in $\bi$ as input to the self join operation. +\end{Definition} + +In ~\cref{ex:bi-tildeq-0}, $\poly_1$ and $\poly_2$ are both examples of \textsc{Case 1}, while $\poly_3$ is an example of \textsc{Case 2}. + +\begin{Theorem}\label{theorem:bi-safe-q} +When both ~\cref{def:bi-qtilde-data} and ~\cref{def:bi-qtilde-query-class} are satisfied, $\rpoly$ cancels out all monomials. +\end{Theorem} + +\begin{proof}[Proof of Theorem ~\ref{theorem:bi-safe-q}] +Starting with the case that $S_1 \cap S_2 = \emptyset$. When this is the case, by definition, all joins on tuples in $S_1$ and $S_2$ will be will involve elements in $S_1 \times S_2$ such that both tuples are distinct. Further, ~\cref{def:bi-qtilde-data} rules out joins across different blocks, while calling for joins of the above form within the same block. Thus all tuples in the query output are dependent on more than one tuple from the same block, thus implying by ~\cref{def:bi-alg-rpoly} that $\rpoly$ will cancel all monomials. + +For the next case where $S_1 \cap S_2 \neq \emptyset$, note that there exists at least one tuple in both $S_1$ and $S_2$ that is the same. Therefore, all equijoins involving matching attributes will produce at least one self joined tuple in the output, breaking the last property of ~\cref{def:bi-qtilde-data}. For the case of equijoins with predicates involving non-matching attribute operands, note that by definition of equijoin, the only case that a tuple shared in both $S_1$ and $S_2$ can join on itself is precisely when that tuple's values agree across all the join attributes in $\theta$. Thus, it is the case that when $S_1 \cap S_2 \neq \emptyset$ and the join predicate involves equality comparison between non-matching attributes such that the values of the non-matching comparison attributes for each tuple in $\{S_1 \cap S_2\}$ do not agree, we have that ~\cref{def:bi-qtilde-data} is not contradicted, and when ~\cref{def:bi-qtilde-data} is fulfilled, it must be the case that $\poly \neq 0$ while $\rpoly = 0$. + +This concludes the proof. +\end{proof} + +\qed + + +Note then that the class of queries described in ~\cref{def:bi-qtilde-query-class} belong to the set of queries containing some form of selction over self cross product. +%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-data}] +%\end{proof} +%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-query-class}] +%\end{proof} + + +%%%%%%%%%%%%%%%%%%%%%%% + +%The condition that causes $\rpoly(\prob_1,\ldots, \prob_\numvar)$ to be $0$ is when all the output tuples in each block cancel each other out. Such occurs when the annotations of each output tuple break the required $\bi$ property that tuples in the same block must be disjoint. This can only occur for the case when a self-join outputs tuples each of which have been joined to another tuple from its block other than itself. +% +%The observation is then the following. In order for such a condition to occur, we must have a query that is a self-join such that the join is on two different sets of atoms for each block. This condition can occur when inner query operations with different constraints on input table $\rel$ produce two non-intersecting sets of tuples and then performs a self join on them, such that the join condition \textit{only} holds for tuples that are members of the same block. +% +%There are two operators that can produce the aforementioned selectivity. First, consider $\sigma$, where two different selection conditions $\theta_1$ and $\theta_2$ over $\rel$ can output sets $S_{\sigma_{\theta_1}}$ and $S_{\sigma_{\theta_2}}$ where $S_{\sigma_{\theta_1}} \cap S_{\sigma_{\theta_2}} = \emptyset$. A join over these two outputs can produce an ouput $\poly$ where all annotations will be disjoint and $\rpoly$ will effectively cancel them all out. Second, consider the projection operator $\pi$, such that projections over $\rel$ which project on different attributes can output two non-intersecting sets of tuples, which when joined, again, provided that the join condition holds only for tuples appearing in the same block, can output tuples all of which will break the disjoint requirement and $\rpoly$ will cancel them out. + +\begin{Example}\label{ex:bi-tildeq-0} +Consider the following $\bi$ table $\rel$ consisting of one block, with the following queries $\poly_1 = \sigma_{A = 1}(\rel)\bowtie_{B = B'} \sigma_{A = 2}(\rel)$, $\poly_2 = \sigma_{A = 1}(\rel)\bowtie_{A = B'} \sigma_{A = 2}(\rel)$, and $\poly_3 = \rel \bowtie_{A = B} \rel$. While the output $\poly_i \neq \emptyset$, all queries have that $\rpoly_i = 0$. Since $\rel$ consists of only one block, we will use single indexing over the annotations. +\end{Example} + + +\begin{figure}[ht] + \begin{tabular}{ c | c c c } + \rel & A & B & $\phi$\\ + \hline + & 1 & 2 & $x_1$\\ + & 2 & 1 & $x_2$\\ + & 1 & 3 & $x_3$\\ + & 3 & 1 & $x_4$\\ + \end{tabular} + \caption{Example~\ref{ex:bi-tildeq-0} Table $\rel$} + \label{fig:bi-ex-table} +\end{figure} +%%%%%%%%%%Query 1 and 2 +\begin{figure}[ht] + \begin{subfigure}{0.2\textwidth} + \centering + \begin{tabular}{ c | c c c } + $\sigma_{\theta_{A = 1}}(\rel )$& A & B & $\phi$\\ + \hline + & 1 & 2 & $x_1$\\ + & 1 & 3 & $x_3$\\ + \end{tabular} + \caption{$\poly_1, \poly_2$ First Selection} + \label{subfig:bi-q1-sigma1} + \end{subfigure} + \begin{subfigure}{0.2\textwidth} + \centering + \begin{tabular}{ c | c c c} + $\sigma_{\theta_{A = 2}}(\rel)$ & A & B' & $\phi$\\ + \hline + & 2 & 1 & $x_2$\\ + \end{tabular} + \caption{$\poly_1, \poly_2$ Second Selection} + \label{subfig:bi-q1-sigma2} + \end{subfigure} + \begin{subfigure}{0.25\textwidth} + \centering + \begin{tabular}{ c | c c c c c} + $\poly_1(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ + \hline + & 1 & 2 & 2 & 1 & $x_1x_2$\\ + \end{tabular} + \caption{$\poly_1(\rel)$ Output} + \label{subfig:bi-q1-output} + \end{subfigure} + \begin{subfigure}{0.4\textwidth} + \centering + \begin{tabular}{ c | c c c c c} + $\poly_2(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ + \hline + & 1 & 2 & 2 & 1 & $x_1x_2$\\ + & 1 & 3 & 2 & 1 & $x_2x_3$\\ + \end{tabular} + \caption{$\poly_2(\rel)$ Output} + \label{subfig:bi-q2-output} + \end{subfigure} + \caption{$\poly_1, \poly_2(\rel)$} + \label{fig:bi-q1-q2} +\end{figure} +%%%%%%%%%%%Query 3 +\begin{figure}[ht] +% \begin{subfigure}{0.2\textwidth} +% \centering +% \begin{tabular}{ c | c c } +% $\pi_{A}(\rel)$ & A & $\phi$\\ +% \hline +% & 1 & $x_1$\\ +% & 2 & $x_2$\\ +% & 1 & $x_3$\\ +% & 3 & $x_4$\\ +% \end{tabular} +% \caption{$\poly_3$ First Projection} +% \label{subfig:bi-q3-pi1} +% \end{subfigure} +% \begin{subfigure}{0.2\textwidth} +% \centering +% \begin{tabular}{ c | c c } +% $\pi_{B}(\rel)$ & B & $\phi$\\ +% \hline +% & 2 & $x_1$\\ +% & 1 & $x_2$\\ +% & 3 & $x_3$\\ +% & 1 & $x_4$\\ +% \end{tabular} +% \caption{$\poly_3$ Second Projection} +% \label{subfig:bi-q3-pi2} +% \end{subfigure} + \begin{subfigure}{0.2\textwidth} + \centering + \begin{tabular}{ c | c c c c c } + $\poly_3(\rel)$ & A & B & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\ + \hline + & 1 & 2& 2 & 1 & $x_1x_2$\\ + & 1 & 2 & 3 & 1 & $x_1x_2$\\ + & 2 & 1 & 1 & 2 & $x_1x_2$\\ + & 1 & 3 & 2 & 1 & $x_2x_3$\\ + & 1 & 3 & 3 & 1 & $x_3x_4$\\ + & 3 & 1 & 1 & 3 & $x_3x_4$\\ + \end{tabular} + \caption{$\poly_3(\rel)$ Output} + \label{subfig:bi-q3-output} + \end{subfigure} + \caption{$\poly_3(\rel)$} + \label{fig:bi-q3} +\end{figure} + +Note that all of ~\cref{subfig:bi-q1-output}, ~\cref{subfig:bi-q2-output}, and ~\cref{subfig:bi-q3-output} each have a set of tuples, where each annotation has cross terms from its block, and by ~\cref{def:bi-alg-rpoly} $\rpoly$ will eliminate all tuples output in the respective queries. + +\subsubsection{When $\rpoly > 0$} +\par\AH{General Case and Sufficient Condition for $\bi$ and $\rpoly_{\bi}$ approx alg needs to be written.} +\paragraph{General Case} +Consider the query $\poly = \sum_{i = 1}^{\numvar}x_i$, analogous to a projection where all tuples match on the projected set of attributes, meaning $\tup_i[A] = \tup_j[A]$ for $i, j \in [\numvar]$ such that $i \neq j$. When $\numvar$ grows unboundedly, $\abs{\etree}(1,\ldots, 1) = \numvar$. We assume that the sum of the probabilities of all $\numvar$ tuples in the block remain a constant as $\numvar$ grows. Thus, we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{n}{c}$ for some constant $c$, and this implies $O(\numvar)$ growth. +% while $\rpoly(\vct{\prob}) \leq 1$, which implies that the ratio is linear, i.e., $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{p})} = \frac{\numvar}{\numvar \cdot \prob_0} = \frac{1}{\prob_0}$ for $\prob_0 = min(\vct{\prob})$. However, note that for $\numvar \rightarrow \infty$ it is the case that $\prob_0 \rightarrow 0$, and as $\numvar$ grows, so does $\frac{1}{\prob_0}$. Intuitively, consider when $p_0 = \frac{1}{\numvar}$. Then we know that the bound is $\frac{\numvar}{1}$ which is $O(\numvar)$. + +\paragraph{Sufficient Condition for $\bi$ to achieve linear approximation} +Consider the same query $\poly = \sum_{i = 1}^{\numvar}$, but this time conditioned on a fixed block size which we denote $\abs{\block}$. Then it is the case that $\abs{\etree}(1,\ldots, 1) = \numvar$, but if we assume that all blocks have a sum of probabilities equal to $1$, $\rpoly(\vct{\prob}) = \frac{\numvar}{\abs{b}}$, and this means that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{\numvar}{\frac{\numvar}{\abs{\block}}} = \abs{\block}$. For the general case when all blocks do not have the property that the sum of the probabilities of the alternatives equal $1$, we can lower bound the sum of probabilities as $\frac{\numvar}{\abs{\block}} \cdot \prob_0$ for $\prob_0 = min(\vct{\prob})$. Note that in $\numvar \cdot \frac{\prob_0}{\abs{\block}}$, $\frac{\prob_0}{\block}$ is indeed a constant, and this gives an overall ratio of $O(1)$ as $\numvar$ increases. \ No newline at end of file diff --git a/single_p.tex b/single_p.tex index 744ed73..1e63aac 100644 --- a/single_p.tex +++ b/single_p.tex @@ -1,7 +1,7 @@ %root: main.tex -\subsubsection{Single $\prob$ value} +\subsection{Single $\prob$ value} In this discussion, let us fix $\kElem = 3$. @@ -156,7 +156,7 @@ Note that $f_k$ is properly defined. For any $S \in \binom{E_k}{3}$, $|f(S)| \l \qed -\subsection{Three Matchings in $\graph{2}$} +\subsubsection{Three Matchings in $\graph{2}$} \AR{TODO for {\em later}: I think the proof will be much easier to follow with figures: just drawing out $S\times \{0,1\}$ along with the $(e_i,b_i)$ explicity notated on the edges will make the proof much easier to follow.} \begin{proof}[Proof of Lemma \ref{lem:3m-G2}] For each edge pattern $S$, we count the number of $3$-matchings in the $3$-edge subgraphs of $\graph{2}$ in $f_2^{-1}(S)$. We start with $S \in \binom{E_1}{3}$, where $S$ is composed of the edges $e_1, e_2, e_3$ and $f_2^{-1}(S)$ is the set of all $3$-edge subsets of the set @@ -198,7 +198,7 @@ Observe that all of the arguments above focused solely on the shape/pattern of $ \end{proof} \qed -\subsection{Three matchings in $\graph{3}$} +\subsubsection{Three matchings in $\graph{3}$} \begin{proof}[Proof of Lemma \ref{lem:3m-G3}] @@ -283,9 +283,9 @@ All of the observations above focused only on the shape of $S$, and since we see \end{proof} \qed -\subsection{Three Paths} +\subsubsection{Three Paths in $\graph{2}$} Computing the number of 3-paths in $\graph{2}$ and $\graph{3}$ consists of much simpler linear combinations. -\subsubsection{$\graph{2}$} + \begin{proof}[Proof of Lemma \ref{lem:3p-G2}] @@ -294,7 +294,7 @@ For $\mathcal{P} \subseteq \eset{2}$ such that $\mathcal{P} $ is a $3$-path, it \qed -\subsubsection{$\graph{3}$} +\subsubsection{Three Paths in $\graph{3}$} \begin{proof}[Proof of Lemma \ref{lem:3p-G3}] @@ -304,7 +304,7 @@ The argument follows along the same lines as in the proof of \cref{lem:3p-G2}. -\subsection{Triangle} +\subsubsection{Triangles} \begin{proof}[Proof of Lemma \ref{lem:tri}] From 0b397d728d02fe7085d7ba45ab7e78ed0252b8e5 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Tue, 8 Dec 2020 12:54:22 -0500 Subject: [PATCH 04/17] Incorporated Boris' suggestions. --- intro.tex | 12 +++++------- macros.tex | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/intro.tex b/intro.tex index ea69edd..243673b 100644 --- a/intro.tex +++ b/intro.tex @@ -2,10 +2,9 @@ \section{Introduction} -\BG{In this paragraph we are not stating the precise problem that we are trying to solve yet. What I mean is: what is the input, what is the output. The case for sets is clear now, but bags require a bit of extra explanation because there is no marginal probability of output tuples and the input encoding also needs to be made clear.} -In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing expectations and other moments\BG{for what?} is analogous to counting the number of solutions to a boolean formula\BG{call it by its name: weighted model counting}, a known \#-P\BG{\#P?} problem +In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing expectations and other moments of an output tuple is analogous to weighted model counting, a known $\sharpP$ problem. %the annotation of the tuple is a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, which can essentially be thought of as a boolean formula. It is known that computing the probability of a lineage formula is \#-P hard in general -~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, the boolean formula\BG{maybe say a bit more here: what does this boolean formula encode?} is called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, a formula generated by query processing\BG{do we have one formula per query? state what the formula encodes (see above)}. However computing expectation\BG{of what?} in the bag setting is linear\BG{in what?} with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula\BG{maybe we should say that in the bag case instead of using boolean lineage formulas, we have to use polynomials over random variables representing the probability distribution of the multiplicity of input tuples.} showing that the complexity landscape becomes much more nuanced, and is not linear\BG{the complexity landscape is linear?} in general. +~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, the boolean formula encodes which input tuples contributed to an output tuple in an arbitrary query. This formula is also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, and is a deterministic formula for each output tuple generated by query processing. However, instead of using boolean lineage formulas, the bag case requires the use of polynomials over random variables, representing the probability distribution of the multiplicity of input tuples. In this case, the probability is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Computing the expectation of the polynomial in the bag setting is linear in the number of terms of the expanded formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula showing that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. @@ -71,12 +70,11 @@ In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag s \end{figure} \begin{Example}\label{ex:intro} - \BG{I think we need to clarify that this example is sets first} -Suppose we are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$ over a Tuple Independent Database ($\ti$)\BG{briefly say what this is}, where the annotation\BG{we have not used the term yet, so it should be explained} of the output will consist of all contributing tuple annotations\BG{same here}. The $\ti$ relations\BG{example instances?} are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. Note that the attribute column $\Phi$ contains a variable/value, where in the former case the variable ranges over $[0, 1]$ denoting its\BG{what is its here?} marginal probability of appearing in the set of possible worlds\BG{that is strangely worded. The probability of appearing in the set of possible worlds is always 1 if the tuple exists in at least one world. I think (marginal) probability of the input tuple would be enough.}, and the latter is the fixed (marginal) probability of the tuple across the set of possible worlds.\BG{The previous sentence is a bit hard to follow, can we try to simplify it?} Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. -This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics it is easy since we enjoy linearity of expectation.\BG{I think this could be confusing to people. We have to clarify what our interpretation of the database under bag semantics is.} +Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB of which all its tuples are assumed to be independent from one another. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the lineages of all contributing tuples. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. Note that the attribute column $\Phi$ contains a variable/value in the range of $[0, 1]$ denoting its marginal probability. Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. +This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} -While our work handles Block Independent Disjoint Databases\BG{what are these?} ($\bi$), for now we consider the $\ti$ model. Define the probability distribution to be $P[W_i = 1] = \prob$ for $i$ in $\{a, b, c\}$.\BG{say that is a fixed probability for all tuples. Also again what is the interpretation of a probability for bag semantics?} +Our work also handles Block Independent Disjoint Databases($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example, consider a fixed probability for all tuples. Note that computing the probability of the query of ~\cref{ex:intro} in set semantics is indeed \#-P hard, since it is a query that is non-hierarchical %, i.e., for $Vars(\poly)$ denoting the set of variables occuring across all atoms of $\poly$, a function $sg(x)$ whose output is the set of all atoms that contain variable $x$, we have that $sg(A) \cap sg(B) \neq \emptyset$ and $sg(A)\not\subseteq sg(B)$ and $sg(B)\not\subseteq sg(A)$, as defined by Dalvi and Suciu in ~\cite{10.1145/1265530.1265571}. For the purposes of this work, we define hard to be anything greater than linear time. %Thus, computing $\expct\pbox{\poly(W_a, W_b, W_c)}$, i.e. the probability of the output with annotation $\poly(W_a, W_b, W_c)$, ($\prob(q)$ in Dalvi, Sucui) is hard in set semantics. diff --git a/macros.tex b/macros.tex index 3657c6d..8d3b1d6 100644 --- a/macros.tex +++ b/macros.tex @@ -8,6 +8,7 @@ \newcommand{\wElem}{w} %an element of \vct{w} \newcommand{\st}{\;|\;} %such that \newcommand{\kElem}{k}%the kth element +\newcommand{\sharpP}{\#P} %RA-to-Poly Notation \newcommand{\polyinput}[2]{\left(#1,\ldots, #2\right)} \newcommand{\numvar}{n} From d21244a4e7df86f8f45d3eeccf80a18812d13d9b Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Tue, 8 Dec 2020 15:45:41 -0500 Subject: [PATCH 05/17] More touch up on the 2-col format. --- approx_alg.tex | 4 ++-- intro.tex | 2 +- lin_sys.tex | 35 +++++++++++++++++++---------------- single_p.tex | 9 +++++++-- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index fc78c93..4846a03 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -340,8 +340,8 @@ Consider when $\etree$ encodes the expression $(x_1 + x_2)(x_1 - x_2) + x_2^2$. \begin{figure}[h!] \begin{tikzpicture}[thick, every tree node/.style={default_node, thick, draw=black, black, circle, text width=0.3cm, font=\bfseries, minimum size=0.65cm}, every child/.style={black}, edge from parent/.style={draw, thick}, -level 1/.style={sibling distance=1.25cm}, -level 2/.style={sibling distance=1.0cm}, +level 1/.style={sibling distance=0.95cm}, +level 2/.style={sibling distance=0.7cm}, %level 2+/.style={sibling distance=0.625cm} %level distance = 1.25cm, %sibling distance = 1cm, diff --git a/intro.tex b/intro.tex index 243673b..96647f0 100644 --- a/intro.tex +++ b/intro.tex @@ -74,7 +74,7 @@ Assume a set semantics setting. Suppose we are given a Tuple Independent Databa This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} -Our work also handles Block Independent Disjoint Databases($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example, consider a fixed probability for all tuples. +Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example, consider a fixed probability for all tuples. Note that computing the probability of the query of ~\cref{ex:intro} in set semantics is indeed \#-P hard, since it is a query that is non-hierarchical %, i.e., for $Vars(\poly)$ denoting the set of variables occuring across all atoms of $\poly$, a function $sg(x)$ whose output is the set of all atoms that contain variable $x$, we have that $sg(A) \cap sg(B) \neq \emptyset$ and $sg(A)\not\subseteq sg(B)$ and $sg(B)\not\subseteq sg(A)$, as defined by Dalvi and Suciu in ~\cite{10.1145/1265530.1265571}. For the purposes of this work, we define hard to be anything greater than linear time. %Thus, computing $\expct\pbox{\poly(W_a, W_b, W_c)}$, i.e. the probability of the output with annotation $\poly(W_a, W_b, W_c)$, ($\prob(q)$ in Dalvi, Sucui) is hard in set semantics. diff --git a/lin_sys.tex b/lin_sys.tex index 240c2b8..4d96660 100644 --- a/lin_sys.tex +++ b/lin_sys.tex @@ -29,8 +29,8 @@ Equation ~\ref{eq:ls-2-1} follows by \cref{lem:tri}. Similarly ~\cref{eq:ls-2-2 Now, by simple algebraic manipulations of ~\cref{lem:qE3-exp}, we deduce, \begin{align} -&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob - \numocc{\graph{2}}{\oneint}\prob\nonumber\\ -& - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 \nonumber\\ +&\frac{\rpoly_{\graph{2}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath} - \numocc{\graph{2}}{\twodis}\prob \nonumber\\ +&- \numocc{\graph{2}}{\oneint}\prob - \big(\numocc{\graph{2}}{\twopathdis} + 3\numocc{\graph{2}}{\threedis}\big)\prob^2 \nonumber\\ &=\left(-2\cdot\numocc{\graph{1}}{\tri} - 4\cdot\numocc{\graph{1}}{\threepath}\right.\nonumber\\ &\left. - 8\cdot\numocc{\graph{1}}{\threedis} - 6\cdot\numocc{\graph{1}}{\twopathdis}\right)\cdot\left(3\prob^2 - p^3\right) + 2\cdot\numocc{\graph{1}}{\twopath}\prob\nonumber\\ &- 4\cdot\numocc{\graph{1}}{\oneint}\cdot\left(3\prob^2 - \prob^3\right)\label{eq:lem3-G2-1}\\ @@ -103,17 +103,18 @@ Following the same reasoning for $\graph{3}$, using \cref{lem:3m-G3}, \cref{lem: Looking at ~\cref{eq:LS-subtract}, \begin{align} -&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob \nonumber\\ -&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2\nonumber\\ +&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob \nonumber\\ +& - \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2\nonumber\\ &= \left\{ -18\numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} - 24 \cdot \numocc{\graph{1}}{\twopathdis}\right. \nonumber\\ &\left.- 27 \cdot \numocc{\graph{1}}{\threedis}\right\}\left(3\prob^2 - \prob^3\right) \nonumber\\ &+ \pbrace{-20 \cdot \numocc{\graph{1}}{\oneint} - 4\cdot \numocc{\graph{1}}{\twopath} - 6 \cdot \numocc{\graph{1}}{\twodis}}\left(3\prob^2 - \prob^3\right)\nonumber\\ &+ \numocc{\graph{1}}{\ed}\prob + 2 \cdot \numocc{\graph{1}}{\twopath}\prob. \label{eq:lem3-G3-2}\\ -&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob\nonumber\\ -&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \left(\numocc{\graph{1}}{\ed} + \numocc{\graph{1}}{\twopath}\right)\prob\nonumber\\ -&+ \left(24\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right) + 20\cdot\numocc{\graph{1}}{\oneint} + 4\cdot\numocc{\graph{1}}{\twopath}\right.\nonumber\\ -&\left.+ 6\cdot\numocc{\graph{1}}{\twodis}\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ -&= \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\left(3p^2 - p^3\right)\label{eq:lem3-G3-3} +&\frac{\rpoly_{\graph{3}}(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob \nonumber\\ +&- \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \left(\numocc{\graph{1}}{\ed}\right.\nonumber\\ +&\left.+ \numocc{\graph{1}}{\twopath}\right)\prob+ \left(24\left(\numocc{\graph{1}}{\twopathdis} + 3\cdot\numocc{\graph{1}}{\threedis}\right) \right.\nonumber\\ +&\left.+ 20\cdot\numocc{\graph{1}}{\oneint} + 4\cdot\numocc{\graph{1}}{\twopath}+ 6\cdot\numocc{\graph{1}}{\twodis}\right)\left(3\prob^2 - \prob^3\right)\nonumber\\ +&= \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\nonumber\\ +&\cdot\left(3p^2 - p^3\right)\label{eq:lem3-G3-3} \end{align} Equation ~\ref{eq:lem3-G3-2} follows from substituting ~\cref{eq:lem3-G3-2} in for the RHS of ~\cref{eq:LS-subtract}. We derive ~\cref{eq:lem3-G3-3} by adding the inverse of all $O(\numedge)$ computable terms, and for the case of $\twopathdis$ and $\threedis$, we add the $O(\numedge)$ computable term $24\cdot\left(\numocc{\graph{1}}{\twopathdis} + \numocc{\graph{1}}{\threedis}\right)$ to both sides. @@ -122,9 +123,10 @@ Equation \ref{eq:LS-G3-sub} follows from simple substitution of all lemma identi It then follows that %Removing $O(\numedge)$ computable terms to the other side of \cref{eq:LS-subtract}, we get -\begin{equation} -\mtrix{\rpoly_{G}}[3] = \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\left(3p^2 - p^3\right)\label{eq:LS-G3'} -\end{equation} +\begin{align} +&\mtrix{\rpoly_{G}}[3] = \pbrace{- 18 \cdot \numocc{\graph{1}}{\tri} - 21 \cdot \numocc{\graph{1}}{\threepath} + 45 \cdot \numocc{\graph{1}}{\threedis}}\nonumber\\ +&\cdot\left(3p^2 - p^3\right)\label{eq:LS-G3'} +\end{align} and %The same justification for the derivation of $\linsys{2}$ applies to the derivation above of $\linsys{3}$. To arrive at ~\cref{eq:LS-G3'}, we move $O(\numedge)$ computable terms to the left hand side. For the term $-24\cdot\numocc{\graph{1}}{\twopathdis}$ we need to add the inverse to both sides AND $72\cdot\numocc{\graph{1}}{\threedis}$ to both sides, in order to satisfy the constraint of $\cref{eq:2pd-3d}$. @@ -132,10 +134,11 @@ and %For the LHS we get \begin{align*} -&\vct{b}[3] = \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob - \numocc{\graph{3}}{\oneint}\prob \nonumber\\ -&- \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2 - \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob\\ -&+ \left\{24 \cdot \left(\numocc{\graph{1}}{\twopathdis} + 3\numocc{\graph{1}}{\threedis}\right) + 20 \cdot \numocc{\graph{1}}{\oneint} + 4\cdot \numocc{\graph{1}}{\twopath} \right.\nonumber\\ -&\left.+ 6 \cdot \numocc{\graph{1}}{\twodis}\right\}\left(3\prob^2 - \prob^3\right) +&\vct{b}[3] = \frac{\rpoly(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{3}}{\ed}}{6\prob} - \numocc{\graph{3}}{\twopath} - \numocc{\graph{3}}{\twodis}\prob \nonumber\\ +& - \numocc{\graph{3}}{\oneint}\prob - \big(\numocc{\graph{3}}{\twopathdis} + 3\numocc{\graph{3}}{\threedis}\big)\prob^2\\ +& - \pbrace{\numocc{\graph{1}}{\ed} + 2 \cdot \numocc{\graph{1}}{\twopath}}\prob + \left\{24 \cdot \left(\numocc{\graph{1}}{\twopathdis} \right. \right.\nonumber\\ +&\left.\left.+ 3\numocc{\graph{1}}{\threedis}\right) + 20 \cdot \numocc{\graph{1}}{\oneint} + 4\cdot \numocc{\graph{1}}{\twopath}\right.\\ +&\left.+ 6 \cdot \numocc{\graph{1}}{\twodis}\right\}\cdot\left(3\prob^2 - \prob^3\right) \end{align*} We now have a linear system consisting of three linear combinations, for $\graph{1}, \graph{2}, \graph{3}$ in terms of $\graph{1}$. Note that the constants for $\graph{1}$ follow the RHS of ~\cref{eq:LS-subtract}. To make it easier, use the following variable representations: $x = \numocc{\graph{1}}{\tri}, y = \numocc{\graph{1}}{\threepath}, z = \numocc{\graph{1}}{\threedis}$. Using $\linsys{2}$ and $\linsys{3}$, the following matrix is obtained, diff --git a/single_p.tex b/single_p.tex index 1e63aac..3f309fe 100644 --- a/single_p.tex +++ b/single_p.tex @@ -155,9 +155,10 @@ Note that $f_k$ is properly defined. For any $S \in \binom{E_k}{3}$, $|f(S)| \l \end{proof} \qed +\AR{TODO for {\em later}: I think the proof will be much easier to follow with figures: just drawing out $S\times \{0,1\}$ along with the $(e_i,b_i)$ explicity notated on the edges will make the proof much easier to follow.} \subsubsection{Three Matchings in $\graph{2}$} -\AR{TODO for {\em later}: I think the proof will be much easier to follow with figures: just drawing out $S\times \{0,1\}$ along with the $(e_i,b_i)$ explicity notated on the edges will make the proof much easier to follow.} + \begin{proof}[Proof of Lemma \ref{lem:3m-G2}] For each edge pattern $S$, we count the number of $3$-matchings in the $3$-edge subgraphs of $\graph{2}$ in $f_2^{-1}(S)$. We start with $S \in \binom{E_1}{3}$, where $S$ is composed of the edges $e_1, e_2, e_3$ and $f_2^{-1}(S)$ is the set of all $3$-edge subsets of the set \begin{equation*} @@ -172,7 +173,11 @@ Consider the $\eset{1} = \threedis$ pattern. Note that edges in $\eset{2}$ are \begin{itemize} \item Disjoint Two-Path ($\twopathdis$) \end{itemize} -For $\eset{1} = \twopathdis$ edges $e_2, e_3$ form a $2$-path with $e_1$ being disjoint. This means that $(e_2, 0), (e_2, 1), (e_3, 0), (e_3, 1)$ form a $4$-path while $(e_1, 0), (e_1, 1)$ is its own disjoint $2$-path. We can only pick either $(e_1, 0)$ or $(e_1, 1)$ from $f_2^{-1}(S)$, and then we need to pick a $2$-matching from $e_2$ and $e_3$. Note that a four path allows there to be 3 possible 2 matchings, specifically, $\pbrace{(e_2, 0), (e_3, 0)}, \pbrace{(e_2, 0), (e_3, 1)}, \pbrace{(e_2, 1), (e_3, 1)}$. Since these two selections can be made independently, there are $2 \cdot 3 = 6$ choices for $3$-matchings in $f_2^{-1}(S)$. +For $\eset{1} = \twopathdis$ edges $e_2, e_3$ form a $2$-path with $e_1$ being disjoint. This means that $(e_2, 0), (e_2, 1), (e_3, 0), (e_3, 1)$ form a $4$-path while $(e_1, 0), (e_1, 1)$ is its own disjoint $2$-path. We can only pick either $(e_1, 0)$ or $(e_1, 1)$ from $f_2^{-1}(S)$, and then we need to pick a $2$-matching from $e_2$ and $e_3$. Note that a four path allows there to be 3 possible 2 matchings, specifically, +\begin{equation*} +\pbrace{(e_2, 0), (e_3, 0)}, \pbrace{(e_2, 0), (e_3, 1)}, \pbrace{(e_2, 1), (e_3, 1)}. +\end{equation*} +Since these two selections can be made independently, there are $2 \cdot 3 = 6$ choices for $3$-matchings in $f_2^{-1}(S)$. \begin{itemize} \item $3$-star ($\oneint$) From 4f8fd2bb7e303972f801adba6c4671c9847c4d6c Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Tue, 8 Dec 2020 16:51:21 -0500 Subject: [PATCH 06/17] Replaced TIDB outer algo with BIDB outer algo. --- approx_alg.tex | 72 ++++++++++++++++++++++++++++++++++++++++++++------ intro.tex | 26 +++++++++--------- 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index 4846a03..35bc054 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -136,35 +136,91 @@ Algorithm ~\ref{alg:mon-sam} approximates $\rpoly$ using the following steps. F Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y$ inclusive. The notation $\{x, y\}$ denotes the set of values consisting of $x$ and $y$. \subsubsection{Psuedo Code} +%Original TIDB Algorithm +%\begin{algorithm}[H] +% \caption{$\approxq$($\etree$, $\vct{p}$, $\conf$, $\error$)} +% \label{alg:mon-sam} +% \begin{algorithmic}[1] +% \Require \etree: Binary Expression Tree +% \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$ +% \Require $\conf$ $\in [0, 1]$ +% \Require $\error$ $\in [0, 1]$ +% \Ensure \vari{acc} $\in \mathbb{R}$ +% \State $\accum \gets 0$\label{alg:mon-sam-global1} +% \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2} +% \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}}\newline +% \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples} +% \State $(\vari{M}_\vari{i}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} +% \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1} +% \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}_{\vari{i}}$} +% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$} +% \EndFor +% \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product} +% \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add} +% \EndFor +% +% \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3} +% \State \Return \vari{acc} +% \end{algorithmic} +%\end{algorithm} + +%BIDB Version of Approximation Algorithm \begin{algorithm}[H] - \caption{$\approxq$($\etree$, $\vct{p}$, $\conf$, $\error$)} - \label{alg:mon-sam} + \caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\abs{\block}$)} + \label{alg:bi-mon-sam} \begin{algorithmic}[1] \Require \etree: Binary Expression Tree \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$ \Require $\conf$ $\in [0, 1]$ \Require $\error$ $\in [0, 1]$ + \Require $\abs{\block} \in \mathbb{N}$%\bivec$ $\in [0, 1]^{\abs{\block}}$ \Ensure \vari{acc} $\in \mathbb{R}$ + \State $\bivec \gets [0]^{\abs{\block}}$\Comment{$\bivec$ is an array whose size is the number of blocks, initialized to all $0$'s}\newline + \State $\vari{sample}_\vari{next} \gets 0$ \State $\accum \gets 0$\label{alg:mon-sam-global1} \State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2} \State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is ~\cref{alg:one-pass} \;and \sampmon \; is ~\cref{alg:sample}} - \For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples} - \State $(\vari{M}_\vari{i}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} - \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1} - \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}_{\vari{i}}$} + \newline + \State $\vari{i} \gets 1$ + \While{$\vari{i} \leq \numsamp$}\Comment{Perform the required number of samples} + \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} + \For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$} + \If{$\bivec[\block] = 1$}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} + \newline + \State $\vari{sample}_{\vari{next}} \gets 1$ + \State break + \Else + \State $\bivec[\block] = 1$ +% \State $\vari{sum} = 0$ +% \For{$\ell \in [\abs{\block}]$} +% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$ +% \EndFor +% \If{$\vari{sum} \geq 2$} +% \State $\vari{sample}_{\vari{next}} \gets 1$ +% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.} + \EndIf + \EndFor + \If{$\vari{sample}_{\vari{next}} = 1$} + \State $\vari{sample}_{\vari{next}} \gets 0$ + \Else + \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}\newline + \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$} \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$} \EndFor \State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product} \State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add} - \EndFor + \State $\vari{i} \gets \vari{i} + 1$ + \EndIf + \EndWhile \State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3} \State \Return \vari{acc} \end{algorithmic} \end{algorithm} + \subsubsection{Correctness} -We state the lemmas for $\onepass$ and $\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are subsequent. +We state the lemmas for $\onepass$ and \newline$\sampmon$, the auxiliary algorithms on which ~\cref{alg:mon-sam} relies. Their proofs are subsequent. \begin{Lemma}\label{lem:one-pass} The $\onepass$ function completes in $O(size(\etree))$ time. After $\onepass$ returns the following post conditions hold. First, that $\abs{\vari{S}}(1,\ldots, 1)$ is correctly computed for each subtree $\vari{S}$ of $\etree$. Second, when $\vari{S}.\val = +$, the weighted distribution $\frac{\abs{\vari{S}_{\vari{child}}}(1,\ldots, 1)}{\abs{\vari{S}}(1,\ldots, 1)}$ is correctly computed for each child of $\vari{S}.$ diff --git a/intro.tex b/intro.tex index 96647f0..8c33b94 100644 --- a/intro.tex +++ b/intro.tex @@ -56,21 +56,21 @@ In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag s \end{figure} %Graph of query output for intro example -\begin{figure} - \begin{tikzpicture} - \node at (1.5, 3) [tree_node](top){a}; - \node at (0, 0) [tree_node](left){b}; - \node at (3, 0) [tree_node](right){c}; - \draw (top)--(left); - \draw (left)--(right); - \draw (right)--(top); - \end{tikzpicture} -\caption{Graph of tuples in table E} -\label{fig:intro-ex-graph} -\end{figure} +%\begin{figure} +% \begin{tikzpicture} +% \node at (1.5, 3) [tree_node](top){a}; +% \node at (0, 0) [tree_node](left){b}; +% \node at (3, 0) [tree_node](right){c}; +% \draw (top)--(left); +% \draw (left)--(right); +% \draw (right)--(top); +% \end{tikzpicture} +%\caption{Graph of tuples in table E} +%\label{fig:intro-ex-graph} +%\end{figure} \begin{Example}\label{ex:intro} -Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB of which all its tuples are assumed to be independent from one another. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the lineages of all contributing tuples. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. Note that the attribute column $\Phi$ contains a variable/value in the range of $[0, 1]$ denoting its marginal probability. Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. +Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB of which all its tuples are assumed to be independent from one another. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the lineages of all contributing tuples. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. Note that the attribute column $\Phi$ contains a variable/value in the range of $[0, 1]$ denoting its marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} From 9412b2bed0d6bf19f27225da0762f7255a6cf5e1 Mon Sep 17 00:00:00 2001 From: Atri Rudra Date: Wed, 9 Dec 2020 00:00:04 -0500 Subject: [PATCH 07/17] Added hardness result for k-matchings --- aaron.bib | 21 +++++++++++++++++++++ mult_distinct_p.tex | 9 ++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/aaron.bib b/aaron.bib index e9f9dc0..2ebf4f7 100644 --- a/aaron.bib +++ b/aaron.bib @@ -168,3 +168,24 @@ series = {PODS '07} } +@inproceedings{k-match, + author = {Radu Curticapean}, + editor = {Fedor V. Fomin and + Rusins Freivalds and + Marta Z. Kwiatkowska and + David Peleg}, + title = {Counting Matchings of Size k Is W[1]-Hard}, + booktitle = {Automata, Languages, and Programming - 40th International Colloquium, + {ICALP} 2013, Riga, Latvia, July 8-12, 2013, Proceedings, Part {I}}, + series = {Lecture Notes in Computer Science}, + volume = {7965}, + pages = {352--363}, + publisher = {Springer}, + year = {2013}, + url = {https://doi.org/10.1007/978-3-642-39206-1\_30}, + doi = {10.1007/978-3-642-39206-1\_30}, + timestamp = {Tue, 14 May 2019 10:00:44 +0200}, + biburl = {https://dblp.org/rec/conf/icalp/Curticapean13.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 563adcc..66be7cc 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -2,7 +2,14 @@ \subsection{Multiple Distinct $\prob$ Values} -We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. +We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. +\AR{Added the hardness result below.} +The hardness result is based on the following hardness result: +\begin{theorem}[\cite{k-match}] +\label{thm:k-match-hard} +Given a positive integer $k$ and an undirected graph $G$ with no self-loops of parallel edges, couting the number of $k$-matchings in $G$ is $\#W[1]$-hard. +\end{theorem} +The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem takes time $|V|^{\Omega(k)}$. To this end, consider the following graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$. From 55e945248e106409aae42761bc331c767dfba9d7 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 11:24:46 -0500 Subject: [PATCH 08/17] Changes per Oliver 120920. --- aaron.bib | 19 ++++++++++++++++++- intro.tex | 28 +++++++++++++++------------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/aaron.bib b/aaron.bib index e9f9dc0..41c1d97 100644 --- a/aaron.bib +++ b/aaron.bib @@ -150,7 +150,24 @@ series = {PODS '07} bdsk-url-1 = {http://sites.computer.org/debull/A18mar/p51.pdf} } - +@article{10.1145/3003665.3003667, +author = {Olteanu, Dan and Schleich, Maximilian}, +title = {Factorized Databases}, +year = {2016}, +issue_date = {June 2016}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {45}, +number = {2}, +issn = {0163-5808}, +url = {https://doi.org/10.1145/3003665.3003667}, +doi = {10.1145/3003665.3003667}, +abstract = {This paper overviews factorized databases and their application to machine learning. The key observation underlying this work is that state-of-the-art relational query processing entails a high degree of redundancy in the computation and representation of query results. This redundancy can be avoided and is not necessary for subsequent analytics such as learning regression models.}, +journal = {SIGMOD Rec.}, +month = sep, +pages = {5–16}, +numpages = {12} +} @inproceedings{DBLP:conf/tapp/Zavodny11, author = {Jakub Z{\'{a}}vodn{\'{y}}}, diff --git a/intro.tex b/intro.tex index 8c33b94..3098543 100644 --- a/intro.tex +++ b/intro.tex @@ -2,9 +2,9 @@ \section{Introduction} -In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing expectations and other moments of an output tuple is analogous to weighted model counting, a known $\sharpP$ problem. +Modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing the probability of an output tuple is analogous to weighted model counting, a known $\sharpP$ problem. %the annotation of the tuple is a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, which can essentially be thought of as a boolean formula. It is known that computing the probability of a lineage formula is \#-P hard in general -~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, the boolean formula encodes which input tuples contributed to an output tuple in an arbitrary query. This formula is also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, and is a deterministic formula for each output tuple generated by query processing. However, instead of using boolean lineage formulas, the bag case requires the use of polynomials over random variables, representing the probability distribution of the multiplicity of input tuples. In this case, the probability is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Computing the expectation of the polynomial in the bag setting is linear in the number of terms of the expanded formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula showing that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. +~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, a boolean formula encodes the conditions under which each output tuple appears in the result. This formula is also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}. The marginal probability of a tuple is its probability to appear in a possible world. The set of variables are each mapped to a probability, and by substituting the probability mappings of each variable into the lineage formula, one can compute the marginal probability. However, instead of using boolean lineage formulas, the bag case requires the use of polynomials over random variables to represent the probability distribution of the multiplicity of input tuples. In this case, the polynomial is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Or in other words, the expectation of the polynomial is the expected multiplicity of the output tuple. Due to linearity of expectation, computing the expectation of the polynomial in the bag setting is linear in the number of terms of the expanded formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula showing that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases. @@ -70,39 +70,39 @@ In practice, modern production databases, e.g., Postgres, Oracle, etc. use bag s %\end{figure} \begin{Example}\label{ex:intro} -Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB of which all its tuples are assumed to be independent from one another. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the lineages of all contributing tuples. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. Note that the attribute column $\Phi$ contains a variable/value in the range of $[0, 1]$ denoting its marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. +Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB whose tuples are independent. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the products of all input tuple lineages whose combination satsifies the join condition, summed together. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. The attribute column $\Phi$ contains its repsective tuple's marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} -Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example, consider a fixed probability for all tuples. -Note that computing the probability of the query of ~\cref{ex:intro} in set semantics is indeed \#-P hard, since it is a query that is non-hierarchical +Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example $Dom(W_i) = \{0, 1\}$ and we consider a fixed probability $\prob$ for all tuple variables such that $P[W_i = 1] = \prob$. Let us also be explicit in mentioning that the input tables are \textit{sets}, and the difference when we speak of bag semantics, is that we consider the output tuple to potentially have duplicates, or in other words we are thinking about query output (over set instances) in the bag context when we speak of the output formula under \textit{bag semantics}. +Note that computing the probability of the query of ~\cref{ex:intro} in set semantics is indeed $\sharpP$ hard, since it is a query that is non-hierarchical %, i.e., for $Vars(\poly)$ denoting the set of variables occuring across all atoms of $\poly$, a function $sg(x)$ whose output is the set of all atoms that contain variable $x$, we have that $sg(A) \cap sg(B) \neq \emptyset$ and $sg(A)\not\subseteq sg(B)$ and $sg(B)\not\subseteq sg(A)$, as defined by Dalvi and Suciu in ~\cite{10.1145/1265530.1265571}. For the purposes of this work, we define hard to be anything greater than linear time. %Thus, computing $\expct\pbox{\poly(W_a, W_b, W_c)}$, i.e. the probability of the output with annotation $\poly(W_a, W_b, W_c)$, ($\prob(q)$ in Dalvi, Sucui) is hard in set semantics. -To see why this computation is hard for query $\poly$ over set semantics, we have an output lineage formula of $\poly(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a$. Note that the conjunctive clauses are not independent of one another and the computation +To see why this computation is hard for query $\poly$ over set semantics, from the query input we compute an output lineage formula of $\poly(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a$. Note that the conjunctive clauses are not independent of one another and the computation of the probability is not linear in the size of $\poly(W_a, W_b, W_c)$. \begin{equation*} \expct\pbox{\poly(W_a, W_b, W_c)} = W_aW_b + W_a\overline{W_b}W_c + \overline{W_a}W_bW_c = 3\prob^2 - 2\prob^3 \end{equation*} -of the probability is not linear in the size of $\poly(W_a, W_b, W_c)$. In general, such a computation can be exponential. +In general, such a computation can be exponential. %Using Shannon's Expansion, %\begin{align*} %&W_aW_b \vee W_bW_c \vee W_cW_a %= &W_a %\end{align*} -However, in the bag setting, the lineage formula is $\poly(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a$. To be precise, the output lineage formula is produced from a query over a set $\ti$ input, where duplicates are allowed in the output. The expectation computation over the output lineage is a computation of the 'average' multiplicity of an output tuple across possible worlds. In ~\cref{ex:intro}, the expectation is simply +However, in the bag setting, the polynomial is $\poly(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a$. To be reiterate, the output lineage formula is produced from a query over a set $\ti$ input, where duplicates are allowed in the output. The expectation computation over the output lineage is a computation of the 'average' multiplicity of an output tuple across possible worlds. In ~\cref{ex:intro}, the expectation is simply \begin{align*} &\expct\pbox{\poly(W_a, W_b, W_c)} = \expct\pbox{W_aW_b} + \expct\pbox{W_bW_c} + \expct\pbox{W_cW_a}\\ = &\expct\pbox{W_a}\expct\pbox{W_b} + \expct\pbox{W_b}\expct\pbox{W_c} + \expct\pbox{W_c}\expct\pbox{W_a}\\ = &\prob^2 + \prob^2 + \prob^2 = 3\prob^2, \end{align*} -which is indeed linear in the size of the lineage as the number of operations in the computation is \textit{exactly} the number of lineage operations. The above equalities hold, since expectation is linear over addition of the natural numbers. Further, we exploited linearity of expectation over multiplication since in the $\ti$ model, all variables are independent. Note that the answer is the same as $\poly(\prob, \prob, \prob)$, although this is coincidental and not true for the general case. +which is indeed linear in the size of the lineage as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial. The above equalities hold, since expectation is linear over addition of the natural numbers. We were also able to push expectation into the product due to the $\ti$ independence property, where all variables are independent. Note that the answer is the same as $\poly(\prob, \prob, \prob)$, where substituting $\prob$ in for each variable yields $\prob \cdot \prob + \prob \cdot \prob + \prob \cdot \prob = 3\prob^2$. This however is coincidental and not true for the general case. Now, consider the query \begin{equation*} \poly^2() := \rel(A), E(A, B), \rel(B), \rel(C), E(C, D), \rel(D), \end{equation*} -For an arbitrary lineage formula, which we can view as a polynomial, it is known that there may exist equivalent compressed representations of the polynomial. One such compression is known as the factorized polynomial, where the polynomial can be broken up into separate factors, and this is generally smaller than the expanded polynomial. Another equivalent form of the polynomial is the SOP, which is the expansion of the factorized polynomial by multiplying out all terms, and in general is exponentially larger (in the number of products) than the factorized version. +For an arbitrary lineage formula, which we can view as a polynomial, it is known that there may exist equivalent compressed representations of the polynomial. One such compression is known as the factorized polynomial ~\cite{10.1145/3003665.3003667}, where the polynomial can be broken up into separate factors. Another equivalent form of the polynomial is the sum of products (SOP), which is the expansion of the factorized polynomial by multiplying out all terms, and in general is exponentially larger (in the number of products) than the factorized version. A factorized polynomial of $\poly^2$ is @@ -154,6 +154,8 @@ This factorized expression can be easily modeled as an expression tree as depict W_a^2W_b^2 + W_b^2W_c^2 + W_c^2W_a^2 + 2W_a^2W_bW_c + 2W_aW_b^2W_c + 2W_aW_bW_c^2. \end{equation*} +One can see that the factorized form more closely models the optimizations of deterministic query evaluation. + The expectation then is \begin{align*} &\expct\pbox{\poly^2(W_a, W_b, W_c)}\\ @@ -164,7 +166,7 @@ The expectation then is = & 3\prob^2(1 + 2\prob) \neq \poly^2(\prob, \prob, \prob). \end{align*} -In this case, even though we substituting probability or expecation values in for each variable, $\poly^2(\prob, \prob, \prob)$ is not the answer we seek since for a random variable $X$, $\expct\pbox{X^2} = \sum_{x \in Dom(X)}x^2 \cdot p(x)$. Note, that for our example, $Dom(W_i) = \{0, 1\}$. Intuitively, bags are only hard with self-joins.\AH{Atri suggests a proof in the appendix regarding the last claim.} +In this case, even though we substitute probability values in for each variable, $\poly^2(\prob, \prob, \prob)$ is not the answer we seek since for a random variable $X$, $\expct\pbox{X^2} = \sum_{x \in Dom(X)}x^2 \cdot p(x)$. Intuitively, bags are only hard with self-joins.\AH{Atri suggests a proof in the appendix regarding the last claim.} Define $\rpoly^2(\vct{X})$ to be the resulting polynomial when all exponents $e > 1$ are set to $1$ in $\poly^2$. Note that this structure $\rpoly^2(\prob, \prob, \prob)$ is the expectation we computed, since it is always the case that $i^2 = i$ for all $i$ in $\{0, 1\}$. And, $\poly^2()$ is still computable in linear time in the size of the output polynomial, compressed or SOP. @@ -172,7 +174,7 @@ A compressed polynomial can be exponentially smaller in $k$ for $k$-products. I This works seeks to explore the complexity landscape for compressed representations of polynomials. We use the term 'easy' to mean linear time, and the term 'hard' to mean superlinear time or greater. Note that when we are linear in the size of the lineage formula, we essentially have runtime that is of deterministic query complexity. -Up to this point the message seems consistent that bags are always easy, but +Up to this point the message seems consistent that bags are always easy in the size of the SOP representation, but \begin{Question} Is it always the case that bags are easy in the size of the compressed polynomial? \end{Question} @@ -180,7 +182,7 @@ If bags \textit{are} always easy for any compressed version of the polynomial, t Consider the query \begin{equation*} -\poly^3() := \rel(A), E(A, B), R(B), \rel(C), E(C, D), R(D), \rel(F), E(F, G), R(G). +\poly^3() := \left(\rel(A), E(A, B), R(B)\right), \left(\rel(C), E(C, D), R(D)\right), \left(\rel(F), E(F, G), R(G)\right). \end{equation*} Upon inspection one can see that the factorized output polynomial consists of three product terms, while the SOP version consists of $3^3$ terms. We show in this paper that, given a $\ti$ and any conjunctive query with input $\prob$ for all variables of $\poly^3$, this particular query is hard given a factorized polynomial as input. We show this via a reduction to computing the number of $3$-matchings over an arbitrary graph. The fact that bags are not easy in the general case when considering compressed polynomials necessitates an approximation algorithm that computes the expected multiplicity of the output in linear time when the output polynomial is in factorized form. We introduce such an approximation algorithm with confidence guarantees to compute $\rpoly(\vct{X})$ in linear time. Our apporximation algorithm generalizes to the $\bi$ model as well. This shows that for all RA+ queries, the processing time in approximation is essentially the same deterministic processing. From 98ac2ced8fb157ad8cd4e768fc8c87c2abf48cab Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 12:20:44 -0500 Subject: [PATCH 09/17] Incorporated all of @atri Riot 120920 suggestions. --- lin_sys.tex | 14 ++++++-------- mult_distinct_p.tex | 15 +++++++++------ poly-form.tex | 2 +- single_p.tex | 1 + 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lin_sys.tex b/lin_sys.tex index 4d96660..0cb70b7 100644 --- a/lin_sys.tex +++ b/lin_sys.tex @@ -205,26 +205,24 @@ Putting \cref{eq:det-1}, \cref{eq:det-2}, \cref{eq:det-3} together, we have, \begin{align} &\dtrm{\mtrix{\rpoly}} = 30(3\prob^2 - \prob^3)^2 - 90\prob(3\prob^2 - \prob^3)^2 +30(3\prob^2 - \prob^3)^3\nonumber\\ &= 30(3\prob^2 - \prob^3)^2\left(1 - 3\prob + (3\prob^2 - \prob^3)\right) \nonumber\\ -&= 30\left(9\prob^4 - 6\prob^5 + \prob^6\right)\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right)\nonumber\\ -&=\left(30\prob^6 - 180\prob^5 + 270\prob^4\right)\cdot\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right).\label{eq:det-final} +&= 30\prob^4\left(3 - \prob\right)^2\left(-\prob^3 + 3\prob^2 - 3\prob + 1\right)\nonumber\\ +&= 30\prob^4\left(3 - \prob\right)^2\left(1 - \prob\right)^3.\label{eq:det-final} \end{align} -\AH{It appears that the equation below has roots at p = 0 (left factor) and p = 1, with NO roots $\in (0, 1)$.} - -It can be shown through standard polynomial roots computation techniques \footnote{An online roots solver such as https://www.mathportal.org/calculators/polynomials-solvers/polynomial-roots-calculator.php will suffice}, that $\dtrm{\mtrix{\rpoly}}$ has no roots in $(0, 1)$, ensuring independence for all $\prob$ values in $(0, 1)$, and thus ~\cref{lem:lin-sys} follows. +From ~\cref{eq:det-final} it can easily be seen that the roots of $\dtrm{\mtrix{\rpoly}}$ are $0, 1,$ and $3$. Hence there are no roots in $(0, 1)$ and ~\cref{lem:lin-sys} follows. \end{proof} \qed \begin{proof}[Proof of \cref{th:single-p}] -Thus, by ~\cref{lem:lin-sys} we have proved ~\cref{th:single-p} for fixed $p \in (0, 1)$. +The proof follows by ~\cref{lem:lin-sys}. \end{proof} \qed \begin{Corollary}\label{cor:single-p-gen-k} For every value $\kElem \geq 3$, there exists a query with $\kElem$ product width that is hard. \end{Corollary} -\begin{proof}[Proof of Corollary ~\cref{cor:single-p-gen-k}] -Consider $\poly^3_{G}$ and $\poly' = 1$ such that $\poly'' = \poly^3_{G} \cdot \poly'$. By ~\cref{th:single-p}, query $\poly''$ with $\kElem = 4$ is hard. +\begin{proof}[Proof of Corollary ~\ref{cor:single-p-gen-k}] +Consider $\poly^3_{G}$ and $\poly' = 1$ such that $\poly'' = \poly^3_{G} \cdot \poly'$. By ~\cref{th:single-p}, query $\poly''$ with $\kElem = 4$ has $\Omega(\numvar^{\frac{4}{3}})$ complexity. \end{proof} \qed diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 66be7cc..e5d799c 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -5,10 +5,10 @@ We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. \AR{Added the hardness result below.} The hardness result is based on the following hardness result: -\begin{theorem}[\cite{k-match}] +\begin{Theorem}[\cite{k-match}] \label{thm:k-match-hard} -Given a positive integer $k$ and an undirected graph $G$ with no self-loops of parallel edges, couting the number of $k$-matchings in $G$ is $\#W[1]$-hard. -\end{theorem} +Given a positive integer $k$ and an undirected graph $G$ with no self-loops or parallel edges, counting the number of $k$-matchings in $G$ is $\#W[1]$-hard. +\end{Theorem} The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem takes time $|V|^{\Omega(k)}$. To this end, consider the following graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$. @@ -45,8 +45,11 @@ By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. Add \end{proof} \qed - -\begin{Corollary}\label{cor:reduct} -By ~\cref{lem:qEk-multi-p} and ~\cref{cor:lem-qEk} it follows that computing $\rpoly(\vct{X})$ is hard. +\begin{Corollary}\label{cor:tilde-q-hard} +Computing $\rpoly(\vct{X})$ is $\#W[1]$-hard. \end{Corollary} +\begin{proof}[Proof of Corollary ~\ref{cor:tilde-q-hard}] +The proof follows by ~\cref{thm:k-match-hard}, ~\cref{lem:qEk-multi-p} and ~\cref{cor:lem-qEk}. +\end{proof} + diff --git a/poly-form.tex b/poly-form.tex index 045a21f..968eaec 100644 --- a/poly-form.tex +++ b/poly-form.tex @@ -112,7 +112,7 @@ If $\poly$ is given as a sum of monomials, the expectation of $\poly$, i.e., $\e \end{Corollary} \begin{proof}[Proof For Corollary ~\ref{cor:expct-sop}] -Note that \cref{lem:exp-poly-rpoly} shows that $\expct\pbox{\poly} = \rpoly(\prob_1,\ldots, \prob_\numvar)$. Therefore, if $\poly$ is already in sum of products form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numvar)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numvar)$), which is indeed has $O(|\poly|)$ compututations.\qed +Note that \cref{lem:exp-poly-rpoly} shows that $\expct\pbox{\poly} =$ $\rpoly(\prob_1,\ldots, \prob_\numvar)$. Therefore, if $\poly$ is already in sum of products form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numvar)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numvar)$), which indeed has $O(|\poly|)$ compututations.\qed \end{proof} diff --git a/single_p.tex b/single_p.tex index 3f309fe..9bea7db 100644 --- a/single_p.tex +++ b/single_p.tex @@ -5,6 +5,7 @@ In this discussion, let us fix $\kElem = 3$. +\AH{@atri needs to put in the result for triangles of $\numvar^{\frac{4}{3}}$ runtime.} \begin{Theorem}\label{th:single-p} If we can compute $\rpoly_{G}^3(\vct{X})$ in T(\numedge) time for $X_1 =\cdots= X_\numvar = \prob$, then we can count the number of triangles, 3-paths, and 3-matchings in $G$ in $T(\numedge) + O(\numedge)$ time. \end{Theorem} From a9c3d362eeea906edc8d391873a4bc5f3ce5a47f Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 13:41:44 -0500 Subject: [PATCH 10/17] Incorporated @atri pdf 120920 suggestions. --- mult_distinct_p.tex | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index e5d799c..492a8a4 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -4,31 +4,33 @@ We would like to argue for a compressed version of $\poly(\vct{w})$, in general $\expct_{\vct{w}}\pbox{\poly(\vct{w})}$ cannot be computed in linear time. \AR{Added the hardness result below.} -The hardness result is based on the following hardness result: +Our hardness result is based on the following hardness result: \begin{Theorem}[\cite{k-match}] \label{thm:k-match-hard} Given a positive integer $k$ and an undirected graph $G$ with no self-loops or parallel edges, counting the number of $k$-matchings in $G$ is $\#W[1]$-hard. \end{Theorem} -The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem takes time $|V|^{\Omega(k)}$. +The above result means that we cannot hope to count the number of $k$-matchings in $G=(V,E)$ in time $f(k)\cdot |V|^{O(1)}$ for any function $f$. In fact, all known algorithms to solve this problem take time $|V|^{\Omega(k)}$. -To this end, consider the following graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$. +To prove our hardness result, consider a graph $G(V, E)$, where $|E| = \numedge$, $|V| = \numvar$, and $i, j \in [\numvar]$. Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$. For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$. \begin{Lemma}\label{lem:qEk-multi-p} -Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$. Given $2\kElem + 1$ distinct $\prob$ values, one can compute each $c_i$ in $\vct{c}$ exactly. Additionally, the number of $\kElem$-matchings can be computed exactly. +Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$. Given $2\kElem + 1$ distinct $\prob$ values, one can compute $\rpoly_{G}^\kElem$ for the number of $\kElem$-matchings in $G$ in $poly(\kElem)$ time. \end{Lemma} \begin{proof}[Proof of ~\cref{lem:qEk-multi-p}] -It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$. +%It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$. +Since $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$, this implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. -Given that we have $2\kElem + 1$ distinct values of $\prob$ by the lemma statement, it follows that we then have $2\kElem + 1$ linear equations which are distinct. Further, by construction of the summation, these $2\kElem + 1$ equations collectively form the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system to determine $\vct{c}$ exactly. +Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$, it follows that we then have $2\kElem + 1$ linear equations which are distinct. Further, by construction of the summation, the coefficient matrix of the $2\kElem + 1$ equations is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. -It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. +Note that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the original factorized representation $\poly_{G}^\kElem(\vct{X})$, where, across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem \kElem = \kElem!$ times. Note that each $\kElem$-matching $(X_{i_1} X_{j_1})\ldots$ $(X_{i_k} X_{j_k})$ corresponds to the unique monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\rpoly_{G}^\kElem(\vct{X})$, where each index is distinct. Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$. +%It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. -Thus, the product $\kElem!\cdot\numocc{G}{\kmatch}$ is the exact number of $\kElem$-matchings in $\poly_{G}^\kElem(\vct{X})$. +Then, since we have $\kElem!$ duplicates of each $\kElem$-matching in $\numocc{G}{\kmatch}$, $c_{2\kElem} = \kElem!\cdot\numocc{G}{\kmatch}$. This allows us to solve for $\numocc{G}{\kmatch}$ by simply dividing $c_{2\kElem}$ by $\kElem!$. By ~\cref{thm:k-match-hard} it follows then that computing $\rpoly(\vct{X})$ given multiple distinct $\prob$ values is $\#W[1]$-hard. \end{proof} \qed @@ -36,20 +38,20 @@ Thus, the product $\kElem!\cdot\numocc{G}{\kmatch}$ is the exact number of $\kEl -\begin{Corollary}\label{cor:lem-qEk} -One can compute $\numocc{G}{\kmatch}$ in $\query_{G}^\kElem(\vct{X})$ exactly. -\end{Corollary} - -\begin{proof}[Proof for Corollary ~\ref{cor:lem-qEk}] -By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. Additionally we know that $c_{2\kElem}$ can be broken into two factors, and by dividing $c_{2\kElem}$ by the factor $\kElem!$, it follows that the resulting value is indeed $\numocc{G}{\kmatch}$. -\end{proof} - -\qed -\begin{Corollary}\label{cor:tilde-q-hard} -Computing $\rpoly(\vct{X})$ is $\#W[1]$-hard. -\end{Corollary} - -\begin{proof}[Proof of Corollary ~\ref{cor:tilde-q-hard}] -The proof follows by ~\cref{thm:k-match-hard}, ~\cref{lem:qEk-multi-p} and ~\cref{cor:lem-qEk}. -\end{proof} +%\begin{Corollary}\label{cor:lem-qEk} +%One can compute $\numocc{G}{\kmatch}$ in $\query_{G}^\kElem(\vct{X})$ exactly. +%\end{Corollary} +% +%\begin{proof}[Proof for Corollary ~\ref{cor:lem-qEk}] +%By ~\cref{lem:qEk-multi-p}, the term $c_{2\kElem}$ can be exactly computed. Additionally we know that $c_{2\kElem}$ can be broken into two factors, and by dividing $c_{2\kElem}$ by the factor $\kElem!$, it follows that the resulting value is indeed $\numocc{G}{\kmatch}$. +%\end{proof} +% +%\qed +%\begin{Corollary}\label{cor:tilde-q-hard} +%Computing $\rpoly(\vct{X})$ is $\#W[1]$-hard. +%\end{Corollary} +% +%\begin{proof}[Proof of Corollary ~\ref{cor:tilde-q-hard}] +%The proof follows by ~\cref{thm:k-match-hard}, ~\cref{lem:qEk-multi-p} and ~\cref{cor:lem-qEk}. +%\end{proof} From dcf0ec7c9df38025a9f0abd21e0cb8ba21115c4d Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 16:51:37 -0500 Subject: [PATCH 11/17] Incorporated @oliver 120920 pdf suggestions. --- intro.tex | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/intro.tex b/intro.tex index 3098543..ee9180c 100644 --- a/intro.tex +++ b/intro.tex @@ -2,9 +2,12 @@ \section{Introduction} -Modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing the probability of an output tuple is analogous to weighted model counting, a known $\sharpP$ problem. +Modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing the probability of an output tuple is analogous to weighted model counting (a known $\sharpP$ problem). %the annotation of the tuple is a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, which can essentially be thought of as a boolean formula. It is known that computing the probability of a lineage formula is \#-P hard in general -~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, a boolean formula encodes the conditions under which each output tuple appears in the result. This formula is also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}. The marginal probability of a tuple is its probability to appear in a possible world. The set of variables are each mapped to a probability, and by substituting the probability mappings of each variable into the lineage formula, one can compute the marginal probability. However, instead of using boolean lineage formulas, the bag case requires the use of polynomials over random variables to represent the probability distribution of the multiplicity of input tuples. In this case, the polynomial is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Or in other words, the expectation of the polynomial is the expected multiplicity of the output tuple. Due to linearity of expectation, computing the expectation of the polynomial in the bag setting is linear in the number of terms of the expanded formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula showing that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases. +~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, a boolean formula, also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, encodes the conditions under which each output tuple appears in the result. The marginal probability of this formula being true is the tuple's probability to appear in a possible world. The set of variables are each mapped to probability values, from which the marginal probability can be computed. The corresponding problem for bag PDBs is computing the expected multiplicity of the output tuple, which requires polynomials to represent the probability distribution of the multiplicity of input tuples. +%In this case, the polynomial is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Or in other words, the expectation of the polynomial is the expected multiplicity of the output tuple. +The standard representation for lineage formulas in PDBs is sum of products (SOP), which is much bigger than the lineage-free representation that deterministic databases employ. +Due to linearity of expectation, computing the expectation of the polynomial in the bag setting is linear in the number of terms in the SOP formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula. We show that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. The compressed representation of the formula is analagous to deterministic query optimizations (e.g. pushing down projections). Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases. @@ -70,8 +73,9 @@ Modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In %\end{figure} \begin{Example}\label{ex:intro} -Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB whose tuples are independent. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$, where the lineage of the output will consist of the products of all input tuple lineages whose combination satsifies the join condition, summed together. The $\ti$ example instances are given in~\cref{fig:intro-ex}. While for completeness we should include annotations for Table E, since each tuple has a probability of $1$, we drop them for simplicity. The attribute column $\Phi$ contains its repsective tuple's marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. -This query is hard in set semantics because of correlations in the lineage formula, but under bag semantics with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. +Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB whose tuples are independent. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$. The lineage of the output is computed by adding variables when a union operation is performed, and by multiplying variables for a join operation. This yields the products of all input tuple lineages whose combination satsifies the join condition, summed together. A $\ti$ example instance is given in~\cref{fig:intro-ex}. While for completeness we should include random variables for Table E, since each tuple has a probability of $1$, we drop them for simplicity. The attribute column $\Phi$ contains its repsective random variable, where $P[W_i = 1]$ is its marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. +Next we explain why this query is hard in set semantics % due to correlations in the lineage formula. But +and easy under bag semantics.% with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example $Dom(W_i) = \{0, 1\}$ and we consider a fixed probability $\prob$ for all tuple variables such that $P[W_i = 1] = \prob$. Let us also be explicit in mentioning that the input tables are \textit{sets}, and the difference when we speak of bag semantics, is that we consider the output tuple to potentially have duplicates, or in other words we are thinking about query output (over set instances) in the bag context when we speak of the output formula under \textit{bag semantics}. From 2caa90b1142328f06261ff1bbdd7f3252b0d0630 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 17:40:54 -0500 Subject: [PATCH 12/17] Implemented changes to Sec. 3.1 per Riot conversation w/@atri 120920 --- mult_distinct_p.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 492a8a4..455741a 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -25,7 +25,7 @@ Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{ %It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$. Since $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$, this implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. -Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$, it follows that we then have $2\kElem + 1$ linear equations which are distinct. Further, by construction of the summation, the coefficient matrix of the $2\kElem + 1$ equations is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. +Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$, it follows that we then have $2\kElem + 1$ linear equations of the form $\prob_i^0\ldots\prob_i^{2\kElem}$ which are distinct. We have then a linear system of the form $M \cdot \vct{c} = \vct{b}$ where $M$ holds the aforementioned linear equations, $\vct{c}$ is the coefficient vector ($c_0,\ldots, c_{2\kElem}$), and $\vct{b}$ is the vector containing each $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. By construction of the summation, matrix $M$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. Note that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the original factorized representation $\poly_{G}^\kElem(\vct{X})$, where, across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem \kElem = \kElem!$ times. Note that each $\kElem$-matching $(X_{i_1} X_{j_1})\ldots$ $(X_{i_k} X_{j_k})$ corresponds to the unique monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\rpoly_{G}^\kElem(\vct{X})$, where each index is distinct. Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$. %It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. From 60d3a772a5eac0e5e26a51337c08db7e32146876 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Wed, 9 Dec 2020 17:53:26 -0500 Subject: [PATCH 13/17] Added the \gamma cancellations to runtime analysis of outer approx algo --- approx_alg.tex | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index 35bc054..3325439 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -185,7 +185,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y \While{$\vari{i} \leq \numsamp$}\Comment{Perform the required number of samples} \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} \For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$} - \If{$\bivec[\block] = 1$}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} + \If{$\bivec[\block] = 1$}\label{alg:mon-sam-drop}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} \newline \State $\vari{sample}_{\vari{next}} \gets 1$ \State break @@ -201,7 +201,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y \EndIf \EndFor \If{$\vari{sample}_{\vari{next}} = 1$} - \State $\vari{sample}_{\vari{next}} \gets 0$ + \State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp} \Else \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}\newline \For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$} @@ -285,11 +285,13 @@ By Hoeffding we obtain the number of samples necessary to acheive the claimed ad This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}. \paragraph{Run-time Analysis} +For a $\bi$ instance, it is possible that cancellations can occur as seen ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations. + Note that lines ~\ref{alg:mon-sam-global1}, ~\ref{alg:mon-sam-global2}, and ~\ref{alg:mon-sam-global3} are $O(1)$ global operations. The call to $\onepass$ in line ~\ref{alg:mon-sam-onepass} by lemma ~\ref{lem:one-pass} is $O(\treesize(\etree))$ time. %First, algorithm ~\ref{alg:mon-sam} calls \textsc{OnePass} which takes $O(|\etree|)$ time. Then for $\numsamp = \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$, the $O(1)$ assignment, product, and addition operations occur. Over the same $\numsamp$ iterations, $\sampmon$ is called, with a runtime of $O(\log{k}\cdot k \cdot depth(\etree))$ by lemma ~\ref{lem:sample}. Finally, over the same iterations, because $\degree(\polyf(\abs{\etree})) = k$, the assignment and product operations of line ~\ref{alg:mon-sam-product2} are called at most $k$ times. -Thus we have $O(\treesize(\etree)) + O(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot \left(k + \log{k}\cdot k \cdot depth(\etree)\right) = O\left(\treesize(\etree) + \left(\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot \left(k \cdot\log{k} \cdot depth(\etree)\right)\right)\right)$ overall running time. +Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error^2} + \gamma\right) \cdot \left(k + \log{k}\cdot k \cdot depth(\etree)\right) = O\left(\treesize(\etree) + \left(\left(\frac{\log{\frac{1}{\conf}}}{\error^2} + \gamma\right) \cdot \left(k \cdot\log{k} \cdot depth(\etree)\right)\right)\right)$ overall running time. \end{proof} \qed From 81baf437b65da3f6de071f9ba013a8f7139e422a Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Thu, 10 Dec 2020 12:07:09 -0500 Subject: [PATCH 14/17] Implemented @atri 121020 pdf suggestions for sec 3. --- approx_alg.tex | 2 +- mult_distinct_p.tex | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index 3325439..80ec762 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -285,7 +285,7 @@ By Hoeffding we obtain the number of samples necessary to acheive the claimed ad This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}. \paragraph{Run-time Analysis} -For a $\bi$ instance, it is possible that cancellations can occur as seen ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations. +For a $\bi$ instance, it is possible that cancellations can occur as seen in ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations. Note that lines ~\ref{alg:mon-sam-global1}, ~\ref{alg:mon-sam-global2}, and ~\ref{alg:mon-sam-global3} are $O(1)$ global operations. The call to $\onepass$ in line ~\ref{alg:mon-sam-onepass} by lemma ~\ref{lem:one-pass} is $O(\treesize(\etree))$ time. %First, algorithm ~\ref{alg:mon-sam} calls \textsc{OnePass} which takes $O(|\etree|)$ time. diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index 455741a..cd95079 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -18,19 +18,36 @@ Consider the query $\poly_{G}(\vct{X}) = q_E(X_1,\ldots, X_\numvar) = \sum\limit For the following discussion, set $\poly_{G}^\kElem(\vct{X}) = \left(q_E(X_1,\ldots, X_\numvar)\right)^\kElem$. \begin{Lemma}\label{lem:qEk-multi-p} -Given polynomial $\poly_{G}^\kElem(\prob,\ldots, \prob)$, we can write $\rpoly_{G}^\kElem$ as $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$ for some fixed terms $\vct{c}$. Given $2\kElem + 1$ distinct $\prob$ values, one can compute $\rpoly_{G}^\kElem$ for the number of $\kElem$-matchings in $G$ in $poly(\kElem)$ time. +Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $poly(\kElem)$ time. \end{Lemma} \begin{proof}[Proof of ~\cref{lem:qEk-multi-p}] %It is trivial to see that one can readily expand the exponential expression by performing the $n^\kElem$ product operations, yielding the polynomial in the sum of products form of the lemma statement. By definition $\rpoly_{G}^\kElem$ reduces all variable exponents greater than $1$ to $1$. Thus, a monomial such as $X_i^\kElem X_j^\kElem$ is $X_iX_j$ in $\rpoly_{G}^\kElem$, and the value after substitution is $p_i\cdot p_j = p^2$. Further, that the number of terms in the sum is no greater than $2\kElem + 1$, can be easily justified by the fact that each edge has two endpoints, and the most endpoints occur when we have $\kElem$ distinct edges (such a subgraph is also known as a $\kElem$-matching), with non-intersecting points, a case equivalent to $p^{2\kElem}$. -Since $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$, this implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. +We will show that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$. First, since $\poly_G^\kElem(\vct{X})$ has $\kElem$ products of monomials of degree $2$, it follows that $\poly_G^\kElem(\vct{X})$ has degree $2\kElem$. We can further write $\poly_{G}^{\kElem}(\vct{X})$ in its expanded SOP form, +\begin{equation*} +\sum_{\substack{(i_1, j_1),\\\cdots,\\(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem} +\end{equation*} +Since each of $(i_1, j_1),\ldots, (i_\kElem, j_\kElem)$ are from $E$, it follows that the set of $\kElem!$ permutations of the $\kElem$ $X_iX_j$ pairs which form the monomial products are of degree $2\kElem$ with the number of distinct variables in an arbitrary monomial $\leq 2\kElem$. By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, thereby shrinking the degree a monomial product term in the SOP form of $\poly_{G}^{\kElem}(\vct{X})$ to the exact number of distinct variables the monomial contains. This implies that $\rpoly_{G}^\kElem$ is a polynomial of degree $2\kElem$ and hence $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ is a polynomial in $\prob$ of degree $2\kElem$. Then it is the case that +\begin{equation*} +\rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i +\end{equation*} +where $c_i$ denotes all monomials in the expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables, with $\prob$ substituted for each distinct variable. -Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$, it follows that we then have $2\kElem + 1$ linear equations of the form $\prob_i^0\ldots\prob_i^{2\kElem}$ which are distinct. We have then a linear system of the form $M \cdot \vct{c} = \vct{b}$ where $M$ holds the aforementioned linear equations, $\vct{c}$ is the coefficient vector ($c_0,\ldots, c_{2\kElem}$), and $\vct{b}$ is the vector containing each $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. By construction of the summation, matrix $M$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. +Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ for $0\leq i\leq2\kElem$, it follows that we then have $2\kElem + 1$ distinct rows of the form $\prob_i^0\ldots\prob_i^{2\kElem}$ which form a matrix $M$. We have then a linear system of the form $M \cdot \vct{c} = \vct{b}$ where $\vct{c}$ is the coefficient vector ($c_0,\ldots, c_{2\kElem}$), and $\vct{b}$ is the vector such that $\vct{b}[i] = \rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. By construction of the summation, matrix $M$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. -Note that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the original factorized representation $\poly_{G}^\kElem(\vct{X})$, where, across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem \kElem = \kElem!$ times. Note that each $\kElem$-matching $(X_{i_1} X_{j_1})\ldots$ $(X_{i_k} X_{j_k})$ corresponds to the unique monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\rpoly_{G}^\kElem(\vct{X})$, where each index is distinct. Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$. +Denote the number of $\kElem$-matchings in $G$ as $\numocc{G}{\kmatch}$. Note that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the original factorized representation $\poly_{G}^\kElem(\vct{X})$, where, across each of the $\kElem$ products, an arbitrary $\kElem$-matching can be selected $\prod_{i = 1}^\kElem \kElem = \kElem!$ times. Note that each $\kElem$-matching $(i_1, j_1)\ldots$ $(i_k, j_k)$ in $G$ corresponds to the unique monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\poly_{G}^\kElem(\vct{X})$, where each index is distinct. Since each index is distinct, then each variable has an exponent $e = 1$ and this monomial survives in $\rpoly_{G}^{\kElem}(\vct{X})$ Since $\rpoly$ contains only exponents $e \leq 1$, the only degree $2\kElem$ terms that can exist in $\rpoly_{G}^\kElem$ are $\kElem$-matchings since every other monomial in $\poly_{G}^\kElem(\vct{X})$ has strictly less than $2\kElem$ distinct variables, which, as stated earlier implies that every other non-$\kElem$-matching monomial in $\rpoly_{G}^\kElem(\vct{X})$ has degree $< 2\kElem$. %It has already been established above that a $\kElem$-matching ($\kmatch$) has coefficient $c_{2\kElem}$. As noted, a $\kElem$-matching occurs when there are $\kElem$ edges, $e_1, e_2,\ldots, e_\kElem$, such that all of them are disjoint, i.e., $e_1 \neq e_2 \neq \cdots \neq e_\kElem$. In all $\kElem$ factors of $\poly_{G}^\kElem(\vct{X})$ there are $k$ choices from the first factor to select an edge for a given $\kElem$ matching, $\kElem - 1$ choices in the second factor, and so on throughout all the factors, yielding $\kElem!$ duplicate terms for each $\kElem$ matching in the expansion of $\poly_{G}^\kElem(\vct{X})$. -Then, since we have $\kElem!$ duplicates of each $\kElem$-matching in $\numocc{G}{\kmatch}$, $c_{2\kElem} = \kElem!\cdot\numocc{G}{\kmatch}$. This allows us to solve for $\numocc{G}{\kmatch}$ by simply dividing $c_{2\kElem}$ by $\kElem!$. By ~\cref{thm:k-match-hard} it follows then that computing $\rpoly(\vct{X})$ given multiple distinct $\prob$ values is $\#W[1]$-hard. +Then, since we have $\kElem!$ duplicates of each distinct $\kElem$-matching, and the fact that $c_{2\kElem}$ contains all monomials with degree $2\kElem$, it follows that $c_{2\kElem} = \kElem!\cdot\numocc{G}{\kmatch}$. This allows us to solve for $\numocc{G}{\kmatch}$ by simply dividing $c_{2\kElem}$ by $\kElem!$. +\end{proof} + +\qed + +\begin{Corollary}\label{cor:mult-p-hard-result} +Computing $\rpoly(\vct{X})$ given multiple distinct $\prob$ values is $\#W[1]$-hard. +\end{Corollary} +\begin{proof}[Proof of Corollary ~\ref{cor:mult-p-hard-result}] +The proof follows by ~\cref{thm:k-match-hard} and ~\cref{lem:qEk-multi-p}. \end{proof} \qed From bbc47b292345827dce317445c52f43fbc3188ccb Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Thu, 10 Dec 2020 18:13:59 -0500 Subject: [PATCH 15/17] Attempted to fix runtime analysis for outer approx alg. --- approx_alg.tex | 59 ++++++++++++++++++++++++++++++--------------- mult_distinct_p.tex | 2 +- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index 80ec762..c5555fb 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -1,6 +1,28 @@ %root: main.tex \section{$1 \pm \epsilon$ Approximation Algorithm} Since it is the case that computing the expected multiplicity of a compressed representation of a bag polynomial is hard, it is then desirable to have an algorithm to approximate the multiplicity in linear time, which is what we describe next. + +\begin{Definition}[$\bi$~\cite{DBLP:series/synthesis/2011Suciu}] +A Block Independent Database ($\bi$) is a PDB whose tuples are partitioned in blocks, where we denote block $i$ as $\block_i$. Each $\block_i$ is independent of all other blocks, while all tuples sharing the same $\block_i$ are mutually exclusive. +\end{Definition} + +A $\ti$ is also a $\bi$ where each tuple is its own block. + +While the definition of polynomial $\poly(\vct{X})$ over a $\bi$ input doesn't change, we introduce an alternative notation which will come in handy. Given $\ell$ blocks, we write $\poly(\vct{X})$ = $\poly(X_{\block_1, 1},\ldots, X_{\block_1, \abs{\block_1}},$ $\ldots, X_{\block_\ell, \abs{\block_\ell}})$, where $\abs{\block_i}$ denotes the size of $\block_i$, and $\block_{i, j}$ denotes tuple $j$ residing in block $i$ for $j$ in $[\abs{\block_i}]$. +The number of tuples in the $\bi$ instance can be computed as $\numvar = \sum\limits_{i = 1}^{\ell}\abs{\block_i}$ . + +When considering $\bi$ input, it becomes necessary to redefine $\rpoly(\vct{X})$. + +\begin{Definition}[$\rpoly$ $\bi$ Redefinition] +A polynomial $\poly(\vct{X})$ over a $\bi$ instance is reduced to $\rpoly(\vct{X})$ with the following criteria. First, all exponents $e > 1$ are reduced to $e = 1$. Second, all monomials sharing the same $\block$ are dropped. Formally this is expressed as + +\begin{equation*} +\rpoly(\vct{X}) = \poly(\vct{X}) \mod X_i^2 - X_i \mod X_{\block_s, t}X_{\block_s, u} +\end{equation*} +for all $i$ in $[\numvar]$ and for all $s$ in $\ell$, such that for all $t, u$ in $[\abs{block_s}]$, $t \neq u$. +\end{Definition} + +We state the approximation algorithm in terms of a $\bi$. First, let us introduce some useful definitions and notation. For illustrative purposes in the definitions below, let us consider when $\poly(\vct{X}) = 2x^2 + 3xy - 2y^2$. @@ -167,7 +189,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y %BIDB Version of Approximation Algorithm \begin{algorithm}[H] \caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\abs{\block}$)} - \label{alg:bi-mon-sam} + \label{alg:mon-sam} \begin{algorithmic}[1] \Require \etree: Binary Expression Tree \Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$ @@ -185,7 +207,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y \While{$\vari{i} \leq \numsamp$}\Comment{Perform the required number of samples} \State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample} \For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$} - \If{$\bivec[\block] = 1$}\label{alg:mon-sam-drop}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} + \If{$\bivec[\block] = 1$}\label{alg:mon-sam-check}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.} \newline \State $\vari{sample}_{\vari{next}} \gets 1$ \State break @@ -200,7 +222,7 @@ Recall that the notation $[x, y]$ denotes the range of values between $x$ and $y % \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.} \EndIf \EndFor - \If{$\vari{sample}_{\vari{next}} = 1$} + \If{$\vari{sample}_{\vari{next}} = 1$}\label{alg:mon-sam-drop} \State $\vari{sample}_{\vari{next}} \gets 0$\label{alg:mon-sam-resamp} \Else \State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}\newline @@ -238,23 +260,22 @@ with bound $P\left(\left|\mathcal{X} - \expct\pbox{\mathcal{X}}\right|\geq \erro \end{Theorem} - - - - - - \begin{proof}[Proof of Theorem \ref{lem:mon-samp}] +First, define $\gamma$ to be the probability that a monomial with variables from the same block $\block$ is sampled. When such a monomial is sampled, the algorithm effectively drops the sample and samples another monomial due to the mutual exclusion property of $\bi$. This can be seen in ~\cref{alg:mon-sam-check} and ~\cref{alg:mon-sam-drop} of the code. -Consider $\expandtree{\etree}$ and let $(\monom, \coef)$ be an arbitrary tuple in $\expandtree{\etree}$. For convenience, over an alphabet $\Sigma$ of size $\numvar$, define $\evalmp: \left(\left\{\monom^a~|~\monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}, [0, 1]^\numvar\right)\mapsto \mathbb{R}$, a function that takes a monomial $\monom$ in $\left\{\monom^a ~|~ \monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}$ and probability vector $\vct{p}$ (introduced in ~\cref{subsec:def-data}) as input and outputs the evaluation of $\monom$ over $\vct{p}$. By ~\cref{lem:sample}, the sampling scheme samples $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. Note that $\coef \cdot \evalmp(\monom, \vct{p})$ is the value of $(\monom, \coef)$ in $\expandtree{\etree}$ when all variables in $\monom$ are assigned their corresponding probabilities. Note again that the sum of this computation over $\expandtree{\etree}$ is equivalently $\rpoly(\prob_1,\ldots, \prob_\numvar)$. +Now, consider $\expandtree{\etree}$ and let $(\monom, \coef)$ be an arbitrary tuple in $\expandtree{\etree}$. For convenience, over an alphabet $\Sigma$ of size $\numvar$, define +\begin{equation*} +\evalmp: \left(\left\{\monom^a~|~\monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}, [0, 1]^\numvar\right)\mapsto \mathbb{R}, +\end{equation*} +a function that takes a monomial $\monom$ in $\left\{\monom^a ~|~ \monom \in \Sigma^b, a \in \mathbb{N}, b \in [k]\right\}$ and probability vector $\vct{p}$ (introduced in ~\cref{subsec:def-data}) as input and outputs the evaluation of $\monom$ over $\vct{p}$. By ~\cref{lem:sample}, the sampling scheme samples $(\monom, \coef)$ in $\expandtree{\etree}$ with probability $\frac{|\coef|}{\abs{\etree}(1,\ldots, 1)}$. Note that $\coef \cdot \evalmp(\monom, \vct{p})$ is the value of $(\monom, \coef)$ in $\expandtree{\etree}$ when all variables in $\monom$ are assigned their corresponding probabilities. Note again that the sum of this computation over $\expandtree{\etree}$ is equivalently $\rpoly(\prob_1,\ldots, \prob_\numvar)$. Consider now a set of $\samplesize$ random variables $\vct{\randvar}$, where each $\randvar_i$ is distributed as described above. Then for random variable $\randvar_i$, it is the case that - $\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that + $\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|}\cdot \frac{1}{1 - \gamma} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)} \cdot \frac{1}{1 - \gamma}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that \begin{align*} &\expct\pbox{\empmean} = \expct\pbox{ \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}\nonumber\\ -&= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}. +&= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|}\cdot \frac{1}{1 - \gamma} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}\cdot \frac{1}{1 - \gamma}. \end{align*} Hoeffding' inequality can be used to compute an upper bound on the number of samples $\samplesize$ needed to establish the $(\error, \conf)$-bound. The inequality states that if we know that each $\randvar_i$ is strictly bounded by the intervals $[a_i, b_i]$, then it is true that @@ -262,7 +283,7 @@ Hoeffding' inequality can be used to compute an upper bound on the number of sam P\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}. \end{equation*} -As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar) = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize}$. This computation is performed on ~\cref{alg:mon-sam-global3}. +As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot\frac{1}{1 - \gamma} = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize}$. This computation is performed on ~\cref{alg:mon-sam-global3}. %Also see that to properly estimate $\rpoly$, it is necessary to multiply by the number of monomials in $\rpoly$, i.e. $\abs{\etree}(1,\ldots, 1)$. Therefore it is the case that $\frac{acc}{N}$ gives the estimate of one monomial, and multiplying by $\abs{\etree}(1,\ldots, 1)$ yields the estimate of $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This scaling is performed in line ~\ref{alg:mon-sam-global3}. Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upperbounding Hoeffding with $\conf$, it is the case that @@ -285,13 +306,13 @@ By Hoeffding we obtain the number of samples necessary to acheive the claimed ad This concludes the proof for the first claim of theorem ~\ref{lem:mon-samp}. \paragraph{Run-time Analysis} -For a $\bi$ instance, it is possible that cancellations can occur as seen in ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations. +%For a $\bi$ instance, it is possible that cancellations can occur as seen in ~\cref{alg:mon-sam-drop}, and by ~\cref{alg:mon-sam-resamp} the algorithm will then re-sample. This affects the overall runtime. Let us denote by $\gamma$ the number of cancellations. Note that lines ~\ref{alg:mon-sam-global1}, ~\ref{alg:mon-sam-global2}, and ~\ref{alg:mon-sam-global3} are $O(1)$ global operations. The call to $\onepass$ in line ~\ref{alg:mon-sam-onepass} by lemma ~\ref{lem:one-pass} is $O(\treesize(\etree))$ time. %First, algorithm ~\ref{alg:mon-sam} calls \textsc{OnePass} which takes $O(|\etree|)$ time. Then for $\numsamp = \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$, the $O(1)$ assignment, product, and addition operations occur. Over the same $\numsamp$ iterations, $\sampmon$ is called, with a runtime of $O(\log{k}\cdot k \cdot depth(\etree))$ by lemma ~\ref{lem:sample}. Finally, over the same iterations, because $\degree(\polyf(\abs{\etree})) = k$, the assignment and product operations of line ~\ref{alg:mon-sam-product2} are called at most $k$ times. -Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error^2} + \gamma\right) \cdot \left(k + \log{k}\cdot k \cdot depth(\etree)\right) = O\left(\treesize(\etree) + \left(\left(\frac{\log{\frac{1}{\conf}}}{\error^2} + \gamma\right) \cdot \left(k \cdot\log{k} \cdot depth(\etree)\right)\right)\right)$ overall running time. +Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error^2}\right) \cdot \left(k + \log{k}\cdot k \cdot depth(\etree)\right) = O\left(\treesize(\etree) + \left(\left(\frac{\log{\frac{1}{\conf}}}{\error^2}\right) \cdot \left(k \cdot\log{k} \cdot depth(\etree)\right)\right)\right)$ overall running time. \end{proof} \qed @@ -300,15 +321,15 @@ Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error \begin{proof}[Proof of Theorem \ref{lem:approx-alg}] %\begin{Corollary}\label{cor:adj-err} -Setting $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}$ achieves $1 \pm \epsilon$ multiplicative error bounds, in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$. +Setting $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}$ achieves $1 \pm \epsilon$ multiplicative error bounds, in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)(1 - \gamma)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$. %\end{Corollary} -Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly. +Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly. The derivation over the number of samples is then \begin{align*} -&\frac{2\log{\frac{2}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{\etree}(1,\ldots, 1)}\right)^2}\\ -= &\frac{2\log{\frac{2}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}, +&\frac{2\log{\frac{2}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}\right)^2}\\ += &\frac{2\log{\frac{2}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot (1 - \gamma)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}, \end{align*} and the runtime then follows, thus upholding ~\cref{lem:approx-alg}. \end{proof} diff --git a/mult_distinct_p.tex b/mult_distinct_p.tex index cd95079..cedcaae 100644 --- a/mult_distinct_p.tex +++ b/mult_distinct_p.tex @@ -31,7 +31,7 @@ Since each of $(i_1, j_1),\ldots, (i_\kElem, j_\kElem)$ are from $E$, it follows \begin{equation*} \rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i \end{equation*} -where $c_i$ denotes all monomials in the expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables, with $\prob$ substituted for each distinct variable. +where $c_i$ denotes all monomials in the expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables, with $\prob$ substituted for each distinct variable\footnote{Since $\rpoly_G^\kElem(\vct{X})$ does not have any monomial with degree $< 2$, it is the case that $c_0 = c_1 = 1$.}. Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ for $0\leq i\leq2\kElem$, it follows that we then have $2\kElem + 1$ distinct rows of the form $\prob_i^0\ldots\prob_i^{2\kElem}$ which form a matrix $M$. We have then a linear system of the form $M \cdot \vct{c} = \vct{b}$ where $\vct{c}$ is the coefficient vector ($c_0,\ldots, c_{2\kElem}$), and $\vct{b}$ is the vector such that $\vct{b}[i] = \rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. By construction of the summation, matrix $M$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank, and we can solve the linear system in $O(k^3)$ time to determine $\vct{c}$ exactly. From 8211a9bfa0fa51e7aabf1bd9779d6227a88cf0de Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Fri, 11 Dec 2020 10:15:35 -0500 Subject: [PATCH 16/17] Incorporated \gamma into outer approx alg analysis. --- approx_alg.tex | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/approx_alg.tex b/approx_alg.tex index c5555fb..07f1853 100644 --- a/approx_alg.tex +++ b/approx_alg.tex @@ -144,10 +144,17 @@ Using the same polynomial from the above example, $poly(\abs{\etree}) = (x + 2y) Given an expression tree $\etree$ and $\vct{v} \in \mathbb{R}^\numvar$, $\etree(\vct{v}) = poly(\etree)(\vct{v})$. \end{Definition} +\begin{Definition}[Probability $\gamma$] +Define $\gamma$ to be the probability that a monomial with variables from the same block $\block$ is sampled. +\end{Definition} + +When a monomial with cross terms from the same block $\block$ is sampled, our algorithm will drop the sample and produce a new sample. + + In the subsequent subsections we lay the groundwork to prove the following theorem. \begin{Theorem}\label{lem:approx-alg} -For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$, with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$. +For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma)^2}\right)$, with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$. \end{Theorem} \subsection{Approximating $\rpoly$} @@ -261,7 +268,7 @@ with bound $P\left(\left|\mathcal{X} - \expct\pbox{\mathcal{X}}\right|\geq \erro \begin{proof}[Proof of Theorem \ref{lem:mon-samp}] -First, define $\gamma$ to be the probability that a monomial with variables from the same block $\block$ is sampled. When such a monomial is sampled, the algorithm effectively drops the sample and samples another monomial due to the mutual exclusion property of $\bi$. This can be seen in ~\cref{alg:mon-sam-check} and ~\cref{alg:mon-sam-drop} of the code. +As previously noted, by lines ~\ref{alg:mon-sam-check} and ~\ref{alg:mon-sam-drop} the algorithm will resample when it encounters a sample with variables from the same block. The probability of sampling such a monomial is $\gamma$. Now, consider $\expandtree{\etree}$ and let $(\monom, \coef)$ be an arbitrary tuple in $\expandtree{\etree}$. For convenience, over an alphabet $\Sigma$ of size $\numvar$, define \begin{equation*} @@ -271,11 +278,11 @@ a function that takes a monomial $\monom$ in $\left\{\monom^a ~|~ \monom \in \Si Consider now a set of $\samplesize$ random variables $\vct{\randvar}$, where each $\randvar_i$ is distributed as described above. Then for random variable $\randvar_i$, it is the case that - $\expct\pbox{\randvar_i} = \sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|}\cdot \frac{1}{1 - \gamma} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)} \cdot \frac{1}{1 - \gamma}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that + $\expct\pbox{\randvar_i} = \sum\limits_{\substack{(\monom, \coef) \in \expandtree{\etree} \st\\ \forall X_{b_i, j}, X_{b_k, \ell}\\ \in Vars(v),\\ \block_i \neq \block_k}}\frac{\coef \cdot \evalmp(\monom, p)}{\sum\limits_{\substack{(\monom, \coef) \in \expandtree{\etree}\st\\ \forall X_{b_i, j}, X_{b_k, \ell}\\ \in Vars(v),\\ \block_i \neq \block_k}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot \frac{1}{1 - \gamma}} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$. Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i$. It is also true that \begin{align*} &\expct\pbox{\empmean} = \expct\pbox{ \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_i} = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_i}\nonumber\\ -&= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{(\monom, \coef) \in \expandtree{\etree}}|\coef|}\cdot \frac{1}{1 - \gamma} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)}\cdot \frac{1}{1 - \gamma}. +&= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\sum\limits_{\substack{(\monom, \coef) \in \expandtree{\etree} \st\\ \forall X_{b_i, j}, X_{b_k, \ell}\\ \in Vars(v),\\ \block_i \neq \block_k}}\frac{\coef \cdot \evalmp(\monom, \vct{p})}{\sum\limits_{\substack{(\monom, \coef) \in \expandtree{\etree} \st\\ \forall X_{b_i, j}, X_{b_k, \ell}\\ \in Vars(v),\\ \block_i \neq \block_k}}|\coef|} = \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}. \end{align*} Hoeffding' inequality can be used to compute an upper bound on the number of samples $\samplesize$ needed to establish the $(\error, \conf)$-bound. The inequality states that if we know that each $\randvar_i$ is strictly bounded by the intervals $[a_i, b_i]$, then it is true that @@ -283,7 +290,7 @@ Hoeffding' inequality can be used to compute an upper bound on the number of sam P\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}. \end{equation*} -As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot\frac{1}{1 - \gamma} = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize}$. This computation is performed on ~\cref{alg:mon-sam-global3}. +As implied above, Hoeffding is assuming the sum of random variables be divided by the number of variables. Since $\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot(1 - \gamma) = \expct\pbox{\empmean} \cdot \abs{\etree}(1,\ldots, 1)$, then our estimate is the sum of random samples multiplied by $\frac{\abs{\etree}(1,\ldots, 1)}{\samplesize}$. This computation is performed on ~\cref{alg:mon-sam-global3}. %Also see that to properly estimate $\rpoly$, it is necessary to multiply by the number of monomials in $\rpoly$, i.e. $\abs{\etree}(1,\ldots, 1)$. Therefore it is the case that $\frac{acc}{N}$ gives the estimate of one monomial, and multiplying by $\abs{\etree}(1,\ldots, 1)$ yields the estimate of $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This scaling is performed in line ~\ref{alg:mon-sam-global3}. Line ~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is mulitplied with at most $\degree(\polyf(\abs{\etree}))$ factors from $\vct{p}$ (\cref{alg:mon-sam-product2}) such that each $p_i$ is in $[0, 1]$, the range for each $\randvar_i$ ($\vari{Y}_\vari{i}$ in the psuedo code) is then strictly bounded by $[-1, 1]$. Bounding Hoeffding's results by $\conf$ ensures confidence no less than $1 - \conf$. Then by upperbounding Hoeffding with $\conf$, it is the case that @@ -321,15 +328,15 @@ Thus we have $O(\treesize(\etree)) + O(\left(\frac{\log{\frac{1}{\conf}}}{\error \begin{proof}[Proof of Theorem \ref{lem:approx-alg}] %\begin{Corollary}\label{cor:adj-err} -Setting $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}$ achieves $1 \pm \epsilon$ multiplicative error bounds, in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)(1 - \gamma)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$. +Setting $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$ achieves $1 \pm \epsilon$ multiplicative error bounds, in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)(1 - \gamma)^2}\right)$. %\end{Corollary} -Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly. +Since it is the case that we have $\error \cdot \abs{\etree}(1,\ldots, 1)$ additive error, one can set $\error = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}$, yielding a multiplicative error proportional to $\rpoly(\prob_1,\ldots, \prob_\numvar)$. This only affects the runtime in the number of samples taken, changing the first factor of the second summand of the original runtime accordingly. The derivation over the number of samples is then \begin{align*} -&\frac{2\log{\frac{2}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{\etree}(1,\ldots, 1)\cdot (1 - \gamma)}\right)^2}\\ -= &\frac{2\log{\frac{2}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)\cdot (1 - \gamma)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}, +&\frac{2\log{\frac{2}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)\cdot (1 - \gamma)}{\abs{\etree}(1,\ldots, 1)}\right)^2}\\ += &\frac{2\log{\frac{2}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)\cdot (1 - \gamma)^2}, \end{align*} and the runtime then follows, thus upholding ~\cref{lem:approx-alg}. \end{proof} From ba79c9ffd75a574a6c05335fc81d9e9969ae7308 Mon Sep 17 00:00:00 2001 From: Aaron Huber Date: Fri, 11 Dec 2020 11:48:55 -0500 Subject: [PATCH 17/17] Finished @oliver 121020 suggestions modulo last Riot p suggestion --- intro.tex | 68 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/intro.tex b/intro.tex index ee9180c..022abb5 100644 --- a/intro.tex +++ b/intro.tex @@ -2,12 +2,14 @@ \section{Introduction} -Modern production databases, e.g., Postgres, Oracle, etc. use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing the probability of an output tuple is analogous to weighted model counting (a known $\sharpP$ problem). +Modern production databases like Postgres and Oracle use bag semantics. In contrast, most implementations of probabilistic databases (PDBs) are built in the setting of set semantics, where computing the probability of an output tuple is analogous to weighted model counting (a known $\sharpP$ problem). %the annotation of the tuple is a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, which can essentially be thought of as a boolean formula. It is known that computing the probability of a lineage formula is \#-P hard in general -~\cite{DBLP:series/synthesis/2011Suciu}. In PDBs, a boolean formula, also called a lineage formula ~\cite{DBLP:series/synthesis/2011Suciu}, encodes the conditions under which each output tuple appears in the result. The marginal probability of this formula being true is the tuple's probability to appear in a possible world. The set of variables are each mapped to probability values, from which the marginal probability can be computed. The corresponding problem for bag PDBs is computing the expected multiplicity of the output tuple, which requires polynomials to represent the probability distribution of the multiplicity of input tuples. + In PDBs, a boolean formula, ~\cite{DBLP:series/synthesis/2011Suciu} also called a lineage formula, encodes the conditions under which each output tuple appears in the result. +%The marginal probability of this formula being true is the tuple's probability to appear in a possible world. +The set of variables in a lineage formula are each drawn from a probability distribution, from which the marginal probability can be computed. The corresponding problem for bag PDBs is computing the expected multiplicities of the output tuple, where polynomials are used to represent the probability distribution of the multiplicity of input tuples. %In this case, the polynomial is interpreted as the probability of the input tuple contributing to the multiplicity of the output tuple. Or in other words, the expectation of the polynomial is the expected multiplicity of the output tuple. -The standard representation for lineage formulas in PDBs is sum of products (SOP), which is much bigger than the lineage-free representation that deterministic databases employ. -Due to linearity of expectation, computing the expectation of the polynomial in the bag setting is linear in the number of terms in the SOP formula, with the result that many regard bags to be easy. In this work we consider compressed representations of the lineage formula. We show that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. The compressed representation of the formula is analagous to deterministic query optimizations (e.g. pushing down projections). Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases. +The standard representation for lineage formulas in PDBs is the sum of products (SOP). The SOP is essentially the expansion of all products of sums terms, so that the formula is a sum of variable products. The SOP representation is much bigger than the lineage-free representation that deterministic databases employ. +Due to linearity of expectation, computing the expectation of tuple multiplicities is linear in the number of terms in the SOP formula, so many regard bags to be easy. In this work we consider compressed representations of the lineage formula. We show that the complexity landscape becomes much more nuanced, and is \textit{not} linear in general. Such compressed representations of the formula are analagous to deterministic query optimizations (e.g. pushing down projections). In this work, we define hard to be anything greater than linear time. Thus, even bag PDBs do not enjoy the same computational complexity as deterministic databases and are hard in general. This makes it desirable to find linear time approximation algorithm. @@ -73,40 +75,58 @@ Due to linearity of expectation, computing the expectation of the polynomial in %\end{figure} \begin{Example}\label{ex:intro} -Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB whose tuples are independent. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$. The lineage of the output is computed by adding variables when a union operation is performed, and by multiplying variables for a join operation. This yields the products of all input tuple lineages whose combination satsifies the join condition, summed together. A $\ti$ example instance is given in~\cref{fig:intro-ex}. While for completeness we should include random variables for Table E, since each tuple has a probability of $1$, we drop them for simplicity. The attribute column $\Phi$ contains its repsective random variable, where $P[W_i = 1]$ is its marginal probability. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. +Assume a set semantics setting. Suppose we are given a Tuple Independent Database ($\ti$), which is a PDB whose tuples are independently present or not. We are given the following boolean query $\poly() :- R(A), E(A, B), R(B)$. The lineage of the output is computed by adding polynomials when a union operation is performed, and by multiplying polynomials for a join operation. This yields the products of all input tuple lineages whose combination satsifies the join condition, summed together. A $\ti$ example instance is given in~\cref{fig:intro-ex}. The attribute column $\Phi$ contains its repsective random variable, where $P[W_i = 1]$ is its marginal probability. While for completeness we should include random variables for Table E, since each tuple has a probability of $1$, we drop them for simplicity. %Finally, see that the tuples in table E can be visualized as the graph in ~\cref{fig:intro-ex-graph}. Next we explain why this query is hard in set semantics % due to correlations in the lineage formula. But and easy under bag semantics.% with a polynomial formula representing the multiple contributing tuples from the input set $\ti$, it is easy since we enjoy linearity of expectation. \end{Example} -Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example $Dom(W_i) = \{0, 1\}$ and we consider a fixed probability $\prob$ for all tuple variables such that $P[W_i = 1] = \prob$. Let us also be explicit in mentioning that the input tables are \textit{sets}, and the difference when we speak of bag semantics, is that we consider the output tuple to potentially have duplicates, or in other words we are thinking about query output (over set instances) in the bag context when we speak of the output formula under \textit{bag semantics}. +Our work also handles Block Independent Disjoint Databases ($\bi$), a PDB model in which tuples are arranged in blocks, where all blocks are independent from one another, but tuples within the same block are mutually exclusive. For now, let us consider the $\ti$ model. In the example we consider a fixed probability $\prob$ for all tuple variables such that $P[W_i = 1] = \prob$. Let us also be explicit in mentioning that the input tables are \textit{sets}, i.e. $Dom(W_i) = \{0, 1\}$, and the difference when we speak of bag semantics, is that we consider the query to potentially have duplicates, or in other words we are thinking about query output (over set instances) in the bag context. + +To contrast the bag/polynomial and set/lineage interpretations, we provide another example. +\begin{Example}\label{ex:bag-vs-set} +The output polynomial in ~\cref{ex:intro} has the following lineage formula (top) and polynomial (bottom). +\begin{align*} +&\poly(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a\\ +&\poly(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a +\end{align*} + +Notice that $\poly$ in the set/lineage setting above, $\poly: (\mathbb{B})^3 \mapsto \mathbb{B}$, while under bag/polynomial semantics we define $\poly: (\mathbb{N})^3 \mapsto \mathbb{N}$. + +Assume the following $\mathbb{B}/\mathbb{N}$ variable assignments: $W_a\mapsto T/1, W_b \mapsto T/1, W_c \mapsto F/0.$ Then the polynomials evaluate as +\begin{align*} +&\poly(T, T, F) = TT \vee TF \vee FT = T\\ +&\poly(1, 1, 0) = 1 \cdot 1 + 1\cdot 0 + 0 \cdot 1 = 1 +\end{align*} +In the set/lineage setting, we find that the boolean query is satisfied, while in the bags evaluation we see how many combinations of the input satsify the query. +\end{Example} Note that computing the probability of the query of ~\cref{ex:intro} in set semantics is indeed $\sharpP$ hard, since it is a query that is non-hierarchical %, i.e., for $Vars(\poly)$ denoting the set of variables occuring across all atoms of $\poly$, a function $sg(x)$ whose output is the set of all atoms that contain variable $x$, we have that $sg(A) \cap sg(B) \neq \emptyset$ and $sg(A)\not\subseteq sg(B)$ and $sg(B)\not\subseteq sg(A)$, -as defined by Dalvi and Suciu in ~\cite{10.1145/1265530.1265571}. For the purposes of this work, we define hard to be anything greater than linear time. %Thus, computing $\expct\pbox{\poly(W_a, W_b, W_c)}$, i.e. the probability of the output with annotation $\poly(W_a, W_b, W_c)$, ($\prob(q)$ in Dalvi, Sucui) is hard in set semantics. -To see why this computation is hard for query $\poly$ over set semantics, from the query input we compute an output lineage formula of $\poly(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a$. Note that the conjunctive clauses are not independent of one another and the computation of the probability is not linear in the size of $\poly(W_a, W_b, W_c)$. +~\cite{10.1145/1265530.1265571}. %Thus, computing $\expct\pbox{\poly(W_a, W_b, W_c)}$, i.e. the probability of the output with annotation $\poly(W_a, W_b, W_c)$, ($\prob(q)$ in Dalvi, Sucui) is hard in set semantics. +To see why this computation is hard for query $\poly$ over set semantics, from the query input we compute an output lineage formula of $\poly(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a$. Note that the conjunctive clauses are not independent of one another and the computation of the probability is not linear in the size of $\poly(W_a, W_b, W_c)$: \begin{equation*} \expct\pbox{\poly(W_a, W_b, W_c)} = W_aW_b + W_a\overline{W_b}W_c + \overline{W_a}W_bW_c = 3\prob^2 - 2\prob^3 \end{equation*} -In general, such a computation can be exponential. +In general, such a computation can be exponential in the size of the database. %Using Shannon's Expansion, %\begin{align*} %&W_aW_b \vee W_bW_c \vee W_cW_a %= &W_a %\end{align*} -However, in the bag setting, the polynomial is $\poly(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a$. To be reiterate, the output lineage formula is produced from a query over a set $\ti$ input, where duplicates are allowed in the output. The expectation computation over the output lineage is a computation of the 'average' multiplicity of an output tuple across possible worlds. In ~\cref{ex:intro}, the expectation is simply +However, in the bag setting, the polynomial is $\poly(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a$. To be reiterate, the output lineage formula is produced from a query over a set $\ti$ input, where duplicates are allowed in the output. The expectation computation over the output lineage is a computation of the expected multiplicity of an output tuple across possible worlds. In ~\cref{ex:intro}, the expectation is simply \begin{align*} &\expct\pbox{\poly(W_a, W_b, W_c)} = \expct\pbox{W_aW_b} + \expct\pbox{W_bW_c} + \expct\pbox{W_cW_a}\\ = &\expct\pbox{W_a}\expct\pbox{W_b} + \expct\pbox{W_b}\expct\pbox{W_c} + \expct\pbox{W_c}\expct\pbox{W_a}\\ -= &\prob^2 + \prob^2 + \prob^2 = 3\prob^2, += &\prob^2 + \prob^2 + \prob^2 = 3\prob^2. \end{align*} -which is indeed linear in the size of the lineage as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial. The above equalities hold, since expectation is linear over addition of the natural numbers. We were also able to push expectation into the product due to the $\ti$ independence property, where all variables are independent. Note that the answer is the same as $\poly(\prob, \prob, \prob)$, where substituting $\prob$ in for each variable yields $\prob \cdot \prob + \prob \cdot \prob + \prob \cdot \prob = 3\prob^2$. This however is coincidental and not true for the general case. +Computing such expectations is indeed linear in the size of the SOP as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial. The above equalities hold, since expectation is linear over addition of the natural numbers. We were also able to push expectation into the product due to the $\ti$ independence property, where all variables are independent. Note that the answer is the same as substituting $\prob$ in for each variable. For example, $\poly(\prob, \prob, \prob)$ $=$ $\prob \cdot \prob + \prob \cdot \prob + \prob \cdot \prob = 3\prob^2$. This however is coincidental and not true for the general case. Now, consider the query \begin{equation*} \poly^2() := \rel(A), E(A, B), \rel(B), \rel(C), E(C, D), \rel(D), \end{equation*} -For an arbitrary lineage formula, which we can view as a polynomial, it is known that there may exist equivalent compressed representations of the polynomial. One such compression is known as the factorized polynomial ~\cite{10.1145/3003665.3003667}, where the polynomial can be broken up into separate factors. Another equivalent form of the polynomial is the sum of products (SOP), which is the expansion of the factorized polynomial by multiplying out all terms, and in general is exponentially larger (in the number of products) than the factorized version. +For an arbitrary lineage formula, which we can view as a polynomial, it is known that there may exist equivalent compressed representations of the polynomial. One such compression is the factorized polynomial ~\cite{10.1145/3003665.3003667}, where the polynomial can be broken up into separate factors. %Another form of the polynomial is the SOP, which is the expansion of the factorized polynomial by multiplying out all terms, and in general is exponentially larger (in the number of products) than the factorized version. A factorized polynomial of $\poly^2$ is @@ -153,7 +173,7 @@ This factorized expression can be easily modeled as an expression tree as depict \label{fig:intro-q2-etree} \end{figure} - In contrast, the SOP equivalent representation is + In contrast, the equivalent SOP representation is \begin{equation*} W_a^2W_b^2 + W_b^2W_c^2 + W_c^2W_a^2 + 2W_a^2W_bW_c + 2W_aW_b^2W_c + 2W_aW_bW_c^2. \end{equation*} @@ -172,17 +192,29 @@ The expectation then is In this case, even though we substitute probability values in for each variable, $\poly^2(\prob, \prob, \prob)$ is not the answer we seek since for a random variable $X$, $\expct\pbox{X^2} = \sum_{x \in Dom(X)}x^2 \cdot p(x)$. Intuitively, bags are only hard with self-joins.\AH{Atri suggests a proof in the appendix regarding the last claim.} -Define $\rpoly^2(\vct{X})$ to be the resulting polynomial when all exponents $e > 1$ are set to $1$ in $\poly^2$. Note that this structure $\rpoly^2(\prob, \prob, \prob)$ is the expectation we computed, since it is always the case that $i^2 = i$ for all $i$ in $\{0, 1\}$. And, $\poly^2()$ is still computable in linear time in the size of the output polynomial, compressed or SOP. +Define $\rpoly^2(\vct{X})$ to be the resulting polynomial when all exponents $e > 1$ are set to $1$ in $\poly^2$. For example, when we have + +\begin{align*} +&\poly^2(W_a, W_b, W_c) = W_a^2W_b^2 + W_b^2W_c^2 + W_c^2W_a^2 + 2W_a^2W_bW_c + 2W_aW_b^2W_c\\ +&+ 2W_aW_bW_c^2, +\end{align*} +then +\begin{align*} +&\rpoly^2(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a + 2W_aW_bW_c + 2W_aW_bW_c\\ +&+ 2W_aW_bW_c\\ +&= W_aW_b + W_bW_c + W_cW_a + 6W_aW_bW_c +\end{align*} +Note that this structure $\rpoly^2(\prob, \prob, \prob)$ is the expectation we computed, since it is always the case that $i^2 = i$ for all $i$ in $\{0, 1\}$. And, $\poly^2()$ is still computable in linear time in the size of the output polynomial, compressed or SOP. A compressed polynomial can be exponentially smaller in $k$ for $k$-products. It is also always the case that computing the expectation of an output polynomial in SOP is always linear in the size of the polynomial, since expecation can be pushed through addition. -This works seeks to explore the complexity landscape for compressed representations of polynomials. We use the term 'easy' to mean linear time, and the term 'hard' to mean superlinear time or greater. Note that when we are linear in the size of the lineage formula, we essentially have runtime that is of deterministic query complexity. +This works seeks to explore the complexity landscape for compressed representations of polynomials. Note that when we are linear in the size of the lineage formula, we essentially have runtime that is of deterministic query complexity. Up to this point the message seems consistent that bags are always easy in the size of the SOP representation, but \begin{Question} -Is it always the case that bags are easy in the size of the compressed polynomial? +Is it always the case that bags are easy in the size of the \emph{compressed} polynomial? \end{Question} -If bags \textit{are} always easy for any compressed version of the polynomial, then there is no need for improvement. But, if proveably not, then the option to approximate the computation over a compressed polynomial in linear time is desirable. +If bags \textit{are} always easy for any compressed version of the polynomial, then there is no need for improvement. But, if proveably not, then the option to approximate the computation over a compressed polynomial in linear time is critical for making PDBs practical. Consider the query \begin{equation*}