paper-BagRelationalPDBsAreHard/poly-form.tex

%root: main.tex
%!TEX root = ./main.tex

\section{Polynomial Formulation}

Further, define $\rpoly(X_1,\ldots, X_\numTup)$ as the reduced version of $\poly(X_1,\ldots, X_\numTup)$, of the form
\[\rpoly(X_1,\ldots, X_\numTup) = \poly(X_1,\ldots, X_\numTup) \mod \wbit_1^2-\wbit\cdots\mod \wbit_\numTup^2 - \wbit_\numTup.\]

\OK{Shouldn't it be $\wbit_1^2 - \wbit_1$ (missing a subscript)?
This definition of $\rpoly$ may be inappropriately concise, as it doesn't (I think?) get across the
``expanded SoP form" constraint.
Also, one way to establish a preliminary intuition for $\rpoly$ might be to associate it with idempotent multiplication operations --- it's the simplest sum-of-products representation of $\poly$ that is equivalent under an idempotent $\otimes$.
}

Intuitively, $\rpoly(\textbf{X})$ is the expanded sum of products form of $\poly(\textbf{X})$ such that if any $X_j$ term  has an exponent $e > 1$, it is reduced to $1$, i.e. $X_j^e\mapsto X_j$ for any $e > 1$.  The usefulness of this reduction will be seen shortly.

\begin{Lemma}\label{lem:pre-poly-rpoly}
When $\poly(X_1,\ldots, X_\numTup) = \sum\limits_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{\vct{d}} \cdot \prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numTup}X_i^{d_i}$, we haven then that $\rpoly(X_1,\ldots, X_\numTup) = \sum\limits_{\vct{d} \in \{0,\ldots, D\}^\numTup} q_{\vct{d}}\cdot\prod\limits_{\substack{i = 1\\s.t. d_i\geq 1}}^{\numTup}X_i$.
\end{Lemma}

First, note the following fact:
\begin{Proposition}
\[\text{For all } (\wbit_1,\ldots, \wbit_\numTup) \in \{0, 1\}^\numTup, \poly(\wbit_1,\ldots, \wbit_\numTup) = \rpoly(\wbit_1,\ldots, \wbit_\numTup).\]
\end{Proposition}

\begin{proof}
For all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = 1$.\qed
\end{proof}

\OK{Might help to emphasize the Sum of Products constraint.}


\begin{Lemma}\label{lem:exp-poly-rpoly}
The expectation of a possible world in $\poly$ is equal to $\rpoly(\prob_1,\ldots, \prob_\numTup)$.
\begin{equation*}
\expct_{\wVec}\pbox{\poly(\wVec)}  = \rpoly(\prob_1,\ldots, \prob_\numTup).
\end{equation*}
\end{Lemma}

\begin{proof}
%Using the fact above, we need to compute \[\sum_{(\wbit_1,\ldots, \wbit_\numTup) \in \{0, 1\}}\rpoly(\wbit_1,\ldots, \wbit_\numTup)\].  We therefore argue that
%\[\sum_{(\wbit_1,\ldots, \wbit_\numTup) \in \{0, 1\}}\rpoly(\wbit_1,\ldots, \wbit_\numTup) = 2^\numTup \cdot \rpoly(\frac{1}{2},\ldots, \frac{1}{2}).\]

Let $\poly$ be the generalized polynomial, i.e., the polynomial of $\numTup$ variables with highest degree $= D$: %, in which every possible monomial permutation appears,
\[\poly(X_1,\ldots, X_\numTup) = \sum_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{d_i}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numTup X_i^{d_i}\].


Then for expectation we have
\begin{align}
\expct_{\wVec}\pbox{\poly(\wVec)} &= \sum_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{\vct{d}}\cdot \expct_{\wVec}\pbox{\prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numTup w_i^{d_i}}\label{p1-s1}\\
&= \sum_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numTup \expct_{\wVec}\pbox{w_i^{d_i}}\label{p1-s2}\\
&= \sum_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numTup \expct_{\wVec}\pbox{w_i}\label{p1-s3}\\
&= \sum_{\vct{d} \in \{0,\ldots, D\}^\numTup}q_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numTup \prob_i\label{p1-s4}\\
&= \rpoly(\prob_1,\ldots, \prob_\numTup)\label{p1-s5}
\end{align}

In steps \cref{p1-s1} and \cref{p1-s2}, by linearity of expectation, the expecation can be pushed all the way inside of the product.  In \cref{p1-s3}, note that $w_i \in \{0, 1\}$ which further implies that for any exponent $e \geq 1$, $w_i^e = w_i$.  Next, by definition of TIDB, in \cref{p1-s4} the expectation of a tuple across all possible worlds is indeed its probability.

\OK{
	You don't need to tie this to TI-DBs if you define the variables ($X_i$) to be independent.
	Annotations
	Boolean expressions over uncorrelated boolean variables are sufficient to model TI-, BI-, and
	PC-Tables.  This should still hold for arithmetic over the naturals.
}


Finally, observe \cref{p1-s5} by construction in \cref{lem:pre-poly-rpoly}, that $\rpoly(\prob_1,\ldots, \prob_\numTup)$ is exactly the product of probabilities of each variable in each monomial across the entire sum.

\qed
\end{proof}

\begin{Corollary}
If $\poly$ is given to us in a sum of monomials form, the expectation of $\poly$ ($\ex{\poly}$) can be computed in $O(|\poly|)$, where $|\poly|$ denotes the total number of multiplication/addition operators.
\end{Corollary}

\begin{proof}
Note that \cref{lem:exp-poly-rpoly} shows that $\ex{\poly} = \rpoly(\prob_1,\ldots, \prob_\numTup)$.  Therefore, if $\poly$ is already in sum of products form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numTup)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numTup)$), which is indeed has $O(|\poly|)$ compututations.\qed
\end{proof}

\subsection{When $\poly$ is not in sum of monomials form}


We would like to argue that in the general case there is no computation of expectation in linear time.

To this end, consider the following graph $G(V, E)$, where $|E| = m$, $|V| = \numTup$, and $i, j \in [\numTup]$.  Consider the query $q_E(X_1,\ldots, X_\numTup) = \sum\limits_{(i, j) \in E} X_i \cdot X_j$.
\AR{The two lemmas need to be re-written once notation for representing a query is finalized in Section 1.}
\AH{\^-----This is an issue that we are currently discussing.}
\begin{Lemma}\label{lem:const-p}
If we can compute $\poly(\wElem_1,\ldots, \wElem_\numTup) = q_E(\wElem_1,\ldots, \wElem_\numTup)^3$ in T(m) time for $\wElem_1 = \ldots = \wElem_\numTup = \prob$, then we can count the number of 3-matchings in $G$ in $T(m) + O(m)$ time.
\end{Lemma}

\begin{Lemma}\label{lem:gen-p}
If we can compute $\poly(\wElem_1,\ldots, \wElem_\numTup) = q_E(\wElem_1,\ldots, \wElem_\numTup)^3$ in T(m) time for O(1) distinct values of $\prob$ then we can count the number of triangles (and the number of 3-paths, the number of 3-matchings) in $G$ in O(T(m) + m) time.
\end{Lemma}

\AH{The warm-up below is fine for now, but will need to be removed for the final draft}
First, let us do a warm-up by computing $\rpoly(\wElem_1,\dots, \wElem_\numTup)$ when $\poly = q_E(\wElem_1,\ldots, \wElem_\numTup)$.  Before doing so, we introduce a notation.  Let $\numocc{H}$ denote the number of occurrences that $H$ occurs in $G$.  So, e.g., $\numocc{\ed}$ is the number of edges ($m$) in $G$.

\AH{We need to make a decision on subgraph notation, and number of occurrences notation.  Waiting to hear back from Oliver before making a decision.}

\OK{
	I'm not sure what I can add.  The existing notation is fine (for now).  I would suggest adding
	a definition table.
}

\begin{Claim}
We can compute $\rpoly_2$ in O(m) time.
\end{Claim}
	\begin{proof}
		The proof basically follows by definition.  When we expand $\poly$, and make all exponents $e = 1$, substituting $\prob$ for all $\wElem_i$ we get $\rpoly_2(\prob,\ldots, \prob) = \numocc{\ed} \cdot \prob^2 + 2\cdot \numocc{\twopath}\cdot \prob^3 + 2\cdot \numocc{\twodis}\cdot \prob^4$.
		\begin{enumerate}
			\item First note that
				\begin{align*}
					\poly_2(\wVec) &= \sum_{(i, j) \in E} (\wElem_i\wElem_j)^2 + \sum_{(i, j), (k, \ell) \in E s.t. (i, j) \neq (k, \ell)} \wElem_i\wElem_j\wElem_k\wElem_\ell\\
					&= \sum_{(i, j) \in E} (\wElem_i\wElem_j)^2 + \sum_{\substack{(i, j), (j, \ell) \in E\\s.t. i \neq \ell}}\wElem_i
					\wElem_j^2\wElem_\ell + \sum_{\substack{(i, j), (k, \ell) \in E\\s.t. i \neq j \neq k \neq \ell}} \wElem_i\wElem_j\wElem_k\wElem_\ell\\
				\end{align*}
				By definition of $\rpoly$,
				\begin{equation*}
					\rpoly_2(\wVec) = \sum_{(i, j) \in E} \wElem_i\wElem_j + \sum_{\substack{(i, j), (j, \ell) \in E\\s.t. i \neq \ell}}\wElem_i\wElem_j\wElem_\ell + \sum_{\substack{(i, j), (k, \ell) \in E\\s.t. i \neq j \neq k \neq \ell}} \wElem_i\wElem_j\wElem_k\wElem_\ell\label{eq:part-1}
				\end{equation*}
				Notice that the first term is $\numocc{\ed}\cdot \prob^2$, the second $\numocc{\twopath}\cdot \prob^3$, and the third $\numocc{\twodis}\cdot \prob^4.$
			\item Note that
\AH{We need the correct formula for two-matchings below.}
				\begin{align*}
					&\numocc{\ed} = m,\\
					&\numocc{\twopath} = \sum_{u \in V} \binom{d_u}{2} \text{where $d_u$ is the degree of vertex $u$}\\									&\numocc{\twodis} = \textbf{\textit{a correct formula}}
				\end{align*}
		\end{enumerate}
		Thus, since each of the summations can be computed in O(m) time, this implies that by \cref{eq:part-1} $\rpoly(\prob,\ldots, \prob)$ can be computed in O(m) time.\qed
	\end{proof}
\AH{END of the 'warm-up'}
We are now ready to state the claim we need to prove \cref{lem:const-p} and \cref{lem:gen-p}.

Let $\poly(\wVec) = q_E^3(\wVec)^3$.
\begin{Claim}\label{claim:four-two}
 If one can compute $\rpoly_3(\prob,\ldots, \prob)$ in time T(m), then we can compute the following in O(T(m) + m):
\[\numocc{\tri} + \numocc{\threepath} \cdot \prob - \numocc{\threedis}\cdot(\prob^2 - \prob^3).\]
\end{Claim}

\begin{proof}
\AH{Use this equation as its own lemma, to be used in lemmas 3 and 4.}
When we expand $\poly$ out and assign all exponents $e \geq 1$ a value of $1$, we have the following,
	\begin{align}
		&\rpoly(\prob,\ldots, \prob) = \numocc{\ed}\prob^2 + 6\numocc{\twopath}\prob^3 + 6\numocc{\twodis} + 6\numocc{\tri}\prob^3 +\nonumber\\
		&\qquad\qquad6\numocc{\oneint}\prob^4 + 6\numocc{\threepath}\prob^4 + 6\numocc{\twopathdis}\prob^5 + 6\numocc{\threedis}\prob^6.\label{claim:four-one}
	\end{align}

We have shown and will show that the following subgraph cardinalities can be computed in $O(m)$ time:
\[\numocc{\ed}, \numocc{\twopath}, \numocc{\twodis}, \numocc{\oneint}, \numocc{\twopathdis} + \numocc{\threedis}.\]

		By definition we have that
		\[\poly_3(\wElem_1,\ldots, \wElem_\numTup) = \sum_{\substack{(i_1, j_1),\\ (i_2, j_2),\\ (i_3, j_3) \in E}} \prod_{\ell = 1}^{3}\wElem_{i_\ell}\wElem_{j_\ell}.\]
		Rather than list all the expressions in full detail, let us make some observations regarding the sum.  Let $e_1 = (i_1, j_1), e_2 = (i_2, j_2), e_3 = (i_3, j_3)$.  Notice that each expression in the sum consists of a triple $(e_1, e_2, e_3)$.  There are three forms the triple $(e_1, e_2, e_3)$ can take.

\underline{case 1:} $e_1 = e_2 = e_3$, where all edges are the same.  There are exactly $m$ such triples, each with a $\prob^2$ factor.

\underline{case 2:}  This case occurs when there are two distinct edges of the three.  All 6 combinations of two distinct values consist of the same monomial in $\rpoly_3$, i.e. $(e_1, e_1, e_2)$ is the same as $(e_2, e_1, e_2)$.  This case produces the following edge patterns: $\twodis, \twopath$.

\underline{case 3:} $e_1 \neq e_2 \neq e_3$, i.e., when all edges are distinct.  This case consists of the following edge patterns: $\threedis, \twopathdis, \threepath, \oneint, \tri$.

It has already been shown previously that $\numocc{\ed}, \numocc{\twopath}, \numocc{\twodis}$ can be computed in O(m) time.  Here are the arguments for the rest.
\[\numocc{\oneint} = \sum_{u \in V} \binom{d_u}{3}\]
$\numocc{\twopathdis} + \numocc{\threedis} = $ the number of occurrences of three distinct edges with five or six vertices.  This can be counted in the following manner.  For every edge $(u, v) \in E$, throw away all neighbors of $u$ and $v$ and pick two more distinct edges.
\[\numocc{\twopathdis} + \numocc{\threedis} = \sum_{(u, v) \in E} \binom{m - d_u - d_v - 1}{2}\]  The implication in \cref{claim:four-two} follows by the above and \cref{claim:four-one}.\qed
	\end{proof}

\begin{proof}[Proof of \cref{lem:gen-p}]

%\AR{Also you can modify the text of \textsc{Proof} by using the following latex command \texttt{\\begin\{proof\}[Proof of Lemma 2]} and Latex will typeset this as \textsc{Proof of Lemma 2}, which is what you really want.}

\cref{claim:four-two} says that if we know $\rpoly_3(\prob,\ldots, \prob)$, then we can know in O(m) additional time
\[\numocc{\tri} + \numocc{\threepath} \cdot \prob - \numocc{\threedis}\cdot(\prob^2 - \prob^3).\]  We can think of each term in the above equation as a variable, where one can solve a linear system given 3 distinct $\prob$ values, assuming independence of the three linear equation.  In the worst case, without independence, 4 distince values of $\prob$ would suffice...because Atri said so, and I need to ask him for understanding why this is the case, of which I suspect that it has to do with basic result(s) in linear algebra.\AR{Follows from the fact that the corresponding coefficient matrix is the so called Vandermonde matrix, which has full rank.}\qed
\end{proof}
\AH{Below is only a transcription of the notes.  The claims need to be verified and further worked out.}

\begin{proof}[Proof of \cref{lem:const-p}]

The argument for \cref{lem:gen-p} cannot be applied to \cref{lem:const-p} since we have that $\prob$ is fixed.  We have hope in the following:  we assume that we can solve this problem for all graphs, and the hope would be be to solve the problem for say $G_1, G_2, G_3$, where $G_1$ is arbitrary, and relate the values of $\numocc{H}$, where $H$ is a placeholder for the relevant edge combination.  The hope is that these relations would result in three independent linear equations, and then we would be done.

The following is an option.
\begin{enumerate}
	\item Let $G_1$ be an arbitrary graph
	\item Build $G_2$ from $G_1$, where each edge in $G_1$ gets replaced by a 2 path.
\end{enumerate}

Then $\numocc{\tri}_2 = 0$, and if we can prove that\AR{Again you are not transcribing the handwritten notes. If the notes has a claim without proof, then you need to finish off the proof. Of course am happy to help if you get stuck but one of the primary goals of you latexing up the handwritten notes is for you to verify what is in the notes is correct and that cannot happen unless you write down complete proofs for all claims and convince yourself that the claims are correct-- e.g. they {\em could} be wrong and the hope is that your pass will catch bugs.}
\begin{itemize}
	\item $\numocc{\threepath}_2 = 2 \cdot \numocc{\twopath}_1$
	\item $\numocc{\threedis}_2 = 8 \cdot \numocc{\threedis}_1$
\end{itemize}
we solve our problem for $q_E^3$ based on $G_2$ and we can compute $\numocc{\threedis}$, a hard problem.
\end{proof}
{\bf TESTING}
$\vec{w}\sim\mathcal{D}$