Done with S2 pass

2021-09-20 18:04:04 -04:00 · 2021-09-20 18:04:04 -04:00 · e0015f15b4
parent bcd5cfb818
commit e0015f15b4
4 changed files with 10 additions and 9 deletions
--- a/circuits-model-runtime.tex
+++ b/circuits-model-runtime.tex
@ -29,8 +29,8 @@ To decouple our results from specific join algorithms, we first abstract the cos

 \begin{Definition}[Join Cost]
 \label{def:join-cost}
-Denote by $\jointime{R_1, \ldots, R_n}$ the runtime of an algorithm for computing the n-ary join $R_1 \bowtie \ldots \bowtie R_n$.
-We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_n} \geq |R_1 \bowtie \ldots \bowtie R_n|$.
+Denote by $\jointime{R_1, \ldots, R_m}$ the runtime of an algorithm for computing the $m$-ary join $R_1 \bowtie \ldots \bowtie R_m$.
+We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_m} \geq |R_1 \bowtie \ldots \bowtie R_m|$.
 \end{Definition}

 Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
@ -63,7 +63,7 @@ We assume that full table scans are used for every base relation access. We can
 %Observe that 
 % () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .

-More specifically \Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
+Finally, \Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\dbbase$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\dbbase,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntime{Q, \dbbase})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}.
 %
 %We now make a simple observation on the above cost model:
 %\begin{proposition}
--- a/poly-form.tex
+++ b/poly-form.tex
@ -9,7 +9,7 @@ Note that a polynomial over $\vct{X}=(X_1,\dots,X_n)$ with individual degree $B
 is formally defined as (where $c_{\vct{d}}\in \semN$): 
 \begin{equation}
  \label{eq:sop-form}
-\poly\inparen{X_1,\dots,X_n}=\sum_{\vct{d}\in\{0,\ldots,B\}^n} c_{\vct{d}}\cdot \prod_{i=1}^n X_i^{d_i},
+\poly\inparen{X_1,\dots,X_n}=\sum_{\vct{d}\in\{0,\ldots,B\}^n} c_{\vct{d}}\cdot \prod_{i=1}^n X_i^{d_i}.
 \end{equation}
 %where $c_{\vct{d}}\in \semN$.

@ -63,7 +63,7 @@ Let $\pdb$ be a \abbrBIDB over $\numvar$ input tuples such that the probability
 \end{equation*}
 \end{Lemma}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-Let $\abs{\poly}$ be proportional to the number of operators in $\phi$.
+Let $\abs{\poly}$ be the number of operators in $\phi$.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Corollary}\label{cor:expct-sop}
--- a/prob-def.tex
+++ b/prob-def.tex
@ -11,7 +11,7 @@ We represent lineage polynomials via {\em arithmetic circuits}~\cite{arith-compl

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[Circuit]\label{def:circuit}
-A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X}$.  For each output tuple there exists one sink gate.  The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
+A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X}$.  For each result tuple there exists one sink gate.  The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
 %
 Each gate has the following members: \type, \vpartial, \vari{input}, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val storing the value.  $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
 \end{Definition}
@ -87,7 +87,7 @@ The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$.
 \noindent We are now ready to formally state the final version of \Cref{prob:intro-stmt}.%our \textbf{main problem}.
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
-Let $\pdb$ be an arbitrary \abbrBIDB-PDB and $\vct{X}$ be the set of variables annotating tuples in $\dbbase$.  Fix a query $\query$ and a result tuple $\tup$.
+Let $\pdb$ be an arbitrary \abbrBIDB-PDB and $\vct{X}$ be the set of variables annotating tuples in $\dbbase$.  Fix an $\raPlus$ query $\query$ and a result tuple $\tup$.
  The \expectProblem is defined as follows:\\[-7mm]
 \begin{center}
 \textbf{Input}: $\circuit \in \circuitset{\polyX}$ for $\polyX = \apolyqdt$
--- a/ra-to-poly.tex
+++ b/ra-to-poly.tex
@ -5,14 +5,15 @@

 \subsection{Probabilistic Databases}

-Following typical representation of bags in production databases, for query inputs, we will use \abbrBPDB\xplural with multiplicities $\{0, 1\}$ and a unique tuple-id field to allow duplicate tuples.
+Following typical representation of bags in production databases, for query inputs, we will use \abbrBPDB\xplural with multiplicities $\{0, 1\}$ (see \Cref{sec:gener-results-beyond} for more on this choice).
+% and a unique tuple-id field to allow duplicate tuples.

 An \textit{incomplete database} $\idb$ is a set of deterministic databases $\db$ called possible worlds.
 A \textit{probabilistic database} $\pdb$ is a pair $(\idb, \pd)$ where $\idb$ is an incomplete database and $\pd$ is a probability distribution over $\idb$. Queries over probabilistic databases are evaluated using the so-called possible world semantics. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\idb$ is the set of query answers produced by evaluating $\query$ over each possible world: $\query(\idb) = \comprehension{\query(\db)}{\db \in \idb}$.

 For a probabilistic  database $\pdb = (\idb, \pd)$,  the result of a query is the pair $(\query(\idb), \pd')$ where $\pd'$ is a probability distribution over $\query(\idb)$  that assigns to each possible query result the sum of the probabilities of the worlds that produce this answer:

-Recall \Cref{fig:nxDBSemantics} which depicts the semantics for constructing a lineage polynomial $\apolyqdt$ for any $\raPlus$ query.  We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
+Recall \Cref{fig:nxDBSemantics} which defines the lineage polynomial $\apolyqdt$ for any $\raPlus$ query.  We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.

 \begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
 Given a \abbrBPDB $\pdb = (\idb,\pd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for aribitrary result tuple $\tup$, %$\semNX$-\abbrPDB $\pxdb = (\idb_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$,