Added two lemmas to S3.

master
Aaron Huber 2021-09-14 08:21:57 -04:00
parent 65f7b1bddc
commit 74d67ae4b0
1 changed files with 12 additions and 5 deletions

View File

@ -51,7 +51,7 @@ For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
SELECT 1 FROM OnTime a, Route r, OnTime b
WHERE a.city = r.city1 AND b.city = r.city2
\end{lstlisting}
as $R_i$ for each $i \in [k]$. The query then becomes
as $R_i$ for each $i \in [k]$. The query $\query^k$ then becomes
\begin{lstlisting}
SELECT 1 FROM $R_1$ JOIN $R_2$ JOIN$\cdots$JOIN $R_k$
\end{lstlisting}
@ -66,14 +66,21 @@ SELECT 1 FROM $R_1$ JOIN $R_2$ JOIN$\cdots$JOIN $R_k$
%\[\poly^k_G\dlImp OnTime(C_1),Route(C_1, C_1'),OnTime(C_1'),\dots,OnTime(C_\kElem),Route(C_\kElem,C_\kElem'),OnTime(C_\kElem')\]
%\end{minipage}
%}
where adapting the PDB instance in \Cref{fig:two-step}, relation $OnTime$ has $4$ tuples corresponding to each vertex for $i$ in $[4]$, each with probability $\prob_i$ and $Route$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
\noindent where adapting the PDB instance in \Cref{fig:two-step}, relation $OnTime$ has $4$ tuples corresponding to each vertex for $i$ in $[4]$, each with probability $\prob_i$ and $Route$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $Route$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $Route$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
Note that this implies that our hard lineage polynomial can be represented as an expression tree produced by a project-join query with same probability value for each input tuple $\prob_i$, and hence is indeed a lineage polynomial for a \abbrTIDB \abbrPDB.
\begin{Lemma}\label{lem:pdb-for-def-qk}
The relations encoding the edges for the hard query of \Cref{def:qk} can be computed in $\bigO{\numedge}$ time.
Assuming that each $v \in \vset$ has degree $\geq 1$,\footnote{We argue that this is a reasonable assumption, since any vertex with degree $= 0$ can be dropped without affecting the result of our hard query.} the \abbrPDB relations encoding the edges for the hard query of \Cref{def:qk} can be computed in $\bigO{\numedge}$ time.
\end{Lemma}
\begin{proof}
Only two relations need be constructed, one for the vertexes and one for the edges. By a simple linear scan, each can be constructed in time $\bigO{\numedge + \numvar}$. If we assume a constant factor of edges in the number of vertexes, then we have $\bigO{\numedge}$ time..
\begin{proof}[Proof of \Cref{lem:pdb-for-def-qk}]
Only two relations need be constructed, one for the set $\vset$ and one for the set $\edgeSet$. By a simple linear scan, each can be constructed in time $\bigO{\numedge + \numvar}$. Given that the degree of each $v \in \vset$ is at least $1$, we have that $\abs{\edgeSet}$ is at least within a constant factor of $\abs{\vset}$, and this yields the claimed runtime.
\end{proof}
\begin{Lemma}\label{lem:tdet-om}
For $\query$ defined above, the runtime $\qruntime{\query^k, \dbbase}$ is $O_k(\numedge)$.
\end{Lemma}
\begin{proof}[Proof of \Cref{lem:tdet-om}]
Since by definition, $\dbbase = \cup_{\db \in \idb}\db$, it follows that $\dbbase$ consists of the relations that contain all possible $v \in \vset$ and $e \in \edgeSet$. Because the result for $\query^1$ cannot be any larger than the relation encoding $\edgeSet$ (i.e., $\abs{\edgeSet}$), it follows that (using an efficient query evaluation strategy such as indexing) the runtime of $\qruntime{\query^1, \dbbase}$ is indeed $O(\numedge)$. When $k > 1$, since by \Cref{def:qk} $\query^k$ is simply a cross product of the original query $\query^1$, we arrive at the desired runtime of $O_k(\numedge)$.
\end{proof}
\subsection{Multiple Distinct $\prob$ Values}