paper-BagRelationalPDBsAreHard/ra-to-poly.tex

95 lines
12 KiB
TeX
Raw Normal View History

%root: main.tex
2020-06-26 17:27:52 -04:00
%!TEX root=./main.tex
2020-12-04 13:14:12 -05:00
%\onecolumn
2020-12-16 17:25:37 -05:00
\section{Background and Notation}\label{sec:background}
2022-02-07 12:09:43 -05:00
\subsection{Polynomial Definition and Terminology}
%We now introduce some terminology
%and develop a reduced form of lineage polynomials for a \abbrBIDB or \abbrTIDB.
%Note that
\secrev{
A polynomial over a set of variables $\vct{S}$ with $\abs{S}=\numedge$ and individual degree $B <\infty$
2022-02-07 12:09:43 -05:00
is formally defined as (where $c_{\vct{d}}\in \semN$):
\begin{equation}
\label{eq:sop-form}
\poly\inparen{S_1,\dots,S_\numedge}=\sum_{\vct{d}\in\{0,\ldots,B\}^\tupset} c_{\vct{d}}\cdot \prod_{i\in\pbox{\numedge}}S_i^{d_i}.
2022-02-07 12:09:43 -05:00
\end{equation}
}
2022-02-07 12:09:43 -05:00
%where $c_{\vct{d}}\in \semN$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
\secrev{The term $\prod_{i\in\pbox{\numedge}} S_i^{d_i}$ }in \Cref{eq:sop-form} is a {\em monomial}. A polynomial $\poly\inparen{\vct{X}}$ is in standard monomial basis (\abbrSMB) when we keep only the terms with $c_{\vct{d}}\ne 0$ from \Cref{eq:sop-form}.
2022-02-07 12:09:43 -05:00
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Unless othewise noted, we consider all polynomials to be in \abbrSMB representation.
When it is unclear, we use $\smbOf{\poly}$ to denote the \abbrSMB form of a polynomial $\poly$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Degree]\label{def:degree-of-poly}
The degree of polynomial $\poly(\vct{X})$ is the largest \secrev{$\vct{d} = \sum_{i\in\pbox{\numedge}}d_i %= \norm{\vct{d}}_1
$}% = \sum_{\tup\in\tupset} d_\tup$
2022-02-17 10:07:33 -05:00
such that $c_{(d_1,\dots,d_n)}\ne 0$. \secrev{
We denote the degree of $\poly$ as $\deg\inparen{\poly}$.
}% maximum sum of exponents, over all monomials in $\smbOf{\poly(\vct{X})}$.
2022-02-07 12:09:43 -05:00
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As an example, the degree of the polynomial $X^2+2XY^2+Y^2$ is $3$.
Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantics}), so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins needed to produce a result tuple.
%in any clause of the $\raPlus$ query that created it.
\secrev{
We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynomial} (%resp., \emph{\ti-lineage polynomial},
or simply lineage polynomial), if there exists a $\raPlus$ query $\query$, \abbrCTIDB $\pdb$, and result tuple $\tup$ such that $\poly\inparen{\vct{X}} = \apolyqdt\inparen{\vct{X}}.$
}
2020-07-02 16:15:35 -04:00
2022-02-08 16:39:14 -05:00
%Following the typical representation of bags in production databases, for query inputs, we will use \abbrBPDB\xplural with multiplicities $\{0, 1\}$ (see \Cref{sec:gener-results-beyond} for more on this choice).
2022-02-17 10:07:33 -05:00
\subsection{$\mathbf{1}$-BIDB}\label{subsec:one-bidb}
2020-12-19 12:59:27 -05:00
\label{subsec:tidbs-and-bidbs}
2022-02-08 16:39:14 -05:00
\noindent\secrev{
2022-02-15 13:11:33 -05:00
A block independent database \abbrBIDB $\pdb'$ can viewed as a $1$-\abbrTIDB $\pdb$ with the added flexibility that each $\tup\in\tupset$ has multiple disjoint alternatives, i.e., all $\tup \in \tupset'$ are partitioned into $m$ independent blocks with the condition that tuples $\tup \in \block_i$ for $i \in \pbox{m}$ are disjoint events. We define next a specific construction of \abbrBIDB that is useful for our work.
\begin{Definition}[$1$-\abbrBIDB]\label{def:one-bidb}
2022-02-16 09:41:00 -05:00
Define a $1$-\abbrBIDB to be the pair $\pdb' = \inparen{\prod_{\tup\in\tupset'}\inset{0, \bound_\tup}, \bpd'},$ where $\tupset'$ is the set of possible tuples such that each $\tup \in \tupset'$ has a multiplicity domain of $\inset{0, \bound_\tup}$, with $\bound_\tup\in\mathbb{N}$. The operation $\prod_{\tup\in\tupset'}$ is the direct product of all such multiplicity domain pairs. The tuples $\tup\in\tupset'$ are partitioned into $m$ independent blocks $\block_i,~i\in\pbox{m}$, of disjoint tuples. $\bpd'$ is the probability distribution across all worlds such that, given $\worldvec\in\prod_{\tup\in\tupset'}\inset{0,\bound_\tup},\tup,~\tup'\in\block_i~:~\probOf\pbox{\worldvec_\tup, \worldvec_\tup'>0} = 0$.
\end{Definition}
%A \abbrCTIDB $\pdb$ is a pair $\inparen{\worlds, \bpd}$ such that $\worlds$ is an incomplete database whose set of possible worlds is the $c+1^\numvar$ tuple/multiplicity combinations across all $\tup\in\tupset$, where $\abs{\tupset} = \numvar$, $\tupset = \bigcup_{\worldvec\in\worlds,~\worldvec_{\tup}\geq 1}\tup$ is the set of possible tuples across possible worlds, and $\bpd$ is a probability distribution over $\worlds$.
2022-02-15 13:11:33 -05:00
%\begin{Definition}[$\bound$-Block Independent Disjoint Database ($\bound$-\abbrBIDB)]\label{def:bidb}
%A $\bound$-block independent database ($\bound$-\abbrBIDB) $\pdb' = \inparen{\inset{0,\ldots,\bound}^{\tupset'}, \bpd'}$ is a probabilistic database such that the all worlds set is encoded as the set of vectors $\worldvec\in\inset{0,\ldots,\bound}^{\abs{\tupset'}}$ where $\worldvec_\tup\leq\bound$ is the multiplicity for tuple $\tup$. $\pdb'$ requires the set of all possible tuples $\tupset = \bigcup_{\worldvec\in\inset{0,\ldots, \bound}^{\tupset'},~\worldvec_\tup \geq 1}\tup$ to be partitioned into $m$ independent blocks $\block_i$ ($i\in\pbox{m}$) where all tuples $\tup_{i, j}\in \block_i$ are disjoint. $\bpd'$ is the probability distribution where, for all $\worldvec\in\inset{0,\ldots,\bound}^{\tupset'}$ such that $\worldvec_{\tup_{i, j}},\worldvec_{\tup_{i, j'}}\neq 0, j\neq j'$ for any $\block_i$, $\probOf\pbox{\worldvec} = 0$, where all other $\worldvec$ has $0<\probOf\pbox{\worldvec}\leq 1$.%bpd'$ set with the all worlds set $\worlds$ and probability distribution $\bpd'$ such that $\tupset' = \bigcup_{\worldvec\in\worlds, \worldvec_\tup \geq 1}\tup$ is the set of all possible tuples for which all $\tup\in\tupset'$ can be partitioned into $\numedge$ blocks $\block_i$ where the set of tuples $\tup_j \in \block_i$ are all disjoint, while blocks $\block_i$ are independent of one another. Each $\tup\in\tupset'$ has a multiplicity of at most $\bound$. $\bpd'$ is the distribution such that for any $\worldvec\in\worlds$ with $\worldvec_{\tup_{i, j}}\geq 1$ and $\worldvec_{\tup_{i, j'}}\geq 1$, $j\neq j'$ in any $\block_i$ more than one tuple present from the same block $\block_i$ has probability $\probOf\pbox{\worldvec} = 0$.
%\end{Definition}
%A block independent database (\abbrBIDB) is a related probabilistic data model $\pdb=\inparen{\Omega, \bpd}$ such that the base set of tuples $\tupset = \bigcup_{\omega\in\Omega,~\tup\in\omega}\tup$ is partitioned into a set of $\numvar$ independent blocks $\inset{\inparen{\block_\tup}_{\tup\in\pbox{\numvar}}}$ such that the set of tuples $\inset{\inparen{\tup_j}_{j\in\pbox{\abs{\block}}}}$ in block $\block_\tup$ are disjoint from one another. This construction produces the set of possible worlds $\Omega$ that consists of all unique combinations of tuples in $\tupset$ with the constraint that for any $\omega\in\Omega$, no two tuples $\tup_j, \tup_{j'}, j\neq j'$ from the same block $\block_\tup$ exist together. A $\bound$-\abbrBIDB has the further requirement that each block has a multiplicity of at most $c$.
2022-02-16 09:41:00 -05:00
We now present a reduction that is useful in deriving our results:
2022-02-08 12:51:15 -05:00
\begin{Definition}[\abbrCTIDB reduction]\label{def:ctidb-reduct}
2022-02-16 09:41:00 -05:00
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\prod_{\tup\in\tupset'}\inset{0, \bound_\tup}^{\tupset'}, \bpd'}$ be the \abbrOneBIDB obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intup{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$.% such that $X_{\tup, j}\in\inset{0,1}$.
The probability distribution $\bpd'$ is the one induced by $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$ and the \abbrBIDB disjoint requirement, where given any $\worldvec\in\prod_{\tup\in\tupset'}\inset{0, \bound_\tup}^{\tupset'}$, $\probOf\pbox{\worldvec_{\tup, j}, \worldvec_{\tup, j'} > 0} = 0$ for any $j \neq j' \in \pbox{\bound}$, such that for any $W\in\prod_{\tup\in\tupset'}\inset{0, \bound_\tup}^{\tupset'}$, $\probOf\pbox{\worldvec = W} = \prod_{\tup\in\tupset', j\in\pbox{\bound}}W_{\tup, j}\cdot j\cdot\prob_\tup$ if $\forall \tup \in \tupset'\not\exists j\neq j'\in\pbox{\bound}, W_{\tup, j}, W_{\tup, j'} \geq 1$; otherwise $\probOf\pbox{\worldvec = W} = 0$.\footnote{
We slightly abuse notation here, denoting a world vector as $W$ rather than $\worldvec$ to distinguish between the random variable and the world instance. When there is no ambiguity, we will denote a world vector as $\worldvec$.}% that for any $X_{\tup, j} = 1, j'\in\pbox{\bound} - \inset{j}, X_{\tup, j'} = 0$.
2022-02-15 13:11:33 -05:00
% $\block_\tup,~j\in\pbox{\bound}~|~X_{\tup, j} = 1,\not\exists j'\neq j~|~X_{\tup, j'} = 1$.
%$\tup_j\geq1\implies \tup_{j'} = 0$.$\forall j, j' \in \pbox{\bound},\forall \tup\in\tupset, \tup_j\geq 1\implies \tup_{j'} = 0$ for any block $\block_\tup$.
2022-02-08 12:51:15 -05:00
\end{Definition}
2022-02-09 09:35:36 -05:00
For the \abbrCTIDB $\pdb$, each $X_\tup\in\pbox{\bound}$, while in the reduced \abbrOneBIDB $\pdb'$, each $X_{\tup, j}\in\inset{0, 1}$. %As previously noted, unlike $X_{\tup}\in\inset{0,\ldots,\bound}$ for $X_{\tup}\in\vars{\pdb}$, $X_{\tup, j}\in\inset{0,1}$ for $X_{\tup, j}\in\vars{\pdb'}$.
2022-02-15 13:11:33 -05:00
Hence, in the setting of \abbrOneBIDB, we have the following semantics for generating lineage polynomials in $\raPlus$ queries: $\poly'\pbox{\project_A\inparen{\query}, \tupset', \tup_j} = \sum_{\tup_{j'} \in \project_{A}\inparen{\query\inparen{\tupset'}}: \tup_{j'} = \tup_j}\poly'\pbox{\query, \tupset', \tup_{j'}}$,
$\poly'\pbox{\select_\theta\inparen{\query}, \tupset', \tup_j} = \begin{cases}\theta = 1&\poly'\pbox{\query, \tupset', \tup_j}\\\theta = 0& 0\\\end{cases}$,
$\poly'\pbox{\query_1\join\query_2, \tupset', \tup_j} = \poly'\pbox{\query_1, \tupset', \project_{attr\inparen{\query_1}}\inparen{\tup_j}}\cdot\poly'\pbox{\query_2, \tupset', \project_{attr\inparen{\query_2}}\inparen{\tup_j}}$,
$\poly'\pbox{\query_1\union\query_2, \tupset', \tup_j} = \poly'\pbox{\query_1, \tupset', \tup_j}+\poly'\pbox{\query_2, \tupset', \tup_j}$,
and the base case now becomes $\poly'\pbox{\rel,\tupset', \tup_j} = j\cdot X_{\tup, j}$ (c.f.~\Cref{fig:nxDBSemantics}). Then given the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\pbox{\rel,\tupset',\tup}$ is of the same structure as the reformulated polynomial $\refpoly{}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of~\Cref{def:reduced-poly}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials: $\expct_{\rvworld\sim\bpd'}\pbox{\poly'\inparen{\rvworld}} = \rpoly'\inparen{\vct{\prob}}$.
2021-09-17 14:11:40 -04:00
}
2022-02-08 12:51:15 -05:00
%In this paper, we focus on two popular forms of \abbrPDB\xplural: Block-Independent (\bi) and Tuple-Independent (\ti) \abbrPDB\xplural.
%%
%A \bi $\pdb$ is a \abbrPDB with the constraint that
%%(i) every tuple $\tup_i$ is annotated with a unique random variable $\randWorld_i \in \{0, 1\}$ and (ii) that
%the tuples in $\dbbase$ can be partitioned into a set of $\ell$ blocks such that tuples $\tup_{i, j}, \tup_{k, j'}$ from separate blocks $(i\neq k)$ are independent of each other while tuples $\tup_{i, j}, \tup_{i, k}$ from the same block are disjoint events.\footnote{
% Although only a single independent, $[\abs{\block_i}+1]$-valued variable is customarily used per block~\cite{DBLP:series/synthesis/2011Suciu}, we decompose it into $\abs{\block_i}$ correlated $\{0,1\}$-valued variables per block that can be used directly in polynomials (without an indicator function). For $t_{i, j} \in b_i$, the event $(\randWorld_{i,j} = 1)$ corresponds to the event $(\randWorld_i = j)$ in the customary annotation scheme.
%}
%Each tuple $\tup_{i, j}$ is annotated with a random variable $\randWorld_{i, j} \in \{0, 1\}$ denoting its presence in a possible world $\db$. The probability distribution $\pd$ over $\dbbase$ is the one induced from individual tuple probabilities $\prob_{i, j}\in \vct{\prob}=\inparen{\prob_{1, 1},\ldots,\prob_{\abs{\block},\ldots,\abs{\block_{\abs{\block}}}}}$ (where $\forall i$, $\sum_j p_{i,j}\le 1$) and the conditions on the blocks. A \abbrTIDB is a \abbrBIDB where each block has size exactly $1$.
2021-09-07 10:33:13 -04:00
2022-02-07 12:09:43 -05:00
2020-12-13 15:51:55 -05:00
2021-04-08 22:51:36 -04:00
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End: