Almost done fixing layout issues with PODs format.

master
Aaron Huber 2022-03-15 11:21:11 -04:00
parent a9aea5ecc3
commit 0920523d57
69 changed files with 20011 additions and 58 deletions

31
Sketching Worlds/.gitignore vendored Normal file
View File

@ -0,0 +1,31 @@
acmart.cls
acmart.pdf
acmguide.pdf
samples/sample-*.pdf
*.log
*.aux
*.cfg
*.glo
*.idx
*.toc
*.ilg
*.ind
*.out
*.lof
*.lot
*.bbl
*.blg
*.gls
*.cut
*.hd
*.dvi
*.ps
*.thm
*.tgz
*.zip
*.rpi
*~
*.bcf
*.run.xml
samples/ACM-Reference-Format.bst
samples/*.tex

View File

@ -0,0 +1,830 @@
\ProvidesFile{ACM-Reference-Format.bbx}[2017-09-27 v0.1 biblatex bibliography style]
% Inherit a default style
\RequireBibliographyStyle{trad-plain}
%%% Localisation strings for ACM
\DefineBibliographyStrings{american}{%
mathesis = {Master's thesis},
phdthesis = {Ph\adddot{}D\adddotspace Dissertation},
editor = {(Ed\adddot)},
editors = {(Eds\adddot)},
edition = {ed\adddot},
}
%%% Formatting for fields
%\DeclareFieldFormat
% [article,inbook,incollection,inproceedings,patent,thesis,unpublished]
% {title}{#1}
\DeclareFieldFormat{pages}{#1}
\DeclareFieldFormat{numpages}{#1 pages}
\DeclareFieldFormat{number}{#1}
\DeclareFieldFormat{articleno}{Article #1}
\DeclareFieldFormat{key}{#1}
\DeclareFieldFormat{urldate}{Retrieved\space{}#1\space{}from}
\DeclareFieldAlias{lastaccessed}{urldate}
\DeclareFieldFormat{url}{\url{#1}}
\DeclareFieldFormat{edition}{%
\printtext[parens]{\ifinteger{#1}
{\mkbibordedition{#1}~\bibstring{edition}}
{#1\isdot~\bibstring{edition}}}}
% Handle urls field containing 'and' separated list of URLs
% https://github.com/plk/biblatex/issues/229
\DeclareListFormat{urls}{%
\url{#1}%
\ifthenelse{\value{listcount}<\value{liststop}}
{\addcomma\space}
{}}
\renewbibmacro*{url}{\iffieldundef{url}{\printlist{urls}}{\printfield{url}}}
%%% Bibmacro definitions
\renewbibmacro*{translator+others}{%
\ifboolexpr{
test \ifusetranslator
and
not test {\ifnameundef{translator}}
}
{\printnames{translator}%
\setunit{\addcomma\space}%
\usebibmacro{translator+othersstrg}%
\clearname{translator}}
{\printfield{key}}}
\newbibmacro*{year}{%
\iffieldundef{year}%
{\printtext{[n.\ d.]}}%
{\printfield{year}}%
}
\renewbibmacro*{date}{\printtext[parens]{\printdate}}
\renewbibmacro*{url+urldate}{\iffieldundef{urlyear}
{}
{\usebibmacro{urldate}%
\setunit*{\addspace}}%
\usebibmacro{url}%
}
\renewbibmacro*{journal+issuetitle}{%
\usebibmacro{journal}%
\setunit*{\addcomma\space}%
\iffieldundef{series}
{}
{\newunit%
\printfield{series}%
\setunit{\addspace}}%
\usebibmacro{volume+number+date+pages+eid}%
\newcommaunit%
% \setunit{\addspace}%
\usebibmacro{issue-issue}%
\setunit*{\addcolon\space}%
\usebibmacro{issue}%
\newunit}
\newbibmacro*{volume+number+date+pages+eid}{%
\printfield{volume}%
\setunit*{\addcomma\space}%
\printfield{number}%
\setunit*{\addcomma\space}%
\printfield{articleno}
\setunit{\addcomma\space}
\usebibmacro{date-ifmonth}
\setunit{\addcomma\space}%
\iffieldundef{pages}%
{\printfield{numpages}}%
{\printfield{pages}}%
\newcommaunit%
\printfield{eid}}%
\renewbibmacro*{chapter+pages}{%
\printfield{chapter}%
\setunit{\bibpagespunct}%
\iffieldundef{pages}%
{\printfield{numpages}}%
{\printfield{pages}}%
\newunit}
\renewbibmacro*{editor+others}{%
\ifboolexpr{
test \ifuseeditor
and
not test {\ifnameundef{editor}}
}
{\printnames{editor}%
\setunit{\addcomma\space}%
\usebibmacro{editor+othersstrg}%
\clearname{editor}}
{\iflistundef{organization}{}{\printlist{organization}}}}
\newbibmacro*{issue-issue}{%
\iffieldundef{issue}%
{}%
{\printfield{issue}%
\setunit*{\addcomma\space}%
\usebibmacro{date-ifmonth}%
}%
\newunit}
\newbibmacro*{maintitle+booktitle+series+number}{%
\iffieldundef{maintitle}
{}
{\usebibmacro{maintitle}%
\newunit\newblock
\iffieldundef{volume}
{}
{\printfield{volume}%
\printfield{part}%
\setunit{\addcolon\space}}}%
\usebibmacro{booktitle}%
\setunit*{\addspace}
\printfield[parens]{series}%
\setunit*{\addspace}%
\printfield{number}%
\setunit*{\addcomma\space}%
\printfield{articleno}
\newunit
}
\renewbibmacro*{booktitle}{%
\ifboolexpr{
test {\iffieldundef{booktitle}}
and
test {\iffieldundef{booksubtitle}}
}
{}
{\printtext[booktitle]{%
\printfield[titlecase]{booktitle}%
\iffieldundef{booksubtitle}{}{
\setunit{\subtitlepunct}%
\printfield[titlecase]{booksubtitle}}%
}%
}%
\printfield{booktitleaddon}}
\renewbibmacro*{volume+number+eid}{%
\printfield{volume}%
\setunit*{\addcomma\space}%
\printfield{number}%
\setunit*{\addcomma\space}%
\printfield{articleno}
\setunit{\addcomma\space}%
\printfield{eid}}
\renewbibmacro*{publisher+location+date}{%
\printlist{publisher}%
\setunit*{\addcomma\space}%
\printlist{location}%
\setunit*{\addcomma\space}%
\usebibmacro{date-ifmonth}%
\newunit}
\newbibmacro{date-ifmonth}{%
\iffieldundef{month}{}{%
\usebibmacro{date}
}%
}
\renewbibmacro*{institution+location+date}{%
\printlist{school}%
\setunit*{\addcomma\space}%
\printlist{institution}%
\setunit*{\addcomma\space}%
\printlist{location}%
\setunit*{\addcomma\space}%
\usebibmacro{date-ifmonth}%
\newunit}
\renewbibmacro*{periodical}{%
\iffieldundef{title}
{}
{\printtext[title]{%
\printfield[titlecase]{title}%
\setunit{\subtitlepunct}%
\printfield[titlecase]{subtitle}}}%
\newunit%
\usebibmacro{journal}}
\renewbibmacro*{issue+date}{%
\iffieldundef{issue}
{\usebibmacro{date}}
{\printfield{issue}%
\setunit*{\addspace}%
\usebibmacro{date}}%
\newunit}
\renewbibmacro*{title+issuetitle}{%
\usebibmacro{periodical}%
\setunit*{\addspace}%
\iffieldundef{series}
{}
{\newunit
\printfield{series}%
\setunit{\addspace}}%
\printfield{volume}%
\setunit*{\addcomma\space}%
\printfield{number}%
\setunit*{\addcomma\space}%
\printfield{articleno}
\setunit{\addcomma\space}%
\printfield{eid}%
\setunit{\addspace}%
\usebibmacro{issue+date}%
\setunit{\addcolon\space}%
\usebibmacro{issue}%
\newunit}
\renewbibmacro*{doi+eprint+url}{%
\iftoggle{bbx:url}
{\iffieldundef{doi}{\usebibmacro{url+urldate}}{}}
{}%
\newunit\newblock
\iftoggle{bbx:eprint}
{\usebibmacro{eprint}}
{}%
\newunit\newblock
\iftoggle{bbx:doi}
{\printfield{doi}}
{}}
%%% Definitions for drivers (alphabetical)
\DeclareBibliographyDriver{article}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/translator+others}%
\setunit{\labelnamepunct}\newblock%
\usebibmacro{year}%
\newunit%
\usebibmacro{title}%
\newunit%
\printlist{language}%
\newunit\newblock%
\usebibmacro{byauthor}%
\newunit\newblock%
\usebibmacro{bytranslator+others}%
\newunit\newblock%
\printfield{version}%
\newunit\newblock%
\usebibmacro{journal+issuetitle}%
\newunit%
\usebibmacro{byeditor+others}%
\newunit%
\printfield{note}%
\newunit\newblock%
\iftoggle{bbx:isbn}
{\printfield{issn}}
{}%
\newunit\newblock%
\usebibmacro{doi+eprint+url}%
\newunit\newblock%
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock%
\usebibmacro{related}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{book}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/editor+others/translator+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}%
\newunit%
\usebibmacro{maintitle+title}%
\newunit%
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{byeditor+others}%
\newunit\newblock
\printfield{edition}%
\newunit
\usebibmacro{series+number}%
\iffieldundef{maintitle}
{\printfield{volume}%
\printfield{part}}
{}%
\newunit
\newunit\newblock
\printfield{volumes}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\usebibmacro{publisher+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit
\printfield{pagetotal}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{inbook}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\iffieldundef{author}%
{\usebibmacro{byeditor+others}}%
{\usebibmacro{author/translator+others}}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
% \usebibmacro{in:}%
\usebibmacro{bybookauthor}%
\newunit\newblock
\usebibmacro{maintitle+booktitle}%
\newunit\newblock
\iffieldundef{author}{}%if undef then we already printed editor
{\usebibmacro{byeditor+others}}%
\newunit\newblock
\printfield{edition}%
\newunit
\iffieldundef{maintitle}
{\printfield{volume}%
\printfield{part}}
{}%
\newunit
\printfield{volumes}%
\newunit\newblock
\usebibmacro{series+number}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\usebibmacro{publisher+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{incollection}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/translator+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{in:}%
\usebibmacro{maintitle+booktitle}%
\newunit\newblock
\usebibmacro{series+number}%
\newunit\newblock
\printfield{edition}%
\newunit
\iffieldundef{maintitle}
{\printfield{volume}%
\printfield{part}}
{}%
\newunit
\printfield{volumes}%
\newunit\newblock
\usebibmacro{byeditor+others}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\usebibmacro{publisher+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{inproceedings}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/translator+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{in:}%
\usebibmacro{maintitle+booktitle+series+number}%
\newunit\newblock
\usebibmacro{event+venue+date}%
\newunit\newblock
\usebibmacro{byeditor+others}%
\newunit\newblock
\iffieldundef{maintitle}
{\printfield{volume}%
\printfield{part}}
{}%
\newunit
\printfield{volumes}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\printlist{organization}%
\newunit
\usebibmacro{publisher+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{manual}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/editor+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{byeditor}%
\newunit\newblock
\printfield{edition}%
\newunit\newblock
\usebibmacro{series+number}%
\newunit\newblock
\printfield{type}%
\newunit
\printfield{version}%
\newunit
\printfield{note}%
\newunit\newblock
\printlist{organization}%
\newunit
\usebibmacro{publisher+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit
\printfield{pagetotal}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{misc}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/editor+others/translator+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{byeditor+others}%
\newunit\newblock
\printfield{howpublished}%
\newunit\newblock
\printfield{type}%
\newunit
\printfield{version}%
\newunit
\printfield{note}%
\newunit\newblock
\usebibmacro{organization+location+date}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{online}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author/editor+others/translator+others}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{byeditor+others}%
\newunit\newblock
\printfield{version}%
\newunit
\printfield{note}%
\newunit\newblock
\printlist{organization}%
\newunit\newblock
\usebibmacro{date-ifmonth}%
\newunit\newblock
\iftoggle{bbx:eprint}
{\usebibmacro{eprint}}
{}%
\newunit\newblock
\usebibmacro{url+urldate}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareFieldFormat[patent]{number}{Patent No.~#1}
\DeclareBibliographyDriver{patent}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}%
\newunit
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\usebibmacro{date}%
\newunit\newblock
\printfield{type}%
\setunit*{\addspace}%
\printfield{number}%
\iflistundef{location}
{}
{\setunit*{\addspace}%
\printtext[parens]{%
\printlist[][-\value{listtotal}]{location}}}%
\newunit\newblock
\usebibmacro{byholder}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{periodical}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{editor}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit
\usebibmacro{title+issuetitle}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byeditor}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{issn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{report}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\printfield{type}%
\setunit*{\addspace}%
\printfield{number}%
\newunit\newblock
\printfield{version}%
\newunit
\printfield{note}%
\newunit\newblock
\usebibmacro{institution+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit
\printfield{pagetotal}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isrn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}
\DeclareBibliographyDriver{thesis}{%
\usebibmacro{bibindex}%
\usebibmacro{begentry}%
\usebibmacro{author}%
\setunit{\labelnamepunct}\newblock
\usebibmacro{year}
\newunit
\usebibmacro{title}%
\newunit
\printlist{language}%
\newunit\newblock
\usebibmacro{byauthor}%
\newunit\newblock
\printfield{type}%
\newunit
\usebibmacro{institution+location+date}%
\newunit\newblock
\usebibmacro{chapter+pages}%
\newunit
\printfield{pagetotal}%
\newunit\newblock
\iftoggle{bbx:isbn}
{\printfield{isbn}}
{}%
\newunit\newblock
\usebibmacro{doi+eprint+url}%
\newunit\newblock
\usebibmacro{addendum+pubstate}%
\setunit{\bibpagerefpunct}\newblock
\usebibmacro{pageref}%
\newunit\newblock
\printfield{note}%
\newunit\newblock
\iftoggle{bbx:related}
{\usebibmacro{related:init}%
\usebibmacro{related}}
{}%
\usebibmacro{finentry}}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
\ProvidesFile{ACM-Reference-Format.cbx}[2017-09-27 v0.1]
\RequireCitationStyle{numeric}
\endinput

View File

@ -0,0 +1,18 @@
% Teach biblatex about numpages field
\DeclareDatamodelFields[type=field, datatype=literal]{numpages}
\DeclareDatamodelEntryfields{numpages}
% Teach biblatex about articleno field
\DeclareDatamodelFields[type=field, datatype=literal]{articleno}
\DeclareDatamodelEntryfields{articleno}
% Teach biblatex about urls field
\DeclareDatamodelFields[type=list, datatype=uri]{urls}
\DeclareDatamodelEntryfields{urls}
% Teach biblatex about school field
\DeclareDatamodelFields[type=list, datatype=literal]{school}
\DeclareDatamodelEntryfields[thesis]{school}
\DeclareDatamodelFields[type=field, datatype=literal]{key}
\DeclareDatamodelEntryfields{key}

View File

@ -0,0 +1,171 @@
You don't need to obsess over the structure or the wording; just focus on the concepts; think only about the outline rather than specific phrasing.
Q: Where can I find the semantics as defined for RA+ set operations
!Thinking about query results as (a giant pile of) monomials vs. (compressed representations of, e.g. factorized db is an example of) polynomials
(since our audience is PODS, we don't need to spend more that a line or two on this; needs to come out as EARLY as possible; this isn't entirely out of the blue;
the landscape changes for bags IF you think of the annotation in terms of a polynomial rather than a giant pile of monomials-->better than linear in the number
of monomials; don't tie this to specific structure but RATHER to the general flow of the text...
You want to get into this twice.
Focus on the 1st paragrah,
once you're done, get it to them
spend a lot of time on our contributions
also a table for sets/bags, data models, etc
===================================================================================================
BEGIN: Introduction Outline
1st Paragraph
-------------
-Motivation (Reader must be convinced that this problem is interesting from a DB perspective)
-In practice PDBs are bags
-Thus, it is relevant and interesting to explore PDBs from a bag perspective
-in practice, Production Databases (Postgres, Oracle, etc.) use bags; modern pdbs are slow since they are dealing with sets rather than bags
-Brief overview of the challenges in the very beginning--1st paragraph, be very economical with words
--COMPUTATIONS (efficiently) over the output polynomial, in our case, expectation
-focus on intuition
-what are the key points?
-one cannot generate a result better than linear time in the size of the polynomial; sets, are even harder than that--in
#P in sets; as a result if you assume that the result is given to you in SOP, then the naive method is the optimal;
however, factorized form of the polynomial allows for better results; runtime in the number of monomials vs. runtime in the
size of the polynomial is the same when the polynomial is given to you in DNF; however, they are not the same when given
compressed version(s) of the polynomial; this work looks into when the polynomal to NOT be given to us in SOP;
Naive alg: generate all the monomials, and compute each of their probabilities
-why do people think bags are easy?
-???
THIS COULD BE BROUGHT OUT MORE -how does this tie into how do people approach implementing pdbs?
-for one, the customary rule of fixed data size on attributes has significantly influenced how folks implement PDBs, i.e., with polynomials in DNF
-how can we do better than the standard bar that most pdb use
-by accepting factorized polynomials as input
-don't worry about specifics
-Result--here is what we show (in gentle English and not too technical terms)
MAYBE VERIFY THIS IS EFFECTIVELY BROUGHT OUT -Computation over bags is hard, i.e. superlinear time if we are given a compressed/factorized db; on the other hand, when we use an approximation
algorithm, we get linear time
Can have another segway paragraph
After motivation up front, forget it all, and get into the nitty gritty
Typical Theory Paper Structure (Look at PODS papers you've read and see their structures):
-------------------------------
#Define the (mathematical) problem
-computing expectation over bag PDB query output polynomial
#Here are known results (you want to articulate why this problem (that you are addressing) is non-trivial)
-people have not really studied this
#Here are our results
-hard in the general case via a reduction to computing the number of 3-paths, 3-matchings, and triangles in an arbitrary graph
#Here are the techniques for our results
-the algorithm uniformly samples monomials from expression tree of the polynomial, approximating $\rpoly{Q}$.
-we perform an analysis of the approximation algorithm that proves linear time with confidence guarantees
2nd Paragraph:
-------------
-Somewhere we need to mention...
THIS WAS NOT INCLUDED IN THE ORIGINAL PASS OF INTRO-------v
-Interesting mathematically
-\tilde{Q} equivalence with \poly{Q} under \vct{X} \in {0, 1}^n
-what does this buy us, aside from being an interesting fact?
-\tilde{Q} is the expectation of Q under the above assumption and the additional assumption of independent variables in \vct{X}
-which allows us to build an approximation alg of \tilde{Q} for the purpose of estimating E[\poly{Q}]
-I may need to think about this more.
-the computation of 3-paths, 3-matchings, and triangles via a linear system and approximation of \tilde{Q}
-Thm 2.1 shows that such a computation is hard (superlinear) without approximation
-what does this buy us practically speaking?
-eeee, are we just saying that we can compute approximations of hard problems in linear time, but, that
should be a given I would think?
-???
-Interesting results
-???
-Why bags are more interesting than previously thought
-the landscape for bags changes when we think of the annotation as a factorized polynomial rather than a giant pile of monomials
-Describe the problem
-Computing expectation over bag PDBs
-Discuss hardness results
-using PJ queries over TIDB with all p_i = p is hard in general--link with thm 2.1
------------->I think here we either decide that there is only one subtelty we want to bring out to the forefront
and/or forget about listing subtelties and just talk about them in some decent order
-list the subtleties that we want to make VERY CLEAR and will subsequently detail in the next paragraphs
BE CERTAIN THAT THIS IS EXPLICITLY STATED -#2 clarify how you are counting the input size (the size of the db instance vs. the size of the query polynomial--which might be significantly larger or smaller
PLACEHOLDER IN THE TEXT FOR THE EXAMPLE than the former); this may be a good place to have an example
-better than linear time in the output
-since we take as input the polynomial encoding the query output
-by the fact that this output polynomial can be in factorized form
Q: -the above is the only subtelty that comes to mind currently
<-------------
DONE------------->ADD THE HISTORY
-Historical Overview
-why is this the way it is
-the customary fixed attribute data size rule
-existing work
-most systems (MayBMS, MystiQ, Orion, GProM, etc) use an encoding of possible tuples that is essentially
an enumerating through the monomials
-this classical approach disallows doing anything clever
-those that use a factorized encoding assume sets, as is the case for Sprout
SKIPPED--->with new encodings, the bag problem is actually hard in a non-obvious way
- a common convention in DBs is a fixed size on the size of a column, the size of the data
-if you know how big the tuple is, there are a bunch of optimizations that you can do
-you want to avoid the situation where the field gets too big
-BUT, annotations break this, since a projection (or join--less so, since you end up with an annotation that is linear in the number of joins) can give
you an annotation that is of arbitrary size greater (in the size of the data).
-therefore, every implemented pdb system (mystique, sprout, etc) really want to avoid creating arbitrary sized annotation column
-take the provenance polynomial,
-flatten it into individual monomials
-store the individual monomials in a table
*PDB implementations have restricted themselves to a giant pile of monomials because of the fixed data requirement
-in the worst case, polynomial in the size of the input tables (the table sizes) to materialize all monomials
*Orchestra (Val Tannen, Zach Ives--take the earliest journal papers (any journal--SIGMOD Record paper perhaps?)),
Factorized Databases implement factorizations (SIGMOD 2012? Olteanu) cgpgrey (England/Northern Ireland)
-think about MayBMS
-
END HISTORY
CAN PERHAPS INCORPORATE THIS PART OF THE OUTLINE MORE STRONGLY IN THE BEGINNING PARAs
-Describe the subtelty of our scheme performing "better than linear in the output size"
-explicitly define our definition of 'hard' query in this setting
-hard is anything worse than linear time in the size of the SOP polynomial
-explain how the traditional 'set' setting for PDBs is an oversimplification
-note that most PDB implementations use DNF to model tuple polynomials, essentially an enumeration though the number of monomials
-thus computing the expectation (though trivial by linearity of expectation) is linear in the number of monomials
-limited results in complexity over PDBs in the bag setting
EEEEEEEEEEEEEEEEE-a good spot to discuss work in bags setting, but as noted below, I need to do a Lit Survey to add any more to this
bullet point
-the richness of the problem: lower, upper bound [factorized polynomial, sop polynomial]
-discuss the 'expression tree' representation of the polynomial, emphasizing the ability to model the factorized form of a polynomial
-we can factorize the polynomial, which produces an output polynomial which is smaller than the equivalent one in DNF, and
this gives us less than linear time.
EEEEEEEEEEEEEEEEEEEEEEEE-link this to work in 'factorized dbs'
-again, I need to reread the Olteanu Factorized DB paper
-motivating example(s)
-don't have any clearly worked out running examples at this point in time
END: Introduction Outline
===============================================================================================
*What other subtelties would be good to explicity bring out here?
-this is a good question, perhaps either would be beneficial
-reread what we have written so far
-think about any other subtelties that need to be brought out
-think about why this problem is interesting from a mathematical perspective, and the mathematical results that we have
*Somewhere we need to list all the ways the annotation polynomial can be compressed*
-my understanding was that the factorization is limited to Products of Sums
-Boris' comment on element
-pushing projections down
-this is seen on p. 3 paragraph 2 of Factorisation of Provenance Polynomials
EEEEEEEEEEEEEEEEEEBackground Work: What have people done with PDBs? What have people done with Bag-PDBs?
-need to do a Lit Survey before tackling this

112
Sketching Worlds/Makefile Normal file
View File

@ -0,0 +1,112 @@
#
# Makefile for acmart package
#
# This file is in public domain
#
# $Id: Makefile,v 1.10 2016/04/14 21:55:57 boris Exp $
#
PACKAGE=acmart
PDF = $(PACKAGE).pdf acmguide.pdf
all: ${PDF} ALLSAMPLES
%.pdf: %.dtx $(PACKAGE).cls
pdflatex $<
- bibtex $*
pdflatex $<
- makeindex -s gind.ist -o $*.ind $*.idx
- makeindex -s gglo.ist -o $*.gls $*.glo
pdflatex $<
while ( grep -q '^LaTeX Warning: Label(s) may have changed' $*.log) \
do pdflatex $<; done
acmguide.pdf: $(PACKAGE).dtx $(PACKAGE).cls
pdflatex -jobname acmguide $(PACKAGE).dtx
- bibtex acmguide
pdflatex -jobname acmguide $(PACKAGE).dtx
while ( grep -q '^LaTeX Warning: Label(s) may have changed' acmguide.log) \
do pdflatex -jobname acmguide $(PACKAGE).dtx; done
%.cls: %.ins %.dtx
pdflatex $<
ALLSAMPLES:
cd samples; pdflatex samples.ins; cd ..
for texfile in samples/*.tex; do \
pdffile=$${texfile%.tex}.pdf; \
${MAKE} $$pdffile; \
done
samples/%: %
cp $^ samples
samples/$(PACKAGE).cls: $(PACKAGE).cls
samples/ACM-Reference-Format.bst: ACM-Reference-Format.bst
samples/%.pdf: samples/%.tex samples/$(PACKAGE).cls samples/ACM-Reference-Format.bst
cd $(dir $@) && pdflatex-dev $(notdir $<)
- cd $(dir $@) && bibtex $(notdir $(basename $<))
cd $(dir $@) && pdflatex-dev $(notdir $<)
cd $(dir $@) && pdflatex-dev $(notdir $<)
while ( grep -q '^LaTeX Warning: Label(s) may have changed' $(basename $<).log) \
do cd $(dir $@) && pdflatex-dev $(notdir $<); done
samples/sample-xelatex.pdf: samples/sample-xelatex.tex samples/$(PACKAGE).cls samples/ACM-Reference-Format.bst
cd $(dir $@) && xelatex-dev $(notdir $<)
- cd $(dir $@) && bibtex $(notdir $(basename $<))
cd $(dir $@) && xelatex-dev $(notdir $<)
cd $(dir $@) && xelatex-dev $(notdir $<)
while ( grep -q '^LaTeX Warning: Label(s) may have changed' $(basename $<).log) \
do cd $(dir $@) && xelatex-dev $(notdir $<); done
samples/sample-lualatex.pdf: samples/sample-lualatex.tex samples/$(PACKAGE).cls samples/ACM-Reference-Format.bst
cd $(dir $@) && lualatex-dev $(notdir $<)
- cd $(dir $@) && bibtex $(notdir $(basename $<))
cd $(dir $@) && lualatex-dev $(notdir $<)
cd $(dir $@) && lualatex-dev $(notdir $<)
while ( grep -q '^LaTeX Warning: Label(s) may have changed' $(basename $<).log) \
do cd $(dir $@) && lualatex-dev $(notdir $<); done
.PRECIOUS: $(PACKAGE).cfg $(PACKAGE).cls
docclean:
$(RM) *.log *.aux \
*.cfg *.glo *.idx *.toc \
*.ilg *.ind *.out *.lof \
*.lot *.bbl *.blg *.gls *.cut *.hd \
*.dvi *.ps *.thm *.tgz *.zip *.rpi \
samples/$(PACKAGE).cls samples/ACM-Reference-Format.bst \
samples/*.log samples/*.aux samples/*.out \
samples/*.bbl samples/*.blg samples/*.cut
clean: docclean
$(RM) $(PACKAGE).cls \
samples/*.tex
distclean: clean
$(RM) *.pdf samples/sample-*.pdf
#
# Archive for the distribution. Includes typeset documentation
#
archive: all clean
COPYFILE_DISABLE=1 tar -C .. -czvf ../$(PACKAGE).tgz --exclude '*~' --exclude '*.tgz' --exclude '*.zip' --exclude CVS --exclude '.git*' $(PACKAGE); mv ../$(PACKAGE).tgz .
zip: all clean
zip -r $(PACKAGE).zip * -x '*~' -x '*.tgz' -x '*.zip' -x CVS -x 'CVS/*'
documents.zip: all docclean
zip -r $@ acmart.pdf acmguide.pdf samples *.cls ACM-Reference-Format.*
.PHONY: all ALLSAMPLES docclean clean distclean archive zip

311
Sketching Worlds/README Normal file
View File

@ -0,0 +1,311 @@
This package provides a class for typesetting publications of the
Association for Computing Machinery.
Your TeX distribution probably includes the latest released version of
this package. If you decide to install it yourself, please see the
Installation section of the User's Guide.
Please note that the version on Github is a development (or
experimental) version: please download it for testing new features.
The production version is the one on CTAN and ACM sites.
Changes
Version 1.83 Support for multilanguage papers
ISSN changes for some journals
Version 1.82 Bug fixes.
New command \anon for anonymization of short strings.
Documentation update.
Version 1.81 Bug fixes
New bib field distinctURL to print URL even if doi is present.
Reworded samples
Version 1.80 New journals: DLT, FAC
Version 1.79 Fixed pages with index
(https://github.com/borisveytsman/acmart/issues/440)
Updated information for TAP, TCPS, TEAC
Version 1.78 Documentation update.
Magic texcount comments for samples.
Title page now is split if there are too many authors
Bug fixes.
Version 1.77 Changed the way to typeset multiple affiliations (Christoph Sommer)
Version 1.76 Added many journal abbreviations to the bst.
New experimental option: pbalance
ORCID linking code
Version 1.75 \country is now obligatory for addresses.
Added \AtBeginMaketitle
Version 1.74 Bug fixes. A regression introduced in the font changes
is reverted.
Version 1.73 Bug fixes
The elements institution, city and country are now obligatory
for affiliations
Version 1.72 Bug fixes. Better handling of metadata.
Version 1.71 Bug fixes
Formats sigchi and sigchi-a are retired
Bibliography formatting changes for @inproceedings entries
having both series and volume
LuaLaTeX now uses the same OTF fonts as XeLaTeX
Version 1.70 Title change for ACM/IMS Transactions on Data Science
Bug fixes for bibliography
Version 1.69 Bug fixes
Compatibility with LaTeX 2020-02-02 release
Version 1.68 Bug fixes
BST now recognizes words `Paper' or 'Article' in
eid or articleno
Version 1.67 Urgent bug fixes:
BibTeX style bug fixed (Michael D. Adams)
Sigplan special section bugfix
Version 1.66 Bug fixes
BibTeX change: location is now a synonym for city (Feras Saad)
ACM reference format is now mandatory for papers over one page.
CCS concepts and keywords are now mandatory for
papers over two pages.
Authors' addresses are mandatory for journal articles.
Version 1.65 Bug fixes
New journal: DGOV
DTRAP and HEALTH are now using acmlarge format
Version 1.64 Produce error if abstract is entered after maketitle
(previously abstract was silently dropped)
Bug fixes for line numbering
Version 1.63a Moved TQUANT to TQC
Version 1.63 New journals: TQUANT, FACMP
Version 1.62 Documentation update
New journal: TELO
Bug fixes
Version 1.61 Bug fixes
New bibtex types for artifacts
Version 1.60 New option: urlbreakonhyphens (thanks to Peter Kemp)
Smaller header size for acmsmall
Version 1.59 Now a journal format can be used for conference proceedings
All samples are now generated from the same .dtx file
Bug fixes
Version 1.58 Suppressed spurious warnings.
New journal: HEALTH.
TDSCI is renamed to TDS.
Version 1.57 Change of \baselinestretch now produces an error
Booktabs is now always loaded
Added option `balance' to balance last page in two-column mode
E-mail is no longer split in addresses
New samples (Stephen Spencer)
Version 1.56 Bug fixes
Added \flushbottom to two column formats (Philip Quinn)
The final punctuation for the list of concepts
is now a period instead of a semicolon (Philip Quinn)
New command \Description to describe images for visually
impaired users.
Version 1.55 Bug fixes
Font changes for SIGCHI table captions
Version 1.54 New option: 'nonacm' (Gabriel Scherer)
Deleted indent for subsubsection (suggested by Ross Moore)
Suppressed some obscurious warning in BibTeX processing
Suppressed hyperrerf warnings (Paolo G. Giarrusso)
New code for sections to help with accessibility patches
(Ross Moore)
Submission id, if present, is printed in anon mode
Bug fixes
Version 1.53 New journals: PACMCGIT, TIOT, TDSCI
Version 1.52 Another rewording of licenses
Version 1.51 Journal footers now use abbreviated journal titles.
Corrected the bug with acmPrice.
Do not show price when copyright is set to iw3c2w3 and iw3c2w3g.
The package now is compatible with polyglossia (Joachim Breitner).
Slightly reworded copyright statements.
Version 1.50 Changes in iw3c2w3 and iw3c2w3g
Version 1.49 New jorunal: DTRAP
Version 1.48 Bug fixes
Review mode now switches on folios
Code prettying (Michael D. Adams)
Bibliography changes: @MISC entries no longer have a
separate date
Sigch-a sample bibliography renamed
Bib code cleanup (Zack Weinberg)
Acmart and version info are added to pdfcreator tag
\citeyear no longer produces parenthetical year
Added initial support for Biblatex (Daniel Thomas)
Added support for IW3C2 conferences
Version 1.47 New journal: THRI
Version 1.46 Bug fixes for bibliography: label width is now calculated
correctly.
All PACM now use screen option. This requires etoolbox.
Added subtitle to ACM reference format.
Now acmart is compatible with fontspec.
\thanks is now obsolete. The addresses are automatically
added to the journal version; this can be overriden with
\authorsaddresses command.
Deleted the rule at the end of frontmatter for all formats.
Deleted new line before doi in the reference format.
Reintegrated theorem code into acmart.dtx (Matthew Fluet)
Version 1.45 Workaround for a Libertine bug. Thanks to LianTze Lim
from Overleaf
Version 1.44 Bug fixes.
Empty DOI and ISBN suppress printing DOI or ISBN lines
Separated theorem code into acmthm.sty, loaded by default.
Article number can be set for proceedings.
New commands: \acmBooktile, \editor.
Reference citation format updated.
Version 1.43 Bug fixes
Version 1.42 Deleted ACM badges
Bug fixes
Version 1.41 Rearranged bib files
Added new badges
Version 1.40 Bibliography changes
Added processing of one-compoment ccsdesc nodes
Bug fixes.
Made the height a multiple of \baselineskip + \topskip
Added cleveref
We no longer print street address in SIGs
Version 1.39 Added \authornotemark commmand
Version 1.38 Increase default font size for SIGPLAN
Version 1.37 Reduce list indentation (Matthew Fluet)
Version 1.36 Bug fixes
Moved PACMPL to acmlarge format
New journal: PACMHCI
Added the possibility to adjust number of author
boxes per row in conference formats
Version 1.35 Author-year bib style now uses square brackets.
Changed defaults for TOG sample
Price is suppressed for usgov and rightsretained modes.
Bugs fixed
Version 1.34 Deleted DOI from doi numbers
Changed bibstrip formatting
The command \terms is now obsolete
The rulers in review mode now have continuous numbering
Version 1.33 New option `timestamp' (Michael D. Adams)
New option `authordraft'
Documentation updates
Bug fixes
We now use Type 1 versions of Libertine fonts even with XeTeX.
New hook acmart-preload-hook.tex (wizards only!)
Added new options `obeypunctuation' for \affiliation command
Added SubmissionID
Added right line count ruler for two-column formats
Added workaround for Adobe Acrobat bugs in selection
Added eid field to the bibliography
Version 1.32 New DOI formatting.
Format siggraph is now obsolete, and sigconf
is used instead.
New proceedings title: POMACS.
Version 1.31 Changed default year and month to the current ones
(thanks to Matteo Riondato)
Table of contents now works
Marginalia now work in all formats
New command \additionalaffiliation
Documentation changes
Version 1.30 Bibtex style now recognizes https:// in doi.
Added \frenchspacing.
\department now has an optional hierarchy level.
Switched to T1 encoding
Updated IMWUT and PACMPL
Version 1.29 Documentation changes. Head height increased from 12pt to 13pt.
Removed spurious indent at start of abstract.
Improved kerning in CCS description list.
Version 1.28 Bug fixes: natbib=false now behaves correctly.
Version 1.27 Bug fixes
Version 1.26 Bug fixes
Version 1.25 Updated PACMPL journal option.
Version 1.24 Added IMWUT journal option.
Version 1.23 Added PACM PL journal option.
Version 1.22 Bibliography changes for Aptara backend; should be
invisible for the users.
Version 1.21 Bibliography changes: added arXiv, some cleanup
Version 1.20 Bug fixes, documentation updates
Version 1.19 Include 'Abstract', 'Acknowledgements', and 'References'
in PDF bookmarks.
Version 1.18 Natbib is now the default for all versions. A unified bib
file is used for all styles. Better treatment
of multiple affiliations.
Version 1.17 Formatting changes for margins and lists. Bug fixes.
Version 1.16 Formatting changes for headers and footers.
Version 1.15 New structured affiliation command.
New commands for acknowledgements.
Version 1.14 Warn about undefined citation styles; move definitions
of acmauthoryear and acmnumeric citation styles before
use.
Version 1.13 Formatting changes: headers, folios etc.
Bibliography changes.
Version 1.12 Bug fixes and documentation updates.
Footnotes rearranged.
Option natbib is now mostly superfluous: the class
makes a guess based on the format chosen.
Version 1.11 Customization of ACM theorem styles and proof
environment (Matthew Fluet).
Version 1.10 Bug fixes
Version 1.09 SIGPLAN: revert caption rules (Matthew Fluet)
Version 1.08 SIGPLAN reformatting (Matthew Fluet); bug fixes

343
Sketching Worlds/aaron.bib Normal file
View File

@ -0,0 +1,343 @@
@misc{pdbench,
title = {pdbench},
howpublished = {\url{http://pdbench.sourceforge.net/}},
note = {Accessed: 2020-12-15}
}
@MISC{Antova_fastand,
author = {Lyublena Antova and Thomas Jansen and Christoph Koch and Dan Olteanu},
title = {Fast and Simple Relational Processing of Uncertain Data},
year = {}
}
@book{DBLP:series/synthesis/2011Suciu,
author = {Dan Suciu and
Dan Olteanu and
Christopher R{\'{e}} and
Christoph Koch},
title = {Probabilistic Databases},
series = {Synthesis Lectures on Data Management},
publisher = {Morgan {\&} Claypool Publishers},
year = {2011},
url = {https://doi.org/10.2200/S00362ED1V01Y201105DTM016},
doi = {10.2200/S00362ED1V01Y201105DTM016},
timestamp = {Tue, 16 May 2017 14:24:20 +0200},
biburl = {https://dblp.org/rec/series/synthesis/2011Suciu.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{10.1145/1265530.1265571,
author = {Dalvi, Nilesh and Suciu, Dan},
title = {The Dichotomy of Conjunctive Queries on Probabilistic Structures},
year = {2007},
isbn = {9781595936851},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi-org.gate.lib.buffalo.edu/10.1145/1265530.1265571},
doi = {10.1145/1265530.1265571},
abstract = {We show that for every conjunctive query, the complexity of evaluating it on a probabilistic database is either PTIME or P-complete, and we give an algorithm for deciding whether a given conjunctive query is PTIME or P-complete. The dichotomy property is a fundamental result on query evaluation on probabilistic databases and it gives a complete classification of the complexity of conjunctive queries.},
booktitle = {Proceedings of the Twenty-Sixth ACM SIGMOD-SIGACT-SIGART Symposium on Principles of Database Systems},
pages = {293302},
numpages = {10},
keywords = {probabilistic databases, dichotomy, conjunctive queries},
location = {Beijing, China},
series = {PODS '07}
}
@inproceedings{DBLP:conf/icde/OlteanuHK10,
author = {Dan Olteanu and
Jiewen Huang and
Christoph Koch},
editor = {Feifei Li and
Mirella M. Moro and
Shahram Ghandeharizadeh and
Jayant R. Haritsa and
Gerhard Weikum and
Michael J. Carey and
Fabio Casati and
Edward Y. Chang and
Ioana Manolescu and
Sharad Mehrotra and
Umeshwar Dayal and
Vassilis J. Tsotras},
title = {Approximate confidence computation in probabilistic databases},
booktitle = {Proceedings of the 26th International Conference on Data Engineering,
{ICDE} 2010, March 1-6, 2010, Long Beach, California, {USA}},
pages = {145--156},
publisher = {{IEEE} Computer Society},
year = {2010},
url = {https://doi.org/10.1109/ICDE.2010.5447826},
doi = {10.1109/ICDE.2010.5447826},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
biburl = {https://dblp.org/rec/conf/icde/OlteanuHK10.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icde/AntovaKO07a,
author = {Lyublena Antova and
Christoph Koch and
Dan Olteanu},
editor = {Rada Chirkova and
Asuman Dogac and
M. Tamer {\"{O}}zsu and
Timos K. Sellis},
title = {MayBMS: Managing Incomplete Information with Probabilistic World-Set
Decompositions},
booktitle = {Proceedings of the 23rd International Conference on Data Engineering,
{ICDE} 2007, The Marmara Hotel, Istanbul, Turkey, April 15-20, 2007},
pages = {1479--1480},
publisher = {{IEEE} Computer Society},
year = {2007},
url = {https://doi.org/10.1109/ICDE.2007.369042},
doi = {10.1109/ICDE.2007.369042},
timestamp = {Wed, 16 Oct 2019 14:14:56 +0200},
biburl = {https://dblp.org/rec/conf/icde/AntovaKO07a.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sigmod/BoulosDMMRS05,
author = {Jihad Boulos and
Nilesh N. Dalvi and
Bhushan Mandhani and
Shobhit Mathur and
Christopher R{\'{e}} and
Dan Suciu},
editor = {Fatma {\"{O}}zcan},
title = {{MYSTIQ:} a system for finding more answers by using probabilities},
booktitle = {Proceedings of the {ACM} {SIGMOD} International Conference on Management
of Data, Baltimore, Maryland, USA, June 14-16, 2005},
pages = {891--893},
publisher = {{ACM}},
year = {2005},
url = {https://doi.org/10.1145/1066157.1066277},
doi = {10.1145/1066157.1066277},
timestamp = {Tue, 06 Nov 2018 11:07:39 +0100},
biburl = {https://dblp.org/rec/conf/sigmod/BoulosDMMRS05.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sigmod/SinghMMPHS08,
author = {Sarvjeet Singh and
Chris Mayfield and
Sagar Mittal and
Sunil Prabhakar and
Susanne E. Hambrusch and
Rahul Shah},
editor = {Jason Tsong{-}Li Wang},
title = {Orion 2.0: native support for uncertain data},
booktitle = {Proceedings of the {ACM} {SIGMOD} International Conference on Management
of Data, {SIGMOD} 2008, Vancouver, BC, Canada, June 10-12, 2008},
pages = {1239--1242},
publisher = {{ACM}},
year = {2008},
url = {https://doi.org/10.1145/1376616.1376744},
doi = {10.1145/1376616.1376744},
timestamp = {Tue, 06 Nov 2018 11:07:37 +0100},
biburl = {https://dblp.org/rec/conf/sigmod/SinghMMPHS08.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{AF18,
author = {Arab, Bahareh and Feng, Su and Glavic, Boris and Lee, Seokki and Niu, Xing and Zeng, Qitian},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/bib/journals/debu/ArabFGLNZ17},
journal = {{IEEE} Data Engineering Bulletin},
keywords = {GProM; Provenance; Annotations},
number = {1},
pages = {51--62},
pdfurl = {http://sites.computer.org/debull/A18mar/p51.pdf},
projects = {GProM; Reenactment},
timestamp = {Fri, 02 Mar 2018 18:50:49 +0100},
title = {{GProM} - {A} Swiss Army Knife for Your Provenance Needs},
venueshort = {Data Eng. Bull.},
volume = {41},
year = {2018},
bdsk-url-1 = {http://sites.computer.org/debull/A18mar/p51.pdf}
}
@article{10.1145/3003665.3003667,
author = {Olteanu, Dan and Schleich, Maximilian},
title = {Factorized Databases},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {45},
number = {2},
issn = {0163-5808},
url = {https://doi.org/10.1145/3003665.3003667},
doi = {10.1145/3003665.3003667},
abstract = {This paper overviews factorized databases and their application to machine learning. The key observation underlying this work is that state-of-the-art relational query processing entails a high degree of redundancy in the computation and representation of query results. This redundancy can be avoided and is not necessary for subsequent analytics such as learning regression models.},
journal = {SIGMOD Rec.},
month = sep,
pages = {516},
numpages = {12}
}
@inproceedings{DBLP:conf/tapp/Zavodny11,
author = {Jakub Z{\'{a}}vodn{\'{y}}},
editor = {Peter Buneman and
Juliana Freire},
title = {On Factorisation of Provenance Polynomials},
booktitle = {3rd Workshop on the Theory and Practice of Provenance, TaPP'11, Heraklion,
Crete, Greece, June 20-21, 2011},
publisher = {{USENIX} Association},
year = {2011},
url = {https://www.usenix.org/conference/tapp11/factorisation-provenance-polynomials},
timestamp = {Wed, 04 Jul 2018 13:06:34 +0200},
biburl = {https://dblp.org/rec/conf/tapp/Zavodny11.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{k-match,
author = {Radu Curticapean},
editor = {Fedor V. Fomin and
Rusins Freivalds and
Marta Z. Kwiatkowska and
David Peleg},
title = {Counting Matchings of Size k Is W[1]-Hard},
booktitle = {Automata, Languages, and Programming - 40th International Colloquium,
{ICALP} 2013, Riga, Latvia, July 8-12, 2013, Proceedings, Part {I}},
series = {Lecture Notes in Computer Science},
volume = {7965},
pages = {352--363},
publisher = {Springer},
year = {2013},
url = {https://doi.org/10.1007/978-3-642-39206-1\_30},
doi = {10.1007/978-3-642-39206-1\_30},
timestamp = {Tue, 14 May 2019 10:00:44 +0200},
biburl = {https://dblp.org/rec/conf/icalp/Curticapean13.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{kennedy:2010:icde:pip,
author = {Kennedy, Oliver and Koch, Christoph},
title = {PIP: A Database System for Great and Small Expectations},
booktitle = {ICDE},
year = {2010}
}
@inproceedings{DBLP:conf/vldb/AgrawalBSHNSW06,
author = {Parag Agrawal and
Omar Benjelloun and
Anish Das Sarma and
Chris Hayworth and
Shubha U. Nabar and
Tomoe Sugihara and
Jennifer Widom},
title = {Trio: {A} System for Data, Uncertainty, and Lineage},
booktitle = {{VLDB}},
pages = {1151--1154},
publisher = {{ACM}},
year = {2006}
}
@inproceedings{feng:2019:sigmod:uncertainty,
author = {Feng, Su and Huber, Aaron and Glavic, Boris and Kennedy, Oliver},
title = {Uncertainty Annotated Databases - A Lightweight Approach for Approximating Certain Answers},
booktitle = {SIGMOD},
year = {2019}
}
@article{DBLP:journals/vldb/FinkHO13,
author = {Robert Fink and
Jiewen Huang and
Dan Olteanu},
title = {Anytime approximation in probabilistic databases},
journal = {{VLDB} J.},
volume = {22},
number = {6},
pages = {823--848},
year = {2013}
}
@inproceedings{DBLP:conf/pods/KhamisNR16,
author = {Mahmoud Abo Khamis and
Hung Q. Ngo and
Atri Rudra},
title = {{FAQ:} Questions Asked Frequently},
booktitle = {{PODS}},
pages = {13--28},
publisher = {{ACM}},
year = {2016}
}
@inproceedings{DBLP:conf/pods/GreenKT07,
author = {Todd J. Green and
Gregory Karvounarakis and
Val Tannen},
title = {Provenance semirings},
booktitle = {{PODS}},
pages = {31--40},
publisher = {{ACM}},
year = {2007}
}
@article{DBLP:journals/sigmod/GuagliardoL17,
author = {Paolo Guagliardo and
Leonid Libkin},
title = {Correctness of {SQL} Queries on Databases with Nulls},
journal = {{SIGMOD} Rec.},
volume = {46},
number = {3},
pages = {5--16},
year = {2017}
}
@inproceedings{GL16,
author = {Paolo Guagliardo and
Leonid Libkin},
booktitle = {PODS},
title = {Making SQL Queries Correct on Incomplete Databases: A Feasibility
Study},
year = {2016}
}
@inproceedings{jampani2008mcdb,
author = {Jampani, Ravi and Xu, Fei and Wu, Mingxi and Perez, Luis Leopoldo and Jermaine, Christopher and Haas, Peter J},
booktitle = {SIGMOD},
title = {MCDB: a monte carlo approach to managing uncertain data},
year = {2008}
}
@article{yang:2015:pvldb:lenses,
author = {Yang, Ying and Meneghetti, Niccolò and Fehling, Ronny and Liu, Zhen Hua and Gawlick, Dieter and Kennedy, Oliver},
title = {Lenses: An On-Demand Approach to ETL},
journal = {pVLDB},
volume = {8},
number = {12},
year = {2015},
pages = {1578--1589}
}
@book{DBLP:books/daglib/0020812,
author = {Hector Garcia{-}Molina and
Jeffrey D. Ullman and
Jennifer Widom},
title = {Database systems - the complete book {(2.} ed.)},
publisher = {Pearson Education},
year = {2009}
}

View File

@ -0,0 +1,13 @@
%root: main.tex
%!TEX root=./main.tex
In this work, we study the problem of computing a tuple's expected multiplicity over probabilistic databases with bag semantics (where each tuple is associated with a multiplicity) exactly and approximately.
We consider bag-\abbrTIDB\xplural where we have a bound $\bound$ on the maximum multiplicity of each tuple and tuples are independent probabilistic events (we refer to such databases as \abbrCTIDB\xplural).
We are specifically interested in the fine-grained complexity of computing expected multiplicities and how it compares to the complexity of deterministic query evaluation algorithms --- if these complexities are comparable, it opens the door to practical deployment of probabilistic databases.
Unfortunately, our results imply that computing expected multiplicities for \abbrCTIDB\xplural based on the results produced by such query evaluation algorithms introduces super-linear overhead (under parameterized complexity hardness assumptions/conjectures).
We proceed to study approximation of expected result tuple multiplicities for positive relational algebra queries ($\raPlus$) over \abbrCTIDB\xplural and for a non-trivial subclass of block-independent databases (\abbrBIDB\xplural).
We develop a sampling algorithm that computes a $(1 \pm \epsilon)$-approximation of the expected multiplicity of an output tuple in time linear in the runtime of the corresponding deterministic query for any $\raPlus$ query.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,3 @@
%root: main.tex
\section{Acknowledgements}
We thank Virginia Williams for showing us \Cref{eq:3p-3tri}, which greatly simplified our earlier proof of Lemma 3.8, and for graciously allowing us to use it.

View File

@ -0,0 +1,95 @@
@Misc{TeXFAQ,
title = {{UK} List of {\TeX} Frequently Asked Questions},
author = {{UK \TeX{} Users Group}},
year = 2019,
howpublished = {\url{https://texfaq.org}}
}
@Manual{Downes04:amsart,
title = {The \textsf{amsart}, \textsf{amsproc}, and
\textsf{amsbook} document~classes},
author = {Michael Downes and Barbara Beeton},
organization = {American Mathematical Society},
year = 2004,
month = aug,
note = {\url{http://www.ctan.org/pkg/amslatex}}
}
@Manual{Fiorio15,
title = {{a}lgorithm2e.sty---package for algorithms},
author = {Cristophe Fiorio},
year = 2015,
month = oct,
note = {\url{http://www.ctan.org/pkg/algorithm2e}}
}
@Manual{Brito09,
title = {The algorithms bundle},
author = {Rog\'erio Brito},
year = 2009,
month = aug,
note = {\url{http://www.ctan.org/pkg/algorithms}}
}
@Manual{Heinz15,
title = {The Listings Package},
author = {Carsten Heinz and Brooks Moses and Jobst Hoffmann},
year = 2015,
month = jun,
note = {\url{http://www.ctan.org/pkg/listings}}
}
@Manual{Fear05,
title = {Publication quality tables in {\LaTeX}},
author = {Simon Fear},
year = 2005,
month = apr,
note = {\url{http://www.ctan.org/pkg/booktabs}}
}
@Manual{ACMIdentityStandards,
title = {{ACM} Visual Identity Standards},
organization = {Association for Computing Machinery},
year = 2007,
note = {\url{http://identitystandards.acm.org}}
}
@Manual{Sommerfeldt13:Subcaption,
title = {The subcaption package},
author = {Axel Sommerfeldt},
year = 2013,
month = apr,
note = {\url{http://www.ctan.org/pkg/subcaption}}
}
@Manual{Nomencl,
title = {A package to create a nomenclature},
author = {Boris Veytsman and Bern Schandl and Lee Netherton
and C. V. Radhakrishnan},
year = 2005,
month = sep,
note = {\url{http://www.ctan.org/pkg/nomencl}}
}
@Manual{Talbot16:Glossaries,
title = {User Manual for glossaries.sty v4.44},
author = {Nicola L. C. Talbot},
year = 2019,
month = dec,
note = {\url{http://www.ctan.org/pkg/glossaries}}
}
@Manual{Carlisle04:Textcase,
title = {The \textsl{textcase} package},
author = {David Carlisle},
month = oct,
year = 2004,
note = {\url{http://www.ctan.org/pkg/textcase}}
}
@Manual{Braams22:Babel,
title = {Babel},
author = {Johannes L. Braams and Javier Bezos},
year = 2022,
note = {\url{http://www.ctan.org/pkg/babel}}}

7934
Sketching Worlds/acmart.dtx Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
%
% Doctrip file for acmart
% This file is in public domain
% $Id: acmart.ins,v 1.1 2015/11/23 22:42:55 boris Exp $
%
\def\batchfile{acmart.ins}
\input docstrip
\keepsilent
\showprogress
\askforoverwritefalse
\generate{%
\file{acmart.cls}{\from{acmart.dtx}{class}}
}
\obeyspaces
\Msg{*****************************************************}%
\Msg{* Congratulations! You successfully generated the *}%
\Msg{* acmart package. *}%
\Msg{* *}%
\Msg{* Please move the file acmart.cls to where LaTeX *}%
\Msg{* files are stored in your system. The manual is *}%
\Msg{* acmart.pdf. *}%
\Msg{* *}%
\Msg{* The package is released under LPPL *}%
\Msg{* *}%
\Msg{* Happy TeXing! *}%
\Msg{*****************************************************}%

View File

@ -0,0 +1,197 @@
%root: main.tex
The following results assume input circuit \circuit computed from an arbitrary $\raPlus$ query $\query$ and arbitrary \abbrBIDB $\pdb$. We refer to \circuit as a \abbrBIDB circuit.
\begin{Theorem}\label{lem:approx-alg}
Let \circuit be an arbitrary \abbrBIDB circuit
and define $\poly(\vct{X})=\polyf(\circuit)$ and let $k=\degree(\circuit)$.
Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ can be computed in time
{\small
\[O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\circuit}^2(1,\ldots, 1)\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error}^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)\]
}
such that
\begin{equation}
\label{eq:approx-algo-bound}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf.
\end{equation}
\end{Theorem}
The slight abuse of notation seen in $\abs{\circuit}\inparen{1,\ldots,1}$ is explained after \Cref{def:positive-circuit} and an example is given in \Cref{ex:def-pos-circ}. The only difference in the use of this notation in \Cref{lem:approx-alg} is that we include an additional exponent to square the quantity.
\subsection{Proof of Theorem \ref{lem:approx-alg}}\label{sec:proof-lem-approx-alg}
\input{app_approx_alg-pseudo-code}
We prove \Cref{lem:approx-alg} constructively by presenting an algorithm \approxq (\Cref{alg:mon-sam}) which has the desired runtime and computes an approximation with the desired approximation guarantee. Algorithm \approxq uses Algorithm \onepass to compute weights on the edges of a circuits. These weights are then used to sample a set of monomials of $\poly(\circuit)$ from the circuit $\circuit$ by traversing the circuit using the weights to ensure that monomials are sampled with an appropriate probability. The correctness of \approxq relies on the correctness (and runtime behavior) of auxiliary algorithms \onepass and \sampmon that we state in the following lemmas (and prove later in this part of the appendix).
\begin{Lemma}\label{lem:one-pass}
The $\onepass$ function completes in time:
$$O\left(\size(\circuit) \cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}\right)$$
$\onepass$ guarantees two post-conditions: First, for each subcircuit $\vari{S}$ of $\circuit$, we have that $\vari{S}.\vari{partial}$ is set to $\abs{\vari{S}}(1,\ldots, 1)$. Second, when $\vari{S}.\type = \circplus$, \subcircuit.\lwght $= \frac{\abs{\subcircuit_\linput}(1,\ldots, 1)}{\abs{\subcircuit}(1,\ldots, 1)}$ and likewise for \subcircuit.\rwght.
\end{Lemma}
To prove correctness of \Cref{alg:mon-sam}, we only use the following fact that follows from the above lemma: for the modified circuit ($\circuit_{\vari{mod}}$) output by \onepass, $\circuit_{\vari{mod}}.\vari{partial}=\abs{\circuit}(1,\dots,1)$.
\begin{Lemma}\label{lem:sample}
The function $\sampmon$ completes in time
$$O(\log{k} \cdot k \cdot \depth(\circuit)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}})$$
where $k = \degree(\circuit)$. The function returns every $\left(\monom, sign(\coef)\right)$ for $(\monom, \coef)\in \expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
\end{Lemma}
With the above two lemmas, we are ready to argue the following result:
\begin{Theorem}\label{lem:mon-samp}
For any $\circuit$ with
$\degree(poly(|\circuit|)) = k$, algorithm \ref{alg:mon-sam} outputs an estimate $\vari{acc}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ such that
\[\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf,\]
in $O\left(\left(\size(\circuit)+\frac{\log{\frac{1}{\conf}}}{\error^2} \cdot k \cdot\log{k} \cdot \depth(\circuit)\right)\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log{\size(\circuit)}}\right)$ time.
\end{Theorem}
Before proving \Cref{lem:mon-samp}, we use it to argue the claimed runtime of our main result, \Cref{lem:approx-alg}.
\begin{proof}[Proof of \Cref{lem:approx-alg}]
Set $\mathcal{E}=\approxq({\circuit}, (\prob_1,\dots,\prob_\numvar),$ $\conf, \error')$, where
\[\error' = \error \cdot \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},\]
which achieves the claimed error bound on $\mathcal{E}$ (\vari{acc}) trivially due to the assignment to $\error'$ and \cref{lem:mon-samp}, since $\error' \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\frac{\rpoly(1,\ldots, 1)}{\abs{\circuit}(1,\ldots, 1)} \cdot \abs{\circuit}(1,\ldots, 1) = \error\cdot\rpoly(1,\ldots, 1)$.
The claim on the runtime follows from \Cref{lem:mon-samp} since
\begin{align*}
\frac 1{\inparen{\error'}^2}\cdot \log\inparen{\frac 1\conf}=&\frac{\log{\frac{1}{\conf}}}{\error^2 \left(\frac{\rpoly(\prob_1,\ldots, \prob_N)}{\abs{{\circuit}}(1,\ldots, 1)}\right)^2}\\
= &\frac{\log{\frac{1}{\conf}}\cdot \abs{{\circuit}}^2(1,\ldots, 1)}{\error^2 \cdot \rpoly^2(\prob_1,\ldots, \prob_\numvar)}.
\end{align*}
\qed
\end{proof}
Let us now prove \Cref{lem:mon-samp}:
\subsection{Proof of Theorem \ref{lem:mon-samp}}\label{app:subsec-th-mon-samp}
\begin{proof}
Consider now the random variables $\randvar_1,\dots,\randvar_\numsamp$, where each $\randvar_\vari{i}$ is the value of $\vari{Y}_{\vari{i}}$ in \cref{alg:mon-sam} after \cref{alg:mon-sam-product} is executed. Overloading $\isInd{\cdot}$ to receive monomial input (recall $\encMon$ is the monomial composed of the variables in the set $\monom$), we have
\[\randvar_\vari{i}= \indicator{\inparen{\isInd{\encMon}}}\cdot \prod_{X_i\in \var\inparen{v}} p_i,\]
where the indicator variable handles the check in \Cref{alg:check-duplicate-block}
Then for random variable $\randvar_i$, it is the case that
\begin{align*}
\expct\pbox{\randvar_\vari{i}} &= \sum\limits_{(\monom, \coef) \in \expansion{{\circuit}} }\frac{\indicator{\inparen{\isInd{\encMon}}}\cdot c\cdot\prod_{X_i\in \var\inparen{v}} p_i }{\abs{{\circuit}}(1,\dots,1)} \\
&= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)},
\end{align*}
where in the first equality we use the fact that $\vari{sgn}_{\vari{i}}\cdot \abs{\coef}=\coef$ and the second equality follows from \Cref{eq:tilde-Q-bi} with $X_i$ substituted by $\prob_i$.
Let $\empmean = \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\randvar_\vari{i}$. It is also true that
\[\expct\pbox{\empmean}
= \frac{1}{\samplesize}\sum_{i = 1}^{\samplesize}\expct\pbox{\randvar_\vari{i}}
= \frac{\rpoly(\prob_1,\ldots, \prob_\numvar)}{\abs{{\circuit}}(1,\ldots, 1)}.\]
Hoeffding's inequality states that if we know that each $\randvar_i$ (which are all independent) always lie in the intervals $[a_i, b_i]$, then it is true that
\begin{equation*}
\probOf\left(\left|\empmean - \expct\pbox{\empmean}\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{\sum_{i = 1}^{\samplesize}(b_i -a_i)^2}\right)}.
\end{equation*}
Line~\ref{alg:mon-sam-sample} shows that $\vari{sgn}_\vari{i}$ has a value in $\{-1, 1\}$ that is multiplied with $O(k)$ $\prob_i\in [0, 1]$, which implies the range for each $\randvar_i$ is $[-1, 1]$.
Using Hoeffding's inequality, we then get:
\begin{equation*}
\probOf\left(~\left| \empmean - \expct\pbox{\empmean} ~\right| \geq \error\right) \leq 2\exp{\left(-\frac{2\samplesize^2\error^2}{2^2 \samplesize}\right)} = 2\exp{\left(-\frac{\samplesize\error^2}{2 }\right)}\leq \conf,
\end{equation*}
where the last inequality dictates our choice of $\samplesize$ in \Cref{alg:mon-sam-global2}.
For the claimed probability bound of $\probOf\left(\left|\vari{acc} - \rpoly(\prob_1,\ldots, \prob_\numvar)\right|> \error \cdot \abs{\circuit}(1,\ldots, 1)\right) \leq \conf$, note that in the algorithm, \vari{acc} is exactly $\empmean \cdot \abs{\circuit}(1,\ldots, 1)$. Multiplying the rest of the terms by the additional factor $\abs{\circuit}(1,\ldots, 1)$ yields the said bound.
This concludes the proof for the first claim of theorem~\ref{lem:mon-samp}. Next, we prove the claim on the runtime.
\paragraph*{Run-time Analysis}
The runtime of the algorithm is dominated first by \Cref{alg:mon-sam-onepass} (which by \Cref{lem:one-pass} takes time $O\left({\size(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$) and then by $\samplesize$ iterations of the loop in \Cref{alg:sampling-loop}. Each iteration's run time is dominated by the call to \sampmon in \Cref{alg:mon-sam-sample} (which by \Cref{lem:sample} takes $O\left(\log{k} \cdot k \cdot {\depth(\circuit)}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$
) and the check \Cref{alg:check-duplicate-block}, which by the subsequent argument takes $O(k\log{k})$ time. We sort the $O(k)$ variables by their block IDs and then check if there is a duplicate block ID or not. Combining all the times discussed here gives us the desired overall runtime.
\qed
\end{proof}
\subsection{Proof of \Cref{cor:approx-algo-const-p}}
\begin{proof}
The result follows by first noting that by definition of $\gamma$, we have
\[\rpoly(1,\dots,1)= (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1).\]
Further, since each $\prob_i\ge \prob_0$ and $\poly(\vct{X})$ (and hence $\rpoly(\vct{X})$) has degree at most $k$, we have that
\[ \rpoly(1,\dots,1) \ge \prob_0^k\cdot \rpoly(1,\dots,1).\]
The above two inequalities implies $\rpoly(1,\dots,1) \ge \prob_0^k\cdot (1-\gamma)\cdot \abs{{\circuit}}(1,\dots,1)$.
Applying this bound in the runtime bound in \Cref{lem:approx-alg} gives the first claimed runtime. The final runtime of $O_k\left(\frac 1{\eps^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\cdot \multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$ follows by noting that $\depth({\circuit})\le \size({\circuit})$ and absorbing all factors that just depend on $k$.
\qed
\end{proof}
\subsection{Proof of~\Cref{lem:ctidb-gamma}}
\begin{proof}
The circuit \circuit' is built from \circuit in the following manner. For each input gate $\gate_i$ with $\gate_i.\val = X_\tup$, replace $\gate_i$ with the circuit \subcircuit encoding the sum $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$. We argue that \circuit' is a valid circuit by the following facts. Let $\pdb = \inparen{\worlds, \bpd}$ be the original \abbrCTIDB \circuit was generated from. Then, by~\Cref{prop:ctidb-reduct} there exists a \abbrOneBIDB $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$, with $\tupset' = \inset{\intup{\tup, j}~|~\tup\in\tupset, j\in\pbox{\bound}}$, from which the conversion from \circuit to \circuit' follows. Both $\polyf\inparen{\circuit}$ and $\polyf\inparen{\circuit'}$ have the same expected multiplicity since (by~\Cref{prop:ctidb-reduct}) the distributions $\bpd$ and $\bpd'$ are equivalent and each $j\cdot\worldvec'_{\tup, j} = \worldvec_\tup$ for $\worldvec'\in\inset{0, 1}^{\bound\numvar}$ and $\worldvec\in\worlds$. Finally, note that because there exists a (sub) circuit encoding $\sum_{j = 1}^\bound j\cdot X_{\tup, j}$ that is a \emph{balanced} binary tree, the above conversion implies the claimed size and depth bounds of the lemma.
Next we argue the claim on $\gamma\inparen{\circuit'}$. Consider the list of expanded monomials $\expansion{\circuit}$ for \abbrCTIDB circuit \circuit. Let
$\encMon = X_{\tup_1}^{d_1},\ldots,X_{\tup_\ell}^{d_\ell}$ be an arbitrary monomial with $\ell$ variables. Then \monom yields the set of monomials $\vari{E}_\monom\inparen{\circuit'}=\inset{j_1^{d_1}\cdot X_{\tup, j_1}^{d_1}\times\cdots\times j_\ell^{d_\ell}\cdot X_{\tup, j_\ell}^{d_\ell}}_{j_1,\ldots, j_\ell \in \pbox{\bound}}$ in $\expansion{\circuit'}$. Recall that a cancellation occurs when we have a monomial \monom' such that there exists $\tup\neq\tup'$ in the same block $\block$ where variables $X_\tup, X_{\tup'}$ are in the set of variables $\encMon'$ of \monom'. Observe that cancellations can only occur for each $X_{\tup}^{d_\tup}\in \encMon$, where the expansion $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_\tup}$ represents the monomial $X_\tup^{d_\tup}$ in $\tupset'$. Consider the number of cancellations for $\inparen{\sum_{j = 1}^\bound j\cdot X_{\tup, j}}^{d_t}$. Then $\gamma \leq 1 - \bound^{d_\tup - 1}$, since
for each element in the set of cross products $\inset{\bigtimes_{i\in\pbox{d_\tup}, j_i\in\pbox{\bound}}X_{\tup, j_i}}$ there are \emph{exactly} $\bound$ surviving elements with $j_1=\cdots=j_{d_\tup}=j$, i.e. $X_{t,j}^{d_\tup}$ for each $j\in\pbox{\bound}$. The rest of the $\bound^{d_\tup}-c$ cross terms cancel. Regarding the whole monomial \monom', it is the case that the proportion of non-cancellations across each $X_\tup^{d_\tup}\in\encMon'$ multiply because non-cancelling terms for $X_\tup$ can only be joined with non-cancelling terms of $X_{\tup'}^{d_{\tup'}}\in\encMon'$ for $\tup\neq\tup'$. This then yields the fraction of cancelled monomials $\gamma\le 1 - \prod_{i = 1}^{\ell}\bound^{d_i - 1} \leq 1 - \bound^{-\inparen{k - 1}}$ where the inequalities take into account the fact that $\sum_{i = 1}^\ell d_i \leq k$.
Since this is true for arbitrary \monom, the bound follows for $\polyf\inparen{\circuit'}$.
\end{proof}
\qed
\subsection{Proof of \Cref{lem:val-ub}}\label{susec:proof-val-up}
\label{app:proof-lem-val-ub}
We will prove \Cref{lem:val-ub} by considering the two cases separately. We start by considering the case when $\circuit$ is a tree:
\begin{Lemma}
\label{lem:C-ub-tree}
Let $\circuit$ be a tree (i.e. the sub-circuits corresponding to two children of a node in $\circuit$ are completely disjoint). Then we have
\[\abs{\circuit}(1,\dots,1)\le \left(\size(\circuit)\right)^{\degree(\circuit)+1}.\]
\end{Lemma}
\begin{proof}[Proof of \Cref{lem:C-ub-tree}]
For notational simplicity define $N=\size(\circuit)$ and $k=\degree(\circuit)$.
We use induction on $\depth(\circuit)$ to show that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient or constant. In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and by \Cref{def:degree} it is the case that $0 \leq k = \degree\inparen{\circuit} \leq 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{k+1} = 1^{k + 1} = 1$ for $k \in \inset{0, 1}$.
Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{k+1 }$.
For the inductive step we consider a circuit \circuit such that $\depth(\circuit) = \ell + 1$. The sink can only be either a $\circmult$ or $\circplus$ gate. Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively. Consider when sink node is $\circmult$.
Then note that
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\cdot \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq (N-1)^{k_\linput+1} \cdot (N - 1)^{k_\rinput+1}\nonumber\\
&= (N-1)^{k+1}\label{eq:sumcoeff-times-upper}\\
&\leq N^{k + 1}.\nonumber
\end{align}
In the above the first inequality follows from the inductive hypothesis (and the fact that the size of either subtree is at most $N-1$) and \Cref{eq:sumcoeff-times-upper} follows by \cref{def:degree} which states that for $k = \degree(\circuit)$ we have $k=k_\linput+k_\rinput+1$.
For the case when the sink gate is a $\circplus$ gate, then for $N_\linput = \size(\circuit_\linput)$ and $N_\rinput = \size(\circuit_\rinput)$ we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \nonumber\\
&\leq
N_\linput^{k+1} + N_\rinput^{k+1}\nonumber\\
&\leq (N-1)^{k+1 } \label{eq:sumcoeff-plus-upper}\\
&\leq N^{k+1}.\nonumber
\end{align}
In the above, the first inequality follows from the inductive hypothes and \cref{def:degree} (which implies the fact that $k_\linput,k_\rinput\le k$). Note that the RHS of this inequality is maximized when the base and exponent of one of the terms is maximized. The second inequality follows from this fact as well as the fact that since $\circuit$ is a tree we have $N_\linput+N_\rinput=N-1$ and, lastly, the fact that $k\ge 0$. This completes the proof.
\end{proof}
The upper bound in \Cref{lem:val-ub} for the general case is a simple variant of the above proof (but we present a proof sketch of the bound below for completeness):
\begin{Lemma}
\label{lem:C-ub-gen}
Let $\circuit$ be a (general) circuit.
Then we have
\[\abs{\circuit}(1,\dots,1)\le 2^{2^{\degree(\circuit)}\cdot \depth(\circuit)}.\]
\end{Lemma}
\begin{proof}[Proof Sketch of \Cref{lem:C-ub-gen}]
We use the same notation as in the proof of \Cref{lem:C-ub-tree} and further define $d=\depth(\circuit)$. We will prove by induction on $\depth(\circuit)$ that $\abs{\circuit}(1,\ldots, 1) \leq 2^{2^k\cdot d }$. The base case argument is similar to that in the proof of \Cref{lem:C-ub-tree}. In the inductive case we have that $d_\linput,d_\rinput\le d-1$.
For the case when the sink node is $\times$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq {2^{2^{k_\linput}\cdot d_\linput}} \circmult {2^{2^{k_\rinput}\cdot d_\rinput}}\\
&\leq 2^{2\cdot 2^{k-1}\cdot (d-1)}\\
&\leq 2^{2^k d}.
\end{align*}
In the above the first inequality follows from inductive hypothesis while the second inequality follows from the fact that $k_\linput,k_\rinput\le k-1$ and $d_\linput, d_\rinput\le d-1$, where we substitute the upperbound into every respective term.
Now consider the case when the sink node is $+$, we get that
\begin{align*}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \\
&\leq 2^{2^{k_\linput}\cdot d_\linput} + 2^{2^{k_\rinput}\cdot d_\rinput}\\
&\leq 2\cdot {2^{2^k(d-1)} } \\
&\leq 2^{2^kd}.
\end{align*}
In the above the first inequality follows from the inductive hypothesis while the second inequality follows from the facts that $k_\linput,k_\rinput\le k$ and $d_\linput,d_\rinput\le d-1$. The final inequality follows from the fact that $k\ge 0$.
\qed
\end{proof}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,27 @@
%root: main.tex
In the following definitions and examples, we use the following polynomial as an example:
\begin{equation}
\label{eq:poly-eg}
\poly(X, Y) = 2X^2 + 3XY - 2Y^2.
\end{equation}
\begin{Definition}[Pure Expansion]
The pure expansion of a polynomial $\poly$ is formed by computing all product of sums occurring in $\poly$, without combining like monomials. The pure expansion of $\poly$ generalizes \Cref{def:smb} by allowing monomials $m_i = m_j$ for $i \neq j$.
\end{Definition}
Note that similar in spirit to \Cref{def:reduced-bi-poly}, $\expansion{\circuit}$ \Cref{def:expand-circuit} reduces all variable exponents $e > 1$ to $e = 1$. Further, it is true that $\expansion{\circuit}$ is the pure expansion of $\circuit$.
\begin{Example}[Example of Pure Expansion]\label{example:expr-tree-T}
Consider the factorized representation $(X+ 2Y)(2X - Y)$ of the polynomial in \Cref{eq:poly-eg}.
Its circuit $\circuit$ is illustrated in \Cref{fig:circuit}.
The pure expansion of the product is $2X^2 - XY + 4XY - 2Y^2$. As an additional example of \Cref{def:expand-circuit}, $\expansion{\circuit}=[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$.
\end{Example}
$\expansion{\circuit}$ effectively\footnote{The minor difference here is that $\expansion{\circuit}$ encodes the \emph{reduced} form over the SOP pure expansion of the compressed representation, as opposed to the \abbrSMB representation} encodes the \emph{reduced} form of $\polyf\inparen{\circuit}$, decoupling each monomial into a set of variables $\monom$ and a real coefficient $\coef$.
However, unlike the constraint on the input $\poly$ to compute $\rpoly$, the input circuit $\circuit$ does not need to be in \abbrSMB/SOP form.
\begin{Example}[Example for \Cref{def:positive-circuit}]\label{ex:def-pos-circ}
Using the same factorization from \Cref{example:expr-tree-T}, $\polyf(\abs{\circuit}) = (X + 2Y)(2X + Y) = 2X^2 +XY +4XY + 2Y^2 = 2X^2 + 5XY + 2Y^2$. Note that this \textit{is not} the same as the polynomial from \Cref{eq:poly-eg}. As an example of the slight abuse of notation we alluded to, $\polyf\inparen{\abs{\circuit}\inparen{1,\ldots, 1}} =2\inparen{1}^2 + 5\inparen{1}\inparen{1} + 2\inparen{1}^2 = 9$.
\end{Example}
\begin{Definition}[Subcircuit]
A subcircuit of a circuit $\circuit$ is a circuit \subcircuit such that \subcircuit is a DAG \textit{subgraph} of the DAG representing \circuit. The sink of \subcircuit has exactly one gate \gate.
\end{Definition}

View File

@ -0,0 +1,28 @@
%root:main.tex
\begin{algorithm}[t]
\caption{$\approxq(\circuit, \vct{p}, \conf, \error)$}
\label{alg:mon-sam}
\begin{algorithmic}[1]
\Require \circuit: Circuit
\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
\Require $\conf$ $\in [0, 1]$
\Require $\error$ $\in [0, 1]$
\Ensure \vari{acc} $\in \mathbb{R}$
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\circuit_\vari{mod}, \vari{size}) \gets $ \onepass($\circuit$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is \Cref{alg:one-pass-iter}}
\For{$\vari{i} \in 1 \text{ to }\numsamp$}\label{alg:sampling-loop}\Comment{Perform the required number of samples}
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\circuit_\vari{mod}$)\label{alg:mon-sam-sample}\Comment{\sampmon is \Cref{alg:sample}. Note that $\vari{sgn}_\vari{i}$ is the \emph{sign} of the monomial's coefficient and \emph{not} the coefficient itself}
\If{$\vari{M}$ has at most one variable from each block}\label{alg:check-duplicate-block}
\State $\vari{Y}_\vari{i} \gets \prod_{X_j\in\vari{M}}p_j$\label{alg:mon-sam-assign1}\Comment{\vari{M} is the sampled monomial's set of variables (cref. \cref{subsec:sampmon-remarks})}
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
\EndIf
\EndFor
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
\State \Return \vari{acc}
\end{algorithmic}
\end{algorithm}

View File

@ -0,0 +1,22 @@
%root: main.tex
\begin{proof}
We first argue that $\rpoly_{G}^\kElem(\prob,\ldots, \prob) = \sum\limits_{i = 0}^{2\kElem} c_i \cdot \prob^i$. First, since $\poly_G(\vct{X})$ has degree $2$, it follows that $\poly_G^\kElem(\vct{X})$ has degree $2\kElem$. By definition, $\rpoly_{G}^{\kElem}(\vct{X})$ sets every exponent $e > 1$ to $e = 1$, which means that $\degree(\rpoly_{G}^\kElem)\le \degree(\poly_G^\kElem)= 2k$. Thus, if we think of $\prob$ as a variable, then $\rpoly_{G}^{\kElem}(\prob,\dots,\prob)$ is a univariate polynomial of degree at most $\degree(\rpoly_{G}^\kElem)\le 2k$. Thus, we can write
\begin{equation*}
\rpoly_{G}^{\kElem}(\prob,\ldots, \prob) = \sum_{i = 0}^{2\kElem} c_i \prob^i
\end{equation*}
We note that $c_i$ is {\em exactly} the number of monomials in the SMB expansion of $\poly_{G}^{\kElem}(\vct{X})$ composed of $i$ distinct variables.\footnote{Since $\rpoly_G^\kElem(\vct{X})$ does not have any monomial with degree $< 2$, it is the case that $c_0 = c_1 = 0$ but for the sake of simplcity we will ignore this observation.}
Given that we then have $2\kElem + 1$ distinct values of $\rpoly_{G}^\kElem(\prob,\ldots, \prob)$ for $0\leq i\leq2\kElem$, it follows that we have a linear system of the form $\vct{M} \cdot \vct{c} = \vct{b}$ where the $i$th row of $\vct{M}$ is $\inparen{\prob_i^0\ldots\prob_i^{2\kElem}}$, $\vct{c}$ is the coefficient vector $\inparen{c_0,\ldots, c_{2\kElem}}$, and $\vct{b}$ is the vector such that $\vct{b}[i] = \rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$. In other words, matrix $\vct{M}$ is the Vandermonde matrix, from which it follows that we have a matrix with full rank (the $p_i$'s are distinct), and we can solve the linear system in $O(k^3)$ time (e.g., using Gaussian Elimination) to determine $\vct{c}$ exactly.
Thus, after $O(k^3)$ work, we know $\vct{c}$ and in particular, $c_{2k}$ exactly.
Next, we show why we can compute $\numocc{G}{\kmatch}$ from $c_{2k}$ in $O(1)$ additional time.
We claim that $c_{2\kElem}$ is $\kElem! \cdot \numocc{G}{\kmatch}$. This can be seen intuitively by looking at the expansion of the original factorized representation
\[\poly_{G}^\kElem(\vct{X}) = \sum_{\substack{(i_1, j_1),\cdots,(i_\kElem, j_\kElem) \in E}}X_{i_1}X_{j_1}\cdots X_{i_\kElem}X_{j_\kElem},\]
where a unique $\kElem$-matching in the multi-set of product terms can be selected $\prod_{i = 1}^\kElem i = \kElem!$ times.
Indeed, note that each $\kElem$-matching $(i_1, j_1)\ldots$ $(i_k, j_k)$ in $G$ corresponds to the monomial $\prod_{\ell = 1}^\kElem X_{i_\ell}X_{j_\ell}$ in $\poly_{G}^\kElem(\vct{X})$, with distinct indexes, and this implies that each distinct $\kElem$-matching appears the exact number of permutations that exist for its particular set of $\kElem$ edges, or $k!$.
Since, as noted earlier, $c_{2\kElem}$ represents the number of monomials with $2\kElem$ distinct variables, then it must be that $c_{2\kElem}$ is the overall number of $\kElem$-matchings. And since we have $\kElem!$ copies of each distinct $\kElem$-matching, it follows that
$c_{2\kElem}= \kElem! \cdot \numocc{G}{\kmatch}$.
Thus, simply dividing $c_{2\kElem}$ by $\kElem!$ gives us $\numocc{G}{\kmatch}$, as needed. \qed
\end{proof}

View File

@ -0,0 +1,78 @@
%root: main.tex
\begin{proof}
The proof consists of two parts. First we need to show that a vector $\vct{b}$ satisfying the linear system exists and further can be computed in $O(m)$ time. Second we need to show that $\numocc{G}{\tri}, \numocc{G}{\threedis}$ can indeed be computed in time $O(1)$.
The lemma claims that for $\vct{M} =
\begin{pmatrix}
1 - 3p & -(3\prob^2 - \prob^3)\\
10(3\prob^2 - \prob^3) & 10(3\prob^2 - \prob^3)
\end{pmatrix}$, $\vct{x} =
\begin{pmatrix}
\numocc{G}{\tri}]\\
\numocc{G}{\threedis}
\end{pmatrix}$
satisfies the linear system $\vct{M} \cdot \vct{x} = \vct{b}$.
To prove the first step, we use \Cref{lem:qE3-exp} to derive the following equality (dropping the superscript and referring to $G^{(1)}$ as $G$):
\begin{align}
\numocc{G}{\ed}\prob^2 &+ 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis}\prob^4 + 6\numocc{G}{\tri}\prob^3 + 6\numocc{G}{\oneint}\prob^4 \nonumber\\
&+ 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6 = \rpoly_{G}^3(\prob,\ldots, \prob)\label{eq:lem-qE3-exp}\\
\numocc{G}{\tri}&+\numocc{G}{\threepath}\prob+\numocc{G}{\twopathdis}\prob^2+\numocc{G}{\threedis}\prob^3\nonumber\\
&= \frac{\rpoly_{G}^3(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{G}{\ed}}{6\prob} - \numocc{G}{\twopath}-\numocc{G}{\twodis}\prob-\numocc{G}{\oneint}\prob\label{eq:b1-alg-1}\\
\numocc{G}{\tri}(1-3p) &- \numocc{G}{\threedis}(3\prob^2 -\prob^3) = \nonumber\\
\frac{\rpoly_{G}^3(\prob,\ldots, \prob)}{6\prob^3} &- \frac{\numocc{G}{\ed}}{6\prob} - \numocc{G}{\twopath}-\numocc{G}{\twodis}\prob-\numocc{G}{\oneint}\prob\nonumber\\
&-\left[\numocc{G}{\threepath}\prob+3\numocc{G}{\tri}\prob\right]-\left[\numocc{G}{\twopathdis}\prob^2+3\numocc{G}{\threedis}\prob^2\right]\label{eq:b1-alg-2}
\end{align}
\Cref{eq:lem-qE3-exp} is the result of \Cref{lem:qE3-exp}. We obtain the remaining equations through standard algebraic manipulations.
Note that the LHS of \Cref{eq:b1-alg-2} is obtained using \cref{eq:2pd-3d} and \cref{eq:3p-3tri} and is indeed the product $\vct{M}[1] \cdot \vct{x}[1]$. Further note that this product is equal to the RHS of \Cref{eq:b1-alg-2}, where every term is computable in $O(m)$ time (by equations (\ref{eq:1e})-(\ref{eq:3p-3tri})). We set $\vct{b}[1]$ to the RHS of \Cref{eq:b1-alg-2}.
We follow the same process in deriving an equality for $G^{(2)}$. Replacing occurrences of $G$ with $G^{(2)}$, we obtain an equation (below) of the form of \cref{eq:b1-alg-2} for $G^{(2)}$. Substituting identities from \cref{lem:3m-G2} and \Cref{lem:tri} we obtain
\begin{align}
0-\left(8\numocc{G}{\threedis}\right.&\left.+6\numocc{G}{\twopathdis}+4\numocc{G}{\oneint}+4\numocc{G}{\threepath}+2\numocc{G}{\tri}\right)(3\prob^2 -\prob^3)=\nonumber\\
&\frac{\rpoly_{\graph{2}}^3(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath}-\numocc{\graph{2}}{\twodis}\prob-\numocc{\graph{2}}{\oneint}\prob\nonumber\\
&-\left[\numocc{\graph{2}}{\twopathdis}\prob^2+3\numocc{\graph{2}}{\threedis}\prob^2\right]-\left[\numocc{\graph{2}}{\threepath}\prob + 3\numocc{\graph{2}}{\tri}\prob\right]\label{eq:b2-sub-lem}\\
(10\numocc{G}{\tri} &+ 10{G}{\threedis})(3\prob^2 -\prob^3) = \nonumber\\
&\frac{\rpoly_{\graph{2}}^3(\prob,\ldots, \prob)}{6\prob^3} - \frac{\numocc{\graph{2}}{\ed}}{6\prob} - \numocc{\graph{2}}{\twopath}-\numocc{\graph{2}}{\twodis}\prob-\numocc{\graph{2}}{\oneint}\prob\nonumber\\
&-\left[\numocc{\graph{2}}{\threepath}\prob+3\numocc{\graph{2}}{\tri}\prob\right]-\left[\numocc{\graph{2}}{\twopathdis}\prob^2-3\numocc{\graph{2}}{\threedis}\prob^2\right]\nonumber\\
&+\left(4\numocc{G}{\oneint}+\left[6\numocc{G}{\twopathdis}+18\numocc{G}{\threedis}\right]+\left[4\numocc{G}{\threepath}+12\numocc{G}{\tri}\right]\right)(3\prob^2 - \prob^3)\label{eq:b2-final}
\end{align}
The steps to obtaining \cref{eq:b2-final} are analogous to the derivation immediately preceding. As in the previous derivation, note that the LHS of \Cref{eq:b2-final} is the same as $\vct{M}[2]\cdot \vct{x}[2]$. The RHS of \Cref{eq:b2-final} has terms all computable (by equations (\ref{eq:1e})-(\ref{eq:3p-3tri})) in $O(m)$ time. Setting $\vct{b}[2]$ to the RHS then completes the proof of step 1.
Note that if $\vct{M}$ has full rank then one can compute $\numocc{G}{\tri}$ and $\numocc{G}{\threedis}$ in $O(1)$ using Gaussian elimination.
To show that $\vct{M}$ indeed has full rank, we show in what follows that $\dtrm{\vct{M}}\ne 0$ for every $\prob\in (0,1)$.
$\dtrm{\vct{M}} = $
\begin{align}
&\begin{vmatrix}
1-3\prob &-(3\prob^2 - \prob^3)\\
10(3\prob^2 - \prob^3) &10(3\prob^2 - \prob^3)
\end{vmatrix}
= (1-3\prob)\cdot 10(3\prob^2-\prob^3) +10(3\prob^2-\prob^3)\cdot(3\prob^2 - \prob^3)\nonumber\\
&=10(3\prob^2-\prob^3)\cdot(1-3\prob+3\prob^2-\prob^3) = 10(3\prob^2-\prob^3)\cdot(-\prob^3+3\prob^2-3\prob + 1)\nonumber\\
&=10\prob^2(3 - \prob)\cdot(1-\prob)^3\label{eq:det-final}
\end{align}
From \Cref{eq:det-final} it can easily be seen that the roots of $\dtrm{\vct{M}}$ are $0, 1,$ and $3$. Hence there are no roots in $(0, 1)$ and \Cref{lem:lin-sys} follows.
\qed
\end{proof}
\subsection{Proof of \Cref{th:single-p}}
\begin{proof}
We can compute $\graph{2}$ from $\graph{1}$ in $O(m)$ time. Additionally, if in time $O(T(m))$, we have $\rpoly_{\graph{\ell}}^3(\prob,\dots,\prob)$ for $\ell\in [2]$, then the theorem follows by \Cref{lem:lin-sys}.
\qed
\end{proof}
In other words, if \Cref{th:single-p} holds, then so must \Cref{th:single-p-hard}.
\subsection{Proof of \Cref{th:single-p-hard}}
\begin{proof}
For the sake of contradiction, assume that for any $G$, we can compute $\rpoly_{G}^3(\prob,\dots,\prob)$ in $o\inparen{m^{1+\eps_0}}$ time.
Let $G$ be the input graph.
Then by \Cref{th:single-p} we can compute $\numocc{G}{\tri}$ in further time $o\inparen{m^{1+\eps_0}}+O(m)$. Thus, the overall, reduction takes $o\inparen{m^{1+\eps_0}}+O(m)= o\inparen{m^{1+\eps_0}}$ time, which violates \Cref{conj:graph}.
\qed
\end{proof}

View File

@ -0,0 +1,21 @@
%root: main.tex
We need all the possible edge patterns in an arbitrary $G$ with at most three distinct edges. We have already seen $\tri,\threepath$ and $\threedis$, so we define the remaining patterns:
\begin{itemize}
\item Single Edge $\left(\ed\right)$
\item 2-path ($\twopath$)
\item 2-matching ($\twodis$)
\item 3-star ($\oneint$)--this is the graph that results when all three edges share exactly one common endpoint. The remaining endpoint for each edge is disconnected from any endpoint of the remaining two edges.
\item Disjoint Two-Path ($\twopathdis$)--this subgraph consists of a two-path and a remaining disjoint edge.
\end{itemize}
For any graph $G$, the following formulas for $\numocc{G}{H}$ compute their respective patterns exactly in $O(\numedge)$ time, with $d_i$ representing the degree of vertex $i$ (proofs are in \Cref{app:easy-counts}):
\begin{align}
&\numocc{G}{\ed} = \numedge, \label{eq:1e}\\
&\numocc{G}{\twopath} = \sum_{i \in V} \binom{d_i}{2} \label{eq:2p}\\
&\numocc{G}{\twodis} = \sum_{(i, j) \in E} \frac{\numedge - d_i - d_j + 1}{2}\label{eq:2m}\\
&\numocc{G}{\oneint} = \sum_{i \in V} \binom{d_i}{3}\label{eq:3s}\\
&\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis} = \sum_{(i, j) \in E} \binom{\numedge - d_i - d_j + 1}{2}\label{eq:2pd-3d}\\
&\numocc{G}{\threepath} + 3\numocc{G}{\tri} = \sum_{(i, j) \in E} (d_i - 1) \cdot (d_j - 1)\label{eq:3p-3tri}
\end{align}

View File

@ -0,0 +1,79 @@
%root: main.tex
\subsection{Tools to prove \Cref{th:single-p-hard}}
Note that $\rpoly_{G}^3(\prob,\ldots, \prob)$ as a polynomial in $\prob$ has degree at most six. Next, we figure out the exact coefficients since this would be useful in our arguments:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Lemma}\label{lem:qE3-exp}
For any $\prob$, we have:
{\small
\begin{align}
\rpoly_{G}^3(\prob,\ldots, \prob) &= \numocc{G}{\ed}\prob^2 + 6\numocc{G}{\twopath}\prob^3 + 6\numocc{G}{\twodis}\prob^4 + 6\numocc{G}{\tri}\prob^3\nonumber\\
&+ 6\numocc{G}{\oneint}\prob^4 + 6\numocc{G}{\threepath}\prob^4 + 6\numocc{G}{\twopathdis}\prob^5 + 6\numocc{G}{\threedis}\prob^6.\label{claim:four-one}
\end{align}}
\end{Lemma}
\subsubsection{Proof for \Cref{lem:qE3-exp}}
\begin{proof}
By definition we have that
\[\poly_{G}^3(\vct{X}) = \sum_{\substack{(i_1, j_1), (i_2, j_2), (i_3, j_3) \in E}}~\; \prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}.\]
Hence $\rpoly_{G}^3(\vct{X})$ has degree six. Note that the monomial $\prod_{\ell = 1}^{3}X_{i_\ell}X_{j_\ell}$ will contribute to the coefficient of $\prob^\nu$ in $\rpoly_{G}^3(\vct{X})$, where $\nu$ is the number of distinct variables in the monomial.
Let $e_1 = (i_1, j_1), e_2 = (i_2, j_2),$ and $e_3 = (i_3, j_3)$.
We compute $\rpoly_{G}^3(\vct{X})$ by considering each of the three forms that the triple $(e_1, e_2, e_3)$ can take.
\textsc{case 1:} $e_1 = e_2 = e_3$ (all edges are the same). When we have that $e_1 = e_2 = e_3$, then the monomial corresponds to $\numocc{G}{\ed}$. There are exactly $\numedge$ such triples, each with a $\prob^2$ factor in $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 2:} This case occurs when there are two distinct edges of the three, call them $e$ and $e'$. When there are two distinct edges, there is then the occurence when $2$ variables in the triple $(e_1, e_2, e_3)$ are bound to $e$. There are three combinations for this occurrence in $\poly_{G}^3(\vct{X})$. Analogusly, there are three such occurrences in $\poly_{G}^3(\vct{X})$ when there is only one occurrence of $e$, i.e. $2$ of the variables in $(e_1, e_2, e_3)$ are $e'$.
This implies that all $3 + 3 = 6$ combinations of two distinct edges $e$ and $e'$ contribute to the same monomial in $\rpoly_{G}^3$.
Since $e\ne e'$, this case produces the following edge patterns: $\twopath, \twodis$, which contribute $6\prob^3$ and $6\prob^4$ respectively to $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\textsc{case 3:} All $e_1,e_2$ and $e_3$ are distinct. For this case, we have $3! = 6$ permutations of $(e_1, e_2, e_3)$, each of which contribute to the same monomial in the \textsc{SMB} representation of $\poly_{G}^3(\vct{X})$. This case consists of the following edge patterns: $\tri, \oneint, \threepath, \twopathdis, \threedis$, which contribute $6\prob^3, 6\prob^4, 6\prob^4, 6\prob^5$ and $6\prob^6$ respectively to $\rpoly_{G}^3\left(\prob,\ldots, \prob\right)$.
\qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Since $\prob$ is fixed, \Cref{lem:qE3-exp} gives us one linear equation in $\numocc{G}{\tri}$ and $\numocc{G}{\threedis}$ (we can handle the other counts due to equations (\ref{eq:1e})-(\ref{eq:3p-3tri})). However, we need to generate one more independent linear equation in these two variables. Towards this end we generate another graph related to $G$:
\begin{Definition}\label{def:Gk}
For $\ell \geq 1$, let graph $\graph{\ell}$ be a graph generated from an arbitrary graph $G$, by replacing every edge $e$ of $G$ with an $\ell$-path, such that all inner vertexes of an $\ell$-path replacement edge are disjoint from all other vertexes.\footnote{Note that $G\equiv \graph{1}$.}.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We will prove \Cref{th:single-p-hard} by the following reduction:
\begin{Theorem}\label{th:single-p}
Fix $\prob\in (0,1)$. Let $G$ be a graph on $\numedge$ edges.
If we can compute $\rpoly_{G}^3(\prob,\dots,\prob)$ exactly in $T(\numedge)$ time, then we can exactly compute $\numocc{G}{\tri}$
in $O\inparen{T(\numedge) + \numedge}$ time.
\end{Theorem}
For clarity, we repeat the notion of $\numocc{G}{H}$ to mean the count of subgraphs in $G$ isomorphic to $H$.
The following lemmas relate these counts in $\graph{2}$ to $\graph{1}$ ($G$). The lemmas are used to prove \Cref{lem:lin-sys}.
\begin{Lemma}\label{lem:3m-G2}
The $3$-matchings in graph $\graph{2}$ satisfy the identity:
\begin{align*}
\numocc{\graph{2}}{\threedis} &= 8 \cdot \numocc{\graph{1}}{\threedis} + 6 \cdot \numocc{\graph{1}}{\twopathdis}\\
&+ 4 \cdot \numocc{\graph{1}}{\oneint} + 4 \cdot \numocc{\graph{1}}{\threepath} + 2 \cdot \numocc{\graph{1}}{\tri}.
\end{align*}
\end{Lemma}
\begin{Lemma}\label{lem:tri}
For $\ell > 1$ and any graph $\graph{\ell}$, $\numocc{\graph{\ell}}{\tri} = 0$.
\end{Lemma}
Finally, the following result immediately implies \Cref{th:single-p}:
\begin{Lemma}\label{lem:lin-sys}
Fix $\prob\in (0,1)$. Given $\rpoly_{\graph{\ell}}^3(\prob,\dots,\prob)$ for $\ell\in [2]$, we can compute in $O(m)$ time a vector $\vct{b}\in\mathbb{R}^3$ such that
\[ \begin{pmatrix}
1 - 3p & -(3\prob^2 - \prob^3)\\
10(3\prob^2 - \prob^3) & 10(3\prob^2 - \prob^3)
\end{pmatrix}
\cdot
\begin{pmatrix}
\numocc{G}{\tri}]\\
\numocc{G}{\threedis}
\end{pmatrix}
=\vct{b},
\]
allowing us to compute $\numocc{G}{\tri}$ and $\numocc{G}{\threedis}$ in $O(1)$ time.
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -0,0 +1,157 @@
%root: main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{\Cref{lem:pdb-for-def-qk}}
\begin{Lemma}\label{lem:pdb-for-def-qk}
Assuming that each $v \in \vset$ has degree $\geq 1$,\footnote{This is WLOG, since any vertex with degree $0$ can be dropped without affecting the result of our hard query.} the \abbrPDB relations encoding the edges for $\poly_G^\kElem$ of \Cref{def:qk} can be computed in $\bigO{\numedge}$ time.
\end{Lemma}
\begin{proof}[Proof of \Cref{lem:pdb-for-def-qk}]
Only two relations need be constructed, one for the set $\vset$ and one for the set $\edgeSet$. By a simple linear scan, each can be constructed in time $\bigO{\numedge + \numvar}$. Given that the degree of each $v \in \vset$ is at least $1$, we have that $m\ge \Omega(n)$,
and this yields the claimed runtime.
\qed
\end{proof}
\subsection{Proof of \Cref{lem:tdet-om}}
\begin{proof}
By the recursive defintion of $\qruntimenoopt{\cdot, \cdot}$ (see \Cref{sec:gen}), we have the following equation for our hard query $\query$ when $k = 1$, (we denote this as $\query^1$).
\begin{equation*}
\qruntimenoopt{\query^1, \tupset} = \abs{\tupset.\vset} + \abs{\tupset.\edgeSet} + \abs{\tupset.\vset} + \jointime{\tupset.\vset , \tupset.\edgeSet , \tupset.\vset}.
\end{equation*}
We argue that $\jointime{\tupset.\vset , \tupset.\edgeSet , \tupset.\vset}$ is at most $O(\numedge)$ by noting that there exists an algorithm that computes $\tupset.\vset\join\tupset.\edgeSet\join\tupset.\vset$ in the same runtime\footnote{Indeed the trivial algorithm that computes the obvious pair-wise joins has the claimed runtime. That is, we first compute $\tupset.\vset\join\tupset.\edgeSet$, which takes $O(m)$ (assuming $\tupset.\vset$ is stored in hash map) since tuples in $\tupset.\vset$ can only filter tuples in $\tupset.\edgeSet$. The resulting subset of tuples in $\tupset.\edgeSet$ are then again joined (on the right) with $\tupset.\vset$, which by the same argument as before also takes $O(m)$ time, as desried.}. Then by the assumption of \Cref{lem:pdb-for-def-qk} (each $v \in \vset$ has degree $\geq 1$), the sum of the first three terms is $\bigO{\numedge}$. We then obtain that $\qruntimenoopt{\query^1, \tupset} = \bigO{\numedge} + \bigO{\numedge} = \bigO{\numedge}$. For $\query^k = \query_1^1 \times\cdots\times\query_k^1$, we have the recurrence $\qruntimenoopt{\query^k, \tupset} = \qruntimenoopt{\query_1^1, \tupset} + \cdots +\qruntimenoopt{\query_k^1, \tupset} + \jointime{\query_1^1,\cdots,\query_k^1}$. Since $\query^1$ outputs a count, computing the join $\query_1^1\join\cdots\join\query_k^1$ is just multiplying $k$ numbers, which takes $O(k)$ time. Thus, we have
\[\qruntimenoopt{\query^k, \tupset} \le k\cdot O(m)+O(k)\le O(km),\]
as desired.
\qed
\end{proof}
\subsection{\Cref{lem:qEk-multi-p}}
\noindent The following lemma reduces the problem of counting $\kElem$-matchings in a graph to our problem (and proves \Cref{thm:mult-p-hard-result}):
\begin{Lemma}\label{lem:qEk-multi-p}
Let $\prob_0,\ldots, \prob_{2\kElem}$ be distinct values in $(0, 1]$. Then given the values $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for $0\leq i\leq 2\kElem$, the number of $\kElem$-matchings in $G$ can be computed in $\bigO{\kElem^3}$ time.
\end{Lemma}
\subsection{Proof of Lemma~\ref{lem:qEk-multi-p}}\label{subsec:c2k-proportional}
\input{app_hard_lem-mult-p}
\subsection{Proof of Theorem~\ref{thm:mult-p-hard-result}}
\begin{proof}
For the sake of contradiction, assume we can solve our problem in $\littleo{\kmatchtime}$ time. Given a graph $G$ by \Cref{lem:pdb-for-def-qk} we can compute the \abbrPDB encoding in $\bigO{\numedge}$ time. Then after we run our algorithm on $\rpoly_G^\kElem$, we get $\rpoly_{G}^\kElem(\prob_i,\ldots, \prob_i)$ for every $0\leq i\leq 2\kElem$ in additional $\bigO{k}\cdot \littleo{\kmatchtime}$ time. \Cref{lem:qEk-multi-p} then computes the number of $k$-matchings in $G$ in $O(\kElem^3)$ time. Adding the runtime of all of these steps, we have an algorithm for computing the number of $k$-matchings that runs in time
\begin{align}
&\bigO{\numedge} + \bigO{k}\cdot \littleo{\kmatchtime} + O(\kElem^3)\label{eq:proof-omega-kmatch2}\\
&\le \littleo{\kmatchtime}\label{eq:proof-omega-kmatch4}.
\end{align}
We obtain \Cref{eq:proof-omega-kmatch4} from the facts that $k$ is fixed (related to $m$) and the assumption that $\kmatchtime\ge\omega(m)$.
Thus we obtain the contradiction that we can achieve a runtime $\littleo{\kmatchtime}$ that is better than the optimal time $\kmatchtime$ required to compute $k$-matchings.
\qed
\end{proof}
\subsection{Subgraph Notation and $O(1)$ Closed Formulas}
\input{app_hard_notation-easy-counts}
\subsection{Proofs of \Cref{eq:1e}-\Cref{eq:3p-3tri}}
\label{app:easy-counts}
The proofs for \Cref{eq:1e}, \Cref{eq:2p} and \Cref{eq:3s} are immediate.
\begin{proof}[Proof of \Cref{eq:2m}]
For edge $(i, j)$ connecting arbitrary vertices $i$ and $j$, finding all other edges in $G$ disjoint to $(i, j)$ is equivalent to finding all edges that are not connected to either vertex $i$ or $j$. The number of such edges is $m - d_i - d_j + 1$, where we add $1$ since edge $(i, j)$ is removed twice when subtracting both $d_i$ and $d_j$. Since the summation is iterating over all edges such that a pair $\left((i, j), (k, \ell)\right)$ will also be counted as $\left((k, \ell), (i, j)\right)$, division by $2$ then eliminates this double counting. Note that $m$ and $d_i$ for all $i \in V$ can be computed in one pass over the set of edges by simply maintaining counts for each quantity. Finally, the summation is also one traversal through the set of edges where each operation is either a lookup ($O(1)$ time) or an addition operation (also $O(1)$) time.
\qed
\end{proof}
\begin{proof}[Proof of \Cref{eq:2pd-3d}]
\Cref{eq:2pd-3d} is true for similar reasons. For edge $(i, j)$, it is necessary to find two additional edges, disjoint or connected. As in our argument for \Cref{eq:2m}, once the number of edges disjoint to $(i, j)$ have been computed, then we only need to consider all possible combinations of two edges from the set of disjoint edges, since it doesn't matter if the two edges are connected or not. Note, the factor $3$ of $\threedis$ is necessary to account for the triple counting of $3$-matchings, since it is indistinguishable to the closed form expression which of the remaining edges are either disjoint or connected to each of the edges in the {\emph{initial}} set of edges disjoint to the edge under consideration. Observe that the disjoint case will be counted $3$ times since each edge of a $3$-path is visited once, and the same $3$-path counted in each visitation. For the latter case however, it is true that since the two path in $\twopathdis$ is connected, there will be no multiple counting by the fact that the summation automatically disconnects the current edge, meaning that a two matching at the current vertex under consideration will not be counted. Thus, $\twopathdis$ will only be counted once, precisely when the single disjoint edge is visited in the summation. The sum over all such edge combinations is precisely then $\numocc{G}{\twopathdis} + 3\numocc{G}{\threedis}$. Note that all factorials can be computed in $O(m)$ time, and then each combination $\binom{n}{2}$ can be performed with constant time operations, yielding the claimed $O(m)$ run time.
\qed
\end{proof}
\begin{proof}[Proof of \Cref{eq:3p-3tri}]
To compute $\numocc{G}{\threepath}$, note that for an arbitrary edge $(i, j)$, a 3-path exists for edge pair $(i, \ell)$ and $(j, k)$ where $i, j, k, \ell$ are distinct. Further, the quantity $(d_i - 1) \cdot (d_j - 1)$ represents the number of 3-edge subgraphs with middle edge $(i, j)$ and outer edges $(i, \ell), (j, k)$ such that $\ell \neq j$ and $k \neq i$. When $k = \ell$, the resulting subgraph is a triangle, and when $k \neq \ell$, the subgraph is a 3-path. Summing over all edges (i, j) gives \Cref{eq:3p-3tri} by observing that each triangle is counted thrice, while each 3-path is counted just once. For reasons similar to \Cref{eq:2m}, all $d_i$ can be computed in $O(m)$ time and each summand can then be computed in $O(1)$ time, yielding an overall $O(m)$ run time.
\qed
\end{proof}
\input{app_hard_single-p-proof-defs}
\subsection{Proofs for \Cref{lem:3m-G2}, \Cref{lem:tri}, and \Cref{lem:lin-sys}}\label{subsec:proofs-struc-lemmas}
Before proceeding, let us introduce a few more helpful definitions.
\begin{Definition}[$\esetType{\ell}$]\label{def:ed-nota}
For $\ell > 1$, we use $\esetType{\ell}$ to denote the set of edges in $\graph{\ell}$. For any graph $\graph{\ell}$, its edges are denoted by the a pair $(e, b)$, such that $b \in \{0,\ldots, \ell-1\}$ where $(e,0),\dots,(e,\ell-1)$ is the $\ell$-path that replaces the edge $e$ for $e\in \esetType{1}$.
\end{Definition}
\begin{Definition}[$\eset{\ell}$]
Given an arbitrary subgraph $\sg{1}$ of $\graph{1}$, let $\eset{1}$ denote the set of edges in $\sg{1}$. Define then $\eset{\ell}$ for $\ell > 1$ as the set of edges in the generated subgraph $\sg{\ell}$ (i.e. when we apply \Cref{def:Gk} to $S$ to generate $\sg{\ell}$).
\end{Definition}
For example, consider $\sg{1}$ with edges $\eset{1} = \{e_1\}$. Then the edge set of $\sg{2}$ is defined as $\eset{2} = \{(e_1, 0), (e_1, 1)\}$.
\begin{Definition}[$\binom{\edgeSet}{t}$ and $\binom{\edgeSet}{\leq t}$]\label{def:ed-sub}
Let $\binom{E}{t}$ denote the set of subsets in $E$ with exactly $t$ edges. In a similar manner, $\binom{E}{\leq t}$ is used to mean the subsets of $E$ with $t$ or fewer edges.
\end{Definition}
The following function $f_\ell$ is a mapping from every $3$-edge shape in $\graph{\ell}$ to its `projection' in $\graph{1}$.
\begin{Definition}\label{def:fk}
Let $f_\ell: \binom{\esetType{\ell}}{3} \rightarrow \binom{\esetType{1}}{\leq3}$ be defined as follows. For any element $s \in \binom{\esetType{\ell}}{3}$ such that $s = \pbrace{(e_1, b_1), (e_2, b_2), (e_3, b_3)}$, define:
\[ f_\ell\left(\pbrace{(e_1, b_1), (e_2, b_2), (e_3, b_3)}\right) = \pbrace{e_1, e_2, e_3}.\]
\end{Definition}
\begin{Definition}[$f_\ell^{-1}$]\label{def:fk-inv}
For an arbitrary subgraph $\sg{1}$ of $\graph{1}$ with at most $m \leq 3$ edges, the inverse function $f_\ell^{-1}: \binom{\esetType{1}}{\leq 3}\rightarrow 2^{\binom{\esetType{\ell}}{3}}$ takes $\eset{1}$ and outputs the set of all elements $s \in \binom{\eset{\ell}}{3}$ such that
$f_\ell(s) = \eset{1}$.
\end{Definition}
Note, importantly, that when we discuss $f_\ell^{-1}$, that each \textit{edge} present in $\eset{1}$ must have an edge in $s\in f_\ell^{-1}(\eset{1})$ that projects down to it. In particular, if $|\eset{1}| = 3$, then it must be the case that each $s\in f_\ell^{-1}(\eset{1})$ consists of the following set of edges: $\{ (e_i, b), (e_j, b'), (e_m, b'') \}$, where $i,j$ and $m$ are distinct.
We are now ready to prove the structural lemmas.
To prove the structural lemmas, we will
count the number of occurrences of $\tri$ and $\threedis$ in $\graph{\ell}$ we count for each $S\in\binom{E_1}{\le 3}$, how many $\threedis$ and $\tri$ subgraphs appear in $f_\ell^{-1}(\eset{1})$.
\subsubsection{Proof of Lemma \ref{lem:3m-G2}}
\begin{proof}
For each subset $\eset{1}\in \binom{E_1}{\le 3}$, we count the number of {\emph{$3$-matchings }}in the $3$-edge subgraphs of $\graph{2}$ in $f_2^{-1}(\eset{1})$. We first consider the case of $\eset{1} \in \binom{E_1}{3}$, where $\eset{1}$ is composed of the edges $e_1, e_2, e_3$ and $f_2^{-1}(\eset{1})$ is the set of all $3$-edge subsets $s \in \{(e_1, 0), (e_1, 1), (e_2, 0), (e_2, 1),$ $(e_3, 0), (e_3, 1)\}$ such that $f_\ell(s) = \{e_1, e_2, e_3\}$. The size of the output is denoted $\abs{f_2^{-1}(\esetType{1})}$. For the case where each set of edges of the form $\{(e_1, b_1), (e_2, b_2), (e_3, b_3)\}$ for $b_i \in [2], i \in [3]$ is present, we have $\abs{f_2^{-1}(\esetType{1})} = 8$. We count the number of $3$-matchings from the set $f_2^{-1}(\eset{1})$.
We do a case analysis based on the subgraph $\sg{1}$ induced by $\eset{1}$.
\begin{itemize}
\item $3$-matching ($\threedis$)
\end{itemize}
When $\sg{1}$ is isomorphic to $\threedis$, it is the case that edges in $\eset{2}$ are {\em not} disjoint only for the pairs $(e_i, 0), (e_i, 1)$ for $i\in \{1,2,3\}$. By definition, each set of edges in $f_2^{-1}\inparen{\eset{1}}$ is a three matching and $\abs{f_2^{-1}\inparen{\eset{1}}} = 8$ possible 3-matchings.
\begin{itemize}
\item Disjoint Two-Path ($\twopathdis$)
\end{itemize}
For $\sg{1}$ isomorphic to $\twopathdis$ edges $e_2, e_3$ form a $2$-path with $e_1$ being disjoint. This means that in $\sg{2}$ edges $(e_2, 0), (e_2, 1), (e_3, 0), (e_3, 1)$ form a $4$-path while $(e_1, 0), (e_1, 1)$ is its own disjoint $2$-path. We can pick either $(e_1, 0)$ or $(e_1, 1)$ for the first edge in the $3$-matching, while it is necessary to have a $2$-matching from path $(e_2, 0),\ldots(e_3, 1)$. Note that the $4$-path allows for three possible $2$-matchings, specifically,
\begin{equation*}
\pbrace{(e_2, 0), (e_3, 0)}, \pbrace{(e_2, 0), (e_3, 1)}, \pbrace{(e_2, 1), (e_3, 1)}.
\end{equation*}
Since these two selections can be made independently, $\abs{f_2^{-1}\inparen{\eset{1}}} = 2 \cdot 3 = 6$ \emph{distinct} $3$-matchings in $f_2^{-1}(\eset{1})$.
\begin{itemize}
\item $3$-star ($\oneint$)
\end{itemize}
When $\sg{1}$ is isomorphic to $\oneint$, the inner edges $(e_i, 1)$ of $\sg{2}$ are all connected, and the outer edges $(e_i, 0)$ are all disjoint. Note that for a valid $3$-matching it must be the case that at most one inner edge can be part of the set of disjoint edges. For the case of when exactly one inner edge is chosen, there exist $3$ possiblities, based on which inner edge is chosen. Note that if $(e_i, 1)$ is chosen, the matching has to choose $(e_j, 0)$ for $j \neq i$ and $(e_{j'}, 0)$ for $j' \neq i, j' \neq j$. The remaining possible 3-matching occurs when all 3 outer edges are chosen, and $\abs{f_2^{-1}\inparen{\eset{1}}} = 4$.
\begin{itemize}
\item $3$-path ($\threepath$)
\end{itemize}
When $\sg{1}$ is isomorphic to $\threepath$ it is the case that all edges beginning with $e_1$ and ending with $e_3$ are successively connected. This means that the edges of $\eset{2}$ form a $6$-path. For a $3$-matching to exist in $f_2^{-1}(\eset{1})$, we cannot pick both $(e_i,0)$ and $(e_i,1)$ or both $(e_i, 1)$ and $(e_j, 0)$ where $j = i + 1$.
There are four such possibilities: $\pbrace{(e_1, 0), (e_2, 0), (e_3, 0)}$, $\pbrace{(e_1, 0), (e_2, 0), (e_3, 1)}$, $\pbrace{(e_1, 0), (e_2, 1), (e_3, 1)},$ $\pbrace{(e_1, 1), (e_2, 1), (e_3, 1)}$ and $\abs{f^{-1}_2\inparen{\eset{1}}} = 4.$
\begin{itemize}
\item Triangle ($\tri$)
\end{itemize}
For $\sg{1}$ isomorphic to $\tri$, note that it is the case that the edges in $\eset{2}$ are connected in a successive manner, but this time in a cycle, such that $(e_1, 0)$ and $(e_3, 1)$ are also connected. While this is similar to the discussion of the three path above, the first and last edges are not disjoint.
This rules out both subsets of $(e_1, 0), (e_2, 0), (e_3, 1)$ and $(e_1, 0), (e_2, 1), (e_3, 1)$, so that $\abs{f_2^{-1}\inparen{\eset{1}}} = 2$.
\noindent Let us now consider when $\eset{1} \in \binom{E_1}{\leq 2}$, i.e. fixed subgraphs among
\begin{itemize}
\item $2$-matching ($\twodis$), $2$-path ($\twopath$), $1$ edge ($\ed$)
\end{itemize}
When $|\eset{1}| = 2$, we can only pick one from each of two pairs, $\pbrace{(e_1, 0), (e_1, 1)}$ and $\pbrace{(e_2, 0), (e_2, 1)}$. The third edge choice in $\eset{2}$ will break the disjoint property of a $3$-matching. Thus, a $3$-matching cannot exist in $f_2^{-1}(\eset{1})$. A similar argument holds for $|\eset{1}| = 1$, where the output of $f_2^{-1}$ is $\{\emptyset\}$ since there are not enough edges in the input to produce any other output.
Observe that all of the arguments above focused solely on the property of subgraph $\sg{1}$ being isomorphmic. In other words, all $\eset{1}$ of a given ``shape'' yield the same number of $3$-matchings in $f_2^{-1}(\eset{1})$, and this is why we get the required identity using the above case analysis.
\qed
\end{proof}
\subsubsection{Proof of \Cref{lem:tri}}
\begin{proof}
The number of triangles in $\graph{\ell}$ for $\ell \geq 2$ will always be $0$ for the simple fact that all cycles in $\graph{\ell}$ will have at least six edges.
\qed
\end{proof}
\subsubsection{Proof of \Cref{lem:lin-sys}}
\input{app_hard_linsys}

View File

@ -0,0 +1,16 @@
%!TEX root=./main.tex
We can use $\semK$-relations to model bags. A \emph{$\semK$-relation}~\cite{DBLP:conf/pods/GreenKT07} is a relation whose tuples are annotated with elements from a commutative semiring $\semK = \inset{\domK, \addK, \multK, \zeroK, \oneK}$. A commutative semiring is a structure with a domain $\domK$ and associative and commutative binary operations $\addK$ and $\multK$ such that $\multK$ distributes over $\addK$, $\zeroK$ is the identity of $\addK$, $\oneK$ is the identity of $\multK$, and $\zeroK$ annihilates all elements of $\domK$ when combined by $\multK$.
Let $\udom$ be a countable domain of values.
Formally, an n-ary $\semK$-relation $\rel$ over $\udom$ is a function $\rel: \udom^n \to \domK$ with finite support $\support{\rel} = \{ \tup \mid \rel(\tup) \neq \zeroK \}$. A $\semK$-database is defined similarly, where we view the $\semK$-database (relation) as a function mapping tuples to their respective annotations.
$\raPlus$ query semantics over $\semK$-relations are analogous to the lineage construction semantics of \Cref{fig:nxDBSemantics}, with the exception of replacing $+$ with $\addK$ and $\cdot$ with $\multK$.
Consider the semiring $\semN = \inset{\domN,+,\times,0,1}$ of natural numbers. $\semN$-databases model bag semantics by annotating each tuple with its multiplicity. A probabilistic $\semN$-database ($\semN$-PDB) is a PDB where each possible world is an $\semN$-database. We study the problem of computing statistical moments for query results over such databases. Given an $\semN$-\abbrPDB $\pdb = (\idb, \pd)$, ($\raPlus$) query $\query$, and possible result tuple $\tup$, we sum $\query(\db)(\tup)\cdot\pd\inparen{\db}$ for all $\db \in \idb$ to compute the expected multiplicity of $\tup$. Intuitively, the expectation of $\query(\db)(t)$ is the number of duplicates of $t$ we expect to find in result of query $\query$.
Let $\semNX$ denote the set of polynomials over variables $\vct{X}=(X_1,\dots,X_n)$ with natural number coefficients and exponents.
Consider now the semiring (abusing notation) $\semNX = \inset{\semNX, +, \cdot, 0, 1}$ whose domain is $\semNX$, with the standard addition and multiplication of polynomials.
We define an \abbrNXPDB $\pxdb$ as the tuple $(\db_{\semNX}, \pd)$, where $\semNX$-database $\db_{\semNX}$ is paired with the probability distribution $\pd$ across the set of possible worlds \emph{represented} by $\db_{\semNX}$, i.e. the one induced from $\mathcal{P}_{\semNX}$, the probability distribution over $\vct{X}$. Note that the notation is slightly abused since the first element of the pair is an encoded set of possible worlds, i.e. $\db_{\semNX}$ is the \dbbaseName.
We denote by $\nxpolyqdt$ the annotation of tuple $t$ in the result of $\query(\db_{\semNX})(t)$, and as before, interpret it as a function $\nxpolyqdt: \{0,1\}^{|\vct X|} \rightarrow \semN$ from vectors of variable assignments to the corresponding value of the annotating polynomial.
\abbrNXPDB\xplural and a function $\rmod$ (which transforms an \abbrNXPDB to an equivalent $\semN$-PDB) are both formalized next.

View File

@ -0,0 +1,156 @@
%root: main.tex
%!TEX root=./main.tex
To justify the use of $\semNX$-databases, we need to show that we can encode any $\semN$-PDB in this way and that the query semantics over this representation coincides with query semantics over its respective $\semN$-PDB. For that it will be opportune to define representation systems for $\semN$-PDBs.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Representation System]\label{def:representation-syste}
A representation system for $\semN$-PDBs is a tuple $(\reprs, \rmod)$ where $\reprs$ is a set of representations and $\rmod$ associates with each $\repr \in \reprs$ an $\semN$-PDB $\pdb$. We say that a representation system is \emph{closed} under a class of queries $\qClass$ if for any query $\query \in \qClass$ and $\repr \in \reprs$ we have:
%
\[ \rmod(\query(\repr)) = \query(\rmod(\repr)) \]
A representation system is \emph{complete} if for every $\semN$-PDB $\pdb$ there exists $\repr \in \reprs$ such that:
%
\[ \rmod(\repr) = \pdb \]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As mentioned above we will use $\semNX$-databases paired with a probability distribution as a representation system, referring to such databases as \abbrNXPDB\xplural.
Given \abbrNXPDB $\pxdb$, one can think of the of $\pd$ as the probability distribution across all worlds $\inset{0, 1}^\numvar$. Denote a particular world to be $\vct{w}$. For convenience let $\assign_\vct{w}: \pxdb\rightarrow\pndb$ be a function that computes the corresponding $\semN$-\abbrPDB upon assigning all values $w_i \in \vct{w}$ to $X_i \in \vct{X}$ of $\db_{\semNX}$. Note the one-to-one correspondence between elements $\vct{w}\in\inset{0, 1}^\numvar$ to the worlds encoded by $\db_{\semNX}$ when $\vct{w}$ is assigned to $\vct{X}$ (assuming a domain of $\inset{0, 1}$ for each $X_i$).
We can think of $\assign_\vct{w}(\pxdb)\inparen{\tup}$ as the semiring homomorphism $\semNX \to \semN$ that applies the assignment $\vct{w}$ to all variables $\vct{X}$ of a polynomial and evaluates the resulting expression in $\semN$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[$\rmod\inparen{\pxdb}$]\label{def:semnx-pdbs}
Given an \abbrNXPDB$\pxdb$, we compute its equivalent $\semN$-\abbrPDB $\pndb = \rmod\inparen{\pxdb} = \inparen{\idb, \pd'}$ as:
\begin{align*}
\idb & = \{ \assign_{\vct{w}}(\pxdb) \mid \vct{w} \in \{0,1\}^n \} \\
\forall \db \in \idb: \probOf(\db) & = \sum_{\vct{w} \in \{0,1\}^n: \assign_{\vct{w}}(\pxdb) = \db} \probOf(\vct{w})
\end{align*}
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
For instance, consider a $\pxdb$ consisting of a single tuple $\tup_1 = (1)$ annotated with $X_1 + X_2$ with probability distribution $\probOf([0,0]) = 0$, $\probOf([0,1]) = 0$, $\probOf([1,0]) = 0.3$ and $\probOf([1,1]) = 0.7$. This \abbrNXPDB encodes two possible worlds (with non-zero probability) that we denote using their world vectors.
%
\[
D_{[0,1]}(\tup_1) = 1 \hspace{0.3cm} \mathbf{and} \hspace{0.3cm} D_{[1,1]}(\tup_1) = 2
\]
%
Importantly, as the following proposition shows, any finite $\semN$-PDB can be encoded as an \abbrNXPDB and \abbrNXPDB\xplural are closed under $\raPlus$\cite{DBLP:conf/pods/GreenKT07}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{prop:semnx-pdbs-are-a-}
\abbrNXPDB\xplural are a complete representation system for $\semN$-PDBs that is closed under $\raPlus$ queries.
\end{Proposition}
\begin{proof}
To prove that \abbrNXPDB\xplural are complete consider the following construction that for any $\semN$-PDB $\pdb = (\idb, \pd)$ produces an \abbrNXPDB $\pxdb = (\db_{\semNX}, \pd')$ such that $\rmod(\pxdb) = \pdb$. Let $\idb = \{D_1, \ldots, D_{\abs{\idb}}\}.$
For each world $D_i$ we create a corresponding variable $X_i$.
In $\db_{\semNX}$ we assign each tuple $\tup$ the polynomial:
%
\[
\db_{\semNX}(\tup) = \sum_{i=1}^{\abs{\idb}} D_i(\tup)\cdot X_{i}
\]
The probability distribution $\pd'$ assigns all world vectors zero probability except for $\abs{\idb}$ world vectors (representing the possible worlds) $\vct{w}_i$. All elements of $\vct{w}_i$ are zero except for the position corresponding to variables $X_{i}$ which is set to $1$. Unfolding definitions it is trivial to show that $\rmod(\pxdb) = \pdb$. Thus, \abbrNXPDB\xplural are a complete representation system.
Since $\semNX$ is the free object in the variety of semirings, Birkhoff's HSP theorem implies that any assignment $\vct{X} \to \semN$, which includes as a special case the assignments $\assign_{\vct{w}}$ used here, uniquely extends to the semiring homomorphism alluded to above, $\assign_\vct{w}\inparen{\pxdb}\inparen{\tup}: \semNX \to \semN$. For a polynomial $\assign_\vct{w}\inparen{\pxdb}\inparen{\tup}$ substitutes variables based on $\vct{w}$ and then evaluates the resulting expression in $\semN$. For instance, consider the polynomial $\pxdb\inparen{\tup} = \poly = X + Y$ and assignment $\vct{w} := X = 0, Y=1$. We get $\assign_\vct{w}\inparen{\pxdb}\inparen{\tup} = 0 + 1 = 1$.
Closure under $\raPlus$ queries follows from this and from \cite{DBLP:conf/pods/GreenKT07}'s Proposition 3.5, which states that semiring homomorphisms commute with queries over $\semK$-relations.
\qed
\end{proof}
\subsection{\tis and \bis in the \abbrNXPDB model}\label{subsec:supp-mat-ti-bi-def}
Two important subclasses of \abbrNXPDB\xplural that are of interest to us are the bag versions of tuple-independent databases (\tis) and block-independent databases (\bis). Under set semantics, a \ti is a deterministic database $\db$ where each tuple $\tup$ is assigned a probability $\prob_\tup$. The set of possible worlds represented by a \ti $\db$ is all subsets of $\db$. The probability of each world is the product of the probabilities of all tuples that exist with one minus the probability of all tuples of $\db$ that are not part of this world, i.e., tuples are treated as independent random events. In a \bi, we also assign each tuple a probability, but additionally partition $\db$ into blocks. The possible worlds of a \bi $\db$ are all subsets of $\db$ that contain at most one tuple from each block. Note then that the tuples sharing the same block are disjoint, and the sum of the probabilitites of all the tuples in the same block $\block$ is at most $1$.
The probability of such a world is the product of the probabilities of all tuples present in the world.
For bag \tis and \bis, we define the probability of a tuple to be the probability that the tuple exists with multiplicity at least $1$.
In this work, we define \tis and \bis as subclasses of \abbrNXPDB\xplural defined over variables $\vct{X}$ (\Cref{def:semnx-pdbs}) where $\vct{X}$ can be partitioned into blocks that satisfy the conditions of a \ti or \bi (stated formally in \Cref{subsec:tidbs-and-bidbs}).
In this work, we consider one further deviation from the standard: We use bag semantics for queries.
Even though tuples cannot occur more than once in the input \ti or \bi, they can occur with a multiplicity larger than one in the result of a query.
Since in \tis and \bis, there is a one-to-one correspondence between tuples in the database and variables, we can interpret a vector $\vct{w} \in \{0,1\}^n$ as denoting which tuples exist in the possible world $\assign_{\vct{w}}(\pxdb)$ (the ones where $w_i = 1$).
For BIDBs specifically, note that at most one of the bits corresponding to tuples in each block will be set (i.e., for any pair of bits $w_j$, $w_{j'}$ that are part of the same block $b_i \supseteq \{t_{i,j}, t_{i,j'}\}$, at most one of them will be set).
Denote the vector $\vct{p}$ to be a vector whose elements are the individual probabilities $\prob_i$ of each tuple $\tup_i$. Given \abbrPDB $\pdb$t $\pd$ is the distribution induced by $\vct{p}$, which we will denote $\pd^{\inparen{\vct{\prob}}}$.
%
\begin{align}\label{eq:tidb-expectation}
\expct_{\vct{W} \sim \pd^{(\vct{p})}}\pbox{\poly(\vct{W})}
= \sum\limits_{\substack{\vct{w} \in \{0, 1\}^\numvar\\ s.t. w_j,w_{j'} = 1 \rightarrow \not \exists b_i \supseteq \{t_{i,j}, t_{i',j}\}}} \poly(\vct{w})\prod_{\substack{j \in [\numvar]\\ s.t. \wElem_j = 1}}\prob_j \prod_{\substack{j \in [\numvar]\\s.t. w_j = 0}}\left(1 - \prob_i\right)
\end{align}
%
Recall that tuple blocks in a TIDB always have size 1, so the outer summation of \cref{eq:tidb-expectation} is over the full set of vectors.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Proof of~\Cref{prop:expection-of-polynom}}
\label{subsec:expectation-of-polynom-proof}
\begin{proof}
We need to prove for $\semN$-PDB $\pdb = (\idb,\pd)$ and \abbrNXPDB $\pxdb = (\db_{\semNX}',\pd')$ where $\rmod(\pxdb) = \pdb$ that $\expct_{\randDB\sim \pd}[\query(\db)(t)] = \expct_{\vct{W} \sim \pd'}\pbox{\nxpolyqdt(\vct{W})}$
By expanding $\nxpolyqdt$ and the expectation we have:
\begin{align*}
\expct_{\vct{W} \sim \pd'}\pbox{\poly(\vct{W})}
& = \sum_{\vct{w} \in \{0,1\}^n}\probOf(\vct{w}) \cdot Q(\db_{\semNX})(t)(\vct{w})\\
\intertext{From $\rmod(\pxdb) = \pdb$, we have that the range of $\assign_{\vct{w}(\pxdb)}$ is $\idb$, so}
& = \sum_{\db \in \idb}\;\;\sum_{\vct{w} \in \{0,1\}^n : \assign_{\vct{w}}(\pxdb) = \db}\probOf(\vct{w}) \cdot Q(\db_{\semNX})(t)(\vct{w})\\
\intertext{The inner sum is only over $\vct{w}$ where $\assign_{\vct{w}}(\pxdb) = \db$ (i.e., $Q(\db_{\semNX})(t)(\vct{w}) = \query(\db)(t))$}
& = \sum_{\db \in \idb}\;\;\sum_{\vct{w} \in \{0,1\}^n : \assign_{\vct{w}}(\pxdb) = \db}\probOf(\vct{w}) \cdot \query(\db)(t)\\
\intertext{By distributivity of $+$ over $\times$}
& = \sum_{\db \in \idb}\query(\db)(t)\sum_{\vct{w} \in \{0,1\}^n : \assign_{\vct{w}}(\pxdb) = \db}\probOf(\vct{w})\\
\intertext{From the definition of $\pd$ in \cref{def:semnx-pdbs}, given $\rmod(\pxdb) = \pdb$, we get}
& = \sum_{\db \in \idb}\query(\db)(t) \cdot \probOf(D) \quad = \expct_{\randDB \sim \pd}[\query(\db)(t)]
\end{align*}
\qed
\end{proof}
\subsection{Proposition~\ref{proposition:q-qtilde}}\label{app:subsec-prop-q-qtilde}
\noindent Note the following fact:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Proposition}\label{proposition:q-qtilde} For any \bi-lineage polynomial $\poly(X_1, \ldots, X_\numvar)$ and all $\vct{w}$ such that $\probOf[\vct{W} = \vct{w}] > 0$, it holds that
$% \[
\poly(\vct{w}) = \rpoly(\vct{w}).
$% \]
\end{Proposition}
\begin{proof}
Note that any $\poly$ in factorized form is equivalent to its \abbrSMB expansion. For each term in the expanded form, further note that for all $b \in \{0, 1\}$ and all $e \geq 1$, $b^e = b$.
Finally, note that there are exactly three cases where the expectation of a monomial term $\expct\left[c_{\vct{d}}\prod_{i = n\; s.t.\; \vct{d}_i \geq 1}X_i\right]$ is zero:
(i) when $c_{\vct{d}} = 0$,
(ii) when $p_i = 0$ for some $i$ where $\vct{d}_i \geq 1$, and
(iii) when $X_i$ and $X_j$ are in the same block for some $i,j$ where $\vct{d}_i, \vct{d}_j \geq 1$.
\qed
\end{proof}
\subsection{Proof for Lemma~\ref{lem:tidb-reduce-poly}}\label{subsec:proof-exp-poly-rpoly}
\begin{proof}
Let $\poly$ be a polynomial of $\numvar$ variables with highest degree $= \hideg$, defined as follows:
\[\poly(X_1,\ldots, X_\numvar) = \sum_{\vct{d} \in \{0,\ldots, \hideg\}^\numvar}c_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar X_i^{d_i}.\]
Let the boolean function $\isInd{\cdot}$ take $\vct{d}$ as input and return true if there does not exist any dependent variables in $\vct{d}$, i.e., $\not\exists ~\block, i\neq j\suchthat d_{\block, i}, d_{\block, j} \geq 1$.\footnote{This \abbrBIDB notation is used and discussed in \cref{subsec:tidbs-and-bidbs}}.
Then in expectation we have
\begin{align}
\expct_{\vct{\randWorld}}\pbox{\poly(\vct{\randWorld})} &= \expct_{\vct{\randWorld}}\pbox{\sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\\wedge~\isInd{\vct{d}}}}c_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \randWorld_i^{d_i} + \sum_{\substack{\vct{d} \in \{0,\ldots, \hideg\}^\numvar\\\wedge ~\neg\isInd{\vct{d}}}} c_{\vct{d}}\cdot\prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar\randWorld_i^{d_i}}\label{p1-s1a}\\
&= \sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\\wedge~\isInd{\vct{d}}}}c_{\vct{d}}\cdot \expct_{\vct{\randWorld}}\pbox{\prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \randWorld_i^{d_i}} + \sum_{\substack{\vct{d} \in \{0,\ldots, \hideg\}^\numvar\\\wedge ~\neg\isInd{\vct{d}}}} c_{\vct{d}}\cdot\expct_{\vct{\randWorld}}\pbox{\prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar\randWorld_i^{d_i}}\label{p1-s1b}\\
&= \sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\~\wedge\isInd{\vct{d}}}}c_{\vct{d}}\cdot \expct_{\vct{\randWorld}}\pbox{\prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \randWorld_i^{d_i}}\label{p1-s1c}\\
&= \sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\\wedge~\isInd{\vct{d}}}}c_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \expct_{\vct{\randWorld}}\pbox{\randWorld_i^{d_i}}\label{p1-s2}\\
&= \sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\\wedge~\isInd{\vct{d}}}}c_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \expct_{\vct{\randWorld}}\pbox{\randWorld_i}\label{p1-s3}\\
&= \sum_{\substack{\vct{d} \in \{0,\ldots,\hideg\}^\numvar\\\wedge~\isInd{\vct{d}}}}c_{\vct{d}}\cdot \prod_{\substack{i = 1\\s.t. d_i \geq 1}}^\numvar \prob_i\label{p1-s4}\\
&= \rpoly(\prob_1,\ldots, \prob_\numvar).\label{p1-s5}
\end{align}
\Cref{p1-s1a} is the result of substituting in the definition of $\poly$ given above. Then we arrive at \cref{p1-s1b} by linearity of expectation. Next, \cref{p1-s1c} is the result of the independence constraint of \abbrBIDB\xplural, specifically that any monomial composed of dependent variables, i.e., variables from the same block $\block$, has a probability of $0$. \Cref{p1-s2} is obtained by the fact that all variables in each monomial are independent, which allows for the expectation to be pushed through the product. In \cref{p1-s3}, since $\randWorld_i \in \{0, 1\}$ it is the case that for any exponent $e \geq 1$, $\randWorld_i^e = \randWorld_i$. Next, in \cref{p1-s4} the expectation of a tuple is indeed its probability.
Finally, it can be verified that \Cref{p1-s5} follows since \cref{p1-s4} satisfies the construction of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ in \Cref{def:reduced-poly}.
\qed
\end{proof}
\subsection{Proof For Corollary~\ref{cor:expct-sop}}
\begin{proof}
Note that~\Cref{lem:tidb-reduce-poly} shows that $\expct\pbox{\poly} =$ $\rpoly(\prob_1,\ldots, \prob_\numvar)$. Therefore, if $\poly$ is already in \abbrSMB form, one only needs to compute $\poly(\prob_1,\ldots, \prob_\numvar)$ ignoring exponent terms (note that such a polynomial is $\rpoly(\prob_1,\ldots, \prob_\numvar)$), which indeed has $\bigO{\abs{\poly}}$ computations.
\qed
\end{proof}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,126 @@
%root: main.tex
\subsection{$\onepass$ Remarks}
Please note that it is \textit{assumed} that the original call to \onepass consists of a call on an input circuit \circuit such that the values of members \prt, \lwght and \rwght have been initialized to Null across all gates.
\input{app_onepass_eval-notes}
\subsection{$\onepass$ Example}
\begin{Example}\label{example:one-pass}
Let $\etree$ encode the expression $(X + Y)(X - Y) + Y^2$. After one pass, \Cref{alg:one-pass-iter} would have computed the following weight distribution. For the two inputs of the sink gate $\circuit$, $\circuit.\lwght = \frac{4}{5}$ and $\circuit.\rwght = \frac{1}{5}$. Similarly, for $\stree$ denoting the left input of $\circuit_{\lchild}$, $\stree.\lwght = \stree.\rwght = \frac{1}{2}$. This is depicted in \Cref{fig:expr-tree-T-wght}.
\end{Example}
\begin{figure}[h!]
\centering
\begin{tikzpicture}[thick]
%First level
\node[tree_node] (a1) at (1, 0) {$\boldsymbol{Y}$};
\node[tree_node] (b1) at (3, 0) {$\boldsymbol{-1}$};
%Second level
\node[tree_node] (a2) at (-0.75, 0) {$\boldsymbol{X}$};
\node[tree_node] (b2) at (1.6,1.25) {$\boldsymbol{\circmult}$};
\node[tree_node] (c2) at (2.9, 1.25) {$\boldsymbol{\circmult}$};
%Third level
\node[tree_node] (a3) at (0.7, 2.5) {$\boldsymbol{\circplus}$};
\node[tree_node] (b3) at (1.6, 2.5) {$\boldsymbol{\circplus}$};
%Fourth level
\node[tree_node] (a4) at (1.5, 3.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (b4) at (2.8, 4) {$\boldsymbol{\circplus}$};
\node[above right=0.15cm of b4, inner sep=0pt, font=\bfseries](labelC) {$\circuit$};
\draw[->] (a1) edge[right] node{$\frac{1}{2}$} (a3);
\draw[->] (b1) -- (b2);
\draw[->] (a1) -- (b2);
\draw[->] (a1) edge[bend left=15] (c2);
\draw[->] (a1) edge[bend right=15] (c2);
\draw[->] (a2) edge[left] node{$\frac{1}{2}$} (a3);
\draw[->] (a2) edge[below] node{$\frac{1}{2}$} (b3);
\draw[->] (b2) edge[right] node{$\frac{1}{2}$} (b3);
\draw[->] (c2) edge[right] node{$\frac{1}{5}$} (b4);
\draw[->] (a3) -- (a4);
\draw[->] (b3) -- (a4);
\draw[->] (a4) edge[above] node{$\frac{4}{5}$} (b4);
\draw[black] (b4) -- (labelC);
\end{tikzpicture}
\caption{Weights computed by $\onepass$ in \Cref{example:one-pass}.}
\label{fig:expr-tree-T-wght}
\end{figure}
\begin{algorithm}[h!]
\caption{\onepass$(\circuit)$}
\label{alg:one-pass-iter}
\begin{algorithmic}[1]
\Require \circuit: Circuit
\Ensure \circuit: Annotated Circuit
\Ensure \vari{sum} $\in \domN$
\For{\gate in \topord(\circuit)}\label{alg:one-pass-loop}\Comment{\topord($\cdot$) is the topological order of \circuit}
\If{\gate.\type $=$ \var}
\State \gate.\prt $\gets 1$\label{alg:one-pass-var}
\ElsIf{\gate.\type $=$ \tnum}
\State \gate.\prt $\gets \abs{\gate.\val}$\label{alg:one-pass-num}
\ElsIf{\gate.\type $= \circmult$}
\State \gate.\prt $\gets \gate_\linput.\prt \times \gate_\rinput.\prt$\label{alg:one-pass-mult}
\Else
\State \gate.\prt $\gets \gate_\linput.\prt + \gate_\rinput.\prt$\label{alg:one-pass-plus}
\State \gate.\lwght $\gets \frac{\gate_\linput.\prt}{\gate.\prt}$\label{alg:one-pass-lwght}
\State \gate.\rwght $\gets \frac{\gate_\rinput.\prt}{\gate.\prt}$\label{alg:one-pass-rwght}
\EndIf
\State \vari{sum} $\gets \gate.\prt$
\EndFor
\State \Return (\vari{sum}, $\circuit$)
\end{algorithmic}
\end{algorithm}
\subsection{Proof of \onepass (\Cref{lem:one-pass})}\label{sec:proof-one-pass}
\begin{proof}
We prove the correct computation of \prt, \lwght, \rwght values on \circuit by induction over the number of iterations in the topological order \topord (line~\ref{alg:one-pass-loop}) of the input circuit \circuit. \topord follows the standard definition of a topological ordering over the DAG structure of \circuit.
For the base case, we have only one gate, which by definition is a source gate and must be either \var or \tnum. In this case, as per \cref{eq:T-all-ones}, lines~\ref{alg:one-pass-var} and~\ref{alg:one-pass-num} correctly compute \circuit.\prt as $1$.
For the inductive hypothesis, assume that \onepass correctly computes \subcircuit.\prt, \subcircuit.\lwght, and \subcircuit.\rwght for all gates \gate in \circuit with $k \geq 0$ iterations over \topord.
We now prove for $k + 1$ iterations that \onepass correctly computes the \prt, \lwght, and \rwght values for each gate $\gate_\vari{i}$ in \circuit for $i \in [k + 1]$.
The $\gate_\vari{k + 1}$ must be in the last ordering of all gates $\gate_\vari{i}$. When \size(\circuit) > 1, if $\gate_{k+1}$ is a leaf node, we are back to the base case. Otherwise $\gate_{k + 1}$ is an internal node
which requires binary input.
When $\gate_{k+1}.\type = \circplus$, then by line~\ref{alg:one-pass-plus} $\gate_{k+1}$.\prt $= \gate_{{k+1}_\lchild}$.\prt $+ \gate_{{k+1}_\rchild}$.\prt, a correct computation, as per \cref{eq:T-all-ones}. Further, lines~\ref{alg:one-pass-lwght} and~\ref{alg:one-pass-rwght} compute $\gate_{{k+1}}.\lwght = \frac{\gate_{{k+1}_\lchild}.\prt}{\gate_{{k+1}}.\prt}$ and analogously for $\gate_{{k+1}}.\rwght$. All values needed for each computation have been correctly computed by the inductive hypothesis.
When $\gate_{k+1}.\type = \circmult$, then line~\ref{alg:one-pass-mult} computes $\gate_{k+1}.\prt = \gate_{{k+1}_\lchild.\prt} \circmult \gate_{{k+1}_\rchild}.\prt$, which indeed by \cref{eq:T-all-ones} is correct. This concludes the proof of correctness.
\paragraph*{Runtime Analysis}
It is known that $\topord(G)$ is computable in linear time. There are $\size(\circuit)$ iterations, each of which takes $O\left( \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log\inparen{\size(\circuit)}}\right)$ time. This can be seen since each of all the numbers which the algorithm computes is at most $\abs{\circuit}(1,\dots,1)$. Hence, by definition each such operation takes $\multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}$ time, which proves the claimed runtime.
\qed
\end{proof}
\iffalse
\paragraph*{Sufficient condition for $\abs{\circuit}(1,\ldots, 1)$ to be size $O(N)$}
For our runtime results to be relevant, it must be the case that the sum of the coefficients computed by \onepass is indeed size $O(N)$ since there are $O(\log{N})$ bits in the RAM model where $N$ is the size of the input. The size of the input here is \size(\circuit). We show that when \size$(\circuit_\linput) = N_\linput$, \size$(\circuit_\rinput) = N_\rinput$, where $N_\linput + N_\rinput \leq N$, this is indeed the case.
\begin{proof}
To prove this result, we start by proving that $\abs{\circuit}(1,\ldots, 1) \leq N^{2^k }$ for \degree(\circuit) $= k$.
For the base case, we have that \depth(\circuit) $= 0$, and there can only be one node which must contain a coefficient (or constant) of $1$. In this case, $\abs{\circuit}(1,\ldots, 1) = 1$, and \size(\circuit) $= 1$, and it is true that $\abs{\circuit}(1,\ldots, 1) = 1 \leq N^{2^k} = 1^{2^0} = 1$.
Assume for $\ell > 0$ an arbitrary circuit \circuit of $\depth(\circuit) \leq \ell$ that it is true that $\abs{\circuit}(1,\ldots, 1) \leq N^{2^k }$.
For the inductive step we consider a circuit \circuit such that $\depth(\circuit) \leq \ell + 1$. The sink can only be either a $\circmult$ or $\circplus$ gate. Consider when sink node is $\circmult$. Let $k_\linput, k_\rinput$ denote \degree($\circuit_\linput$) and \degree($\circuit_\rinput$) respectively. Note that this case does not require the constraint on $N_\linput$ or $N_\rinput$.
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1)\circmult \abs{\circuit_\rinput}(1,\ldots, 1) \leq (N-1)^{2^{k_\linput}} \circmult (N - 1)^{2^{k_\rinput}}\nonumber\\
&\leq (N-1)^{2^{k}-1}\label{eq:sumcoeff-times-upper}\\
&\leq N^{2^k}.\nonumber
\end{align}
We derive the upperbound of \Cref{eq:sumcoeff-times-upper} by noting that the maximum value of the LHS occurs when both the base and exponent are maximized.
For the case when the sink node is a $\circplus$ node, then we have
\begin{align}
\abs{\circuit}(1,\ldots, 1) &= \abs{\circuit_\linput}(1,\ldots, 1) \circplus \abs{\circuit_\rinput}(1,\ldots, 1) \leq
N_\linput^{2^{k_\linput}} + N_\rinput^{2^{k_\rinput}}\nonumber\\
&\leq N_\linput^{2^k } + N_\rinput\label{eq:sumcoeff-plus-upper}\\
&\leq N^{2^k}.\nonumber
\end{align}
Similar to the $\circmult$ case, \Cref{eq:sumcoeff-plus-upper} upperbounds its LHS by the fact that the maximum base and exponent combination is always greater than or equal to the sum of lower base/exponent combinations. The final equality is true given the constraint over the inputs.
Since $\abs{\circuit}(1,\ldots, 1) \leq N^{2^k}$ for all circuits such that all $\circplus$ gates share at most one gate with their sibling (across their respective subcircuits), then $\log{N^{2^k}} = 2^k \cdot \log{N}$ which for fixed $k$ yields the desired $O(\log{N})$ bits for $O(1)$ arithmetic operations.
\end{proof}
\fi

View File

@ -0,0 +1,21 @@
%root: main.tex
The evaluation of $\abs{\circuit}(1,\ldots, 1)$ can be defined recursively, as follows (where $\circuit_\linput$ and $\circuit_\rinput$ are the `left' and `right' inputs of $\circuit$ if they exist):
{\small
\begin{align}
\label{eq:T-all-ones}
\abs{\circuit}(1,\ldots, 1) = \begin{cases}
\abs{\circuit_\linput}(1,\ldots, 1) \cdot \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \circmult\\
\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1) &\textbf{if }\circuit.\type = \circplus \\
|\circuit.\val| &\textbf{if }\circuit.\type = \tnum\\
1 &\textbf{if }\circuit.\type = \var.
\end{cases}
\end{align}
}
It turns out that for proof of \Cref{lem:sample}, we need to argue that when $\circuit.\type = +$, we indeed have
\begin{align}
\label{eq:T-weights}
\circuit.\lwght &\gets \frac{\abs{\circuit_\linput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1) + \abs{\circuit_\rinput}(1,\ldots, 1)};\\
\circuit.\rwght &\gets \frac{\abs{\circuit_\rinput}(1,\ldots, 1)}{\abs{\circuit_\linput}(1,\ldots, 1)+ \abs{\circuit_\rinput}(1,\ldots, 1)}
\end{align}

View File

@ -0,0 +1,153 @@
%root: main.tex
\subsection{\sampmon Remarks}\label{subsec:sampmon-remarks}
\input{app_samp-monom_pseudo-code}
We briefly describe the top-down traversal of \sampmon. When \circuit.\type $= +$, the input to be visited is sampled from the weighted distribution precomputed by \onepass.
When a \circuit.\type$= \times$ node is visited, both inputs are visited.
The algorithm computes two properties: the set of all variable leaf nodes visited, and the product of the signs of visited coefficient leaf nodes.
%
We will assume the TreeSet data structure to maintain sets with logarithmic time insertion and linear time traversal of its elements.
While we would like to take advantage of the space efficiency gained in using a circuit \circuit instead an expression tree \etree, we do not know that such a method exists when computing a sample of the input polynomial representation.
The efficiency gains of circuits over trees is found in the capability of circuits to only require space for each \emph{distinct} term in the compressed representation. This saves space in such polynomials containing non-distinct terms multiplied or added to each other, e.g., $x^4$. However, to avoid biased sampling, it is imperative to sample from both inputs of a multiplication gate, independently, which is indeed the approach of \sampmon.
\subsection{Proof of \sampmon (\Cref{lem:sample})}\label{sec:proof-sample-monom}
\begin{proof}
We first need to show that $\sampmon$ samples a valid monomial $\encMon$ by sampling and returning a set of variables $\monom$, such that $(\monom, \coef)$ is in $\expansion{\circuit}$ and $\encMon$ is indeed a monomial of the $\rpoly\inparen{\vct{X}}$ encoded in \circuit. We show this via induction over the depth of \circuit.
For the base case, let the depth $d$ of $\circuit$ be $0$. We have that the single gate is either a constant $\coef$ for which by line~\ref{alg:sample-num-return} we return $\{~\}$, or we have that $\circuit.\type = \var$ and $\circuit.\val = x$, and by line~\ref{alg:sample-var-return} we return $\{x\}$. By \cref{def:expand-circuit}, both cases return a valid $\monom$ for some $(\monom, \coef)$ from $\expansion{\circuit}$, and the base case is proven.
For the inductive hypothesis, assume that for $d \leq k$ for some $k \geq 0$, that it is indeed the case that $\sampmon$ returns a valid monomial.
For the inductive step, let us take a circuit $\circuit$ with $d = k + 1$. Note that each input has depth $d - 1 \leq k$, and by inductive hypothesis both of them sample a valid monomial. Then the sink can be either a $\circplus$ or $\circmult$ gate. For the case when $\circuit.\type = \circplus$, line~\ref{alg:sample-plus-bsamp} of $\sampmon$ will choose one of the inputs of the source. By inductive hypothesis it is the case that some valid monomial is being randomly sampled from each of the inputs. Then it follows when $\circuit.\type = \circplus$ that a valid monomial is sampled by $\sampmon$. When the $\circuit.\type = \circmult$, line~\ref{alg:sample-times-union} computes the set union of the monomials returned by the two inputs of the sink, and it is trivial to see by \cref{def:expand-circuit} that $\encMon$ is a valid monomial encoded by some $(\monom, \coef)$ of $\expansion{\circuit}$.
We will next prove by induction on the depth $d$ of $\circuit$ that for $(\monom,\coef) \in \expansion{\circuit}$, $\monom$ is sampled with a probability $\frac{|\coef|}{\abs{\circuit}\polyinput{1}{1}}$.
For the base case $d = 0$, by definition~\ref{def:circuit} we know that the $\size\inparen{\circuit} = 1$ and \circuit.\type$=$ \tnum or \var. For either case, the probability of the value returned is $1$ since there is only one value to sample from. When \circuit.\val $= x$, the algorithm always return the variable set $\{x\}$. When $\circuit.\type = \tnum$, \sampmon will always return $\emptyset$.
For the inductive hypothesis, assume that for $d \leq k$ and $k \geq 0$ $\sampmon$ indeed returns $\monom$ in $(\monom, \coef)$ of $\expansion{\circuit}$ with probability $\frac{|\coef|}{\abs{\circuit}\polyinput{1}{1}}$.
We prove now for $d = k + 1$ the inductive step holds. It is the case that the sink of $\circuit$ has two inputs $\circuit_\linput$ and $\circuit_\rinput$. Since $\circuit_\linput$ and $\circuit_\rinput$ are both depth $d - 1 \leq k$, by inductive hypothesis, $\sampmon$ will return $\monom_\linput$ in $(\monom_\lchild, \coef_\lchild)$ of $\expansion{\circuit_\linput}$ and $\monom_\rinput$ in $(\monom_\rchild, \coef_\rchild)$ of $\expansion{\circuit_\rinput}$, from $\circuit_\linput$ and $\circuit_\rinput$ with probability $\frac{|\coef_\lchild|}{\abs{\circuit_\linput}\polyinput{1}{1}}$ and $\frac{|\coef_\rchild|}{\abs{\circuit_\rinput}\polyinput{1}{1}}$.
Consider the case when $\circuit.\type = \circmult$. For the term $(\monom, \coef)$ from $\expansion{\circuit}$ that is being sampled it is the case that $\monom = \monom_\lchild \cup \monom_\rchild$, where $\monom_\lchild$ is coming from $\circuit_\linput$ and $\monom_\rchild$ from $\circuit_\rinput$. The probability that \sampmon$(\circuit_{\lchild})$ returns $\monom_\lchild$ is $\frac{|\coef_{\monom_\lchild}|}{|\circuit_\linput|(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\rchild}|}{\abs{\circuit_\rinput}\polyinput{1}{1}}$ for $\monom_\rchild$. Since both $\monom_\lchild$ and $\monom_\rchild$ are sampled with independent randomness, the final probability for sample $\monom$ is then $\frac{|\coef_{\monom_\lchild}| \cdot |\coef_{\monom_\rchild}|}{|\circuit_\linput|(1,\ldots, 1) \cdot |\circuit_\rinput|(1,\ldots, 1)}$. For $(\monom, \coef)$ in $\expansion{\circuit}$, by \cref{def:expand-circuit} it is indeed the case that $|\coef| = |\coef_{\monom_\lchild}| \cdot |\coef_{\monom_\rchild}|$ and that (as shown in \cref{eq:T-all-ones}) $\abs{\circuit}(1,\ldots, 1) = |\circuit_\linput|(1,\ldots, 1) \cdot |\circuit_\rinput|(1,\ldots, 1)$, and therefore $\monom$ is sampled with correct probability $\frac{|\coef|}{\abs{\circuit}(1,\ldots, 1)}$.
For the case when $\circuit.\type = \circplus$, \sampmon ~will sample $\monom$ from one of its inputs. By inductive hypothesis we know that any $\monom_\lchild$ in $\expansion{\circuit_\linput}$ and any $\monom_\rchild$ in $\expansion{\circuit_\rinput}$ will both be sampled with correct probability $\frac{|\coef_{\monom_\lchild}|}{\abs{\circuit_{\lchild}}(1,\ldots, 1)}$ and $\frac{|\coef_{\monom_\rchild}|}{|\circuit_\rinput|(1,\ldots, 1)}$, where either $\monom_\lchild$ or $\monom_\rchild$ will equal $\monom$, depending on whether $\circuit_\linput$ or $\circuit_\rinput$ is sampled. Assume that $\monom$ is sampled from $\circuit_\linput$, and note that a symmetric argument holds for the case when $\monom$ is sampled from $\circuit_\rinput$. Notice also that the probability of choosing $\circuit_\linput$ from $\circuit$ is $\frac{\abs{\circuit_\linput}\polyinput{1}{1}}{\abs{\circuit_\linput}\polyinput{1}{1} + \abs{\circuit_\rinput}\polyinput{1}{1}}$ as computed by $\onepass$. Then, since $\sampmon$ goes top-down, and each sampling choice is independent (which follows from the randomness in the root of $\circuit$ being independent from the randomness used in its subtrees), the probability for $\monom$ to be sampled from $\circuit$ is equal to the product of the probability that $\circuit_\linput$ is sampled from $\circuit$ and $\monom$ is sampled in $\circuit_\linput$, and
\begin{align*}
&\probOf(\sampmon(\circuit) = \monom) = \\
&\probOf(\sampmon(\circuit_\linput) = \monom) \cdot \probOf(SampledChild(\circuit) = \circuit_\linput)\\
&= \frac{|\coef_\monom|}{|\circuit_\linput|(1,\ldots, 1)} \cdot \frac{\abs{\circuit_\linput}(1,\ldots, 1)}{|\circuit_\linput|(1,\ldots, 1) + |\circuit_\rinput|(1,\ldots, 1)}\\
&= \frac{|\coef_\monom|}{\abs{\circuit}(1,\ldots, 1)},
\end{align*}
and we obtain the desired result.
Lastly, we show by simple induction of the depth $d$ of \circuit that \sampmon indeed returns the correct sign value of $\coef$ in $(\monom, \coef)$.
In the base case, $\circuit.\type = \tnum$ or $\var$. For the former, \sampmon correctly returns the sign value of the gate. For the latter, \sampmon returns the correct sign of $1$, since a variable is a neutral element, and $1$ is the multiplicative identity, whose product with another sign element will not change that sign element.
For the inductive hypothesis, we assume for a circuit of depth $d \leq k$ and $k \geq 0$ that the algorithm correctly returns the sign value of $\coef$.
Similar to before, for a depth $d \leq k + 1$, it is true that $\circuit_\linput$ and $\circuit_\rinput$ both return the correct sign of $\coef$. For the case that $\circuit.\type = \circmult$, the sign value of both inputs are multiplied, which is the correct behavior by \cref{def:expand-circuit}. When $\circuit.\type = \circplus$, only one input of $\circuit$ is sampled, and the algorithm returns the correct sign value of $\coef$ by inductive hyptothesis.
\paragraph*{Run-time Analysis}
It is easy to check that except for lines~\ref{alg:sample-plus-bsamp} and~\ref{alg:sample-times-union}, all lines take $O(1)$ time. Consider an execution of \cref{alg:sample-times-union}. We note that we will be adding a given set of variables to some set at most once: since the sum of the sizes of the sets at a given level is at most $\degree(\circuit)$, each gate visited takes $O(\log{\degree(\circuit)})$. For \Cref{alg:sample-plus-bsamp}, note that we pick $\circuit_\linput$ with probability $\frac a{a+b}$ where $a=\circuit.\vari{Lweight}$ and $b=\circuit.\vari{Rweight}$. We can implement this step by picking a random number $r\in[a+b]$ and then checking if $r\le a$. It is easy to check that $a+b\le \abs{\circuit}(1,\dots,1)$. This means we need to add and compare $\log{\abs{\circuit}(1,\ldots, 1)}$-bit numbers, which can certainly be done in time $\multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}$ (note that this is an over-estimate).
Denote \cost(\circuit) (\Cref{eq:cost-sampmon}) to be an upper bound of the number of gates visited by \sampmon. Then the runtime is $O\left(\cost(\circuit)\cdot \log{\degree(\circuit)}\cdot \multc{\log\left(\abs{\circuit(1\ldots, 1)}\right)}{\log{\size(\circuit)}}\right)$.
We now bound the number of recursive calls in $\sampmon$ by $O\left((\degree(\circuit) + 1)\right.$$\left.\cdot\right.$ $\left.\depth(\circuit)\right)$, which by the above will prove the claimed runtime.
Let \cost$(\cdot)$ be a function that models an upper bound on the number of gates that can be visited in the run of \sampmon. We define \cost$(\cdot)$ recursively as follows.
\begin{equation}
\cost(\circuit) =
\begin{cases}
1 + \cost(\circuit_\linput) + \cost(\circuit_\rinput) & \textbf{if } \text{\circuit.\type = }\circmult\\
1 + \max\left(\cost(\circuit_\linput), \cost(\circuit_\rinput)\right) & \textbf{if } \text{\circuit.\type = \circplus}\\
1 & \textbf{otherwise}
\end{cases}\label{eq:cost-sampmon}
\end{equation}
First note that the number of gates visited in \sampmon is $\leq\cost(\circuit)$. To show that \cref{eq:cost-sampmon} upper bounds the number of nodes visited by \sampmon, note that when \sampmon visits a gate such that \circuit.\type $ =\circmult$, line~\ref{alg:sample-times-for-loop} visits each input of \circuit, as defined in (\ref{eq:cost-sampmon}). For the case when \circuit.\type $= \circplus$, line~\ref{alg:sample-plus-bsamp} visits exactly one of the input gates, which may or may not be the subcircuit with the maximum number of gates traversed, which makes \cost$(\cdot)$ an upperbound. Finally, it is trivial to see that when \circuit.\type $\in \{\var, \tnum\}$, i.e., a source gate, that only one gate is visited.
We prove the following inequality holds.
\begin{equation}
2\left(\degree(\circuit) + 1\right) \cdot \depth(\circuit) + 1 \geq \cost(\circuit)\label{eq:strict-upper-bound}
\end{equation}
Note that \cref{eq:strict-upper-bound} implies the claimed runtime. We prove \cref{eq:strict-upper-bound} for the number of gates traversed in \sampmon using induction over $\depth(\circuit)$. Recall how degree is defined in \cref{def:degree}.
For the base case $\degree(\circuit) = \inset{0, 1}, \depth(\circuit) = 0$, $\cost(\circuit) = 1$, and it is trivial to see that the inequality $2\degree(\circuit) \cdot \depth(\circuit) + 1 \geq \cost(\circuit)$ holds.
For the inductive hypothesis, we assume the bound holds for any circuit where $\ell \geq \depth(\circuit) \geq 0$.
Now consider the case when \sampmon has an arbitrary circuit \circuit input with $\depth(\circuit) = \ell + 1$. By definition \circuit.\type $\in \{\circplus, \circmult\}$. Note that since $\depth(\circuit) \geq 1$, \circuit must have input(s). Further we know that by the inductive hypothesis the inputs $\circuit_i$ for $i \in \{\linput, \rinput\}$ of the sink gate \circuit uphold the bound
\begin{equation}
2\left(\degree(\circuit_i) + 1\right)\cdot \depth(\circuit_i) + 1 \geq \cost(\circuit_i).\label{eq:ih-bound-cost}
\end{equation}
In particular, since for any $i$, \cref{eq:ih-bound-cost} holds, then it immediately follows that an inequality whose operands consist of a sum of the aforementioned inequalities must also hold. This is readily seen in the inequality of \cref{eq:times-middle} and \cref{eq:times-rhs}, where $2\inparen{\degree(\circuit_\linput) + 1}\cdot \depth(\circuit_\linput) \geq \cost(\circuit_\linput)$, likewise for $\circuit_\rinput$, and $1\geq 1$.
It is also true that $\depth(\circuit_\linput) \leq \depth(\circuit) - 1$ and $\depth(\circuit_\rinput) \leq \depth(\circuit) - 1$.
If \circuit.\type $= \circplus$, then $\degree(\circuit) = \max\left(\degree(\circuit_\linput), \degree(\circuit_\rinput)\right)$. Otherwise \circuit.\type = $\circmult$ and $\degree(\circuit) = \degree(\circuit_\linput) + \degree(\circuit_\rinput) + 1$. In either case it is true that $\depth(\circuit) = \max\inparen{\depth(\circuit_\linput), \depth(\circuit_\rinput)} + 1$.
If \circuit.\type $= \circmult$, then, by \cref{eq:cost-sampmon}, substituting values, the following should hold,
\begin{align}
&2\left(\degree(\circuit_\linput) + \degree(\circuit_\rinput) + 2\right) \cdot \left(\max(\depth(\circuit_\linput), \depth(\circuit_\rinput)) + 1\right) + 1 \label{eq:times-lhs}\\
&\qquad\geq 2\left(\degree(\circuit_\linput) + 1\right) \cdot \depth(\circuit_\linput) + 2\left(\degree(\circuit_\rinput) + 1\right)\cdot \depth(\circuit_\rinput) + 3\label{eq:times-middle} \\
&\qquad\geq 1 + \cost(\circuit_\linput) + \cost(\circuit_\rinput) = \cost(\circuit) \label{eq:times-rhs}.
\end{align}
To prove (\ref{eq:times-middle}), first, \cref{eq:times-lhs} expands to,
\begin{equation}
2\degree(\circuit_\linput)\cdot\depth_{\max} + 2\degree(\circuit_\rinput)\cdot\depth_{\max} + 4\depth_{\max} + 2\degree(\circuit_\linput) + 2\degree(\circuit_\rinput) + 4 + 1\label{eq:times-lhs-expanded}
\end{equation}
where $\depth_{\max}$ is used to denote the maximum depth of the two input subcircuits. \Cref{eq:times-middle} expands to
\begin{equation}
2\degree(\circuit_\linput)\cdot\depth(\circuit_\linput) + 2\depth(\circuit_\linput) + 2\degree(\circuit_\rinput)\cdot\depth(\circuit_\rinput) + 2\depth(\circuit_\rinput) + 3\label{eq:times-middle-expanded}
\end{equation}
Putting \Cref{eq:times-lhs-expanded} and \Cref{eq:times-middle-expanded} together we get
\begin{align}
&2\degree(\circuit_\linput)\cdot\depth_{\max} + 2\degree(\circuit_\rinput)\cdot\depth_{\max} + 4\depth_{\max} + 2\degree(\circuit_\linput) + 2\degree(\circuit_\rinput) + 5\nonumber\\
&\qquad\geq 2\degree(\circuit_\linput)\cdot\depth(\circuit_\linput) + 2\degree(\circuit_\rinput)\cdot\depth(\circuit_\rinput) + 2\depth(\circuit_\linput) + 2\depth(\circuit_\rinput) + 3\label{eq:times-lhs-middle}
\end{align}
Since the following is always true,
\begin{align*}
&2\degree(\circuit_\linput)\cdot\depth_{\max} + 2\degree(\circuit_\rinput)\cdot\depth_{\max} + 4\depth_{\max} + 5\\
&\qquad \geq 2\degree(\circuit_\linput)\cdot\depth(\circuit_\linput) + 2\degree(\circuit_\rinput)\cdot\depth(\circuit_\rinput) + 2\depth(\circuit_\linput) + 2\depth(\circuit_\rinput) + 3,
\end{align*}
then it is the case that \Cref{eq:times-lhs-middle} is \emph{always} true.
Now to justify (\ref{eq:times-rhs}) which holds for the following reasons. First, \cref{eq:times-rhs}
is the result of \Cref{eq:cost-sampmon} when $\circuit.\type = \circmult$. \Cref{eq:times-middle}
is then produced by substituting the upperbound of (\ref{eq:ih-bound-cost}) for each $\cost(\circuit_i)$, trivially establishing the upper bound of (\ref{eq:times-rhs}). This proves \cref{eq:strict-upper-bound} for the $\circmult$ case.
For the case when \circuit.\type $= \circplus$, substituting values yields
\begin{align}
&2\left(\max(\degree(\circuit_\linput), \degree(\circuit_\rinput)) + 1\right) \cdot \left(\max(\depth(\circuit_\linput), \depth(\circuit_\rinput)) + 1\right) +1\label{eq:plus-lhs-inequality}\\
&\qquad \geq \max\left(2\left(\degree(\circuit_\linput) + 1\right) \cdot \depth(\circuit_\linput) + 1, 2\left(\degree(\circuit_\rinput) + 1\right) \cdot \depth(\circuit_\rinput) +1\right) + 1\label{eq:plus-middle}\\
&\qquad \geq 1 + \max(\cost(\circuit_\linput), \cost(\circuit_\rinput)) = \cost(\circuit)\label{eq:plus-rhs}
\end{align}
To prove (\ref{eq:plus-middle}), \cref{eq:plus-lhs-inequality} expands to
\begin{equation}
2\degree_{\max}\depth_{\max} + 2\degree_{\max} + 2\depth_{\max} + 2 + 1.\label{eq:plus-lhs-expanded}
\end{equation}
Since $\degree_{\max} \cdot \depth_{\max} \geq \degree(\circuit_i)\cdot \depth(\circuit_i),$ the following upper bound holds for the expansion of \cref{eq:plus-middle}:
\begin{equation}
2\degree_{\max}\depth_{\max} + 2\depth_{\max} + 2
\label{eq:plus-middle-expanded}
\end{equation}
Putting it together we obtain the following for (\ref{eq:plus-middle}):
\begin{align}
&2\degree_{\max}\depth_{\max} + 2\degree_{\max} + 2\depth_{\max} + 3\nonumber\\
&\qquad \geq 2\degree_{\max}\depth_{\max} + 2\depth_{\max} + 2, \label{eq:plus-upper-bound-final}
\end{align}
where it can be readily seen that the inequality stands and (\ref{eq:plus-upper-bound-final}) follows. This proves (\ref{eq:plus-middle}).
Similar to the case of $\circuit.\type = \circmult$, (\ref{eq:plus-rhs}) follows by equations $(\ref{eq:cost-sampmon})$ and $(\ref{eq:ih-bound-cost})$.
This proves (\ref{eq:strict-upper-bound}) as desired.
\qed
\end{proof}

View File

@ -0,0 +1,29 @@
%root: main.tex
\begin{algorithm}[t]
\caption{\sampmon(\circuit)}
\label{alg:sample}
\begin{algorithmic}[1]
\Require \circuit: Circuit
\Ensure \vari{vars}: TreeSet
\Ensure \vari{sgn} $\in \{-1, 1\}$
\Comment{\Cref{alg:one-pass-iter} should have been run before this one}
\State $\vari{vars} \gets \emptyset$ \label{alg:sample-global1}
\If{$\circuit.\type = +$}\Comment{Sample at every $+$ node}
\State $\circuit_{\vari{samp}} \gets$ Sample from left input ($\circuit_{\linput}$) and right input ($\circuit_{\rinput}$) w.p. $\circuit.\vari{Lweight}$ and $\circuit.\vari{Rweight}$. \label{alg:sample-plus-bsamp} \Comment{Each call to \sampmon uses fresh randomness}
\State $(\vari{v}, \vari{s}) \gets \sampmon(\circuit_{\vari{samp}})$\label{alg:sample-plus-traversal}
\State $\Return ~(\vari{v}, \vari{s})$
\ElsIf{$\circuit.\type = \times$}\Comment{Multiply the sampled values of all inputs}
\State $\vari{sgn} \gets 1$\label{alg:sample-global2}
\For {$input$ in $\circuit.\vari{input}$}\label{alg:sample-times-for-loop}
\State $(\vari{v}, \vari{s}) \gets \sampmon(input)$
\State $\vari{vars} \gets \vari{vars} \cup \{\vari{v}\}$\label{alg:sample-times-union}
\State $\vari{sgn} \gets \vari{sgn} \times \vari{s}$\label{alg:sample-times-product}
\EndFor
\State $\Return ~(\vari{vars}, \vari{sgn})$
\ElsIf{$\circuit.\type = \tnum$}\Comment{The leaf is a coefficient}
\State $\Return ~\left(\{\}, \func{sgn}(\circuit.\val)\right)$\label{alg:sample-num-return}\Comment{$\func{sgn}(\cdot)$ outputs $-1$ for \circuit.\val $\geq 1$ and $-1$ for \circuit.\val $\leq -1$}
\ElsIf{$\circuit.\type = var$}
\State $\Return~\left(\{\circuit.\val\}, 1\right) $\label{alg:sample-var-return}
\EndIf
\end{algorithmic}
\end{algorithm}

View File

@ -0,0 +1,31 @@
\section{Generalizing Beyond Set Inputs}
\label{sec:gener-results-beyond}
\subsection{\abbrTIDB{}s}
\label{sec:abbrtidbs}
In our definition of \abbrTIDBs (\Cref{subsec:tidbs-and-bidbs}), we assumed a model of \abbrTIDBs where each input tuple is assigned a probability $p$ of having multiplicity $1$. That is, we assumed inputs to be sets, but interpret queries under bag semantics. Other sensible generalizations of \abbrTIDBs from set semantics to bag semantics also exist.
One very natural such generalization is to assign each input tuple $\tup$ a multiplicity $m_\tup$ and probability $p$: the tuple has probability $p$ to exists with multiplicity $m_\tup$, and otherwise has multiplicity $0$. If the maximal multiplicity of all input tuples in the \abbrTIDB is bounded by some constant, then a generalization of our hardness results and approximation algorithm can be achieved by changing the construction of lineage polynomials (in \Cref{fig:nxDBSemantics}) as follows (all other cases remain the same as in \cref{fig:nxDBSemantics}):
\begin{align*}
\polyqdt{\rel}{\dbbase}{\tup} =&\begin{cases}
m_\tup X_\tup & \text{if }\dbbase.\rel\inparen{\tup} = m_\tup \\
0 &\text{otherwise.}\end{cases}
\end{align*}
That is the variable representing a tuple is multiplied by $m_\tup$ to encode the tuple's multiplicity $m_\tup$. We note that our lower bounds still hold for this model since we only need $m_\tup=1$ for all tuples $\tup$. Further, it can be argued that our proofs (as is) for approximation algorithms also work for this model. The only change is that since we now allow $m_\tup>1$ some of the constants in the runtime analysis of our algorithms change but the overall asymptotic runtime bound remains the same.
Yet another option would be to assign each tuple a probability distribution over multiplicities. It seems very unlikely that our results would extend to a model that allows arbitrary probability distributions over multiplicities (our current proof techniques definitely break down). However, we would like to note that the special case of a Poisson binomial distribution (sum of independent but not necessarily identical Bernoulli trials) over multiplicities can be handled as follows: we add an additional identifier attribute to each relation in the database. For a tuple $\tup$ with maximal multiplicity $m_\tup$, we create $m_\tup$ copies of $\tup$ with different identifiers. To answer a query over this encoding, we first project away the identifier attribute (note that as per \Cref{fig:nxDBSemantics}, in $\poly$ this would add up all the variables corresponding to the same tuple $\tup$).
\subsection{\abbrBIDB{}s}
\label{sec:abbrbidbs}
The approach described above works for \abbrBIDB\xplural as well if we define the bag version of \abbrBIDB{}s to associate each tuple $\tup$ a multiplicity $m_\tup$. Recall that we associate each tuple in a block with a unique variable. Thus, the modified lineage polynomial construction shown above can be applied for \abbrBIDB{}s too (and our approximation results also hold).
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,311 @@
%!TEX root=./main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{app_set-to-bag-pdb}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Missing details from Section~\ref{sec:background}}\label{sec:proofs-background}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{$\semK$-relations and \abbrNXPDB\xplural}\label{subsec:supp-mat-background}\label{subsec:supp-mat-krelations}
\input{app_k-relations}
\input{app_notation-background}
\section{Missing details from Section~\ref{sec:hard}}
\label{app:single-mult-p}
\input{app_hardness-results}
\section{Missing Details from Section~\ref{sec:algo}}\label{sec:proofs-approx-alg}
\input{app_approx-algo-defs-and-examples}
\input{app_approx-alg-analysis}
\input{app_onepass-analysis}
\input{app_samp-monom-analysis}
\subsection{Experimental Results}\label{app:subsec:experiment}
\input{experiments}
\section{Circuits}\label{app:sec-cicuits}
\subsection{Representing Polynomials with Circuits}\label{app:subsec-rep-poly-lin-circ}
\subsubsection{Circuits for query plans}
\label{sec:circuits-formal}
We now formalize circuits and the construction of circuits for $\raPlus$ queries.
As mentioned earlier, we represent lineage polynomials as arithmetic circuits over $\mathbb N$-valued variables with $+$, $\times$.
A circuit for query $Q$ and \abbrNXPDB $\pxdb$ is a directed acyclic graph $\tuple{V_{Q,\pxdb}, E_{Q,\pxdb}, \phi_{Q,\pxdb}, \ell_{Q,\pxdb}}$ with vertices $V_{Q,\pxdb}$ and directed edges $E_{Q,\pxdb} \subset {V_{Q,\pxdb}}^2$.
The sink function $\phi_{Q,\pxdb} : \udom^n \rightarrow V_{Q,\pxdb}$ is a partial function that maps the tuples of the $n$-ary relation $Q(\pxdb)$ to vertices.
We require that $\phi_{Q,\pxdb}$'s range be limited to sink vertices (i.e., vertices with out-degree 0).
A function $\ell_{Q,\pxdb} : V_{Q,\pxdb} \rightarrow \{\;+,\times\;\}\cup \mathbb N \cup \vct X$ assigns a label to each node: Source nodes (i.e., vertices with in-degree 0) are labeled with constants or variables (i.e., $\mathbb N \cup \vct X$), while the remaining nodes are labeled with the symbol $+$ or $\times$.
We require that vertices have an in-degree of at most two.
Note that we can construct circuits for \bis in time linear in the time required for deterministic query processing over a possible world of the \bi under the aforementioned assumption that $\abs{\pxdb} \leq c \cdot \abs{\db}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Modeling Circuit Construction}
\newcommand{\bagdbof}{\textsc{bag}(\pxdb)}
We now connect the size of a circuit (where the size of a circuit is the number of vertices in the corresponding DAG)
for a given $\raPlus$ query $Q$ and \abbrNXPDB $\pxdb$ to
the runtime $\qruntime{Q,\dbbase}$ of the PDB's \dbbaseName $\dbbase$.
We do this formally by showing that the size of the circuit is asymptotically no worse than the corresponding runtime of a large class of deterministic query processing algorithms.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\getpoly}[1]{\textbf{lin}\inparen{#1}}
Each vertex $v \in V_{Q,\pxdb}$ in the arithmetic circuit for
\[\tuple{V_{Q,\pxdb}, E_{Q,\pxdb}, \phi_{Q,\pxdb}, \ell_{Q,\pxdb}}\]
encodes a polynomial, realized as
\[\getpoly{v} = \begin{cases}
\sum_{v' : (v',v) \in E_{Q,\pxdb}} \getpoly{v'} & \textbf{if } \ell(v) = +\\
\prod_{v' : (v',v) \in E_{Q,\pxdb}} \getpoly{v'} & \textbf{if } \ell(v) = \times\\
\ell(v) & \textbf{otherwise}
\end{cases}\]
We define the circuit for a $\raPlus$ query $\query$ recursively by cases as follows. In each case, let $\tuple{V_{Q_i,\pxdb}, E_{Q_i,\pxdb}, \phi_{Q_{i},\pxdb}, \ell_{Q_i,\pxdb}}$ denote the circuit for subquery $Q_i$. We implicitly include in all circuits a global zero node $v_0$ s.t., $\ell_{Q, \pxdb}(v_0) = 0$ for any $Q,\pxdb$.
\begin{algorithm}
\caption{\abbrStepOne$(\query, \dbbase, E, V, \ell)$}
\label{alg:lc}
\begin{algorithmic}[1]
\Require $\query$: query
\Require $\dbbase$: a \dbbaseName
\Require $E, V, \ell$: accumulators for the edge list, vertex list, and vertex label list.
\Ensure $\circuit = \tuple{E, V, \phi, \ell}$: a circuit encoding the lineage of each tuple in $\query(\dbbase)$
\If{$\query$ is $R$} \Comment{\textbf{Case 1}: $\query$ is a relation atom}
\For{$t \in \dbbase.R$}
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, R(t))\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) \gets v_t$
\EndFor
\ElsIf{$\query$ is $\sigma_\theta(\query')$} \Comment{\textbf{Case 2}: $\query$ is a Selection}
\State $\tuple{V, E, \phi', \ell} \gets \abbrStepOne(\query', \dbbase, V, E, \ell)$
\For{$t \in \domain(\phi')$}
\State \textbf{if }$\theta(t)$
\textbf{ then } $\phi(t) \gets \phi'(t)$
\textbf{ else } $\phi(t) \gets v_0$
\EndFor
\ElsIf{$\query$ is $\pi_{\vec{A}}(\query')$} \Comment{\textbf{Case 3}: $\query$ is a Projection}
\State $\tuple{V, E, \phi', \ell} \gets \abbrStepOne(\query', \dbbase, V, E, \ell)$
\For{$t \in \pi_{\vec{A}}(\query'(\dbbase))$}
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$\Comment{Allocate a fresh node $v_t$}
\State $\phi(t) \leftarrow v_t$
\EndFor
\For{$t \in \query'(\dbbase)$}
\State $E \leftarrow E \cup \{(\phi'(t), \phi(\pi_{\vec{A}}t))\}$
\EndFor
\State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in two tree instead
\ElsIf{$\query$ is $\query_1 \cup \query_2$} \Comment{\textbf{Case 4}: $\query$ is a Bag Union}
\State $\tuple{V, E, \phi_1, \ell} \gets \abbrStepOne(\query_1, \dbbase, V, E, \ell)$
\State $\tuple{V, E, \phi_2, \ell} \gets \abbrStepOne(\query_2, \dbbase, V, E, \ell)$
\State $\phi \gets \phi_1 \cup \phi_2$
\For{$t \in \domain(\phi_1) \cap \domain(\phi_2)$}
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, +)\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) \gets v_t$
\State $E \leftarrow E \cup \{(\phi_1(t), v_t), (\phi_2(t), v_t)\}$
\EndFor
\ElsIf{$\query$ is $\query_1 \bowtie \ldots \bowtie \query_m$} \Comment{\textbf{Case 5}: $\query$ is a $m$-ary Join}
\For{$i \in [m]$}
\State $\tuple{V, E, \phi_i, \ell} \gets \abbrStepOne(\query_i, \dbbase, V, E, \ell)$
\EndFor
\For{$t \in \domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_m)$}
\State $V \leftarrow V \cup \{v_t\}$; $\ell \leftarrow \ell \cup \{(v_t, \times)\}$ \Comment{Allocate a fresh node $v_t$}
\State $\phi(t) \gets v_t$
\State $E \leftarrow E \cup \comprehension{(\phi_i(\pi_{sch(\query_i(\dbbase))}(t)), v_t)}{i \in [n]}$
\EndFor
\State Correct nodes with in-degrees $>2$ by appending an equivalent fan-in two tree instead
\EndIf
\end{algorithmic}
\end{algorithm}
\Cref{alg:lc} defines how the circuit for a query result is constructed. We quickly review the number of vertices emitted in each case.
\caseheading{Base Relation}
This circuit has $|D_\Omega.R|$ vertices.
\caseheading{Selection}
If we assume dead sinks are iteratively garbage collected,
this circuit has at most $|V_{Q_1,\pxdb}|$ vertices.
\caseheading{Projection}
This formulation will produce vertices with an in-degree greater than two, a problem that we correct by replacing every vertex with an in-degree over two by an equivalent fan-in two tree. The resulting structure has at most $|{Q_1}|-1$ new vertices.
The corrected circuit thus has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices.
\caseheading{Union}
This circuit has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices.
\caseheading{$k$-ary Join}
As in projection, newly created vertices will have an in-degree of $k$, and a fan-in two tree is required.
There are $|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ such vertices, so the corrected circuit has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(k-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Bounding circuit depth}
\label{sec:circuit-depth}
We first show that the depth of the circuit (\depth; \Cref{def:size-depth}) is bounded by the size of the query. Denote by $|\query|$ the number of relational operators in query $\query$, which recall we assume is a constant.
\begin{Proposition}[Circuit depth is bounded]
\label{prop:circuit-depth}
Let $\query$ be a relational query and $\dbbase$ be a \dbbaseName with $n$ tuples. There exists a (lineage) circuit $\circuit^*$ encoding the lineage of all tuples $\tup \in \query(\dbbase)$ for which
$\depth(\circuit^*) \leq O(k|\query|\log(n))$.
\end{Proposition}
\begin{proof}
We show that the bound of \Cref{prop:circuit-depth} holds for the circuit constructed by \Cref{alg:lc}.
First, observe that \Cref{alg:lc} is (recursively) invoked exactly once for every relational operator or base relation in $\query$; It thus suffices to show that a call to \Cref{alg:lc} adds at most $O_k(\log(n))$ to the depth of a circuit produced by any recursive invocation.
Second, observe that modulo the logarithmic fan-in of the projection and join cases, the depth of the output is at most one greater than the depth of any input (or at most 1 in the base case of relation atoms).
For the join case, the number of in-edges can be no greater than the join width, which itself is bounded by $k$. The depth thus increases by at most a constant factor of $\lceil \log(k) \rceil = O_k(1)$.
For the projection case, observe that the fan-in is bounded by $|\query'(\dbbase)|$, which is in turn bounded by $n^k$. The depth increase for any projection node is thus at most $\lceil \log(n^k)\rceil = O(k\log(n))$, as desired.
\qed
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Circuit size vs. runtime}
\label{sec:circuit-runtime}
\begin{Lemma}\label{lem:circ-model-runtime}
\label{lem:circuits-model-runtime}
Given a \abbrNXPDB $\pxdb$ with \dbbaseName $\dbbase$, and an $\raPlus$ query $Q$, the runtime of $Q$ over $\dbbase$ has the same or greater complexity as the size of the lineage of $Q(\pxdb)$. That is, we have $\abs{V_{Q,\pxdb}} \leq k\qruntime{Q, \dbbase}+1$, where $k\ge 1$ is the maximal degree of any polynomial in $Q(\pxdb)$.
\end{Lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{proof}
We prove by induction that $\abs{V_{Q,\pxdb} \setminus \{v_0\}} \leq k\qruntime{Q, \dbbase}$. For clarity, we implicitly exclude $v_0$ in the proof below.
The base case is a base relation: $Q = R$ and is trivially true since $|V_{R,\pxdb}| = |\dbbase.R|=\qruntime{R, \dbbase}$ (note that here the degree $k=1$).
For the inductive step, we assume that we have circuits for subqueries $Q_1, \ldots, Q_m$ such that $|V_{Q_i,\pxdb}| \leq k_i\qruntime{Q_i,\dbbase}$ where $k_i$ is the degree of $Q_i$.
\caseheading{Selection}
Assume that $Q = \sigma_\theta(Q_1)$.
In the circuit for $Q$, $|V_{Q,\pxdb}| = |V_{Q_1,\dbbase}|$ vertices, so from the inductive assumption and $\qruntime{Q,\dbbase} = \qruntime{Q_1,\dbbase}$ by definition, we have $|V_{Q,\pxdb}| \leq k \qruntime{Q,\dbbase} $.
\caseheading{Projection}
Assume that $Q = \pi_{\vct A}(Q_1)$.
The circuit for $Q$ has at most $|V_{Q_1,\pxdb}|+|{Q_1}|$ vertices.
\begin{align*}
|V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}| + |Q_1|\\
\intertext{(From the inductive assumption)}
& \leq k\qruntime{Q_1,\dbbase} + \abs{Q_1}\\
\intertext{(By definition of $\qruntime{Q,\dbbase}$)}
& \le k\qruntime{Q,\dbbase}.
\end{align*}
\caseheading{Union}
Assume that $Q = Q_1 \cup Q_2$.
The circuit for $Q$ has $|V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1} \cap {Q_2}|$ vertices.
\begin{align*}
|V_{Q,\pxdb}| & \leq |V_{Q_1,\pxdb}|+|V_{Q_2,\pxdb}|+|{Q_1}|+|{Q_2}|\\
\intertext{(From the inductive assumption)}
& \leq k(\qruntime{Q_1,\dbbase} + \qruntime{Q_2,\dbbase}) + (|Q_1| + |Q_2|)
\intertext{(By definition of $\qruntime{Q,\dbbase}$)}
& \leq k(\qruntime{Q,\dbbase}).
\end{align*}
\caseheading{$m$-ary Join}
Assume that $Q = Q_1 \bowtie \ldots \bowtie Q_m$. Note that $k=\sum_{i=1}^m k_i\ge m$.
The circuit for $Q$ has $|V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(m-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|$ vertices.
\begin{align*}
|V_{Q,\pxdb}| & = |V_{Q_1,\pxdb}|+\ldots+|V_{Q_k,\pxdb}|+(m-1)|{Q_1} \bowtie \ldots \bowtie {Q_k}|\\
\intertext{From the inductive assumption and noting $\forall i: k_i \leq k$ and $m\le k$}
& \leq k\qruntime{Q_1,\dbbase}+\ldots+k\qruntime{Q_k,\dbbase}+\\
&\;\;\; (m-1)|{Q_1} \bowtie \ldots \bowtie {Q_m}|\\
& \leq k(\qruntime{Q_1,\dbbase}+\ldots+\qruntime{Q_m,\dbbase}+\\
&\;\;\;|{Q_1} \bowtie \ldots \bowtie {Q_m}|)\\
\intertext{(By definition of $\qruntime{Q,\dbbase}$ and assumption on $\jointime{\cdot}$)}
& \le k\qruntime{Q,\dbbase}.
\end{align*}
The property holds for all recursive queries, and the proof holds.
\qed
\end{proof}
\subsubsection{Runtime of \abbrStepOne}
\label{sec:lc-runtime}
We next need to show that we can construct the circuit in time linear in the deterministic runtime.
\begin{Lemma}\label{lem:tlc-is-the-same-as-det}
Given a query $\query$ over a \dbbaseName $\dbbase$ and the $\circuit^*$ output by \Cref{alg:lc}, the runtime $\timeOf{\abbrStepOne}(\query,\dbbase,\circuit^*) \le O(\qruntime{\query, \dbbase})$.
\end{Lemma}
\begin{proof}
By analysis of \Cref{alg:lc}, invoked as $\circuit^*\gets\abbrStepOne(\query, \dbbase, \{v_0\}, \emptyset, \{(v_0, 0)\})$.
We assume that the vertex list $V$, edge list $E$, and vertex label list $\ell$ are mutable accumulators with $O(1)$ ammortized append.
We assume that the tuple to sink mapping $\phi$ is a linked hashmap, with $O(1)$ insertions and retrievals, and $O(n)$ iteration over the domain of keys.
We assume that the n-ary join $\domain(\phi_1) \bowtie \ldots \bowtie\domain(\phi_n)$ can be computed in time $\jointime{\domain(\phi_1), \ldots, \domain(\phi_n)}$ (\Cref{def:join-cost}) and that an intersection $\domain(\phi_1) \cap \domain(\phi_2)$ can be computed in time $O(|\domain(\phi_1)| + |\domain(\phi_2)|)$ (e.g., with a hash table).
Before proving our runtime bound, we first observe that $\qruntime{\query, \db} \geq \Omega(|\query(\db)|)$.
This is true by construction for the relation, projection, and union cases, by \Cref{def:join-cost} for joins, and by the observation that $|\sigma(R)| \leq |R|$.
We showthat $\qruntime{\query, \dbbase}$ is an upper-bound for the runtime of \Cref{alg:lc} by recursion.
The base case of a relation atom requires only an $O(|\dbbase.R|)$ iteration over the source tuples.
For the remaining cases, we make the recursive assumption that for every subquery $\query'$, it holds that $O(\qruntime{\query', \dbbase})$ bounds the runtime of \Cref{alg:lc}.
\caseheading{Selection}
Selection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$.
\Cref{alg:lc} requires a loop over every element of $\query'(\dbbase)$.
By the observation above that $\qruntime{\query, \db} \geq \Omega(|\query(\db)|)$, this iteration is also bounded by $O(\qruntime{\query', \dbbase})$.
\caseheading{Projection}
Projection requires a recursive call to \Cref{alg:lc}, which by the recursive assumption is bounded by $O(\qruntime{\query', \dbbase})$, which in turn is a term in $\qruntime{\pi_{\vec{A}}\query', \dbbase}$.
What remains is an iteration over $\pi_{\vec A}(\query(\dbbase))$ (lines 13--16), an iteration over $\query'(\dbbase)$ (lines 17--19), and the construction of a fan-in tree (line 20).
The first iteration is $O(|\query(\dbbase)|) \leq O(\qruntime{\query, \dbbase})$.
The second iteration and the construction of the bounded fan-in tree are both $O(|\query'(\dbbase)|) \leq O(\qruntime{\query', \dbbase}) \leq O(\qruntime{\query, \dbbase}) $, by the the observation above that $\qruntime{\query, \db} \geq \Omega(|\query(\db)|)$.
\caseheading{Bag Union}
As above, the recursive calls explicitly correspond to terms in the expansion of $\qruntime{\query_1 \cup \query_2, \dbbase}$.
Initializing $\phi$ (line 24) can be accomplished in $O(\domain(\phi_1) + \domain(\phi_2)) = O(|\query_1(\dbbase)| + |\query_2(\dbbase)|) \leq O(\qruntime{\query_1, \dbbase} + \qruntime{\query_2, \dbbase})$.
The remainder requires computing $\query_1 \cup \query_2$ (line 25) and iterating over it (lines 25--29), which is $O(|\query_1| + |\query_2|)$ as noted above --- this directly corresponds to terms in $\qruntime{\query_1 \cup \query_2, \dbbase}$.
\caseheading{$m$-ary Join}
As in the prior cases, recursive calls explicitly correspond to terms in our target runtime.
The remaining logic involves (i) computing $\domain(\phi_1) \bowtie \ldots \bowtie \domain(\phi_m)$, (ii) iterating over the results, and (iii) creating a fan-in tree.
Respectively, these are: \\
~(i)~$\jointime{\domain(\phi_1), \ldots, \domain(\phi_m)}$\\
~(ii)~$O(|\query_1(\dbbase) \bowtie \ldots \bowtie \query_m(\dbbase)|) \leq O(\jointime{\domain(\phi_1), \ldots, \domain(\phi_m)})$ (\Cref{def:join-cost})\\
~(iii)~$O(m|\query_1(\dbbase) \bowtie \ldots \bowtie \query_m(\dbbase)|)$ (as (ii), noting that $m \leq k = O(1)$)
\qed
\end{proof}
\section{Higher Moments}
\label{sec:momemts}
%
We make a simple observation to conclude the presentation of our results.
So far we have only focused on the expectation of $\poly$.
In addition, we could e.g. prove bounds of the probability of a tuple's multiplicity being at least $1$.
Progress can be made on this as follows:
For any positive integer $m$ we can compute the $m$-th moment of the multiplicities, allowing us to e.g. use the Chebyschev inequality or other high moment based probability bounds on the events we might be interested in.
We leave further investigations for future work.
\section{The Karp-Luby Estimator}
\label{sec:karp-luby}
%
Computing the marginal probability of a tuple in the output of a set-probabilistic database query has been studied extensively.
To the best of our knowledge, the current state of the art approximation algorithm for this problem is the Karp-Luby estimator~\cite{DBLP:journals/jal/KarpLM89}, which first appeared in MayBMS/Sprout~\cite{DBLP:conf/icde/OlteanuHK10}, and more recently as part of an online ``anytime'' approximation algorithm~\cite{FH13,heuvel-19-anappdsd}.
The estimator works by observing that for any $\ell$ random binary (but not necessarily independent) events $\vct{W}_1, \ldots, \vct{W}_\ell$, the probability of at least one event occurring (i.e., $\probOf\inparen{\vct{W}_1 \vee \ldots \vee\vct{W}_\ell}$) is bounded from above by the sum of the independent event probabilities (i.e., $\probOf\inparen{\vct{W}_1 \vee \ldots \vee \vct{W}_\ell} \leq \probOf\inparen{\vct{W}_1} + \ldots + \probOf\inparen{\vct{W}_\ell}$).
Starting from this (`easily' computable and large) value, the estimator proceeds to correct the estimate by estimating how much of an over-estimate it is.
Specifically, if $\mathcal P$ is the joint distribution over $\vct{W}$, the estimator computes an approximation of:
$$\mathcal O = \underset{\vct{W} \sim \mathcal P}{\expct}\Big[
\left|\comprehension{i}{\vct{W}_i = 1, i \in [\ell]}\right|
\Big].$$
The accuracy of this estimate is improved by conditioning $\mathcal P$ on a $W_i$ chosen uniformly at random (which ensures that the sampled count will be at least 1) and correcting the resulting estimate by $\probOf\inparen{W_i}$. With an estimate of $\mathcal O$, it can easily be verified that the probability of the disjunction can be computed as:
$$\probOf\inparen{\vct{W}_1 \vee \ldots \vee\vct{W}_\ell} = \probOf\inparen{\vct{W}_1} + \ldots + \probOf\inparen{\vct{W}_\ell} - \mathcal O$$
The Karp-Luby estimator is employed on the \abbrSMB representation\footnote{Note that since we are in the set semantics, in the lineage polynomial/formula, addition is logical OR and multiplication is logical AND.} of $\circuit$ (to solve the set-PDB version of \Cref{prob:intro-stmt}), where each $W_i$ represents the event that one monomial is true.
By simple inspection, if there are $\ell$ monomials, this estimator has runtime $\Omega(\ell)$. Further, a minimum of $\left\lceil\frac{3\cdot \ell\cdot \log(\frac{2}{\delta})}{\epsilon^2}\right\rceil$ invocations of the estimator are required to achieve $1\pm\epsilon$ approximation with probability at least $1-\delta$~\cite{DBLP:conf/icde/OlteanuHK10}, entailing a runtime at least quadratic in $\ell$.
As an arbitrary lineage circuit $\circuit$ may encode $\Omega\inparen{|\circuit|^k}$ monomials, the worst case runtime is at least $\Omega\inparen{|\circuit|^{2k}}$ (where $k$ is the `degree' of lineage polynomial encoded by $\circuit$). By contrast note that by the discussion after \Cref{lem:val-ub} we can solve \Cref{prob:intro-stmt} in time $O\inparen{|\circuit|^2}$ for all \abbrBIDB circuits {\em independent} of the degree $k$.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,158 @@
%root: main.tex
%!TEX root=./main.tex
\section{$1 \pm \epsilon$ Approximation Algorithm}\label{sec:algo}
In \Cref{sec:hard}, we showed that \Cref{prob:bag-pdb-poly-expected} cannot be solved in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. In light of this, we desire to produce an approximation algorithm that runs in time $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$. We do this by showing the result via circuits,
such that our approximation algorithm for this problem runs in $\bigO{\abs{\circuit}}$ for a very broad class of circuits, (thus affirming~\Cref{prob:intro-stmt}); see the discussion after \Cref{lem:val-ub} for more.
The following approximation algorithm applies to bag query semantics over both
\abbrCTIDB lineage polynomials and general \abbrBIDB lineage polynomials in practice, where for the latter we note that a $1$-\abbrTIDB is equivalently a \abbrBIDB (blocks are size $1$). Our experimental results (see~\Cref{app:subsec:experiment}) which use queries from the PDBench benchmark~\cite{pdbench} show a low $\gamma$ (see~\Cref{def:param-gamma}) supporting the notion that our bounds hold for general \abbrBIDB in practice.
Corresponding proofs and pseudocode for all formal statements and algorithms
can be found in \Cref{sec:proofs-approx-alg}.
\subsection{Preliminaries and some more notation}
We now introduce definitions and notation related to circuits and polynomials that we will need to state our upper bound results. First we introduce the expansion $\expansion{\circuit}$ of circuit $\circuit$ which
is used in our auxiliary algorithm \sampmon for sampling monomials when computing the approximation.
\begin{Definition}[$\expansion{\circuit}$]\label{def:expand-circuit}
For a circuit $\circuit$, we define $\expansion{\circuit}$ as a list of tuples $(\monom, \coef)$, where $\monom$ is a set of variables and $\coef \in \domN$.
$\expansion{\circuit}$ has the following recursive definition ($\circ$ is list concatenation).
$\expansion{\circuit} =
\begin{cases}
\expansion{\circuit_\linput} \circ \expansion{\circuit_\rinput} &\textbf{ if }\circuit.\type = \circplus\\
\left\{(\monom_\linput \cup \monom_\rinput, \coef_\linput \cdot \coef_\rinput) ~|~(\monom_\linput, \coef_\linput) \in \expansion{\circuit_\linput}, (\monom_\rinput, \coef_\rinput) \in \expansion{\circuit_\rinput}\right\} &\textbf{ if }\circuit.\type = \circmult\\
\elist{(\emptyset, \circuit.\val)} &\textbf{ if }\circuit.\type = \tnum\\
\elist{(\{\circuit.\val\}, 1)} &\textbf{ if }\circuit.\type = \var.\\
\end{cases}
$
\end{Definition}
Later on, we will denote the monomial composed of the variables in $\monom$ as $\encMon$. As an example of $\expansion{\circuit}$, consider $\circuit$ illustrated in \Cref{fig:circuit}. $\expansion{\circuit}$ is then $[(X, 2), (XY, -1), (XY, 4), (Y, -2)]$. This helps us redefine $\rpoly$ (see \Cref{eq:tilde-Q-bi}) in a way that makes our algorithm more transparent.
\begin{Definition}[$\abs{\circuit}$]\label{def:positive-circuit}
For any circuit $\circuit$, the corresponding
{\em positive circuit}, denoted $\abs{\circuit}$, is obtained from $\circuit$ as follows. For each leaf node $\ell$ of $\circuit$ where $\ell.\type$ is $\tnum$, update $\ell.\vari{value}$ to $|\ell.\vari{value}|$.
\end{Definition}
We will overload notation and use $\abs{\circuit}\inparen{\vct{X}}$ to mean $\polyf\inparen{\abs{\circuit}}$.
Conveniently, $\abs{\circuit}\inparen{1,\ldots,1}$ gives us $\sum\limits_{\inparen{\monom, \coef} \in \expansion{\circuit}}\abs{\coef}$.
\begin{Definition}[\size($\cdot$), \depth$\inparen{\cdot}$]\label{def:size-depth}
The functions \size and \depth output the number of gates and levels respectively for input \circuit.
\end{Definition}
\begin{Definition}[$\degree(\cdot)$]\label{def:degree}\footnote{Note that the degree of $\polyf(\abs{\circuit})$ is always upper bounded by $\degree(\circuit)$ and the latter can be strictly larger (e.g. consider the case when $\circuit$ multiplies two copies of the constant $1$-- here we have $\deg(\circuit)=1$ but degree of $\polyf(\abs{\circuit})$ is $0$).}
$\degree(\circuit)$ is defined recursively as follows:
\[\degree(\circuit)=
\begin{cases}
\max(\degree(\circuit_\linput),\degree(\circuit_\rinput)) & \text{ if }\circuit.\type=+\\
\degree(\circuit_\linput) + \degree(\circuit_\rinput)+1 &\text{ if }\circuit.\type=\times\\
1 & \text{ if }\circuit.\type = \var\\
0 & \text{otherwise}.
\end{cases}
\]
\end{Definition}
Next, we use the following notation for the complexity of multiplying integers:
\begin{Definition}[$\multc{\cdot}{\cdot}$]\footnote{We note that when doing arithmetic operations on the RAM model for input of size $N$, we have that $\multc{O(\log{N})}{O(\log{N})}=O(1)$. More generally we have $\multc{N}{O(\log{N})}=O(N\log{N}\log\log{N})$.}
In a RAM model of word size of $W$-bits, $\multc{M}{W}$ denotes the complexity of multiplying two integers represented with $M$-bits. (We will assume that for input of size $N$, $W=O(\log{N})$.)
\end{Definition}
Finally, to get linear runtime results, we will need to define another parameter modeling the (weighted) number of monomials in $\expansion{\circuit}$
that need to be `canceled' when monomials with dependent variables are removed (\Cref{subsec:one-bidb}).
Let $\isInd{\cdot}$ be a boolean function returning true if monomial $\encMon$ is composed of independent variables and false otherwise; further, let $\indicator{\theta}$ also be a boolean function returning true if $\theta$ evaluates to true.
\begin{Definition}[Parameter $\gamma$]\label{def:param-gamma}
Given a \abbrOneBIDB circuit $\circuit$ define
\[\gamma(\circuit)=\frac{\sum_{(\monom, \coef)\in \expansion{\circuit}} \abs{\coef}\cdot \indicator{\neg\isInd{\encMon}} }
{\abs{\circuit}(1,\ldots, 1)}.\]
\end{Definition}
\subsection{Our main result}\label{sec:algo:sub:main-result}
We solve~\Cref{prob:intro-stmt} for any fixed $\epsilon > 0$ in what follows.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Algorithm Idea}
Our approximation algorithm (\approxq pseudo code in \Cref{sec:proof-lem-approx-alg})
is based on the following observation.
Given a lineage polynomial $\poly(\vct{X})=\polyf(\circuit)$ for circuit \circuit over
\abbrOneBIDB (recall that all \abbrCTIDB can be reduced to \abbrOneBIDB by~\Cref{prop:ctidb-reduct}), we have:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{equation}
\label{eq:tilde-Q-bi}
\rpoly\inparen{p_1,\dots,p_\numvar}=\hspace*{-1mm}\sum_{(\monom,\coef)\in \expansion{\circuit}}
\indicator{\isInd{\encMon}
}\cdot \coef\cdot\hspace*{-2mm}\prod_{X_i\in \monom}\hspace*{-2mm} p_i.
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Given the above, the algorithm is a sampling based algorithm for the above sum: we sample (via \sampmon) $(\monom,\coef)\in \expansion{\circuit}$ with probability proportional
to $\abs{\coef}$ and compute $\vari{Y}=\indicator{\isInd{\encMon}}
\cdot \prod_{X_i\in \monom} p_i$.
Repeating the sampling an appropriate number of times
and computing the average of $\vari{Y}$ gives us our final estimate. \onepass is used to compute the sampling probabilities needed in \sampmon (details are in \Cref{sec:proofs-approx-alg}).
%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Runtime analysis} We can argue the following runtime for the algorithm outlined above:
\begin{Theorem}
\label{cor:approx-algo-const-p}
Let \circuit be an arbitrary \emph{\abbrOneBIDB} circuit, define $\poly(\vct{X})=\polyf(\circuit)$, let $k=\degree(\circuit)$, and let $\gamma=\gamma(\circuit)$. Further let it be the case that $\prob_i\ge \prob_0$ for all $i\in[\numvar]$. Then an estimate $\mathcal{E}$ of $\rpoly(\prob_1,\ldots, \prob_\numvar)$
satisfying
\begin{equation}
\label{eq:approx-algo-bound-main}
\probOf\left(\left|\mathcal{E} - \rpoly(\prob_1,\dots,\prob_\numvar)\right|> \error' \cdot \rpoly(\prob_1,\dots,\prob_\numvar)\right) \leq \conf
\end{equation}
can be computed in time
\begin{equation}
\label{eq:approx-algo-runtime}
O\left(\left(\size(\circuit) + \frac{\log{\frac{1}{\conf}}\cdot k\cdot \log{k} \cdot \depth(\circuit))}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right).
\end{equation}
In particular, if $\prob_0>0$ and $\gamma<1$ are absolute constants then the above runtime simplifies to $O_k\left(\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)\cdot\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}\right)$.
\end{Theorem}
The restriction on $\gamma$ is satisfied by any
$1$-\abbrTIDB (where $\gamma=0$ in the equivalent $1$-\abbrBIDB of~\Cref{prop:ctidb-reduct})
as well as for all three queries of the PDBench \abbrBIDB benchmark (see \Cref{app:subsec:experiment} for experimental results). Further, we can also argue the following result, recalling from~\Cref{sec:intro} for \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, where $\tupset$ is the set of possible tuples across all possible worlds of $\pdb$.
\begin{Lemma}
\label{lem:ctidb-gamma}
Given $\raPlus$ query $\query$ and \abbrCTIDB $\pdb$, let \circuit be the circuit computed by $\query\inparen{\tupset}$. Then, for the reduced \abbrOneBIDB $\pdb'$ there exists an equivalent circuit \circuit' obtained from $\query\inparen{\tupset'}$, such that $\gamma\inparen{\circuit'}\leq 1 - \bound^{-\inparen{k-1}}$ with $\size\inparen{\circuit'} \leq \size\inparen{\circuit} + \bigO{\numvar\bound}$
and $\depth\inparen{\circuit'} = \depth\inparen{\circuit} + \bigO{\log{\bound}}$.
\end{Lemma}
We briefly connect the runtime in \Cref{eq:approx-algo-runtime} to the algorithm outline earlier (where we ignore the dependence on $\multc{\cdot}{\cdot}$, which is needed to handle the cost of arithmetic operations over integers). The $\size(\circuit)$ comes from the time taken to run \onepass once (\onepass essentially computes $\abs{\circuit}(1,\ldots, 1)$ using the natural circuit evaluation algorithm on $\circuit$). We make $\frac{\log{\frac{1}{\conf}}}{\inparen{\error'}^2\cdot(1-\gamma)^2\cdot \prob_0^{2k}}$ many calls to \sampmon (each of which essentially traces $O(k)$ random sink to source paths in $\circuit$ all of which by definition have length at most $\depth(\circuit)$).
Finally, we address the $\multc{\log\left(\abs{\circuit}(1,\ldots, 1)\right)}{\log\left(\size(\circuit)\right)}$ term in the runtime.
\begin{Lemma}
\label{lem:val-ub}
For any \emph{\abbrOneBIDB} circuit $\circuit$ with $\degree(\circuit)=k$, we have
$\abs{\circuit}(1,\ldots, 1)\le 2^{2^k\cdot \depth(\circuit)}.$
Further, if $\circuit$ is a tree, then we have $\abs{\circuit}(1,\ldots, 1)\le \size(\circuit)^{O(k)}.$
\end{Lemma}
Note that the above implies that with the assumption $\prob_0>0$ and $\gamma<1$ are absolute constants from \Cref{cor:approx-algo-const-p}, then the runtime there simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)^2\cdot \log{\frac{1}{\conf}}\right)$ for general circuits $\circuit$. If $\circuit$ is a tree, then the runtime simplifies to $O_k\left(\frac 1{\inparen{\error'}^2}\cdot\size(\circuit)\cdot \log{\frac{1}{\conf}}\right)$, which then answers \Cref{prob:intro-stmt} with yes for such circuits.
Finally, note that by \Cref{prop:circuit-depth} and \Cref{lem:circ-model-runtime} for any $\raPlus$ query $\query$, there exists a circuit $\circuit^*$ for $\apolyqdt$ such that $\depth(\circuit^*)\le O_{|Q|}(\log{n})$ and $\size(\circuit)\le O_k\inparen{\qruntime{\query, \tupset, \bound}}$. Using this along with \Cref{lem:val-ub}, \Cref{cor:approx-algo-const-p} and the fact that $n\le \qruntime{\query, \tupset, \bound}$, we have the following corollary:
\begin{Corollary}
\label{cor:approx-algo-punchline}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \emph{\abbrOneBIDB} with $p_0>0$ and $\gamma<1$ (where $p_0,\gamma$ as in \Cref{cor:approx-algo-const-p}) are absolute constants. Let $\poly(\vct{X})=\apolyqdt$ for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $p_i$ for each $i\in [n]$ that defines $\pd$).
\end{Corollary}
Next, we note that the above result along with \Cref{lem:ctidb-gamma}
answers \Cref{prob:big-o-joint-steps} in the affirmative as follows:
\begin{Corollary}
\label{cor:approx-algo-punchline-ctidb}
Let $\query$ be an $\raPlus$ query and $\pdb$ be a \abbrCTIDB with $p_0>0$ (where $p_0$ as in \Cref{cor:approx-algo-const-p}) is an absolute constant. Let $\poly(\vct{X})=\apolyqdt$ for any result tuple $\tup$ with $\deg(\poly)=k$. Then one can compute an approximation satisfying \Cref{eq:approx-algo-bound-main} in time $O_{k,|Q|,\error',\conf,\bound}\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$ (given $\query,\tupset$ and $\prob_{\tup, j}$ for each $\tup\in\tupset,~j\in\pbox{\bound}$ that defines $\bpd$).
\end{Corollary}
\begin{proof}[Proof of~\Cref{cor:approx-algo-punchline-ctidb}]
The proof follows by~\Cref{lem:ctidb-gamma}, and~\Cref{cor:approx-algo-punchline}.
\end{proof}
\qed
If we want to approximate the expected multiplicities of all $Z=O(n^k)$ result tuples $\tup$ simultaneously, we just need to run the above result with $\conf$ replaced by $\frac \conf Z$. Note this increases the runtime by only a logarithmic factor.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

142
Sketching Worlds/atri.bib Normal file
View File

@ -0,0 +1,142 @@
@inproceedings{triang-hard,
author = {Tsvi Kopelowitz and
Virginia Vassilevska Williams},
editor = {Artur Czumaj and
Anuj Dawar and
Emanuela Merelli},
title = {Towards Optimal Set-Disjointness and Set-Intersection Data Structures},
booktitle = {47th International Colloquium on Automata, Languages, and Programming,
{ICALP} 2020, July 8-11, 2020, Saarbr{\"{u}}cken, Germany (Virtual
Conference)},
series = {LIPIcs},
volume = {168},
pages = {74:1--74:16},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum f{\"{u}}r Informatik},
year = {2020},
url = {https://doi.org/10.4230/LIPIcs.ICALP.2020.74},
doi = {10.4230/LIPIcs.ICALP.2020.74},
timestamp = {Tue, 30 Jun 2020 17:15:44 +0200},
biburl = {https://dblp.org/rec/conf/icalp/KopelowitzW20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{arith-complexity,
author = {Peter B{\"{u}}rgisser and
Michael Clausen and
Mohammad Amin Shokrollahi},
title = {Algebraic complexity theory},
series = {Grundlehren der mathematischen Wissenschaften},
volume = {315},
publisher = {Springer},
year = {1997},
isbn = {3-540-60582-7},
timestamp = {Thu, 31 Jan 2013 18:02:56 +0100},
biburl = {https://dblp.org/rec/books/daglib/0090316.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{NPRR,
author = {Hung Q. Ngo and
Ely Porat and
Christopher R{\'{e}} and
Atri Rudra},
title = {Worst-case Optimal Join Algorithms},
journal = {J. {ACM}},
volume = {65},
number = {3},
pages = {16:1--16:40},
year = {2018},
url = {https://doi.org/10.1145/3180143},
doi = {10.1145/3180143},
timestamp = {Wed, 21 Nov 2018 12:44:29 +0100},
biburl = {https://dblp.org/rec/journals/jacm/NgoPRR18.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{skew,
author = {Hung Q. Ngo and
Christopher R{\'{e}} and
Atri Rudra},
title = {Skew strikes back: new developments in the theory of join algorithms},
journal = {{SIGMOD} Rec.},
volume = {42},
number = {4},
pages = {5--16},
year = {2013},
url = {https://doi.org/10.1145/2590989.2590991},
doi = {10.1145/2590989.2590991},
timestamp = {Fri, 06 Mar 2020 21:55:55 +0100},
biburl = {https://dblp.org/rec/journals/sigmod/NgoRR13.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{factorized-db,
author = {Dan Olteanu and
Maximilian Schleich},
title = {Factorized Databases},
journal = {{SIGMOD} Rec.},
volume = {45},
number = {2},
pages = {5--16},
year = {2016},
url = {https://doi.org/10.1145/3003665.3003667},
doi = {10.1145/3003665.3003667},
timestamp = {Fri, 06 Mar 2020 21:56:19 +0100},
biburl = {https://dblp.org/rec/journals/sigmod/OlteanuS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{ngo-survey,
author = {Hung Q. Ngo},
editor = {Jan Van den Bussche and
Marcelo Arenas},
title = {Worst-Case Optimal Join Algorithms: Techniques, Results, and Open
Problems},
booktitle = {Proceedings of the 37th {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
of Database Systems, Houston, TX, USA, June 10-15, 2018},
pages = {111--124},
publisher = {{ACM}},
year = {2018},
url = {https://doi.org/10.1145/3196959.3196990},
doi = {10.1145/3196959.3196990},
timestamp = {Wed, 21 Nov 2018 12:44:18 +0100},
biburl = {https://dblp.org/rec/conf/pods/000118.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{ajar,
author = {Manas R. Joglekar and
Rohan Puttagunta and
Christopher R{\'{e}}},
editor = {Tova Milo and
Wang{-}Chiew Tan},
title = {{AJAR:} Aggregations and Joins over Annotated Relations},
booktitle = {Proceedings of the 35th {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
of Database Systems, {PODS} 2016, San Francisco, CA, USA, June 26
- July 01, 2016},
pages = {91--106},
publisher = {{ACM}},
year = {2016},
url = {https://doi.org/10.1145/2902251.2902293},
doi = {10.1145/2902251.2902293},
timestamp = {Tue, 06 Nov 2018 16:58:02 +0100},
biburl = {https://dblp.org/rec/conf/pods/JoglekarPR16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{AGM,
author = {Albert Atserias and
Martin Grohe and
D{\'{a}}niel Marx},
title = {Size Bounds and Query Plans for Relational Joins},
journal = {{SIAM} J. Comput.},
volume = {42},
number = {4},
pages = {1737--1767},
year = {2013},
url = {https://doi.org/10.1137/110859440},
doi = {10.1137/110859440},
timestamp = {Thu, 08 Jun 2017 08:59:24 +0200},
biburl = {https://dblp.org/rec/journals/siamcomp/AtseriasGM13.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}

View File

@ -0,0 +1,11 @@
%root main.tex
\section{$\rpoly$ cancellations due to $\bi$ constraint}
\paragraph{Problem Definition}
Since $\bi$ has the constraint that all tuples from the same block are mutually exclusive from one another, it is the case that there exist query polynomials $\poly$ such that $\rpoly$ will cancel out monomials that violate this condition. Let us assume that we have the following $\poly = \poly_1 \cdot \poly_2$, where $\poly_1 = \sum_{i = 1}^\numvar \tup^{1_i}$ and $\poly_2 = \sum{j = 1}^\numvar \tup^{2_j}$, and $\tup^{a_i}$ is a monomial as defined in \Cref{def:monomial}, i.e., every term in $\tup^{a_i}$ is a single variable factor of the monomial as opposed to allowing product of sums. Note that each $\tup^{a_i}$ has at most a degree of $k$ and that each of its variables are associated with a particular block $\block$. We can assume WLOG that each monomial $\tup^{a_i}$ has at most one variable from each block since any $\tup^{a_i}$ having non-identitcal variables from the same $\block$ can easily be pruned in a $O(\numvar)$ scan.
\paragraph{High Level Description of Solution}
We claim that we can compute the number of cancelations in $O(\numvar\cdot \log{\numvar})$ time.
Before digging into the details of computing the exact number of cancellations of $\rpoly$, we describe the high-level details of the solution.

View File

@ -0,0 +1,101 @@
%root: main.tex
%!TEX root=./main.tex
\section{Background and Notation}\label{sec:background}
\subsection{Polynomial Definition and Terminology}
Given an index set $S$ over variables $X_\tup$ for $\tup\in S$, a (general) polynomial $\genpoly$ over $\inparen{X_\tup}_{\tup \in S}$ with individual degree $\hideg <\infty$
is formally defined as:
\begin{align}
\label{eq:sop-form}
\genpoly\inparen{\inparen{X_\tup}_{\tup\in S}}=\sum_{\vct{d}\in\{0,\ldots,\hideg\}^{S}} c_{\vct{d}}\cdot \prod_{\tup\in S}X_\tup^{d_\tup}&&\text{ where } c_{\vct{d}}\in \semN.
\end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Standard Monomial Basis]\label{def:smb}
The term $\prod_{\tup\in S} X_\tup^{d_\tup}$ in \Cref{eq:sop-form} is a {\em monomial}. A polynomial $\genpoly\inparen{\vct{X}}$ is in standard monomial basis (\abbrSMB) when we keep only the terms with $c_{\vct{d}}\ne 0$ from \Cref{eq:sop-form}.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Unless othewise noted, we consider all polynomials to be in \abbrSMB representation.
When it is unclear, we use $\smbOf{\genpoly}~\inparen{\smbOf{\poly}}$ to denote the \abbrSMB form of a polynomial (lineage polynomial) $\genpoly~\inparen{\poly}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Degree]\label{def:degree-of-poly}
The degree of polynomial $\genpoly(\vct{X})$ is the largest $\sum_{\tup\in S}d_\tup
$ for all $\vct{d}\in\inset{0,\ldots,\hideg}^S$
such that $c_{(d_1,\dots,d_n)}\ne 0$.
We denote the degree of $\genpoly$ as $\deg\inparen{\genpoly}$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As an example, the degree of the polynomial $X^2+2XY^2+Y^2$ is $3$.
Product terms in lineage arise only from join operations (\Cref{fig:nxDBSemantics}), so intuitively, the degree of a lineage polynomial is analogous to the largest number of joins needed to produce a result tuple.
We call a polynomial $\poly\inparen{\vct{X}}$ a \emph{\abbrCTIDB-lineage polynomial} (
or simply lineage polynomial), if it is clear from context that there exists an $\raPlus$ query $\query$, \abbrCTIDB $\pdb$, and result tuple $\tup$ such that $\poly\inparen{\vct{X}} = \apolyqdt\inparen{\vct{X}}.$
\subsection{\abbrOneBIDB}\label{subsec:one-bidb}
\label{subsec:tidbs-and-bidbs}
\noindent A block independent database \abbrBIDB $\pdb'$ models a set of worlds each of which consists of a subset of the possible tuples $\tupset'$, where $\tupset'$ is partitioned into $\numblock$ blocks $\block_i$ and all $\block_i$ are independent random events. $\pdb'$ further constrains that all $\tup\in\block_i$ for all $i\in\pbox{\numblock}$ of $\tupset'$ be disjoint events. We refer to any monomial that includes $X_\tup X_{\tup'}$ for $\tup\neq\tup'\in\block_i$ as a \emph{cancellation}. We define next a specific construction of \abbrBIDB that is useful for our work.
\begin{Definition}[\abbrOneBIDB]\label{def:one-bidb}
Define a \emph{\abbrOneBIDB} to be the pair $\pdb' = \inparen{\bigtimes_{\tup\in\tupset'}\inset{0, \bound_\tup}, \bpd'},$ where $\tupset'$ is the set of possible tuples such that each $\tup \in \tupset'$ has a multiplicity domain of $\inset{0, \bound_\tup}$, with $\bound_\tup\in\mathbb{N}$. $\tupset'$ is partitioned into $\numblock$ independent blocks $\block_i,$ for $i\in\pbox{\numblock}$, of disjoint tuples. $\bpd'$ is characterized by the vector $\inparen{\prob_\tup}_{\tup\in\tupset'}$ where for every block $\block_i$, $\sum_{\tup \in \block_i}\prob_\tup \leq 1$. Given $W\in\onebidbworlds{\tupset'}$ and for $i\in\pbox{\numblock}$, let $\prob_i(W) = \begin{cases}
1 - \sum_{\tup\in\block_i}\prob_\tup & \text{if }W_\tup = 0\text{ for all }\tup\in\block_i\\
0 & \text{if there exists } \tup,~\tup'\in\block_i, W_\tup, W_{\tup'}\neq 0\\
\prob_\tup & W_\tup \ne 0 \text{ for the unique } t\in B_i.\\
\end{cases}$
\noindent$\bpd'$ is the probability distribution across all worlds such that, given $W\in\bigtimes_{\tup \in \tupset'}\inset{0,\bound_\tup}$, $\probOf\pbox{\worldvec = W} = \prod_{i\in\pbox{\numblock}}\prob_{i}(W)$.
\footnote{
We slightly abuse notation here, denoting a world vector as $W$ rather than $\worldvec$ to distinguish between the random variable and the world instance. When there is no ambiguity, we will denote a world vector as $\worldvec$.}
\end{Definition}
\Cref{fig:lin-poly-bidb} shows the lineage construction of $\poly'\inparen{\vct{X}}$ given $\raPlus$ query $\query$ for arbitrary deterministic $\gentupset'$. Note that the semantics differ from~\Cref{fig:nxDBSemantics} only in the base case.
\begin{Proposition}[\abbrCTIDB reduction]\label{prop:ctidb-reduct}
Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebidbworlds{\tupset'}, \bpd'}$ be the \emph{\abbrOneBIDB} obtained in the following manner: for each $\tup\in\tupset$, create block $\block_\tup = \inset{\intup{\tup, j}_{j\in\pbox{\bound}}}$ of disjoint tuples, for all $j\in\pbox{\bound}$.
The probability distribution $\bpd'$ is the characterized by the vector $\vct{p} = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}$.
Then, the distributions $\mathcal{P}$ and $\mathcal{P}'$ are equivalent.
\end{Proposition}
We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
\begin{figure}[t!]
\centering
\resizebox{\textwidth}{!}{
\begin{minipage}{\textwidth}
\begin{align*}
\poly'\pbox{\project_A\inparen{\query}, \gentupset', \tup_j} =& \sum_{\substack{\tup_{j'},\\\project_{A}\inparen{\tup_{j'}} = \tup_j}}\poly'\pbox{\query, \gentupset', \tup_{j'}} &
\poly'\pbox{\query_1\union\query_2, \gentupset', \tup_j} =& \poly'\pbox{\query_1, \gentupset', \tup_j}+\poly'\pbox{\query_2, \gentupset', \tup_j}\\
\poly'\pbox{\select_\theta\inparen{\query}, \gentupset', \tup_j} =& \begin{cases}\theta = 1&\poly'\pbox{\query, \gentupset', \tup_j}\\\theta = 0& 0\\\end{cases} &
\begin{aligned}
\poly'\pbox{\query_1\join\query_2, \gentupset', \tup_j} = \\~
\end{aligned} &
\begin{aligned}
&\poly'\pbox{\query_1, \gentupset', \project_{attr\inparen{\query_1}}\inparen{\tup_j}}\\ &~~~\cdot\poly'\pbox{\query_2, \gentupset', \project_{attr\inparen{\query_2}}\inparen{\tup_j}}
\end{aligned}\\
&&&\poly'\pbox{\rel,\gentupset', \tup_j} = j\cdot X_{\tup, j}.
\end{align*}\\[-10mm]
\end{minipage}}
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over $\gentupset'$.}
\label{fig:lin-poly-bidb}
\end{figure}
\begin{Definition}[$\rpoly'$]\label{def:reduced-poly-one-bidb}
Given a polynomial $\poly'\inparen{\vct{X}}$ generated from a \abbrOneBIDB and let $\rpoly'\inparen{\vct{X}}$ denote the reduced form of $\poly'\inparen{\vct{X}}$ derived as follows: i) compute $\smbOf{\poly'\inparen{\vct{X}}}$ eliminating all monomials with cross terms $X_{\tup}X_{\tup'}$ for $\tup\neq \tup' \in \block_i$ and ii) reduce all \emph{variable} exponents $e > 1$ to $1$.
\end{Definition}
Then given $\worldvec\in\inset{0,1}^{\tupset'}$ over the reduced \abbrOneBIDB of~\Cref{prop:ctidb-reduct}, the disjoint requirement and the semantics for constructing the lineage polynomial over a \abbrOneBIDB, $\poly'\inparen{\worldvec}$ is of the same structure as the reformulated polynomial $\refpoly{}\inparen{\worldvec}$ of step i) from~\Cref{def:reduced-poly}, which then implies that $\rpoly'$ is the reduced polynomial that results from step ii) of both~\Cref{def:reduced-poly} and~\Cref{def:reduced-poly-one-bidb}, and further that~\Cref{lem:tidb-reduce-poly} immediately follows for \abbrOneBIDB polynomials.
\begin{Lemma}
Given any \emph{\abbrOneBIDB} $\pdb'$, $\raPlus$ query $\query$, and lineage polynomial
$\poly'\inparen{\vct{X}}=\poly'\pbox{\query,\tupset',\tup}\inparen{\vct{X}}$, it holds that $
\expct_{\vct{W} \sim \pdassign'}\pbox{\poly'\inparen{\vct{W}}} = \rpoly'\inparen{\probAllTup}.
$
\end{Lemma}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,95 @@
%!TEX root=./main.tex
\subsection{Relationship to Deterministic Query Runtimes}\label{sec:gen}
%We formalize our claim from \Cref{sec:intro} that a linear approximation algorithm for our problem implies that PDB queries (under bag semantics) can be answered (approximately) in the same runtime as deterministic queries under reasonable assumptions.
%Lastly, we generalize our result for expectation to other moments.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\revision{
%\subsection{Cost Model, Query Plans, and Runtime}
%As in the introduction, we could consider polynomials to be represented as an expression tree.
%However, they do not capture many of the compressed polynomial representations that we can get from query processing algorithms on bags, including the recent work on worst-case optimal join algorithms~\cite{ngo-survey,skew}, factorized databases~\cite{factorized-db}, and FAQ~\cite{DBLP:conf/pods/KhamisNR16}. Intuitively, the main reason is that an expression tree does not allow for `sharing' of intermediate results, which is crucial for these algorithms (and other query processing methods as well).
%}
%
%\label{sec:circuits}
%\mypar{The cost model}
%\label{sec:cost-model}
%So far our analysis of \Cref{prob:intro-stmt} has been in terms of the size of the lineage circuits.
%We now show that this model corresponds to the behavior of a deterministic database by proving that for any \raPlus query $\query$, we can construct a compressed circuit for $\poly$ and \bi $\pdb$ of size and runtime linear in that of a general class of query processing algorithms for the same query $\query$ on $\pdb$'s \dbbaseName $\dbbase$.
% Note that by definition, there exists a linear relationship between input sizes $|\pxdb|$ and $|\dbbase|$ (i.e., $\exists c, \db \in \pxdb$ s.t. $\abs{\pxdb} \leq c \cdot \abs{\db})$).
% \footnote{This is a reasonable assumption because each block of a \bi represents entities with uncertain attributes.
% In practice there is often a limited number of alternatives for each block (e.g., which of five conflicting data sources to trust). Note that all \tis trivially fulfill this condition (i.e., $c = 1$).}
%That is for \bis that fulfill this restriction approximating the expectation of results of SPJU queries is only has a constant factor overhead over deterministic query processing (using one of the algorithms for which we prove the claim).
% with the same complexity as it would take to evaluate the query on a deterministic \emph{bag} database of the same size as the input PDB.
In~\Cref{sec:intro}, we introduced the structure $T_{det}\inparen{\cdot}$ to analyze the runtime complexity of~\Cref{prob:expect-mult}.
To decouple our results from specific join algorithms, we first lower bound the cost of a join.
\begin{Definition}[Join Cost]
\label{def:join-cost}
Denote by $\jointime{R_1, \ldots, R_m}$ the runtime of an algorithm for computing the $m$-ary join $R_1 \bowtie \ldots \bowtie R_m$.
We require only that the algorithm must enumerate its output, i.e., that $\jointime{R_1, \ldots, R_m} \geq |R_1 \bowtie \ldots \bowtie R_m|$. With this definition of $\jointime{\cdot}$, worst-case optimal join algorithms are handled.
\end{Definition}
Worst-case optimal join algorithms~\cite{skew,ngo-survey} and query evaluation via factorized databases~\cite{factorized-db} (as well as work on FAQs~\cite{DBLP:conf/pods/KhamisNR16}) can be modeled as $\raPlus$ queries (though the query size is data dependent).
For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bound}~\cite{AGM}.
% = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.
Our cost model for general query evaluation follows from the join cost:
\noindent\resizebox{1\linewidth}{!}{
\begin{minipage}{1.0\linewidth}
\begin{align*}
\qruntimenoopt{R,\gentupset,\bound} & = |\gentupset.R| &
\qruntimenoopt{\sigma \query, \gentupset,\bound} & = \qruntimenoopt{\query,\gentupset} &
\qruntimenoopt{\pi \query, \gentupset,\bound} & = \qruntimenoopt{\query,\gentupset,\bound} + \abs{\query(\gentupset)}
\end{align*}\\[-15mm]
\begin{align*}
\qruntimenoopt{\query \cup \query', \gentupset,\bound} & = \qruntimenoopt{\query, \gentupset,\bound} +
\qruntimenoopt{\query', \gentupset,\bound} +
\abs{\query\inparen{\gentupset}}+\abs{\query'\inparen{\gentupset}} \\
\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound}
& = \qruntimenoopt{\query_1, \gentupset,\bound} + \ldots +
\qruntimenoopt{\query_m,\gentupset,\bound} +
\jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}
\end{align*}
\end{minipage}
}\\
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O(\qruntimenoopt{Q,\gentupset, \bound})$.
We assume that full table scans are used for every base relation access. We can model index scans by treating an index scan query $\sigma_\theta(R)$ as a base relation.
%Observe that
% () .\footnote{This claim can be verified by e.g. simply looking at the {\em Generic-Join} algorithm in~\cite{skew} and {\em factorize} algorithm in~\cite{factorized-db}.} It can be verified that the above cost model on the corresponding $\raPlus$ join queries correctly captures the runtime of current best known .
\Cref{lem:circ-model-runtime} and \Cref{lem:tlc-is-the-same-as-det} show that for any $\raPlus$ query $\query$ and $\tupset$, there exists a circuit $\circuit^*$ such that $\timeOf{\abbrStepOne}(Q,\tupset,\circuit^*)$ and $|\circuit^*|$ are both $O(\qruntimenoopt{\optquery{\query}, \tupset,\bound})$. Recall we assumed these two bounds when we moved from \Cref{prob:big-o-joint-steps} to \Cref{prob:intro-stmt}. Lastly, we can handle FAQs and factorized databases by allowing for optimization, i.e. $\optquery{\query}$.
%
%We now make a simple observation on the above cost model:
%\begin{proposition}
%\label{prop:queries-need-to-output-tuples}
%The runtime $\qruntimenoopt{Q}$ of any query $Q$ is at least $|Q|$
%\end{proposition}
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%We are now ready to formally state our claim with respect to \Cref{prob:intro-stmt}:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{Corollary}\label{cor:cost-model}
% Given an $\raPlus$ query $\query$ over a \ti $\pdb$ with \dbbaseName $\dbbase$, we can compute a $(1\pm\eps)$-approximation of the expectation for each output tuple in $\query(\pdb)$ with probability at least $1-\delta$ in time
%
% \[
% O_k\left(\frac 1{\eps^2}\cdot\qruntimenoopt{Q,\dbbase}\cdot \log{\frac{1}{\conf}}\cdot \log(n)\right)
% \]
%\end{Corollary}
%Atri: The above is no longer needed
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,14 @@
%!TEX root=./main.tex
\section{Conclusions and Future Work}\label{sec:concl-future-work}
We have studied the problem of calculating the expected multiplicity of a bag-query result tuple,
a problem that has a practical application in probabilistic databases over multisets.
We show that under various parameterized complexity hardness results/conjectures computing the expected multiplicities exactly is not possible in time linear in the corresponding deterministic query processing time.
We prove that it is possible to approximate the expectation of a lineage polynomial in linear time
in the deterministic query processing over TIDBs and BIDBs (assuming that there are few cancellations).
Interesting directions for future work include development of a dichotomy for bag \abbrPDB\xplural. While we can handle higher moments (this follows fairly easily from our existing results-- see \Cref{sec:momemts}), more general approximations are an interesting area for exploration, including those for more general data models.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,26 @@
% root: main.tex
Recall that by definition of $\abbrBIDB$, a query result cannot be derived by a self-join between non-identical tuples belonging to the same block. Note, that by \Cref{cor:approx-algo-const-p}, $\gamma$ must be a constant in order for \Cref{alg:mon-sam} to acheive linear time. We would like to determine experimentally whether queries over $\abbrBIDB$ instances in practice generate a constant number of cancellations or not. Such an experiment would ideally use a database instance with queries both considered to be typical representations of what is seen in practice.
We ran our experiments using Windows 10 WSL Operating System with an Intel Core i7 2.40GHz processor and 16GB RAM. All experiments used the PostgreSQL 13.0 database system.
For the data we used the MayBMS data generator~\cite{pdbench} tool to randomly generate uncertain versions of TPCH tables. The queries computed over the database instance are $\query_1$, $\query_2$, and $\query_3$ from~\cite{Antova_fastand}, all of which are modified versions of TPC-H queries $\query_3$, $\query_6$, and $\query_7$ where all aggregations have been dropped.
As written, the queries disallow $\abbrBIDB$ cross terms. We first ran all queries, noting the result size for each. Next the queries were rewritten so as not to filter out the cross terms. The comparison of the sizes of both result sets should then suggest in one way or another whether or not there exist many cross terms in practice. As seen, the experimental query results contain little to no cancelling terms. \Cref{fig:experiment-bidb-cancel} shows the result sizes of the queries, where column CF is the result size when all cross terms are filtered out, column CI shows the number of output tuples when the cancelled tuples are included in the result, and the last column is the value of $\gamma$. The experiments show $\gamma$ to be in a range between $[0, 0.1]\%$, indicating that only a negligible or constant (compare the result sizes of $\query_1 < \query_2$ and their respective $\gamma$ values) amount of tuples are cancelled in practice when running queries over a typical \abbrBIDB instance. Interestingly, only one of the three queries had tuples that violated the \abbrBIDB constraint.
To conclude, the results in \Cref{fig:experiment-bidb-cancel} show experimentally that $\gamma$ is negligible in practice for BIDB queries. We also observe that (i) tuple presence is independent across blocks, so the corresponding probabilities (and hence $\prob_0$) are independent of the number of blocks, and (ii) \bis model uncertain attributes, so block size (and hence $\gamma$) is a function of the ``messiness'' of a dataset, rather than its size.
Thus, we expect \Cref{cor:approx-algo-const-p} to hold in general.
\begin{figure}[ht]
\begin{tabular}{ c | c c c}\label{tbl:cancel}
Query & CF & CI & $\gamma$\\
\hline
$\query_1$ & $46,714$ & $46,768$ & $0.1\%$\\
$\query_2$ & $179.917$ & $179,917$ & $0\%$\\
$\query_3$ & $11,535$ & $11,535$ & $0\%$\\
\end{tabular}
\caption{Number of Cancellations for Queries Over $\abbrBIDB$.}
\label{fig:experiment-bidb-cancel}
\end{figure}

View File

@ -0,0 +1,222 @@
%root: main.tex
%!TEX root=./main.tex
\AH{1st Submission ICDT Intro}
\section{Introduction}
\label{sec:intro}
A \emph{probabilistic database} $\pdb = (\idb, \pd)$ is set of deterministic databases $\idb = \{ \db_1, \ldots, \db_n\}$ called possible worlds, paired with a probability distribution $\pd$ over these worlds.
A well-studied problem in probabilistic databases is to take a query $\query$ and a probabilistic database $\pdb$, and compute the \emph{marginal probability} of a tuple $\tup$ (i.e., its probability of appearing in the result of query $\query$ over $\pdb$).
This problem is \sharpphard for set semantics, even for \emph{tuple-independent probabilistic databases}~\cite{DBLP:series/synthesis/2011Suciu} (TIDBs), which are a subclass of probabilistic databases where tuples are independent events. The dichotomy of Dalvi and Suciu~\cite{10.1145/1265530.1265571} separates the hard cases, from cases that are in \ptime for unions of conjunctive queries (UCQs).
In this work we consider bag semantics, where each tuple is associated with a multiplicity $\db_i(\tup)$ in each possible world $\db_i$ and study the analogous problem of computing the expectation (where $\overline{\db}$ denotes a random variable) of the multiplicity of a query result tuple $\tup$ (denoted $\query(\db)(t)$):
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{equation}\label{eq:intro-bag-expectation}
\expct_{\randDB \sim \pd}[\query(\overline{D})(t)] = \sum_{\db \in \idb} \query(\db)(t) \cdot \probOf\pbox{\db} \hspace{2cm}\text{\textbf{(Expected Result Multiplicity)}}
\end{equation}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\begin{subfigure}[b]{0.49\linewidth}
\centering
{\small
\begin{tabular}{ c | c c c}
$OnTime$ & City$_\ell$ & $\Phi$ & \textbf{p}\\
\hline
& Buffalo & $L_a$ & 0.9 \\
& Chicago & $L_b$ & 0.5\\
& Bremen & $L_c$ & 0.5\\
& Zurich & $L_d$ & 1.0\\
\end{tabular}
}
\caption{Relation $OnTime$}
\label{subfig:ex-shipping-simp-loc}
\end{subfigure}%
\begin{subfigure}[b]{0.49\linewidth}
\centering
{\small
\begin{tabular}{ c | c c c c}
$Route$ & $\text{City}_1$ & $\text{City}_2$ & $\Phi$ & \textbf{p} \\
\hline
& Buffalo & Chicago & $R_a$ & 1.0 \\
& Chicago & Zurich & $R_b$ & 1.0 \\
%& $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ \\
& Chicago & Bremen & $R_c$ & 1.0 \\
\end{tabular}
}
\caption{Relation $Route$}
\label{subfig:ex-shipping-simp-route}
\end{subfigure}%
% \begin{subfigure}[b]{0.17\linewidth}
% \centering
% \caption{Circuit for $(Chicago)$}
% \label{subfig:ex-proj-push-circ-q3}
% \end{subfigure}
\begin{subfigure}[b]{0.66\linewidth}
\centering
{\small
\begin{tabular}{ c | c c c}
$\query_1$ & City & $\Phi$ & $\expct_{\idb \sim \probDist}[\query(\db)(t)]$ \\ \hline
& Buffalo & $L_a \cdot R_a$ & $0.9$ \\
& Chicago & $L_b \cdot R_b + L_b \cdot R_c$ & $0.5 \cdot 1.0 + 0.5 \cdot 1.0 = 1.0$ \\
%& $\cdots$ & $\cdots$ & $\cdots$ \\
\end{tabular}
}
\caption{$Q_1$'s Result}
\label{subfig:ex-shipping-simp-queries}
\end{subfigure}%
\begin{subfigure}[b]{0.33\linewidth}
\centering
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[tree_node] (a2) at (0, 0){$R_b$};
\node[tree_node] (b2) at (1, 0){$L_b$};
\node[tree_node] (c2) at (2, 0){$R_c$};
%level 1
\node[tree_node] (a1) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[tree_node] (b1) at (1.5, 0.8){$\boldsymbol{\circmult}$};
%level 0
\node[tree_node] (a0) at (1.0, 1.6){$\boldsymbol{\circplus}$};
%edges
\draw[->] (a2) -- (a1);
\draw[->] (b2) -- (a1);
\draw[->] (b2) -- (b1);
\draw[->] (c2) -- (b1);
\draw[->] (a1) -- (a0);
\draw[->] (b1) -- (a0);
\end{tikzpicture}
}
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (1, 0){$R_b$};
\node[tree_node] (b1) at (2, 0){$R_c$};
%level 1
\node[tree_node] (a2) at (0.75, 0.8){$L_b$};
\node[tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circplus}$};
%level 0
\node[tree_node] (a3) at (1.1, 1.6){$\boldsymbol{\circmult}$};
%edges
\draw[->] (a1) -- (b2);
\draw[->] (b1) -- (b2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (a3);
\end{tikzpicture}
}
\caption{Two circuits for $Q_1(Chicago)$}
\label{subfig:ex-proj-push-circ-q4}
\end{subfigure}%
\vspace*{-3mm}
\caption{\ti instance and query results for \Cref{ex:intro-tbls}.}%{$\ti$ relations for $\poly$}
\label{fig:ex-shipping-simp}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Example}\label{ex:intro-tbls}
Consider the bag-\ti relations shown in \Cref{fig:ex-shipping-simp}. We define a \ti under bag semantics analogously to the set case: each input tuple is associated with a probability of having a multiplicity of one (and otherwise multiplicity zero), and tuples are independent random events. Ignore column $\Phi$ for now. In this example, we have shipping routes that are certain (probability 1.0) and information about whether shipping at locations is on time (with a certain probability). Query $\query_1$, shown below returns starting points of shipping routes where shipment processing is on time.
$$Q_1(\text{City}) \dlImp OnTime(\text{City}), Route(\text{City}, \dlDontcare)$$
\Cref{subfig:ex-shipping-simp-queries} shows the possible results of this query.
For example, there is a 90\% probability there is a single route starting in Buffalo that is on time, and the expected multiplicity of this result tuple is $0.9$.
There are two shipping routes starting in Chicago.
Since the Chicago location has a 50\% probability of being on schedule (we assume that delays are linked), the expected multiplicity of this result tuple is $0.5 + 0.5 = 1.0$.
\end{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
A well-known result in probabilistic databases is that under set semantics, the marginal probability of a query result $\tup$ can be computed based on the tuple's lineage. The lineage of a tuple is a Boolean formula (an element of the semiring $\text{PosBool}[\vct{X}]$~\cite{DBLP:conf/pods/GreenKT07} of positive Boolean expressions)
over random variables
($\vct{X}=(X_1,\dots,X_n)$)
that encode the existence of input tuples. Each possible world $\db$ corresponds to an assignment $\{0,1\}^\numvar$ of the variables in $\vct{X}$ to either true (the tuple exists in this world) or false (the tuple does not exist in this world). Importantly, the following holds: if the lineage formula for $t$ evaluates to true under the assignment for a world $\db$, then $\tup \in \query(\db)$.
Thus, the marginal probability of tuple $\tup$ is equal to the probability that its lineage evaluates to true (with respect to the obvious analog of probability distribution $\pd$ defined over $\Omega$ and induced over $\vct{X}$).
For bag semantics, the lineage of a tuple is a polynomial over variables $\vct{X}=(X_1,\dots,X_n)$ with % \in \mathbb{N}^\numvar$ with
coefficients in the set of natural numbers $\mathbb{N}$ (an element of semiring $\mathbb{N}[\vct{X}]$).
Analogously to sets, evaluating the lineage for $t$ over an assignment corresponding to a possible world yields the multiplicity of the result tuple $\tup$ in this world. Thus, instead of using \Cref{eq:intro-bag-expectation} to compute the expected result multiplicity of a tuple $\tup$, we can equivalently compute the expectation of the lineage polynomial of $\tup$, which for this example we denote as $\linsett{\query}{\pdb}{\tup}$ or $\Phi$ if the parameters are clear from the context\footnote{
In later sections, where we focus on a single lineage polynomial, we will simply refer to $\linsett{\query}{\pdb}{\tup}$ as $Q$.
}. In this work, we study the complexity of computing the expectation of such polynomials encoded as arithmetic circuits.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Example}\label{ex:intro-lineage}
Associating a lineage variable with every input tuple as shown in \Cref{fig:ex-shipping-simp}, we can compute the lineage of every result tuple as shown in \Cref{subfig:ex-shipping-simp-queries}. For example, the tuple Chicago is in the result, because $L_b$ joins with both $R_b$ and $R_c$. Its lineage is $\Phi = L_b \cdot R_b + L_b \cdot R_c$. The expected multiplicity of this result tuple is calculated by summing the multiplicity of the result tuple, weighted by its probability, over all possible worlds.
In this example, $\Phi$ is a sum of products (SOP), and so we can use linearity of expectation to solve the problem in linear time (in the size of $\Phi$).
The expectation of the sum is the sum of the expectations of monomials.
The expectation of each monomial is then computed by multiplying the probabilities of the variables (tuples) occurring in the monomial, since we have independence of tuples.
The expected multiplicity for Chicago is $1.0$.
\end{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The expected multiplicity of a query result can be computed in linear time (in the size of the result's lineage) if the lineage is in SOP form.
However, this need not be true for compressed representations of polynomials, including factorized polynomials or arithmetic circuits.
For instance, \Cref{subfig:ex-proj-push-circ-q4} shows two circuits encoding the lineage of the result tuple $(Chicago)$ from \Cref{ex:intro-lineage}.
The left circuit encodes the lineage as a SOP while the right circuit uses distributivity to push the addition gate below the multiplication, resulting in a smaller circuit.
Given that there is a large body of work (on, e.g., deterministic bag-relational query processing) that can output such compressed representations~\cite{DBLP:conf/pods/KhamisNR16,factorized-db}, %\BG{cite FDBs and FAQ},
an interesting question is whether computing expectations is still in linear time for such compressed representations.
If the answer is in the affirmative, then probabilities over bag-PDBs can be computed with linear overhead (in the size of the compressed representation) using any algorithm that computes compressed lineage polynomials.
% and if lineage formulas can also be computed in linear time (in the lineage size), then bag-relational probabilistic databases can theoretically match the performance of deterministic databases.
Unfortunately, we prove that this is not the case: computing the expected count of a query result tuple is super-linear under standard complexity assumptions (\sharpwonehard) in the size of a lineage circuit.
Concretely, we make the following contributions:
(i) We show that the expected result multiplicity problem (\Cref{def:the-expected-multipl}) for conjunctive queries for bag-$\ti$s is \sharpwonehard in the size of a lineage circuit by reduction from counting the number of $k$-matchings over an arbitrary graph;
(ii) We present an $(1\pm\epsilon)$-\emph{multiplicative} approximation algorithm for bag-$\ti$s and show that for typical database usage patterns (e.g. when the circuit is a tree or is generated by recent worst-case optimal join algorithms or their FAQ followups~\cite{DBLP:conf/pods/KhamisNR16}) its complexity is linear in the size of the compressed lineage encoding; %;\BG{Fix not linear in all cases, restate after 4 is done}
(iii) We generalize the approximation algorithm to bag-$\bi$s, a more general model of probabilistic data;
(iv) We further prove that for \raPlus queries (an equivalently expressive, but factorizable form of UCQs), we can approximate the expected output tuple multiplicities with only $O(\log{Z})$ overhead (where $Z$ is the number of output tuples) over the runtime of a broad class of query processing algorithms. We also observe that our results trivially extend to higher moments of the tuple multiplicity (instead of just the expectation).
%\mypar{Implications of our Results} As mentioned above
\mypar{Overview of our Techniques} All of our results rely on working with a {\em reduced} form of the lineage polynomial $\Phi$. In fact, it turns out that for the TIDB (and BIDB) case, computing the expected multiplicity is {\em exactly} the same as evaluating this reduced polynomial over the probabilities that define the TIDB/BIDB. Next, we motivate this reduced polynomial by continuing \Cref{ex:intro-tbls}.
%Moving forward, we focus exclusively on bags.
Consider the query $Q()\dlImp$$OnTime(\text{City}), Route(\text{City}, \text{City}'),$ $OnTime(\text{City}')$ over the bag relations of \Cref{fig:ex-shipping-simp}. It can be verified that $\Phi$ for $Q$ is $L_aL_b + L_bL_d + L_bL_c$. Now consider the product query $\poly^2()\dlImp Q(), Q()$.
%The factorized representation of $\poly^2$ is (for simplicity we ignore the random variables of $Route$ since each variable has probability of $1$):
%\begin{equation*}
%\poly^2 = \left(L_aL_b + L_bL_d + L_bL_c\right) \cdot \left(L_aL_b + L_bL_d + L_bL_c\right)
%\end{equation*}
%This equivalent SOP representation is
The lineage polynomial for $Q^2$ is given by $\Phi^2$:
\begin{equation*}
\left(L_aL_b + L_bL_d + L_bL_c\right)^2=L_a^2L_b^2 + L_b^2L_d^2 + L_b^2L_c^2 + 2L_aL_b^2L_d + 2L_aL_b^2L_c + 2L_b^2L_dL_c.
\end{equation*}
The expectation $\expct\pbox{\Phi^2}$ then is:
\begin{multline*}
\expct\pbox{L_a}\expct\pbox{L_b^2} + \expct\pbox{L_b^2}\expct\pbox{L_d^2} + \expct\pbox{L_b^2}\expct\pbox{L_c^2} + 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_d} \\
+ 2\expct\pbox{L_a}\expct\pbox{L_b^2}\expct\pbox{L_c} + 2\expct\pbox{L_b^2}\expct\pbox{L_d}\expct\pbox{L_c}
\end{multline*}
\noindent If the domain of a random variable $W$ is $\{0, 1\}$, then for any $k > 0$, $\expct\pbox{W^k} = \expct\pbox{W}$, which means that $\expct\pbox{\Phi^2}$ simplifies to:
\begin{footnotesize}
\begin{equation*}
\expct\pbox{L_a}\expct\pbox{L_b} + \expct\pbox{L_b}\expct\pbox{L_d} + \expct\pbox{L_b}\expct\pbox{L_c} + 2\expct\pbox{L_a}\expct\pbox{L_b}\expct\pbox{L_d} + 2\expct\pbox{L_a}\expct\pbox{L_b}\expct\pbox{L_c} + 2\expct\pbox{L_b}\expct\pbox{L_d}\expct\pbox{L_c}
\end{equation*}
\end{footnotesize}
\noindent This property leads us to consider a structure related to the lineage polynomial.
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly(\vct{X})$, define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in the SOP form of $\poly(\vct{X})$ to $1$.
\end{Definition}
With $\Phi^2$ as an example, we have:
\begin{align*}
\widetilde{\Phi^2}(L_a, L_b, L_c, L_d)
=&\; L_aL_b + L_bL_d + L_bL_c + 2L_aL_bL_d + 2L_aL_bL_c + 2L_bL_cL_d
\end{align*}
It can be verified that the reduced polynomial is a closed form of the expected count (i.e., $\expct\pbox{\Phi^2} = \widetilde{\Phi^2}(\probOf\pbox{L_a=1}, \probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$). In fact, we show in \Cref{lem:exp-poly-rpoly} that this equivalence holds for {\em all} UCQs over TIDB/BIDB.
%The reduced form of a lineage polynomial can be obtained but requires a linear scan over the clauses of an SOP encoding of the polynomial. Note that for a compressed representation, this scheme would require an exponential number of computations in the size of the compressed representation. In \Cref{sec:hard}, we use $\rpoly$ to prove our hardness results .
To prove our hardness result we show that for the same $Q$ considered in the running example, the query $Q^k$ is able to encode various hard graph-counting problems. We do so by analyzing how the coefficients in the (univariate) polynomial $\widetilde{\Phi}\left(p,\dots,p\right)$ relate to counts of various sub-graphs on $k$ edges in an arbitrary graph $G$ (which is used to define the relations in $Q$). For the upper bound it is easy to check that if all the probabilties are constant then ${\Phi}\left(\probOf\pbox{X_1=1},\dots, \probOf\pbox{X_n=1}\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation. \AH{Why do we say `approximation'? This is a linear {\emph exact} computation.} To get an $(1\pm \epsilon)$-multiplicative approximation we sample monomials from $\Phi$ and `adjust' their contribution to $\widetilde{\Phi}\left(\cdot\right)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Paper Organization} We present relevant background and notation in \Cref{sec:background}. We then prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}. We present some (easy) generalizations of our results in \Cref{sec:gen} and also discuss extensions from computing expectations of polynomials to the expected result multiplicity problem (\Cref{def:the-expected-multipl}). Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,363 @@
%root: main.tex
%\setenumerate[1]{label = \Roman*.}
% \setenumerate[2]{label=\Alph*.}
% \setenumerate[3]{label=\roman*.}
% \setenumerate[4]{label=\alph*.}
\section{Introduction (Rewrite)}
% %for outline functionality
% \begin{outline}[enumerate]
% \1 Overall Problem
% \2 Hardness of deterministic queries, e.g., counting cliques, multiple joins, etc.
% \2 Assuming $\query$ is easy, how does the probabilistic computation compare in \abbrPDB\xplural?
% \3 Introduce two-step process
% \4 Deterministic Process: compute query, lineage, representation aka circuit
% \4 Probability Computation
% \3 Why the two-step process?
% \4 Semiring provenance nicely follows this model
% \4 Set \abbrPDB\xplural use this process
% \4 Model allows us to separate the deterministic from the probability computation
% \AH{The part below should maybe be moved further down. The order in the current draft is further down.}
% \3 Assuming a bag-\abbrTIDB, when the probability of all tuples $\prob_i = 1$, the problem of computing the expected count is linear
% \3 However, when $\prob_i < 1$, the problem is not linear (in circuit size)
% \3 An approximation algorithm exists to bring the second step back down to linear time
% \3 For set-\abbrPDB, the problem is \sharpphard with respect to exact computation
% \3 For set-\abbrPDB, the problem is quadratic with respect to approximation
% \2 Note that in all cases, Step 2 is at least as hard as Step 1
% \3 Bag-\abbrPDB\xplural are useful for things like a count query
% \3 This work focuses on Step 2 for bag-\abbrPDB\xplural
% \4 Given a lineage polynomial generated by a query, compute the expected multiplicity
% \3 Why focus on tuple expected multiplicity?
% \4 Note that bag-\abbrPDB query output is a probability distribution over counts; this contrasts the marginal probability paradigm of set-\abbrPDB\xplural
% \4 From a theoretical perspective, bag-\abbrPDB\xplural are not well studied
% \4 There are several statistical measures that can be done
% \4 We focus on expected count since it is a natural and simplistic statistic to consider
% \4 Appendix also considers higher moments
% \2 The setting for our problem assumes set-\abbrPDB inputs
% \3 A simple generalization exists
%
% \end{outline}
%\begin{figure}[H]
% \centering
% \includegraphics[width=\textwidth]{twostep}
% \caption{Old inkscape graphic}
% \label{fig:old-inkscape}
%\end{figure}
\usetikzlibrary{shapes.geometric}%for cylinder
\usetikzlibrary{shapes.arrows}%for arrow shape
\usetikzlibrary{shapes.misc}
%rid of vertical spacing for booktabs rules
\renewcommand{\aboverulesep}{0pt}
\renewcommand{\belowrulesep}{0pt}
\begin{figure}[h!]
\centering
\resizebox{\textwidth}{5.5cm}{%
\begin{tikzpicture}
%pdb cylinder
\node[cylinder, text width=0.28\textwidth, align=center, draw=black, text=black, cylinder uses custom fill, cylinder body fill=blue!10, aspect=0.12, minimum height=5cm, minimum width=2.5cm, cylinder end fill=blue!50, shape border rotate=90] (cylinder) at (0, 0) {
\tabcolsep=0.1cm
\begin{tabular}{>{\small}c | >{\small}c | >{\small}c}
\multicolumn{3}{c}{$\boldsymbol{OnTime}$}\\
%\toprule
City$_\ell$ & $\Phi$ & \textbf{p}\\
\midrule
Buffalo & $L_a$ & 0.9 \\
Chicago & $L_b$ & 0.5\\
Bremen & $L_c$ & 0.5\\
Zurich & $L_d$ & 1.0\\
\end{tabular}\\
\tabcolsep=0.05cm
%\captionof{table}{Route}
\begin{tabular}{>{\footnotesize}c | >{\footnotesize}c | >{\footnotesize}c | >{\footnotesize}c}
\multicolumn{4}{c}{$\boldsymbol{Route$}}\\
%\toprule
$\text{City}_1$ & $\text{City}_2$ & $\Phi$ & \textbf{p} \\
\midrule
Buffalo & Chicago & $R_a$ & 1.0 \\
Chicago & Zurich & $R_b$ & 1.0 \\
%& $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ \\
Chicago & Bremen & $R_c$ & 1.0 \\
\end{tabular}};
%label below cylinder
\node[below=0.2 cm of cylinder]{{\LARGE$ \pdb$}};
%First arrow
\node[single arrow, right=0.25 of cylinder, draw=black, fill=black!65, text=white, minimum height=0.75cm, minimum width=0.25cm](arrow1) {\textbf{Step 1}};
\node[above=of arrow1](arrow1Label) {$\query$};
\usetikzlibrary{arrows.meta}%for the following arrow configurations
\draw[line width=0.5mm, dashed, arrows = -{Latex[length=3mm, open]}] (arrow1Label)->(arrow1);
%Query output (output of step 1)
\node[rectangle, right=0.175 of arrow1, draw=black, text=black, fill=purple!10, minimum height=4.5cm, minimum width=2cm](rect) {
\tabcolsep=0.075cm
%\captionof{table}{Q}
\begin{tabular}{>{\normalsize}c | >{\normalsize}c | >{\centering\arraybackslash\small}m{1.95cm}}
%\multicolumn{3}{c}{$\boldsymbol{\query(\pdb)}$}\\[1mm]
%\toprule
City & $\Phi$ & Circuit\\% & $\expct_{\idb \sim \probDist}[\query(\db)(t)]$ \\ \hline
\midrule
%\hline
\\\\[-3.5\medskipamount]
Buffalo & $L_a R_a$ &\resizebox{!}{10mm}{
\begin{tikzpicture}[thick]
\node[gen_tree_node](sink) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[gen_tree_node](source1) at (0, 0){$L_a$};
\node[gen_tree_node](source2) at (1, 0){$R_a$};
\draw[->](source1)--(sink);
\draw[->] (source2)--(sink);
\end{tikzpicture}% & $0.5 \cdot 1.0 + 0.5 \cdot 1.0 = 1.0$
}\\[5mm]% & $0.9$ \\
Chicago & $L_b(R_b + R_c)$&
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[gen_tree_node] (a1) at (1, 0){$R_b$};
\node[gen_tree_node] (b1) at (2, 0){$R_c$};
%level 1
\node[gen_tree_node] (a2) at (0.75, 0.8){$L_b$};
\node[gen_tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circplus}$};
%level 0
\node[gen_tree_node] (a3) at (1.1, 1.6){$\boldsymbol{\circmult}$};
%edges
\draw[->] (a1) -- (b2);
\draw[->] (b1) -- (b2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (a3);
\end{tikzpicture}
}\newline\text{Or}\newline
%%%%%%%%%%%
%Non factorized circuit%
%%%%%%%%%%%
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[gen_tree_node] (a2) at (0, 0){$R_b$};
\node[gen_tree_node] (b2) at (1, 0){$L_b$};
\node[gen_tree_node] (c2) at (2, 0){$R_c$};
%level 1
\node[gen_tree_node] (a1) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[gen_tree_node] (b1) at (1.5, 0.8){$\boldsymbol{\circmult}$};
%level 0
\node[gen_tree_node] (a0) at (1.0, 1.6){$\boldsymbol{\circplus}$};
%edges
\draw[->] (a2) -- (a1);
\draw[->] (b2) -- (a1);
\draw[->] (b2) -- (b1);
\draw[->] (c2) -- (b1);
\draw[->] (a1) -- (a0);
\draw[->] (b1) -- (a0);
\end{tikzpicture}
}\\
\end{tabular}
};
%label below rectangle
\node[below=0.2cm of rect]{{\LARGE $\query(\pdb)$}};
%Second arrow
\node[single arrow, right=0.25 of rect, draw=black, fill=black!65, text=white, minimum height=0.75cm, minimum width=0.25cm](arrow2) {\textbf{Step 2}};
%Expectation computation; (output of step 2)
\node[rectangle, right=0.25 of arrow2, rounded corners, draw=black, fill=red!10, text=black, minimum height=4.5cm, minimum width=2cm](rrect) {
\tabcolsep=0.09cm
%\captionof{table}{Q}
\begin{tabular}{>{\small}c | >{\centering\arraybackslash\small}m{1.95cm}}
%\multicolumn{2}{c}{$\expct\pbox{\poly(\vct{X})}$}\\[1mm]
%\toprule
City & $\mathbb{E}[\poly(\vct{X})]$\\
\midrule%[0.05pt]
Buffalo & $1.0 \cdot 0.9 = 0.9$\\[3mm]
Chicago & $(0.5 \cdot 1.0) + $\newline $\hspace{0.2cm}(0.5 \cdot 1.0)$\newline $= 1.0$\\
\end{tabular}
};
%label of rounded rectangle
\node[below=0.2cm of rrect]{{\LARGE $\expct\pbox{\poly(\vct{X})}$}};
\end{tikzpicture}
}
\caption{Two step model of computation}
\label{fig:two-step}
\end{figure}
A bag probabilistic database (\abbrPDB) is a probability distribution over $\numvar$ (not necessarily \emph{unique}) tuples in a deterministic database $\db$. A tuple independent bag probabilistic database (\abbrTIDB) $\pdb$ has the further restriction that each tuple $\tup$ in $\db$ be an independent random event, with all base relation tuples annotated with a unique random variable. Given a query $\query$ from the set of positive relational algebra queries ($\raPlus$) over $\pdb$, the goal is to compute the expected count of each \emph{distinct} output tuple $\tup$ in $\query(\pdb)$, where each $\tup$ of $\query(\pdb)$ is annotated with its respective lineage polynomial, $\poly({\vct{X}})$ such that $\vct{X}$ is the vector of all unique $\numvar$ variables in $\pdb$. Bag \abbrPDB\xplural are a natural fit for evaluating queries involving multiplicity such as count queries.
In general the runtime of $\query$ over a deterministic database is \sharpwonehard, meaning the runtime is superlinear in the $\numvar$ sized input, based on a specific parameter $k$, as is the case for counting $k$-cliques and multiple joins also known as $k$-joins. This hardness result is unsatisfying in the sense that it doesn't account for computing the expected count, $\expct\pbox{\poly(\vct{X})}$. A natural question is if we can quantify the hardness of the probability computation beyond the complexity of deterministic (pre) processing. The model illustrated in \cref{fig:two-step} is one way to do this. %Assuming $\query$ is linear or better, how does query computation of a $\abbrPDB$ compare to deterministic query processing?
This model views \abbrPDB query processing as two steps. As depicted, computing $\query$ over a $\abbrPDB$ consists of the first step, which is essentially the deterministic computation of both the query output and $\poly(\vct{X})$.%result tuple lineage polynomial(s) encoded in the respective representation.
\footnote{Note that, assuming standard query algorithms over $\raPlus$ operators, computing a lineage polynomial is of the same complexity as computing the query output.}
% the runtime of the first step is the same in both the deterministic and \abbrPDB settings, since the computation of the linage is never greater than the query processing time.}
The second step consists of computing the expectation of $\poly({\vct{X}})$, $\expct\pbox{\poly(\vct{X})}$. This model of computation is nicely followed by set-\abbrPDB semantics \cite{DBLP:series/synthesis/2011Suciu} (where e.g. computing the marginal probability in intensional evaluation is a separate step; further, computing the marginal probability in extensional evaluation occurs as a separate step of each operator, and therefore implies that both concerns can be separated) and also by that of semiring provenance \cite{DBLP:conf/pods/GreenKT07} (where the $\semNX$-DB first computes the annotation via the query, and then the polynomial is evaluated on a specific valuation), and further, it is useful in this work for the purpose of separating the deterministic computation from the probability computation.
This work seeks to explore whether or not step two is \emph{always} of equal or lesser complexity relative to step one in the setting of bag \abbrPDB\xplural while establishing theoretical foundations supporting the answer to this question. For step one, as alluded above, in general query processing in deterministic databases is polynomial in $k$. Then our question is, ``Is it always the case that computing the expected count of a result tuple $\tup$ is linear is the complexity of step one, or are there classes of queries where computing step two is \emph{superlinear} in the query complexity?''
Most work done in \abbrPDB\xplural has been done in the setting of set \abbrPDB\xplural, where lineage is represented as a propositional formula rather than a polynomial. Each output tuple of a query $\query$ appears once with a marginal probability of $\expct\pbox{\poly(\vct{X})}$\footnote{We abuse notation and denote the propositional formula as $\poly(\vct{X})$}. The problem of computing $\query$ \emph{exactly} over a set-\abbrPDB is known to be \sharpphard in the general case. The dichotomy of Dalvi and Suciu \cite{10.1145/1265530.1265571} shows that for set-\abbrPDB\xplural it is the case that $\query(\pdb)$ is either polynomial or \sharpphard in $\numvar$ for any polytime step one. Since the hardness is non-parameterized, it is not necessary to consider the two step model. It is noteworthy to point out that this dichotomy is \emph{based} on the query structure and in general is independent of the representation of the lineage polynomial.\footnote{We do note that there exist specific cases when given a specific database instance combined with an amenable representation, that a hard $\query$ can become easy, but this is \emph{not} the general case.} If we are okay with approximation, then this problem can then be brought back down to at most quadratic time.\AH{Citation necessary.}
%Since set-\abbrPDB\xplural are essentially limited to computing the marginal probability of $\tup$, bag-\abbrPDB\xplural are a more natural fit for computing queries such as count queries.
Traditionally, bag-\abbrPDB\xplural have long been considered to be bottlenecked in step one only, or linear in the size of query. This may partially be due to the prevalence that exists in using a sum of products (\abbrSOP) representation of the lineage polynomial amongst many of the most well-known implementations of set-\abbrPDB\xplural. Such a representation used in the bag-\abbrPDB setting \emph{indeed} allows for step two to be linear in the \emph{size} of the \abbrSOP representation, a result due to linearity of expectation.
The main insight of the paper is that we should not stop here. One can have compact representations of $\poly(\vct{X})$ resulting from, for example, optimizations like projection push-down produce factorized representations of $\poly(\vct{X})$. To capture such factorizations, this work uses (arithmetic) circuits as the representation system of $\poly(\vct{X})$, which are a natural fit to $\raPlus$ queries as each operator maps to either a $\circplus$ or $\circmult$ operation \cite{DBLP:conf/pods/GreenKT07} (as shown in \cref{fig:nxDBSemantics}). Our work explores whether or not step two in the computation model is \emph{always} linear in the \emph{size} of the representation of the lineage polynomial when step one of $\query(\pdb)$ is easy. %This works focuses on step two of the computation model specifically in regards to bag-\abbrPDB queries.
Consider again the bag-\abbrTIDB $\pdb$. When the probability of all tuples $\prob_i = 1$, the problem of computing the expected count is linear in the size of the arithemetic circuit, and we have polytime complexity for computing $\query(\pdb)$. This leads us to our problem statement:
\begin{Problem}\label{prob:intro-stmt}
Given a query $\query$ in $\raPlus$ and bag \abbrPDB $\pdb$, what is the complexity (in the size of the circuit representation) of computing step two ($\expct\pbox{\poly(\vct{X})}$) for each tuple $\tup$ in the output of $\query(\pdb)$?
\end{Problem}
We show, for the class of \abbrTIDB\xplural with $\prob_i < 1$, the problem of computing step two in general is no longer linear in the size of the lineage polynomial representation.
Our work further introduces an approximation algorithm of the expected count of $\tup$ from the bag-\abbrPDB query $\query$ which runs in linear time.
As noted, bag-\abbrPDB query output is a probability distribution over the possible multiplicities of $\tup$, which is a stark contrast to the marginal probability ($\expct\pbox{\poly\inparen{\vct{X}}}$) paradigm of set-\abbrPDB\xplural. From a theoretical perspective, not much work has been done considering bag-\abbrPDB\xplural. Focusing on computing the expected count ($\expct\pbox{\poly\inparen{\vct{X}}}$) of $\tup$ is therfore a natural (and simplistic) statistic to consider in further developing the theoretical foundations of bag-\abbrPDB\xplural. There are indeed other statistical measures that can be computed, which are beyond the scope of this paper, though we additionally consider higher moments, which can be found in the appendix.
Our work focuses on the following setting for query computation. Inputs of $\query$ are set-\abbrPDB\xplural, while the output of $\query$ is a bag-\abbrPDB. This setting, however, is not limiting as a simple generalization exists, reducing a bag \abbrPDB to a set \abbrPDB with typically only an $O(1)$ increase in size.
%%%%%%%%%%%%%%%%%%%%%%%%%
%Contributions, Overview, Paper Organization
%%%%%%%%%%%%%%%%%%%%%%%%%
Concretely, we make the following contributions:
(i) We show that \cref{prob:intro-stmt} for bag-\abbrTIDB\xplural is \sharpwonehard in the size of the lineage circuit by reduction from counting the number of $k$-matchings over an arbitrary graph; we further show superlinear hardness for a specific cubic graph query for the special case of all $\prob_i = \prob$ for some $\prob$ in $(0, 1)$;
(ii) We present an $(1\pm\epsilon)$-\emph{multiplicative} approximation algorithm for bag-\abbrTIDB\xplural and $\raPlus$ queries; we further show that for typical database usage patterns (e.g. when the circuit is a tree or is generated by recent worst-case optimal join algorithms or their FAQ followups~\cite{DBLP:conf/pods/KhamisNR16}) have complexity linear in the size of the compressed lineage encoding (in contrast, known approximation techniques in set-\abbrPDB\xplural are at most quadratic); (iii) We generalize the approximation algorithm to a class of bag-\abbrBIDB\xplural, a more general model of probabilistic data; (iv) We further prove that for \raPlus queries
%(an equivalently expressive, but factorizable form of UCQs),
\AH{This point \emph{\Large seems} weird to me. I thought we just said that the approximation complexity is linear in step one, but now it's as if we're saying that it's $\log{\text{step one}} + $ the runtime of step one. Where am I missing it?}
we can approximate the expected output tuple multiplicities with only $O(\log{Z})$ overhead (where $Z$ is the number of output tuples) over the runtime of a broad class of query processing algorithms. We also observe that our results trivially extend to higher moments of the tuple multiplicity (instead of just the expectation).
\mypar{Overview of our Techniques} All of our results rely on working with a {\em reduced} form of the lineage polynomial $\Phi$. In fact, it turns out that for the TIDB (and BIDB) case, computing the expected multiplicity is {\em exactly} the same as evaluating this reduced polynomial over the probabilities that define the TIDB/BIDB. Next, we motivate this reduced polynomial in what follows.%continuing \Cref{ex:intro-tbls}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Old Figure from 1st ICDT submission
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{figure}[t]
% \begin{subfigure}[b]{0.49\linewidth}
% \centering
%{\small
% \begin{tabular}{ c | c c c}
% $OnTime$ & City$_\ell$ & $\Phi$ & \textbf{p}\\
% \hline
% & Buffalo & $L_a$ & 0.9 \\
% & Chicago & $L_b$ & 0.5\\
% & Bremen & $L_c$ & 0.5\\
% & Zurich & $L_d$ & 1.0\\
% \end{tabular}
% }
% \caption{Relation $OnTime$}
% \label{subfig:ex-shipping-simp-loc}
% \end{subfigure}%
% \begin{subfigure}[b]{0.49\linewidth}
% \centering
%{\small
% \begin{tabular}{ c | c c c c}
% $Route$ & $\text{City}_1$ & $\text{City}_2$ & $\Phi$ & \textbf{p} \\
% \hline
% & Buffalo & Chicago & $R_a$ & 1.0 \\
% & Chicago & Zurich & $R_b$ & 1.0 \\
% %& $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ \\
% & Chicago & Bremen & $R_c$ & 1.0 \\
% \end{tabular}
% }
% \caption{Relation $Route$}
% \label{subfig:ex-shipping-simp-route}
% \end{subfigure}%
% % \begin{subfigure}[b]{0.17\linewidth}
% % \centering
%
% % \caption{Circuit for $(Chicago)$}
% % \label{subfig:ex-proj-push-circ-q3}
% % \end{subfigure}
%
% \begin{subfigure}[b]{0.66\linewidth}
% \centering
%{\small
% \begin{tabular}{ c | c c c}
% $\query_1$ & City & $\Phi$ & $\expct_{\idb \sim \probDist}[\query(\db)(t)]$ \\ \hline
% & Buffalo & $L_a \cdot R_a$ & $0.9$ \\
% & Chicago & $L_b \cdot R_b + L_b \cdot R_c$ & $0.5 \cdot 1.0 + 0.5 \cdot 1.0 = 1.0$ \\
% %& $\cdots$ & $\cdots$ & $\cdots$ \\
% \end{tabular}
% }
% \caption{$Q_1$'s Result}
% \label{subfig:ex-shipping-simp-queries}
% \end{subfigure}%
% \begin{subfigure}[b]{0.33\linewidth}
% \centering
% \resizebox{!}{16mm} {
% \begin{tikzpicture}[thick]
% \node[tree_node] (a2) at (0, 0){$R_b$};
% \node[tree_node] (b2) at (1, 0){$L_b$};
% \node[tree_node] (c2) at (2, 0){$R_c$};
% %level 1
% \node[tree_node] (a1) at (0.5, 0.8){$\boldsymbol{\circmult}$};
% \node[tree_node] (b1) at (1.5, 0.8){$\boldsymbol{\circmult}$};
% %level 0
% \node[tree_node] (a0) at (1.0, 1.6){$\boldsymbol{\circplus}$};
% %edges
% \draw[->] (a2) -- (a1);
% \draw[->] (b2) -- (a1);
% \draw[->] (b2) -- (b1);
% \draw[->] (c2) -- (b1);
% \draw[->] (a1) -- (a0);
% \draw[->] (b1) -- (a0);
% \end{tikzpicture}
% }
% \resizebox{!}{16mm} {
% \begin{tikzpicture}[thick]
% \node[tree_node] (a1) at (1, 0){$R_b$};
% \node[tree_node] (b1) at (2, 0){$R_c$};
% %level 1
% \node[tree_node] (a2) at (0.75, 0.8){$L_b$};
% \node[tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circplus}$};
% %level 0
% \node[tree_node] (a3) at (1.1, 1.6){$\boldsymbol{\circmult}$};
% %edges
% \draw[->] (a1) -- (b2);
% \draw[->] (b1) -- (b2);
% \draw[->] (a2) -- (a3);
% \draw[->] (b2) -- (a3);
% \end{tikzpicture}
% }
% \caption{Two circuits for $Q_1(Chicago)$}
% \label{subfig:ex-proj-push-circ-q4}
% \end{subfigure}%
% \vspace*{-3mm}
% \caption{\ti instance and query results for \cref{ex:overview}}%\Cref{ex:intro-tbls}.}%{$\ti$ relations for $\poly$}
% \label{fig:ex-shipping-simp}
% \trimfigurespacing
%\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Consider the query $\query(\pdb) \coloneqq \project_\emptyset(OnTime \join_{City = City_1} Route \join_{{City}_2 = City'}\rename_{City' \leftarrow City}(OnTime)$
%$Q()\dlImp$$OnTime(\text{City}), Route(\text{City}, \text{City}'),$ $OnTime(\text{City}')$
over the bag relations of \cref{fig:two-step}. It can be verified that $\Phi$ for $Q$ is $L_aR_aL_b + L_bR_bL_d + L_bR_cL_c$. Now consider the product query $\query^2(\pdb) = \query(\pdb) \times \query(\pdb)$.
The lineage polynomial for $Q^2$ is given by $\Phi^2$:
\begin{multline*}
\left(L_aR_aL_b + L_bR_bL_d + L_bR_cL_c\right)^2\\
=L_a^2R_a^2L_b^2 + L_b^2R_d^2L_d^2 + L_b^2R_c^2L_c^2 + 2L_aR_aL_b^2R_bL_d + 2L_aR_bL_b^2R_cL_c + 2L_b^2R_bL_dR_cL_c.
\end{multline*}
The expectation $\expct\pbox{\Phi^2}$ then is:
\begin{footnotesize}
\begin{multline*}
\expct\pbox{L_a^2}\expct\pbox{R_a^2}\expct\pbox{L_b^2} + \expct\pbox{L_b^2}\expct\pbox{R_b^2}\expct\pbox{L_d^2} + \expct\pbox{L_b^2}\expct\pbox{R_c^2}\expct\pbox{L_c^2} + 2\expct\pbox{L_a}\expct\pbox{R_a}\expct\pbox{L_b^2}\expct\pbox{R_b}\expct\pbox{L_d}\\
+ 2\expct\pbox{L_a}\expct\pbox{R_b}\expct\pbox{L_b^2}\expct\pbox{R_c}\expct\pbox{L_c} + 2\expct\pbox{L_b^2}\expct\pbox{R_b}\expct\pbox{L_d}\expct\pbox{R_c}\expct\pbox{L_c}
\end{multline*}
\end{footnotesize}
\noindent If the domain of a random variable $W$ is $\{0, 1\}$, then for any $k > 0$, $\expct\pbox{W^k} = \expct\pbox{W}$, which means that $\expct\pbox{\Phi^2}$ simplifies to:
\begin{footnotesize}
\begin{multline*}
\expct\pbox{L_a}\expct\pbox{R_a}\expct\pbox{L_b} + \expct\pbox{L_b}\expct\pbox{R_b}\expct\pbox{L_d} + \expct\pbox{L_b}\expct\pbox{R_c}\expct\pbox{L_c} + 2\expct\pbox{L_a}\expct\pbox{R_a}\expct\pbox{L_b}\expct{R_b}\expct\pbox{L_d} \\
+ 2\expct\pbox{L_a}\expct\pbox{R_b}\expct\pbox{L_b}\expct{R_c}\expct\pbox{L_c} + 2\expct\pbox{L_b}\expct\pbox{R_b}\expct\pbox{L_d}\expct\pbox{R_c}\expct\pbox{L_c}
\end{multline*}
\end{footnotesize}
\noindent This property leads us to consider a structure related to the lineage polynomial.
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly(\vct{X})$, define the \emph{reduced polynomial} $\rpoly(\vct{X})$ to be the polynomial obtained by setting all exponents $e > 1$ in the SOP form of $\poly(\vct{X})$ to $1$.
\end{Definition}
With $\Phi^2$ as an example, we have:
\begin{align*}
&\widetilde{\Phi^2}(L_a, L_b, L_c, L_d, R_a, R_b, R_c)\\
&\; = L_aR_aL_b + L_bR_bL_d + L_bR_cL_c + 2L_aR_aL_bR_bL_d + 2L_aR_aL_bR_cL_c + 2L_bR_bL_dR_cL_c
\end{align*}
It can be verified that the reduced polynomial parameterized with each variable's respective marginal probability is a closed form of the expected count (i.e., $\expct\pbox{\Phi^2} = \widetilde{\Phi^2}(\probOf\pbox{L_a=1},$ $\probOf\pbox{L_b=1}, \probOf\pbox{L_c=1}), \probOf\pbox{L_d=1})$). In fact, we show in \Cref{lem:exp-poly-rpoly} that this equivalence holds for {\em all} $\raPlus$ queries over TIDB/BIDB.
To prove our hardness result we show that for the same $Q$ considered in the running example, the query $Q^k$ is able to encode various hard graph-counting problems. We do so by analyzing how the coefficients in the (univariate) polynomial $\widetilde{\Phi}\left(p,\dots,p\right)$ relate to counts of various sub-graphs on $k$ edges in an arbitrary graph $G$ (which is used to define the relations in $Q$). For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then ${\Phi}\left(\probOf\pbox{X_1=1},\dots, \probOf\pbox{X_n=1}\right)$ (i.e. evaluating the original lineage polynomial over the probability values) is a constant factor approximation. For example, if we know that $\prob_0 = \max_{i \in [\numvar]}\prob_i$, then $\poly(\prob_0,\ldots, \prob_0)$ is an upper bound constant factor approximation. The opposite holds true for determining a constant factor lower bound. To get an $(1\pm \epsilon)$-multiplicative approximation we sample monomials from $\Phi$ and `adjust' their contribution to $\widetilde{\Phi}\left(\cdot\right)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Paper Organization} We present relevant background and notation in \Cref{sec:background}. We then prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}. We present some (easy) generalizations of our results in \Cref{sec:gen} and also discuss extensions from computing expectations of polynomials to the expected result multiplicity problem (\Cref{def:the-expected-multipl})\AH{Aren't they the same?}. Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}.

393
Sketching Worlds/intro.tex Normal file
View File

@ -0,0 +1,393 @@
%root: main.tex
%!TEX root=./main.tex
\section{Introduction}
\label{sec:intro}
In their most general form, tuple-independent set-probabilistic databases~\cite{DBLP:series/synthesis/2011Suciu} (TIDBs) answer existential queries (queries for the probability of a specific condition holding over the input database) in two steps: (i) lineage and (ii) probability.
The lineage is a boolean formula, an element of the $\text{PosBool}[\vct{X}]$ semiring, where lineage variables $\vct{X}\in \mathbb{B}^\numvar$ are random variables corresponding to the presence of each of the $\numvar$ input tuples in one possible world of the input database.
The lineage models the relationship between the presence of these input tuples and the query condition being satisfied, and thus the probability of this formula is exactly the query result.
The analogous query in the bag setting~\cite{DBLP:journals/sigmod/GuagliardoL17,feng:2019:sigmod:uncertainty} asks for the expectation of the number (multiplicity) of result tuples that satisfy the query condition.
The process for responding to such queries is also analogous, save that the lineage is a polynomial, an element from the $\mathbb{N}[\vct{X}]$ semiring, with coefficients in the set of natural numbers $\mathbb{N}$ and random variables from the set $\vct{X} \in \mathbb{N}^\numvar$.
The expectation of this polynomial is the query result.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[t]
\begin{subfigure}[b]{0.33\linewidth}
\centering
\resizebox{!}{9mm}{
\begin{tabular}{ c | c c c}
$Loc$ & City$_\ell$ & $\Phi_{set}$ & $\Phi_{bag}$\\
\hline
& Buffalo & $L_a$ & $L_a$\\
& Chicago & $L_b$ & $L_b$\\
& Bremen & $L_c$ & $L_c$\\
%& Tel Aviv & $L_d$ & $L_d$\\
& Zurich & $L_d$ & $L_e$\\
\end{tabular}
}
\caption{Relation $Loc$ in \Cref{ex:intro-tbls}}
\label{subfig:ex-shipping-loc}
\end{subfigure}%
\begin{subfigure}[b]{0.33\linewidth}
\centering
\resizebox{!}{9mm}{
\begin{tabular}{ c | c c c c}
$Route$ & $\text{City}_1$ & $\text{City}_2$ & $\Phi_{set}$ & $\Phi_{bag}$ \\
\hline
& Buffalo & Chicago & $\top$ & $1$\\
& Chicago & Zurich & $\top$ & $1$\\
& $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$\\
& Chicago & Bremen & $\top$ & $1$\\
\end{tabular}
}
\caption{Relation $Route$ in \Cref{ex:intro-tbls}}
\label{subfig:ex-shipping-route}
\end{subfigure}%
\begin{subfigure}[b]{0.33\linewidth}
\centering
\resizebox{!}{9mm}{
\begin{tabular}{ c | c c c}
$Q_{1}$ & $\text{City}_1$ & $\Phi_{set}$ & $\Phi_{bag}$ \\
\hline
& Chicago & $\top \vee \top = \top$ & $1 + 1 = 2$\\
\multicolumn{1}{c}{\vspace{1mm}}\\
$Q_{2}$ & $\text{City}_1$ & $\Phi_{set}$ & $\Phi_{bag}$ \\
\hline
& Chicago & $L_a \wedge \top$ & $2L_a$\\
\end{tabular}
}
\caption{$Q_1$ and $Q_2$ in \Cref{ex:intro-tbls}}
\label{subfig:ex-shipping-queries}
\end{subfigure}%
\vspace*{-3mm}
\caption{ }%{$\ti$ relations for $\poly$}
\label{fig:ex-shipping}
\trimfigurespacing
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Example}\label{ex:intro-tbls}
Consider the \ti tables (\Cref{fig:ex-shipping}) from an international shipping company.
Table Loc lists all the locations of airports.
Table Route identifies all flight routes.
The tuples of both tables are annotated with elements of the $PosBool[\vct{X}]$ ($\Phi_{set}$) and $\mathbb{N}[\vct{X}]$ ($\Phi_{bag}$) semirings that indicate the tuples presence or multiplicity, respectively.
Tuples of Routes are annotated with random variables $L_i$ that models the probability of no delays at the airport on a given day\footnote{We assume for simplicity that these variables are independent events.}.
Tuples of Routes are annotated with a constant ($\top$ or $1$ respectively), and are deterministic; Queries over this table follow classical query evaluation semantics.
Consider a customer service representative who needs to expedite a shipment to Western Europe.
The query $Q_1 := \pi_{\text{City}_1}\left(\sigma_{\text{City}_2 = \text{``Bremen"} ~OR~ \text{City}_2 = \text{``Zurich"}}\right.$$\left.(Route)\right)$ asks for all cities with routes to either Zurich or Bremen.
Both routes exist from Chicago, and so the result lineage~\cite{DBLP:conf/pods/GreenKT07} of the corresponding tuple (\Cref{subfig:ex-shipping-queries}) indicates that the tuple is deterministically present, either via Zurich or Bremen.
Analogously, under bag semantics Chicago appears in the result twice.
Observe that even when the input is a set (i.e., input tuple annotations are at most $1$), we can still evaluate bag queries over it.
Suppose the representative would like to consider delays from the originating city, as per the query $Q_2 := \pi_{\text{City}_1}(Loc$ $\bowtie_{\text{City}_\ell = \text{City}_1} Q_{1})$.
The resulting lineage formulas (\Cref{subfig:ex-shipping-queries}) concisely describe the event of delivering a shipment to Zurich or Bremen without departure delay, or the number of departure-delay-free routes to these cities given an assignment to $L_b$.
If Chicago is delay-free ($L_b = \top$, $L_b = 1$, respectively), there exists a route (set semantics) or there are two routes (bag semantics).
\end{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%The computation of the marginal probability in is a known . Its corresponding problem in bag PDBs is computing the expected count of a tuple $\tup$. The computation is a two step process. The first step involves actually computing the lineage formula of $\tup$. The second step then computes the marginal probability (expected count) of the lineage formula of $\tup$, a boolean formula (polynomial) in the set (bag) semantics setting.
A well-known dichotomy~\cite{10.1145/1265530.1265571} separates the common-case \sharpphard problem of computing the probability of a boolean lineage formulas, from the case where the probability computation can be inlined into the polynomial-time lineage construction process.
Historically, the bottleneck for \emph{set-}probabilistic databases has been the second step; An instrumented query can compute a circuit encoding of the result lineage with at most a constant-factor overhead over the un-instrumented query ((TODO: Find citation)).
Because the probability computation is the bottleneck, it is typical to assume that the lineage formula is provided in disjunctive normal form (DNF), as even when this assumption holds the problem remains \sharpphard in general.
However, for bag semantics the analogous sum of products (SOP) lineage representation admits a trivial naive implementation due to linearity of expectation.
However, what can be said about lineage polynomials (i.e., bag-probabilistic database query results) that are in a compressed (e.g, circuit) representation instead?
In this paper we study computing the expected count of an output bag PDB tuple whose lineage formula is in a compressed representation, using the more general intensional query evaluation semantics.
%
%%Most theoretical developments in probabilistic databases (PDBs) have been made in the setting of set semantics. This is largely due to the stark contrast in hardness results when computing the first moment of a tuple's lineage formula (a boolean formula encoding the contributing input tuples to the output tuple) in set semantics versus the linear runtime when computing the expectation over the lineage polynomial (a standard polynomial analogously encoding contributing input tuples) of a tuple from an output bag PDB. However, when viewed more closely, the assumption of linear runtime in the bag setting relies on the lineage polynomial being in its "expanded" sum of products (SOP) form (each term is a product, where all (product) terms are summed). What can be said about computing the expectation of a more compressed form of the lineage polyomial (e.g. factorized polynomial) under bag semantics?
%
%%As explainability and fairness become more relevant to the data science community, it is now more critical than ever to understand how reliable a dataset is.
%%Probabilistic databases (PDBs)~\cite{DBLP:series/synthesis/2011Suciu} are a compelling solution, but a major roadblock to their adoption remains:
%%PDBs are orders of magnitude slower than classical (i.e., deterministic) database systems~\cite{feng:2019:sigmod:uncertainty}.
%%Naively, one might suggest that this is because most work on probabilistic databases assumes set semantics, while, virtually all implementations of the relational data model use bag semantics.
%%However, as we show in this paper, there is a more subtle problem behind this barrier to adoption.
%\subsection{Sets vs. Bags}
%In the setting of set semantics, this problem can be defined as: given a query, probabilistic database, and possible result tuple, compute the marginal probability of the tuple appearing in the result. It has been shown that this is equivalent to computing the probability of the lineage formula. %, which records how the result tuple was derived from input tuples.
%Given this correspondence, the problem reduces to weighted model counting over the lineage (a \sharpphard problem, even if the lineage is in DNF--the "expanded" form of the lineage formula in set semantics, corresponding to SOP of bag semantics).
%%A large body of work has focused on identifying tractable cases by either identifying tractable classes of queries (e.g.,~\cite{DS12}) or studying compressed representations of lineage formulas that are tractable for certain classes of input databases (e.g.,~\cite{AB15}). In this work we define a compressed representation as any one of the possible circuit representations of the lineage formula (please see Definitions~\ref{def:circuit},~\ref{def:poly-func}, and~\ref{def:circuit-set}).
%
%In bag semantics this problem corresponds to computing the expected multiplicity of a query result tuple, which can be reduced to computing the expectation of the lineage polynomial.
%
%\begin{Example}\label{ex:intro}
%The tables $\rel$ and $E$ in \Cref{fig:intro-ex} are examples of an incomplete database. In the setting of set semantics (disregard $\Phi_{bag}$ for the moment), every tuple $\tup$ of these tables is annotated with a variable or the symbol $\top$. Each assignment of values to variables ($\{\;W_a,W_b,W_c\;\}\mapsto \{\;\top,\bot\;\}$) identifies one \emph{possible world}, a deterministic database instance containing exactly the tuples annotated by the constant $\top$ or by a variable assigned to $\top$. When each variable represents an \emph{independent} event, this encoding is called a Tuple Independent Database $(\ti)$.
%
%The probability of this world is the joint probability of the corresponding assignments.
%For example, let $\probOf[W_a] = \probOf[W_b] = \probOf[W_c] = \prob$ and consider the possible world where $R = \{\;\tuple{a}, \tuple{b}\;\}$.
%The corresponding variable assignment is $\{\;W_a \mapsto \top, W_b \mapsto \top, W_c \mapsto \bot\;\}$, and its probability is $\probOf[W_a]\cdot \probOf[W_b] \cdot \probOf[\neg W_c] = \prob\cdot \prob\cdot (1-\prob)=\prob^2-\prob^3$.
%\end{Example}
%
%\begin{figure}[t]
% \begin{subfigure}{0.33\linewidth}
% \centering
% \resizebox{!}{10mm}{
% \begin{tabular}{ c | c c c}
% $\rel$ & A & $\Phi_{set}$ & $\Phi_{bag}$\\
% \hline
% & a & $W_a$ & $W_a$\\
% & b & $W_b$ & $W_b$\\
% & c & $W_c$ & $W_c$\\
% \end{tabular}
%} \caption{Relation $R$ in \Cref{ex:intro}}
% \label{subfig:ex-atom1}
% \end{subfigure}%
% \begin{subfigure}{0.33\linewidth}
% \centering
% \resizebox{!}{10mm}{
% \begin{tabular}{ c | c c c c}
% $E$ & A & B & $\Phi_{set}$ & $\Phi_{bag}$ \\
% \hline
% & a & b & $\top$ & $1$\\
% & b & c & $\top$ & $1$\\
% & c & a & $\top$ & $1$\\
% \end{tabular}
% }
% \caption{Relation $E$ in \Cref{ex:intro}}
% \label{subfig:ex-atom3}
% \end{subfigure}%
% \begin{subfigure}{0.33\linewidth}
% \centering
% \resizebox{!}{29mm}{
% \begin{tikzpicture}[thick]
% \node[tree_node] (a1) at (0, 0){$W_a$};
% \node[tree_node] (b1) at (1, 0){$W_b$};
% \node[tree_node] (c1) at (2, 0){$W_c$};
% \node[tree_node] (d1) at (3, 0){$W_d$};
%
% \node[tree_node] (a2) at (0.75, 0.8){$\boldsymbol{\circmult}$};
% \node[tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circmult}$};
% \node[tree_node] (c2) at (2.25, 0.8){$\boldsymbol{\circmult}$};
%
% \node[tree_node] (a3) at (1.9, 1.6){$\boldsymbol{\circplus}$};
% \node[tree_node] (a4) at (0.75, 1.6){$\boldsymbol{\circplus}$};
% \node[tree_node] (a5) at (0.75, 2.5){$\boldsymbol{\circmult}$};
%
% \draw[->] (a1) -- (a2);
% \draw[->] (b1) -- (a2);
% \draw[->] (b1) -- (b2);
% \draw[->] (c1) -- (b2);
% \draw[->] (c1) -- (c2);
% \draw[->] (d1) -- (c2);
% \draw[->] (c2) -- (a3);
% \draw[->] (a2) -- (a4);
% \draw[->] (b2) -- (a3);
% \draw[->] (a3) -- (a4);
% %sink
% \draw[thick, ->] (a4.110) -- (a5.250);
% \draw[thick, ->] (a4.70) -- (a5.290);
% \draw[thick, ->] (a5) -- (0.75, 3.0);
% \end{tikzpicture}
% }
% \caption{Circuit encoding for query $\poly^2$.}
% \label{fig:circuit-q2-intro}
% \end{subfigure}
% %\vspace*{3mm}
% \vspace*{-3mm}
% \caption{ }%{$\ti$ relations for $\poly$}
% \label{fig:intro-ex}
% \trimfigurespacing
%\end{figure}
%
%
%Following prior efforts~\cite{feng:2019:sigmod:uncertainty,DBLP:conf/pods/GreenKT07,GL16}, we generalize this model of Set-PDBs to Bag-PDBs using $\semN$-valued random variables (i.e., $\domain(\randomvar_i) \subseteq \mathbb N$) and constants (annotation $\Phi_{bag}$ in the example).
%Without loss of generality, we assume that input relations are sets (i.e. $Dom(W_i) = \{0, 1\}$), while \emph{query evaluation follows bag semantics}.
%
%\begin{Example}\label{ex:bag-vs-set}
%Continuing the prior example, we are given the following Boolean (resp,. count) query
%$$\poly() \dlImp R(A), E(A, B), R(B)$$
%The lineage of the result in a Set-PDB (Bag-PDB) is a Boolean formula (polynomial) over random variables annotating the input relations (i.e., $W_a$, $W_b$, $W_c$).
%Because the query result is a nullary relation, in what follows we can write $Q(\cdot)$ to denote the function that evaluates the lineage over one specific assignment of values to the variables (i.e., the value of the lineage in the corresponding possible world):
%
%\setlength\parindent{0pt}
%\vspace*{-3mm}
%\begin{tabular}{@{}l l}
% \begin{minipage}[b]{0.45\linewidth}
% \begin{equation}
% \poly_{set}(W_a, W_b, W_c) = W_aW_b \vee W_bW_c \vee W_cW_a\label{eq:poly-set}
% \end{equation}
% \end{minipage}\hspace*{5mm}
% &
% \begin{minipage}[b]{0.45\linewidth}
% \begin{equation}
% \poly_{bag}(W_a, W_b, W_c) = W_aW_b + W_bW_c + W_cW_a\label{eq:poly-bag}
% \end{equation}
% \end{minipage}\\
%\end{tabular}
%\vspace*{1mm}
%
%
%
%These functions compute the existence (count) of the nullary tuple resulting from applying $\poly$ on the PDB of \Cref{fig:intro-ex}.
%For the same possible world identified in \Cref{ex:intro}:
%$$
%\begin{tabular}{c c}
% \begin{minipage}[b]{0.45\linewidth}
% $\poly_{set}(\top, \top, \bot) = \top\top \vee \top\bot \vee \bot\top = \top$
% \end{minipage}
% &
% \begin{minipage}[b]{0.45\linewidth}
% $\poly_{bag}(1, 1, 0) = 1 \cdot 1 + 1\cdot 0 + 0 \cdot 1 = 1$
% \end{minipage}\\
%\end{tabular}
%$$
%
%The Set-PDB query is satisfied in this possible world and the output Bag-PDB tuple has a multiplicity of 1.
%The marginal probability (expected count) of this query is computed over all possible worlds:
%{\small
%\begin{align*}
%\probOf[\poly_{set}] &= \hspace*{-1mm}
% \sum_{w_i \in \{\top,\bot\}} \indicator{\poly_{set}(w_a, w_b, w_c)}\probOf[W_a = w_a,W_b = w_b,W_c = w_c]\\
%\expct[\poly_{bag}] &= \sum_{w_i \in \{0,1\}} \poly_{bag}(w_a, w_b, w_c)\cdot \probOf[W_a = w_a,W_b = w_b,W_c = w_c]
%\end{align*}
%}
%\end{Example}
%
%Note that the query of \Cref{ex:bag-vs-set} in set semantics is indeed non-hierarchical~\cite{DS12}, and thus \sharpphard.
%To see why computing this probability is hard, observe that the three clauses $(W_aW_b, W_bW_c, W_aW_c)$ of $(\ref{eq:poly-set})$ are not independent (the same variables appear in multiple clauses) nor disjoint (the clauses are not mutually exclusive). Computing the probability of such formulas exactly requires exponential time algorithms (e.g., Shanon Decomposition).
%Conversely, in Bag-PDBs, correlations between monomials of the SOP polynomial (\ref{eq:poly-bag}) are not problematic thanks to linearity of expectation.
%The expectation computation over the output lineage is simply the sum of expectations of each clause.
%Referring again to example~\ref{ex:intro}, the expectation is simply
%\begin{equation*}
%\expct\pbox{\poly_{bag}(W_a, W_b, W_c)} = \expct\pbox{W_aW_b} + \expct\pbox{W_bW_c} + \expct\pbox{W_cW_a}
%\end{equation*}
%In this particular lineage polynomial, all variables in each product clause are independent, so we can push expectations through.
%\begin{equation*}
%= \expct\pbox{W_a}\expct\pbox{W_b} + \expct\pbox{W_b}\expct\pbox{W_c} + \expct\pbox{W_c}\expct\pbox{W_a}
%\end{equation*}
%Computing such expectations is indeed linear in the size of the SOP as the number of operations in the computation is \textit{exactly} the number of multiplication and addition operations of the polynomial.
%As a further interesting feature of this example, note that $\expct\pbox{W_i} = \probOf[W_i = 1]$, and so taking the same polynomial over the reals:
%\begin{equation}
%\label{eqn:can-inline-probabilities-into-polynomial}
%\expct\pbox{\poly_{bag}}
%= \poly_{bag}(\probOf[W_a=1], \probOf[W_b=1], \probOf[W_c=1])
%\end{equation}
%\Cref{eqn:can-inline-probabilities-into-polynomial} is not true in general, as we shall see in \Cref{sec:suplin-bags}.
%
%The workflow modeling this particular problem can be broken down into two steps. We start with converting the output boolean formula (polynomial) into a representation. This representation is then the interface for the second step, which is computing the marginal probability (count) of the encoded boolean formula (polynomial). A natural question arises as to which representation to use. Our choice to use circuits (\Cref{def:circuit}) to represent the lineage polynomials follows from the observation that the work in WCOJ/FAQ/Factorized DB's --\color{red}CITATION HERE\color{black}-- all contain algorithms that can be easily be modified to output circuits without changing their runtime. Further, circuits generally allow for greater compression than other respresentations, such as expression trees. By the former observation, step one is always linear in the size of the circuit representation of the boolean formula (polynomial), implying that if the second step of the workflow is computed in time greater, then reducing the complexity of the second step would indeed improve the overall efficiency of computing the marginal probability (count) of an output set (bag) PDB tuple. This, however, as noted earlier, cannot be done in the set semantics setting, due to known hardness results.
%
%Though computing the expected count of an output bag PDB tuple $\tup$ is linear (in the size of the polynomial) when the lineage polynomial of $\tup$ is in SOP form, %has received much less attention, perhaps due to the property of linearity of expectation noted above.
%%, perhaps because on the surface, the problem is trivially tractable.In fact, as mentioned, it is linear time when the lineage polynomial is encoded in an SOP representation.
%is this computation also linear (in the size of an equivalent compressed representation) when the lineage polynomial of $\tup$ is in compressed form?
%there exist compressed representations of polynomials, e.g., factorizations~\cite{factorized-db}, that can be polynomially more concise than their SOP counterpart.
Such compressed forms naturally occur in typical database optimizations, e.g., projection push-down~\cite{DBLP:books/daglib/0020812}, (where e.g. in the case of a projection followed by a join, addition would be performed prior to multiplication, yielding a product of sums instead of a SOP).
\begin{figure}[t]
\begin{subfigure}[b]{0.51\linewidth}
\centering
\resizebox{!}{20mm} {
\begin{tabular}{c | c c c}
$Route$ & $\text{City}_1$ & $\text{City}_2$ &$\Phi$\\
\hline
& Buffalo & Chicago & $R_a$\\
& Chicago & Zurich & $R_b$\\
& $\cdots$ & $\cdots$ & $\cdots$\\
& Chicago & Bremen & $R_c$\\
\multicolumn{1}{c}{\vspace{1mm}}\\
$Q_3$ & \text{City} & $\Phi_{set}$ & $\Phi_{bag}$\\
\hline
& Chicago & $L_b \wedge R_b \vee L_b \wedge R_c$ & $ L_b \cdot R_b + L_b \cdot R_c$\\
\multicolumn{1}{c}{\vspace{1mm}}\\
$Q_4$ & \text{City} & $\Phi_{set}$ & $\Phi_{bag}$\\
\hline
& Chicago & $L_b \wedge (R_b \vee R_c)$ & $L_b \cdot (R_b + R_c)$\\
\end{tabular}
}
\caption{$Route$, $Q_3$, $Q_4$}
\label{subfig:ex-proj-push-q4}
\end{subfigure}%
\begin{subfigure}[b]{0.24\linewidth}
\centering
\resizebox{!}{29mm} {
\begin{tikzpicture}[thick]
\node[tree_node] (a2) at (0, 0){$R_b$};
\node[tree_node] (b2) at (1, 0){$L_b$};
\node[tree_node] (c2) at (2, 0){$R_c$};
%level 1
\node[tree_node] (a1) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[tree_node] (b1) at (1.5, 0.8){$\boldsymbol{\circmult}$};
%level 0
\node[tree_node] (a0) at (1.0, 1.6){$\boldsymbol{\circplus}$};
%edges
\draw[->] (a2) -- (a1);
\draw[->] (b2) -- (a1);
\draw[->] (b2) -- (b1);
\draw[->] (c2) -- (b1);
\draw[->] (a1) -- (a0);
\draw[->] (b1) -- (a0);
\end{tikzpicture}
}
\caption{Circuit encoding $Q_3$}
\label{subfig:ex-proj-push-circ-q3}
\end{subfigure}%
\begin{subfigure}[b]{0.24\linewidth}
\centering
\resizebox{!}{29mm} {
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (1, 0){$R_b$};
\node[tree_node] (b1) at (2, 0){$R_c$};
%level 1
\node[tree_node] (a2) at (0.75, 0.8){$L_b$};
\node[tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circplus}$};
%level 0
\node[tree_node] (a3) at (1.1, 1.6){$\boldsymbol{\circmult}$};
%edges
\draw[->] (a1) -- (b2);
\draw[->] (b1) -- (b2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (a3);
\end{tikzpicture}
}
\caption{Circuit encoding $Q_4$.}
\label{subfig:ex-proj-push-circ-q4}
\end{subfigure}%
\label{fig:ex-proj-push}
\end{figure}
\begin{Example}
Consider again the tables in \Cref{subfig:ex-shipping-loc} and \Cref{subfig:ex-shipping-route} and let us assume that the tuples in $Route$ are annotated with random variables as shown in \Cref{subfig:ex-proj-push-q4}.
Consider the equivalent queries $Q_3 := \pi_{\text{City}_1}(Loc \bowtie_{\text{City}_\ell = \text{City}_1}Route)$ and $Q_4 := Loc \bowtie_{\text{City}_\ell = \text{City}_1}\pi_{\text{City}_1}(Route)$.
The latter's ``pushed down'' projection produces a compressed annotation, both in the polynomial, as well as its circuit encoding (\Cref{subfig:ex-proj-push-circ-q3,subfig:ex-proj-push-circ-q4}).
In general, compressed representations of the lineage polynomial can be exponentially smaller than the polynomial.
\end{Example}
This suggests that perhaps even Bag-PDBs have higher query processing complexity than deterministic databases.
In this paper, we confirm this intuition, first proving that computing the expected count of a query result tuple is super-linear (\sharpwonehard) in the size of a compressed lineage representation, and then relating the size of the compressed lineage to the cost of answering a deterministic query.
In view of this hardness result (i.e., step 2 of the workflow is the bottleneck in the bag setting as well), we develop an approximation algorithm for expected counts of SPJU query Bag-PDB output, that is, to our knowledge, the first linear time (in the size of the factorized lineage) $(1-\epsilon)$-\emph{multiplicative} approximation, eliminating step 2 from being the bottleneck of the workflow.
By extension, this algorithm only has a constant factor slower runtime relative to deterministic query processing.\footnote{
Monte-carlo sampling~\cite{jampani2008mcdb} is also trivially a constant factor slower, but can only guarantee additive rather than our stronger multiplicative bounds.
}
This is an important result, because it implies that computing approximate expectations for bag output PDBs of SPJU queries can indeed be competitive with deterministic query evaluation over bag databases.
\subsection{Overview of our results and techniques}
Concretely, in this paper:
(i) We show that computing the expected count of conjunctive queries whose output is a bag-$\ti$ is hard (i.e., superlinear in the size of a compressed lineage encoding) by reduction from counting the number of $k$-matchings over an arbitrary graph;
(ii) We present an $(1-\epsilon)$-\emph{multiplicative} approximation algorithm for bag-$\ti$s and show that its complexity is linear in the size of the compressed lineage encoding;
(iii) We generalize the approximation algorithm to bag-$\bi$s, a more general model of probabilistic data;
(iv) We further generalize our results to higher moments and prove that for RA+ queries, the processing time in approximation is within a constant factor of the same query processed deterministically.
Our hardness results follow by considering a suitable generalization of the lineage polynomial in \Cref{eq:edge-query}. First it is easy to generalize the polynomial to $\poly_G(X_1,\dots,X_n)$ that represents the edge set of a graph $G$ in $n$ vertices. Then $\poly_G^k(X_1,\dots,X_n)$ (i.e., $\inparen{\poly_G(X_1,\dots,X_n)}^k$) encodes as its monomials all subgraphs of $G$ with at most $k$ edges in it. This implies that the corresponding reduced polynomial $\rpoly_G^k(\prob,\dots,\prob)$ (see \Cref{def:reduced-poly}) can be written as $\sum_{i=0}^{2k} c_i\cdot \prob^i$ and we observe that $c_{2k}$ is proportional to the number of $k$-matchings (which computing is \sharpwonehard) in $G$. Thus, if we have access to $\rpoly_G^k(\prob_i,\dots,\prob_i)$ for distinct values of $\prob_i$ for $0\le i\le 2k$, then we can set up a system of linear equations and compute $c_{2k}$ (and hence the number of $k$-matchings in $G$). This result, however, does not rule out the possibility that computing $\rpoly_G^k(\prob,\dots, \prob)$ for a {\em single specific} value of $\prob$ might be easy: indeed it is easy for $\prob=0$ or $\prob=1$. However, we are able to show that for any other value of $\prob$, computing $\rpoly_G^k(\prob,\dots, \prob)$ exactly will most probably require super-linear time. This reduction needs more work (and we cannot yet extend our results to $k>3$). Further, we have to rely on more recent conjectures in {\em fine-grained} complexity on e.g. the complexity of counting the number of triangles in $G$ and not more standard parameterized hardness like \sharpwonehard.
The starting point of our approximation algorithm was the simple observation that for any lineage polynomial $\poly(X_1,\dots,X_n)$, we have $\rpoly(1,\dots,1)=Q(1,\dots,1)$ and if all the coefficients of $\poly$ are constants, then $\poly(\prob,\dots, \prob)$ (which can be easily computed in linear time) is a $\prob^k$ approximation to the value $\rpoly(\prob,\dots, \prob)$ that we are after. If $\prob$ (i.e., the \emph{input} tuple probabilities) and $k=\degree(\poly)$ are constants, then this gives a constant factor approximation. We then use sampling to get a better approximation factor of $(1\pm \eps)$: we sample monomials from $\poly(X_1,\dots,X_\numvar)$ and do an appropriate weighted sum of their coefficients. Standard tail bounds then allow us to get our desired approximation scheme. To get a linear runtime, it turns out that we need the following properties from our compressed representation of $\poly$: (i) be able to compute $\poly(1,\ldots, 1)$ in linear time and (ii) be able to sample monomials from $\poly(X_1,\dots,X_n)$ quickly as well.
%For the ease of exposition, we start off with expression trees (see \Cref{fig:circuit-q2-intro} for an example) and show that they satisfy both of these properties. Later we show that it is easy to show that these properties also extend to polynomial circuits as well (we essentially show that in the required time bound, we can simulate access to the `unrolled' expression tree by considering the polynomial circuit).
We formalize our claim that, since our approximation algorithm runs in time linear in the size of the polynomial circuit, we can approximate the expected output tuple multiplicities with only a $O(\log{Z})$ overhead (where $Z$ is the number of output tuples) over the runtime of a broad class of query processing algorithms. We also observe that our results trivially extend to higher moments of the tuple multiplicity (instead of just the expectation).
\paragraph{Paper Organization.} We present some relevant background and set up our notation in \Cref{sec:background}. We present our hardness results in \Cref{sec:hard} and our approximation algorithm in \Cref{sec:algo}. We present some (easy) generalizations of our results in \Cref{sec:gen}. We do a quick overview of related work in \Cref{sec:related-work} and conclude with some open questions in \Cref{sec:concl-future-work}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,272 @@
%!TEX root=./main.tex
%root: main.tex
\section{Introduction}\label{sec:intro}
This work explores the problem of computing the expectation of the multiplicity of a tuple in the result of a query over a \abbrCTIDB, a type of probabilistic database with bag semantics where the multiplicity of a tuple is a random variable with range $[0,\bound]$ for some fixed constant $\bound$ and multiplicities assigned to any two tuples are independent of each other.
Formally, a \abbrCTIDB,
$\pdb = \inparen{\worlds, \bpd}$ consists of a set of tuples $\tupset$ and a probability distribution $\bpd$ over all possible worlds generated by assigning each tuple $\tup \in \tupset$ a multiplicity in the range $[0,\bound]$.
Any such world can be encoded as a vector from $\worlds$, the set of all vectors of length $\numvar=\abs{\tupset}$ such that each index corresponds to a distinct $\tup \in \tupset$ storing its multiplicity.
A given world $\worldvec \in\worlds$ can be interpreted as follows: for each $\tup \in \tupset$, $\worldvec_{\tup}$ is the multiplicity of $\tup$ in $\worldvec$. Given that the multiplicities of tuples are independent events, the probability distribution $\bpd$ can be expressed compactly by assigning each tuple a (disjoint) probability distribution over $[0,\bound]$. Let $\prob_{\tup,j}$ denote the probability that tuple $\tup$ is assigned multiplicity $j$. The probability of a particular world $\worldvec$ is then $\prod_{\tup \in \tupset} \prob_{\tup,\worldvec_{\tup}}$.
Allowing for $\leq \bound$ multiplicities across all tuples gives rise to having $\leq \inparen{\bound+1}^\numvar$ possible worlds instead of the usual $2^\numvar$ possible worlds of a $1$-\abbrTIDB, which (assuming set query semantics), is the same as the traditional set \abbrTIDB.
In this work, since we are generally considering bag query input, we will only be considering bag query semantics. We denote by $\query\inparen{\worldvec}\inparen{\tup}$ the multiplicity of $\tup$ in query $\query$ over possible world $\worldvec\in\worlds$.
We can formally state our problem of computing the expected multiplicity of a result tuple as:
\begin{Problem}\label{prob:expect-mult}
Given a \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, $\raPlus$ query\footnote{
An $\raPlus$ query is a query expressed in positive relational algebra, i.e., using only the relational algebra operators selection ($\select$), projection ($\project$), natural join ($\join$) and union ($\union$).
}
$\query$, and result tuple $\tup$, compute the expected multiplicity of $\tup$: $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$.
\end{Problem}
\begin{figure}[t!]
\begin{align*}
&\begin{aligned}[t]
&\polyqdt{\project_A(\query)}{\gentupset}{\tup} =\\
&~~\sum_{\tup': \project_A(\tup') = \tup} \polyqdt{\query}{\gentupset}{\tup'}
\end{aligned}
&
&\begin{aligned}[t]
&\polyqdt{\query_1 \union \query_2}{\gentupset}{\tup} =\\
&\qquad \polyqdt{\query_1}{\gentupset}{\tup} + \polyqdt{\query_2}{\gentupset}{\tup}\\
\end{aligned}\\
&\begin{aligned}
&\polyqdt{\select_\theta(\query)}{\gentupset}{\tup} =\\
&~~ \begin{cases}
\polyqdt{\query}{\gentupset}{\tup} & \text{if }\theta(\tup) \\
0 & \text{otherwise}.
\end{cases}
\end{aligned}
&
&\begin{aligned}
&\polyqdt{\query_1 \join \query_2}{\gentupset}{\tup} =\\
&\qquad\polyqdt{\query_1}{\gentupset}{\project_{\attr{\query_1}}{\tup}}\\
&\qquad\cdot\polyqdt{\query_2}{\gentupset}{\project_{\attr{\query_2}}{\tup}}
\end{aligned}\\
&&&\polyqdt{\rel}{\gentupset}{\tup} = X_\tup
\end{align*}%\\[-10mm]
\setlength{\abovecaptionskip}{-0.25cm}
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over an arbitrary deterministic database $\gentupset$, where $\vct{X}$ consists of all $X_\tup$ over all $\rel$ in $\gentupset$ and $\tup$ in $\rel$. Here $\gentupset.\rel$ denotes the instance of relation $\rel$ in $\gentupset$. Please note, after we introduce the reduction to $1$-\abbrBIDB, the base case will be expressed alternatively.}
\label{fig:nxDBSemantics}
\vspace{-0.53cm}
\end{figure}
It is natural to explore computing the expected multiplicity of a result tuple as this is the analog for computing the marginal probability of a tuple in a set \abbrPDB.
In this work we will assume that $c =\bigO{1}$ since this is what is typically seen in practice.
Allowing for unbounded $c$ is an interesting open problem.
\mypar{Hardness of Set Query Semantics and Bag Query Semantics}
Set query evaluation semantics over $1$-\abbrTIDB\xplural have been studied extensively, and the data complexity of the problem in general has been shown by Dalvi and Suicu to be \sharpphard\cite{10.1145/1265530.1265571}. For our setting, there exists a trivial polytime algorithm to compute~\Cref{prob:expect-mult} for any $\raPlus$ query over a \abbrCTIDB due to linearity of expection (see~\Cref{sec:intro-poly-equiv}).
Since we can compute~\Cref{prob:expect-mult} in polynomial time, the interesting question that we explore deals with analyzing the hardness of computing expectation using fine-grained analysis and parameterized complexity, where we are interested in the exponent of polynomial runtime.
Specifically, in this work we ask if~\Cref{prob:expect-mult} can be solved in time linear in the runtime of an analogous deterministic query which we make more precise shortly.
If this is true, then this would open up the way for deployment of \abbrCTIDB\xplural in practice. To analyze this question we denote by $\timeOf{}^*(Q,\pdb)$ the optimal runtime complexity of computing~\Cref{prob:expect-mult} over \abbrCTIDB $\pdb$.
Let $\qruntime{\query,\gentupset,\bound}$ (see~\Cref{sec:gen} for further details) denote the runtime for query $\query$, deterministic database $\gentupset$, and multiplicity bound $\bound$. This paper considers $\raPlus$ queries for which order of operations is \emph{explicit}, as opposed to other query languages, e.g. Datalog, UCQ. Thus, since order of operations affects runtime, we denote the optimized $\raPlus$ query picked by an arbitrary production system as $\optquery{\query} = \min_{\query'\in\raPlus, \query'\equiv\query}\qruntime{\query', \gentupset, \bound}$. Then $\qruntime{\optquery{\query}, \gentupset,\bound}$ is the runtime for the optimized query.\footnote{Note that our work applies to any $\query \in\raPlus$, which implies that specific heuristics for choosing an optimized query can be abstracted away, i.e., our work does not consider heuristic techniques.}
\begin{table*}[t!]
\centering
\begin{tabular}{|p{0.43\textwidth}|p{0.12\textwidth}|p{0.35\textwidth}|}
\hline
\textbf{Lower bound on $\timeOf{}^*(\qhard,\pdb)$} & \textbf{Num.} $\bpd$s
& \textbf{Hardness Assumption}\\
\hline
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{1+\eps_0}}$ for {\em some} $\eps_0>0$ & Single & Triangle Detection hypothesis\\
$\omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{C_0}}$ for {\em all} $C_0>0$ & Multiple &$\sharpwzero\ne\sharpwone$\\
$\Omega\inparen{\inparen{\qruntime{\optquery{\qhard}, \tupset, \bound}}^{c_0\cdot k}}$ for {\em some} $c_0>0$ & Multiple & \Cref{conj:known-algo-kmatch}\\
\hline
\end{tabular}
\caption{Our lower bounds for a specific hard query $\qhard$ parameterized by $k$. For $\pdb = \inset{\worlds, \bpd}$ those with `Multiple' in the second column need the algorithm to be able to handle multiple $\bpd$, i.e. probability distributions (for a given $\tupset$). The last column states the hardness assumptions that imply the lower bounds in the first column ($\eps_o,C_0,c_0$ are constants that are independent of $k$).}
\label{tab:lbs}
\vspace{-0.73cm}
\end{table*}
\mypar{Our lower bound results}
Our question is whether or not it is always true that $\timeOf{}^*\inparen{\query, \pdb}\leq\qruntime{\optquery{\query}, \tupset, \bound}$. Unfortunately this is not the case.
~\Cref{tab:lbs} shows our results.
Specifically, depending on what hardness result/conjecture we assume, we get various weaker or stronger versions of {\em no} as an answer to our question. To make some sense of the other lower bounds in \Cref{tab:lbs}, we note that it is not too hard to show that $\timeOf{}^*(Q,\pdb) \le \bigO{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$, where $k$ is the join width (our notion of join width follows from~\Cref{def:degree-of-poly} and~\Cref{fig:nxDBSemantics}.) of the query $\query$ over all result tuples $\tup$ (and the parameter that defines our family of hard queries).
What our lower bound in the third row says is that one cannot get more than a polynomial improvement over essentially the trivial algorithm for~\Cref{prob:expect-mult}.
However, this result assumes a hardness conjecture that is not as well studied as those in the first two rows of the table (see \Cref{sec:hard} for more discussion on the hardness assumptions). Further, we note that existing results\footnote{This claim follows from known results for the problem of counting $k$-cliques, where for a query $\query$ over database $\tupset$ that counts the number of $k$-cliques. Specifically, a lower bound of the form $\Omega\inparen{n^{1+\eps_0}}$ for {\em some} $\eps_0>0$ follows from the triangle detection hypothesis (this like our result is for $k=3$). Second, a lower bound of $\omega\inparen{n^{C_0}}$ for {\em all} $C_0>0$ under the assumption $\sharpwzero\ne\sharpwone$ for counting $k$-clique~\cite{10.5555/645413.652181}. Finally, a lower bound of $\Omega\inparen{n^{c_0\cdot k}}$ for {\em some} $c_0>0$ was shown by~\cite{CHEN20061346} (under the strong exponential time hypothesis).
} already imply the claimed lower bounds if we were to replace the $\qruntime{\optquery{\query}, \tupset, \bound}$ by just $\numvar$ (indeed these results follow from known lower bounds for deterministic query processing). Our contribution is to then identify a family of hard queries where deterministic query processing is `easy' but computing the expected multiplicities is hard.
\mypar{Our upper bound results} We introduce a $(1\pm \epsilon)$-approximation algorithm that computes ~\Cref{prob:expect-mult} in time $O_\epsilon\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}$. This means, when we are okay with approximation, that we solve~\Cref{prob:expect-mult} in time linear in the size of the deterministic query and bag \abbrPDB\xplural are deployable in practice.
In contrast, known approximation techniques (\cite{DBLP:conf/icde/OlteanuHK10,DBLP:journals/jal/KarpLM89}) in set-\abbrPDB\xplural need time $\Omega(\qruntime{\optquery{\query}, \tupset, \bound}^{2k})$
(see \Cref{sec:karp-luby}).
Further, our approximation algorithm works for a more general notion of bag \abbrPDB\xplural beyond \abbrCTIDB\xplural
(see \Cref{subsec:tidbs-and-bidbs}).
\subsection{Polynomial Equivalence}\label{sec:intro-poly-equiv}
A common encoding of probabilistic databases (e.g., in \cite{IL84a,Imielinski1989IncompleteII,Antova_fastand,DBLP:conf/vldb/AgrawalBSHNSW06} and many others) relies on annotating tuples with lineages or propositional formulas that describe the set of possible worlds that the tuple appears in. The bag semantics analog is a provenance/lineage polynomial (see~\Cref{fig:nxDBSemantics}) $\apolyqdt$~\cite{DBLP:conf/pods/GreenKT07}, a polynomial with non-zero integer coefficients and exponents, over variables $\vct{X}$ encoding input tuple multiplicities. Evaluating a lineage polynomial for a query result tuple $t_{out}$ by, for each tuple $\tup_{in}$, assigning the variable $X_{t_{in}}$ encoding the tuple's multiplicity to the tuple's multiplicity in the possible world yields the multiplicity of the $\tup_{out}$ in the query result for this world.
We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear from the context or irrelevant to the discussion. We now specify the problem of computing the expectation of tuple multiplicity in the language of lineage polynomials:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$, \abbrCTIDB $\pdb$ and result tuple $\tup$, compute the expected
multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$, where $\vct{W} \in \worlds$).
\end{Problem}
We note that computing \Cref{prob:expect-mult}
is equivalent (yields the same result as) to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
All of our results rely on working with a {\em reduced} form $\inparen{\rpoly}$ of the lineage polynomial $\poly$. In fact, it turns out that for the $1$-\abbrTIDB case, computing the expected multiplicity (over bag query semantics) is {\em exactly} the same as evaluating this reduced polynomial over the probabilities that define the $1$-\abbrTIDB. This is also true when the query input(s) is a block independent disjoint probabilistic database~\cite{DBLP:conf/icde/OlteanuHK10} (bag query semantics with tuple multiplicity at most $1$), for which the proof of~\Cref{lem:tidb-reduce-poly} (introduced shortly) holds .
Next, we motivate this reduced polynomial.
Consider the query $\query_1$ defined as follows over the bag relations of \Cref{fig:two-step}:
\begin{lstlisting}
SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = r.Point$_2$
\end{lstlisting}
It can be verified that $\poly\inparen{A, B, C, E, X, Y, Z}$ for the sole result tuple of $\query_1$ is $AXB + BYE + BZC$. Now consider the product query $\query_1^2 = \query_1 \times \query_1$.
The lineage polynomial for $Q_1^2$ is given by $\poly_1^2\inparen{A, B, C, E, X, Y, Z}$
$$
=A^2X^2B^2 + B^2Y^2E^2 + B^2Z^2C^2 + 2AXB^2YE + 2AXB^2ZC + 2B^2YEZC.
$$
To compute $\expct\pbox{\poly_1^2}$ we can use linearity of expectation and push the expectation through each summand. To keep things simple, let us focus on the monomial $\poly_1^{\inparen{ABX}^2} = A^2X^2B^2$ as the procedure is the same for all other monomials of $\poly_1^2$. Let $\randWorld_X$ be the random variable corresponding to a lineage variable $X$. Because the distinct variables in the product are independent, we can push expectation through them yielding $\expct\pbox{\randWorld_A^2\randWorld_X^2\randWorld_B^2}=\expct\pbox{\randWorld_A^2}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B^2}$. Since $\randWorld_A, \randWorld_B\in \inset{0, 1}$ we can further derive $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X^2}\expct\pbox{\randWorld_B}$ by the fact that for any $W\in \inset{0, 1}$, $W^2 = W$. Observe that if $X\in\inset{0, 1}$, then we further would have $\expct\pbox{\randWorld_A}\expct\pbox{\randWorld_X}\expct\pbox{\randWorld_B} = \prob_A\cdot\prob_X\cdot\prob_B$ (denoting $\probOf\pbox{\randWorld_A = 1} = \prob_A$) $= \rpoly_1^{\inparen{ABX}^2}\inparen{\prob_A, \prob_X, \prob_B}$ (see $ii)$ of~\Cref{def:reduced-poly}). However, in this example, we get stuck with $\expct\pbox{\randWorld_X^2}$, since $\randWorld_X\in\inset{0, 1, 2}$ and for $\randWorld_X \gets 2$, $\randWorld_X^2 \neq \randWorld_X$.
Denote the variables of $\poly$ to be $\vars{\poly}.$ In the \abbrCTIDB setting, $\poly\inparen{\vct{X}}$ has an equivalent reformulation $\inparen{\refpoly{}\inparen{\vct{X_R}}}$ that is of use to us, where $\abs{\vct{X_R}} = \bound\cdot\abs{\vct{X}}$ . Given $X_\tup \in\vars{\poly}$ and integer valuation $X_\tup \in\inset{0,\ldots, c}$. We can replace $X_\tup$ by $\sum_{j\in\pbox{\bound}}jX_{\tup, j}$ where the variables $\inparen{X_{\tup, j}}_{j\in\pbox{\bound}}$ are disjoint with integer assignments $X_\tup\in\inset{0, 1}$. Then for any $\worldvec\in\worlds$ and corresponding reformulated world $\worldvec_{\vct{R}}\in\inset{0, 1}^{\tupset\bound}$, we set $\worldvec_{\vct{R}_{\tup, j}} = 1$ for $\worldvec_\tup = j$, while $\worldvec_{\vct{R}_{\tup, j'}} = 0$ for all $j'\neq j\in\pbox{\bound}$. By construction then $\poly\inparen{\vct{X}}\equiv\refpoly{}\inparen{\vct{X_R}}$ $\inparen{\vct{X_R} = \vars{\refpoly{}}}$ since for any integer valuation $X_\tup\in\pbox{\bound}$, $X_j\in\inset{0, 1}$ we have the equality $X_\tup = j = \sum_{j\in\pbox{\bound}}jX_j$.
Considering again our example,
\begin{multline*}
\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
\end{multline*}
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Computing expectation we get $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$, since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$.
This leads us to consider a structure related to the lineage polynomial.
\begin{Definition}\label{def:reduced-poly}
For any polynomial $\poly\inparen{\inparen{X_\tup}_{\tup\in\tupset}}$ define the reformulated polynomial $\refpoly{}\inparen{\inparen{X_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}
$ to be the polynomial $\refpoly{}$ = $\poly\inparen{\inparen{\sum_{j\in\pbox{\bound}}j\cdot X_{\tup, j}}_{\tup\in\tupset}}
$ and ii) define the \emph{reduced polynomial} $\rpoly\inparen{\inparen{X_{\tup, j}}_{\tup\in\tupset, j\in\pbox{\bound}}}
$ to be the polynomial resulting from converting $\refpoly{}$ into the standard monomial basis (\abbrSMB),
\footnote{
This is the representation, typically used in set-\abbrPDB\xplural, where the polynomial is reresented as sum of `pure' products. See \Cref{def:smb} for a formal definition.
}
removing all monomials containing the term $X_{\tup, j}X_{\tup, j'}$ for $\tup\in\tupset, j\neq j'\in\pbox{c}$, and setting all \emph{variable} exponents $e > 1$ to $1$.
\end{Definition}
Continuing with the example
\footnote{
To save clutter we do not show the full expansion for variables with greatest multiplicity $= 1$ since e.g. for variable $A$, the sum of products itself evaluates to $1^2\cdot A^2 = A$.
}
$\poly_1^2\inparen{A, B, C, E, X_1, X_2, Y, Z}$ we have
\begin{multline*}
\rpoly_1^2(A, B, C, E, X_1, X_2, Y, Z) = \\
A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}B + BYE + BZC + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BYE + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BZC + 2BYEZC =\\
ABX_1 + AB\inparen{2}^2X_2 + BYE + BZC + 2AX_1BYE + 2A\inparen{2}^2X_2BYE + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
\end{multline*}
Note that we have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$ $\probOf\inparen{B=1}, \probOf\inparen{C=1}), \probOf\inparen{E=1}, \probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural (proof in \Cref{subsec:proof-exp-poly-rpoly}).
\begin{Lemma}\label{lem:tidb-reduce-poly}
For any \abbrCTIDB $\pdb$, $\raPlus$ query $\query$, and lineage polynomial
$\poly\inparen{\vct{X}}=\poly\pbox{\query,\tupset,\tup}\inparen{\vct{X}}$, it holds that $
\expct_{\vct{W} \sim \pdassign}\pbox{\refpoly{}\inparen{\vct{W}}} = \rpoly\inparen{\probAllTup}
$, where $\probAllTup = \inparen{\inparen{\prob_{\tup, j}}_{\tup\in\tupset, j\in\pbox{c}}}.$
\end{Lemma}
\subsection{Our Techniques}
\mypar{Lower Bound Proof Techniques}
Our main hardness result shows that computing~\Cref{prob:expect-mult} is $\sharpwonehard$ for $1$-\abbrTIDB. To prove this result we show that for the same $\query_1$ from the example above, for an arbitrary `product width' $k$, the query $\qhard^k$ is able to encode various hard graph-counting problems (assuming $\bigO{\numvar}$ tuples rather than the $\bigO{1}$ tuples in \Cref{fig:two-step}).
We do so by considering an arbitrary graph $G$ (analogous to relation $\boldsymbol{R}$ of $\query_1$) and analyzing how the coefficients in the (univariate) polynomial $\widetilde{\poly}\left(p,\dots,p\right)$ relate to counts of subgraphs in $G$ that are isomorphic to various graphs with $k$ edges. E.g., we exploit the fact that the coefficient corresponding to the power of $2k$ in $\poly$ of $\qhard^k$ is proportional to the number of $k$-matchings in $G$,
a known hard problem in parameterized/fine-grained complexity literature.
\mypar{Upper Bound Techniques}
Our negative results (\Cref{tab:lbs}) indicate that \abbrCTIDB{}s (even for $\bound=1$) can not achieve comparable performance to deterministic databases for exact results (under complexity assumptions). In fact, under plausible hardness conjectures, one cannot (drastically) improve upon the trivial algorithm to exactly compute the expected multiplicities for $1$-\abbrTIDB\xplural. A natural followup is whether we can do better if we are willing to settle for an approximation to the expected multiplities.
\input{two-step-model}
We adopt a two-step intensional model of query evaluation used in set-\abbrPDB\xplural, as illustrated in \Cref{fig:two-step}:
(i) \termStepOne (\abbrStepOne): Given input $\tupset$ and $\query$, output every tuple $\tup$ that possibly satisfies $\query$, annotated with its lineage polynomial ($\poly(\vct{X})=\apolyqdt\inparen{\vct{X}}$);
(ii) \termStepTwo (\abbrStepTwo): Given $\poly(\vct{X})$ for each tuple, compute $\expct_{\randWorld\sim\bpd}\pbox{\poly(\vct{\randWorld})}$.
Let $\timeOf{\abbrStepOne}(Q,\tupset,\circuit)$ denote the runtime of \abbrStepOne when it outputs $\circuit$ (which is a representation of $\poly$ as an arithmetic circuit --- more on this representation in~\Cref{sec:expression-trees}).
Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the output of \abbrStepOne) the runtime of \abbrStepTwo, which we can leverage~\Cref{def:reduced-poly} and~\Cref{lem:tidb-reduce-poly} to address the next formal objective:
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
Given \abbrCTIDB $\pdb$, $\raPlus$ query $\query$,
is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ for all result tuples $\tup$ where
$\exists \circuit : \timeOf{\abbrStepOne}(Q,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
\end{Problem}
We show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
A key insight of this paper is that the representation of $\circuit$ matters.
For example, if we insist that $\circuit$ represent the lineage polynomial in \abbrSMB, the answer to the above question in general is no, since then we will need $\abs{\circuit}\ge \Omega\inparen{\inparen{\qruntime{\optquery{\query}, \tupset, \bound}}^k}$,
and hence, just $\timeOf{\abbrStepOne}(\query,\tupset,\circuit)$ will be too large.
However, systems can directly emit compact, factorized representations of $\poly(\vct{X})$ (e.g., as a consequence of the standard projection push-down optimization~\cite{DBLP:books/daglib/0020812}).
For example, in~\Cref{fig:two-step}, $B(Y+Z)$ is a factorized representation of the SMB-form $BY+BZ$.
Accordingly, this work uses (arithmetic) circuits\footnote{
An arithmetic circuit is a DAG with variable and/or numeric source nodes and internal, each nodes representing either an addition or multiplication operator.
}
as the representation system of $\poly(\vct{X})$.
Given that there exists a representation $\circuit^*$ such that $\timeOf{\abbrStepOne}(\query,\tupset,\circuit^*)\le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$, we can now focus on the complexity of the \abbrStepTwo step.
We can represent the factorized lineage polynomial by its correspoding arithmetic circuit $\circuit$ (whose size we denote by $|\circuit|$).
As we also show in \Cref{sec:circuit-runtime}, this size is also bounded by $\qruntime{\optquery{\query}, \tupset, \bound}$ (i.e., $|\circuit^*| \le \bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$).
Thus, the question of approximation
can be stated as the following stronger (since~\Cref{prob:big-o-joint-steps} has access to \emph{all} equivalent \circuit representing $\query\inparen{\vct{W}}\inparen{\tup}$), but sufficient condition:
\begin{Problem}\label{prob:intro-stmt}
Given one circuit $\circuit$ that encodes $\apolyqdt$ for all result tuples $\tup$ (one sink per $\tup$) for \abbrCTIDB $\pdb$ and $\raPlus$ query $\query$, does there exist an algorithm that computes a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ (for all result tuples $\tup$) in $\bigO{|\circuit|}$ time?
\end{Problem}
For an upper bound on approximating the expected count, it is easy to check that if all the probabilties are constant then (with an additive adjustment) $\poly\left(\prob_1,\dots, \prob_n\right)$
(i.e. evaluating the original lineage polynomial
over the probability values) is a constant factor approximation
. This is illustrated in the following example using $\query_1^2$ from earlier. To aid in presentation we assume $\bound = 2$ for variable $X$ and $\bound = 1$ for all other variables. Let $\prob_A$ denote $\probOf\pbox{A = 1}$.
In computing $\rpoly$, we have some cancellations to deal with:
\begin{footnotesize}
\begin{align*}
\refpoly{1, }^2\inparen{\vct{X}} &= A^2\inparen{X_1^2 + 4X_1X_2 + 4X_2^2}B^2 + B^2Y^2E^2 + B^2Z^2C^2 + 2AX_1B^2YE \\
&\qquad+ 2AX_2B^2YE + 2AX_1B^2ZC + 2AX_2B^2ZC + 2B^2YEZC\\
\end{align*}
\end{footnotesize}
This then implies
\begin{footnotesize}
\begin{align*}
\rpoly^2\inparen{\vct{X}} &= AX_1B+4AX_2B+BYE+BZC+2AX_1BYE+2AX_2BYE+2AX_1BZC\\
&\qquad+2AX_2BZC+2BYEZC\\
\end{align*}
\end{footnotesize}
Substituting $\vct{\prob}$ for $\vct{X}$,
\begin{footnotesize}
\begin{align*}
\hspace*{-3mm}
\refpoly{1, }^2\inparen{\probAllTup} &= \prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2 + \prob_B^2\prob_Y^2\prob_E^2 + \prob_B^2\prob_Z^2\prob_C^2 + 2\prob_A\prob_{X_1}\prob_B^2\prob_Y\prob_E + 2\prob_A\prob_{X_2}\prob_B^2\prob_Y\prob_E\\
&\qquad+ 2\prob_A\prob_{X_1}\prob_B^2\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B^2\prob_Z\prob_C+ 2\prob_B^2\prob_Y\prob_E\prob_Z\prob_C\\
&\leq\prob_A\prob_{X_1}\prob_B + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A\prob_{X_2}\prob_b + \prob_B\prob_Y\prob_E + \prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_1}\prob_B\prob_Y\prob_E+ 2\prob_A\prob_{X_2}\prob_B\prob_Y\prob_E \\
&\qquad+ 2\prob_A\prob_{X_1}\prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B\prob_Z\prob_C + 2\prob_B\prob_Y\prob_E\prob_Z\prob_C
= \rpoly_1^2\inparen{\vct{p}} + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2.
\end{align*}
\end{footnotesize}
If we assume that all probability values are at least $p_0>0$, then given access to $\refpoly{1, }^2\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$
we get that $\refpoly{1, }^2\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$ is in the range $\left(\inparen{p_0}^3\cdot\inparen{\rpoly^2_1\vct{\prob}}, \rpoly_1^2\inparen{\vct{\prob}}\right]$.
%We can simulate sampling from $\refpoly{1, }^2\inparen{\vct{X}}$ by sampling monomials from $\refpoly{1, }^2$ while ignoring any samples $A^2X_1X_2B^2$.
Note however, that this is \emph{not a tight approximation}.
In~\cref{sec:algo} we demonstrate that a $(1\pm\epsilon)$ (multiplicative) approximation with competitive performance is achievable.
To get an $(1\pm \epsilon)$-multiplicative approximation and solve~\Cref{prob:intro-stmt}, using \circuit we uniformly sample monomials from the equivalent \abbrSMB representation of $\poly$ (without materializing the \abbrSMB representation) and `adjust' their contribution to $\widetilde{\poly}\left(\cdot\right)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Applications}
Recent work in heuristic data cleaning~\cite{yang:2015:pvldb:lenses,DBLP:journals/vldb/SaRR0W0Z17,DBLP:journals/pvldb/RekatsinasCIR17,DBLP:journals/pvldb/BeskalesIG10,DBLP:journals/vldb/SaRR0W0Z17} emits a \abbrPDB when insufficient data exists to select the `correct' data repair.
Probabilistic data cleaning is a crucial innovation, as the alternative is to arbitrarily select one repair and `hope' that queries receive meaningful results.
Although \abbrPDB queries instead convey the trustworthiness of results~\cite{kumari:2016:qdb:communicating}, they are impractically slow~\cite{feng:2019:sigmod:uncertainty,feng:2021:sigmod:efficient}, even in approximation (see \Cref{sec:karp-luby}).
Bags, as we consider, are sufficient for production use, where bag-relational algebra is already the default for performance reasons.
Our results show that bag-\abbrPDB\xplural can be competitive, laying the groundwork for probabilistic functionality in production database engines.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mypar{Paper Organization} We present relevant background and notation in \Cref{sec:background}. We then prove our main hardness results in \Cref{sec:hard} and present our approximation algorithm in \Cref{sec:algo}.
Finally, we discuss related work in \Cref{sec:related-work} and conclude in \Cref{sec:concl-future-work}. All proofs are in the appendix.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

579
Sketching Worlds/macros.tex Normal file
View File

@ -0,0 +1,579 @@
% -*- root: main.tex -*-
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Temporary Macros for Outline Comparison
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\isIncluded}[1]{\textcolor{blue}{#1}}
\newcommand{\notIncluded}[1]{\textcolor{red}{#1}}
\newcommand{\xplural}{s\xspace}
\xspaceaddexceptions{\xplural}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMMENTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%editing/highlighting sections
\newcommand{\AHchange}[1]{\textcolor{blue}{#1}}
\newcommand{\secrev}[1]{\color{red}#1\color{black}}
\newcommand{\draft}{0} %%% Change this to non-zero to remove comments
\ifnum\draft=0
\newcommand{\currentWork}[1]{\textcolor{red}{#1}}
\newcommand{\BG}[1]{\todo[inline]{\textbf{Boris says:$\,$} #1}}
\newcommand{\SF}[1]{\todo{\textbf{Su says:$\,$} #1}}
\newcommand{\OK}[1]{\todo[color=gray]{\textbf{Oliver says:$\,$} #1}}
\newcommand{\AH}[1]{\todo[inline, backgroundcolor=cyan, caption={}]{\textbf{Aaron says:$\,$} #1}}
\newcommand{\AR}[1]{\todo[inline,color=green]{\textbf{Atri says:$\,$} #1}}
\newcommand{\BGdel}[2]{\todo[inline]{\textbf{Boris deleted [#2]: {#1}}}}
\else
\newcommand{\BG}[1]{}
\newcommand{\SF}[1]{}
\newcommand{\OK}[1]{}
\newcommand{\AH}[1]{}
\newcommand{\AR}[1]{}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREM LIKE ENVIRONMENTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%v---what is this?
\DeclareMathAlphabet{\mathbbold}{U}{bbold}{m}{n}
\newtheorem{Theorem}{Theorem}[section]
\newtheorem{Definition}[Theorem]{Definition}
\newtheorem{Lemma}[Theorem]{Lemma}
\newtheorem{Proposition}[Theorem]{Proposition}
\newtheorem{Corollary}[Theorem]{Corollary}
\newtheorem{Example}[Theorem]{Example}
\newtheorem{hypo}[Theorem]{Conjecture}%used in mult_distinct_p.tex
\newtheorem{Problem}[Theorem]{Problem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Rel model
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Need to have all Rel stuff in one place
\newcommand{\tup}{t}
\newcommand{\rel}{R}
\newcommand{\reli}{S}%<----better names?
\newcommand{\relii}{T}
\newcommand{\db}{D}
\newcommand{\query}{Q}
\newcommand{\qhard}{\query_{hard}}
\newcommand{\tset}{\mathcal{T}}%the set of tuples in a database
\newcommand{\join}{\mathlarger\Join}
\newcommand{\select}{\sigma}
\newcommand{\project}{\pi}
\newcommand{\union}{\cup}
\newcommand{\rename}{\mathlarger\rho}
\newcommand{\sch}{sch}
\newcommand{\attr}[1]{attr\left(#1\right)}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% TERMINOLOGY AND ABBREVIATIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Perhaps PDB abbreviations should go here?
%Two-step (intensional evaluation model)
\newcommand{\termStepOne}{Lineage Computation\xspace}
\newcommand{\abbrStepOne}{LC\xspace}
\newcommand{\termStepTwo}{Expectation Computation\xspace}
\newcommand{\abbrStepTwo}{EC\xspace}
%
\newcommand{\expectProblem}{\textsc{Expected Result Multiplicity Problem}\xspace}
\newcommand{\termSMB}{standard monomial basis\xspace}
\newcommand{\abbrSMB}{SMB\xspace}%we already have this; one has to go
\newcommand{\termSOP}{sum of products\xspace}
\newcommand{\abbrSOP}{SOP\xspace}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Function Names and Typesetting %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\domain}{\func{Dom}}
\newcommand{\func}[1]{\textsc{#1}\xspace}
\newcommand{\isInd}[1]{\func{isInd}\inparen{#1}}
\newcommand{\polyf}{\func{poly}}
\newcommand{\evalmp}{\func{eval}}
\newcommand{\degree}{\func{deg}}
\newcommand{\size}{\func{size}}
\newcommand{\depth}{\func{depth}}
\newcommand{\topord}{\func{TopOrd}}
\newcommand{\smbOf}[1]{\func{\abbrSMB}\inparen{#1}}
%Verify if we need the above...
%saving \treesize for now to keep latex from breaking
\newcommand{\treesize}{\func{size}}
%I believe this is used in the algo psuedocode
\newcommand{\sign}{\func{sgn}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SEMIRINGS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\udom}{\mathcal{U}}
\newcommand{\domK}{K}
\newcommand{\semK}{\mathcal{K}}
\newcommand{\semB}{\mathbb{B}}
\newcommand{\semN}{\mathbb{N}}
\newcommand{\semNX}{\mathbb{N}[\vct{X}]}
\newcommand{\onesymbol}{\mathbbold{1}}
\newcommand{\zerosymbol}{\mathbbold{0}}
\newcommand{\multsymb}{\otimes}
\newcommand{\addsymbol}{\oplus}
\newcommand{\addK}{\addsymbol_{\semK}}
\newcommand{\multK}{\multsymb_{\semK}}
\newcommand{\oneK}{\onesymbol_{\semK}}
\newcommand{\zeroK}{\zerosymbol_{\semK}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Incomplete DB/PDBs %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\idb}{{\Omega}}
\newcommand{\pd}{{\mathcal{P}}}%pd for probability distribution
\newcommand{\pdassign}{\mathcal{P}}
\newcommand{\pdb}{\mathcal{D}}
\newcommand{\dbbase}{\db_\idb}
\newcommand{\dbbaseName}{deterministic bounding database\xspace}
\newcommand{\pxdb}{\pdb_{\semNX}}
\newcommand{\pndb}{\pdb_{\semN}}
\newcommand{\nxdb}{D(\vct{X})}%\mathbb{N}[\vct{X}] db--Are we currently using this?
\newcommand{\valworlds}{\eta}%valid worlds--in particular referring to something like a BIDB, where not all worlds have Pr[w] > 0.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Bag c-TIDB Notation %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\bound}{c}
\newcommand{\tupsetsize}{n}
\newcommand{\tupset}{D}
\newcommand{\gentupset}{\overline{D}}
\newcommand{\world}{\inset{0,\ldots, c}}
\newcommand{\worldvec}{\vct{W}}
\newcommand{\worlds}{\world^\tupset}
\newcommand{\bpd}{\mathcal{P}}%bpd for bag probability distribution
%BIDB
\newcommand{\block}{B}
\newcommand{\bivar}{x_{\block, i}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Binary-BIDB Notation %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\onebidbworlds}[1]{\bigtimes_{\tup\in #1}\inset{0, \bound_\tup}}
%PDB Abbreviations
\newcommand{\abbrOneBIDB}{\text{Binary-BIDB}\xspace}
\newcommand{\abbrPDB}{\textnormal{PDB}\xspace}
\newcommand{\abbrBPDB}{\textnormal{bag-PDB}\xspace}
\newcommand{\abbrTIDB}{\textnormal{TIDB}\xspace}%replace \ti with this
\newcommand{\abbrCTIDB}{\textnormal{$\bound$-TIDB}\xspace}
\newcommand{\abbrTIDBs}{\textnormal{TIDBs}\xspace}%replace \ti with this
\newcommand{\abbrBIDB}{\textnormal{BIDB}\xspace}
\newcommand{\ti}{TIDB\xspace}
\newcommand{\tis}{TIDBs\xspace}
\newcommand{\bi}{BIDB\xspace}
\newcommand{\bis}{BIDBs\xspace}
\newcommand{\abbrNXPDB}{$\semNX$-encoded PDB\xspace}
%not sure if we use these; arguably the above abbrev macros should have a name change
\newcommand{\tiabb}{ti}
\newcommand{\biabb}{bi}
\newcommand{\biwset}{\idb_{\biabb}}
\newcommand{\biord}{\leq_{x_\block}}
\newcommand{\tiwset}{\idb_{\tiabb}}
\newcommand{\bipd}{\pd_{\biabb}}
\newcommand{\tipd}{\pd_{\tiabb}}
\newcommand{\bipdb}{\pdb_{\biabb}}
\newcommand{\tipdb}{\pdb_{\tiabb}}
%--------------------------------
\newcommand{\probDist}{\vct{\probOf}}%<---I don't think we need this.
\newcommand{\probAllTup}{\vct{\prob}}%<---I was using simply \vct{\prob}; decide on a convention
\newcommand{\wSet}{\Omega}%<---We have \idb, the set of possible worlds; decide on one of these
%Is this being used?
\newcommand{\pdbx}{X_{DB}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Math Symbols, Functions/Operators, Containers %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Number Sets
\newcommand{\domR}{\mathbb{R}}
\newcommand{\domN}{\mathbb{N}}
%Probability, Expectation
\newcommand{\expct}{\mathop{\mathbb{E}}}%why not just call this \expect
\newcommand{\probOf}{Pr}%probability function
%Functions/Operators
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\suchthat}{\;|\;} %such that
\newcommand{\comprehension}[2]{\left\{\;#1\;|\;#2\;\right\}}
\newcommand{\eval}[1]{\llbracket #1 \rrbracket}%evaluation double brackets
\newcommand{\evald}[2]{\eval{{#1}}_{#2}}
%Containers
\newcommand{\pbox}[1]{\left[#1\right]}%<---used for expectation
\newcommand{\pbrace}[1]{\left\{#1\right\}}
%consider replacing \pbrace with what is below
\newcommand{\inparen}[1]{\left({#1}\right)}
\newcommand{\inset}[1]{\left\{{#1}\right\}}%we already have this as \pbrace; need to pick one
\newcommand{\intuple}[1]{\left\langle{#1}\right\rangle}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Variable, Polynomial and Vector Notation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Instance Variables
\newcommand{\prob}{p}
\newcommand{\wElem}{w} %an element of \vct{w}
\newcommand{\worldinst}{W}
%Polynomial Variables
\newcommand{\pVar}{X}%<----not used but recomment instituting this--pVar for polyVar
\newcommand{\kElem}{k}%the kth element<---where and how are we using this?
%Random Variables
\newcommand{\randWorld}{W}
\newcommand{\rvworld}{\vct{\randWorld}}
\newcommand{\randDB}{\vct{\db}}
\newcommand{\rvW}{W}%\rvW for random variable of type World<---this is the same as \randWorld
%One of these needs to go...I think...
\newcommand{\randomvar}{W}%this little guy needs a home!
%Container for Polynomial Params
\newcommand{\polyinput}[2]{\left(#1,\ldots, #2\right)}%do we still use this?
%Number of Variables--this could easily be number of tups--maybe move to Rel Model?
\newcommand{\numvar}{n}
%Number of blocks (BIDB)
\newcommand{\numblock}{m}
%Vector
\newcommand{\vct}[1]{{\bf #1}}
%norm
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
%using \wVec for world bit vector notation<-----Is this still the case?
%Polynomial
\newcommand{\hideg}{K}
\newcommand{\poly}{\Phi}
\newcommand{\genpoly}{\phi}
\newcommand{\vars}[1]{\func{Vars}\inparen{#1}}
\newcommand{\polyOf}[1]{\poly[#1]}
\newcommand{\polyqdt}[3]{\polyOf{#1,#2,#3}}
\newcommand{\apolyqdt}{\polyqdt{\query}{\tupset}{\tup}}
\newcommand{\nxpolyqdt}{\polyqdt{\query}{\db_{\semNX}}{\tup}}
\newcommand{\tupvar}[2]{X_{#1,#2}}
\newcommand{\atupvar}{\tupvar{\rel}{\tup}}
\newcommand{\polyX}{\poly\inparen{\vct{\pVar}}}%<---let's see if this proves handy
\newcommand{\rpoly}{\widetilde{\poly}}%r for reduced as in reduced 'Q'
\newcommand{\refpoly}[1]{\poly_{#1R}}
\newcommand{\rpolyX}{\rpoly\inparen{\pVar}}%<---if this isn't something we use much, we can get rid of it
\newcommand{\biDisProd}{\mathcal{B}}%bidb disjoint tuple products (def 2.5)
\newcommand{\rExp}{\mathcal{T}}%the set of variables to reduce all exponents to 1 via modulus operation; I think \mathcal T collides with the notation used for the set of tuples in D
\newcommand{\polyForTuple}{\poly_{\tup}}%do we use this?<--S 2
%Do we use this?
\newcommand{\out}{output}%output aggregation over the output vector
\newcommand{\prel}{\mathcal{\rel}}%What is this?
\newcommand{\linsett}[3]{\Phi_{#1,#2}^{#3}}%Where is this used?
\newcommand{\wbit}{w}%don't think we need this one
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Graph Notation %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\vset}{V}
\newcommand{\edgeSet}{E}
\newcommand{\gtype}[1]{\inparen{#1}}
\newcommand{\esetType}[1]{\edgeSet^{\gtype{#1}}}%edge set for induced graph G^{\inparen{\ell}}
\newcommand{\graph}[1]{G^{(#1)}}
\newcommand{\numocc}[2]{\#\left(#1,#2\right)}
\newcommand{\eset}[1]{E^{(#1)}_S} %edge set for arbitrary subgraph
%I don't think we use these anymore
\newcommand{\linsys}[1]{LS(\graph{#1})}
\newcommand{\lintime}[1]{LT^{\graph{#1}}}
\newcommand{\aug}[1]{AUG^{\graph{#1}}}
\newcommand{\mtrix}[1]{M_{#1}}
\newcommand{\dtrm}[1]{Det\left(#1\right)}
\newcommand{\tuple}[1]{\left<#1\right>}
\newcommand{\indicator}[1]{\onesymbol_{#1}}
%----------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Circuit Notation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\circuit}{\vari{C}}
\newcommand{\circuitset}[1]{\vari{CSet}\inparen{#1}}
\newcommand{\circmult}{\times}
\newcommand{\circplus}{+}
\newcommand{\rinput}{\vari{R}}
\newcommand{\linput}{\vari{L}}
\newcommand{\inp}{\vari{input}}
\newcommand{\inputs}{\vari{inputs}}%do we use this?
\newcommand{\subcircuit}{\vari{S}}%does this clash/conflict with \coeffset?
\newcommand{\gate}{\vari{g}}
\newcommand{\lwght}{\vari{Lweight}}
\newcommand{\rwght}{\vari{Rweight}}
\newcommand{\prt}{\vari{partial}}
\newcommand{\degval}{\vari{degree}}
\newcommand{\type}{\vari{type}}
\newcommand{\val}{\vari{val}}
%types of C
\newcommand{\var}{\textsc{var}\xspace}
\newcommand{\tnum}{\textsc{num}\xspace}
%Do we use this?
\newcommand{\subgraph}{\vari{S}_{\equivtree(\circuit)}}
%-----
\newcommand{\cost}{\func{Cost}}
\newcommand{\nullval}{NULL}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Datalog
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\dlImp}[0]{\,\ensuremath{\mathtt{{:}-}}\,}
\newcommand{\dlDontcare}{\_}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Query Classes
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\qClass}{\mathcal{Q}}
\newcommand{\raPlus}{\ensuremath{\mathcal{RA}^{+}}\xspace}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPLEXITY
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\bigO}[1]{O\inparen{#1}}
\newcommand{\littleo}[1]{o\inparen{#1}}
\newcommand{\bigOmega}[1]{\Omega\inparen{#1}}
\newcommand{\littleomega}[1]{\omega\inparen{#1}}
\newcommand{\np}{{\sf NP}\xspace}
\newcommand{\polytime}{{\sf P}\xspace}
\newcommand{\sharpp}{\#{\sf P}\xspace}
\newcommand{\sharpphard}{\#{\sf P}-hard\xspace}
\newcommand{\sharpwone}{\#{\sf W}[1]\xspace}
\newcommand{\sharpwzero}{\#{\sf W}[0]\xspace}
\newcommand{\sharpwonehard}{\#{\sf W}[1]-hard\xspace}
\newcommand{\ptime}{{\sf PTIME}\xspace}
\newcommand{\timeOf}[1]{T_{#1}}
\newcommand{\qruntime}[1]{T_{det}\inparen{#1}}
\newcommand{\optquery}[1]{\func{OPT}\inparen{#1}}
\newcommand{\qruntimenoopt}[1]{T_{det}\inparen{#1}}%need to get rid of this--needs to be propagated
\newcommand{\jointime}[1]{T_{join}(#1)}
\newcommand{\kmatchtime}{T_{match}\inparen{k, G}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Approx Alg
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\randvar}{\vari{Y}}
\newcommand{\coeffset}{S}
\newcommand{\distinctvars}{d}
\newcommand{\coeffitem}[1]{c_{#1}\cdot\prob^{\distinctvars_{#1}}}
\newcommand{\unidist}[1]{Uniform\left(#1\right)}
\newcommand{\samplesize}{N}
\newcommand{\setsize}{m}
\newcommand{\empmean}{\overline{\vct{\randvar}}}
\newcommand{\setsum}{SUM}
\newcommand{\ave}[1]{AVG(#1)}
\newcommand{\hoeffestsum}{EST_{\setsum}}
\newcommand{\error}{\epsilon}
\newcommand{\conf}{\delta}
%Pseudo Code Notation
\newcommand{\plus}{\texttt{+}}
\newcommand{\mult}{\texttt{\times}}
\newcommand{\algname}[1]{\textsc{#1}\xspace}
\newcommand{\approxq}{\algname{Approximate$\rpoly$}}
\newcommand{\onepass}{\algname{OnePass}}
\newcommand{\sampmon}{\algname{SampleMonomial}}
%I don't think we use reduce anymore
\newcommand{\reduce}{\algname{Reduce}}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}
\newcommand{\vari}[1]{\texttt{#1}\xspace}
\newcommand{\accum}{\vari{acc}}
\newcommand{\numsamp}{\vari{N}}%we have \samplesize above; we can get rid of one of these
\newcommand{\numedge}{m}%we have set size above; we can get rid of one of these
\newcommand{\bivec}{\vari{b}_{\vari{vec}}}%Section 3--proof in appendix for last theorem
%Major cleaning needed to get rid of obsolete notation like expression trees, etc.
%I don't know that we use any of the expression tree macros anymore; if we do, they would be predominantly in S 3 and 4 and their respective appendices
%expression tree T
\newcommand{\etree}{\vari{T}}
\newcommand{\stree}{\vari{S}}
\newcommand{\lchild}{\vari{L}}
\newcommand{\rchild}{\vari{R}}
%I don't think we talk of T but of C; let's update this. These should be used only in S 2 and S4
%members of T
\newcommand{\wght}{\vari{weight}}
\newcommand{\vpartial}{\vari{partial}}
%%%%%%%
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
%\newcommand{\smb}{\poly\left(\vct{X}\right)}%smb for standard monomial basis; S 2<---this command is, I believe, unnecessary
%not sure if we use this
%not sure if we use this
\newcommand{\etreeset}[1]{\vari{ET}\left(#1\right)}
%verify this
%\expandtree is a placeholder until I change other files with the new macro name \expansion
\newcommand{\expandtree}[1]{\vari{E}(#1)}
\newcommand{\expansion}[1]{\vari{E}(#1)}
%not sure if we use this; I think the only occurrence would be in the def section of S 4
\newcommand{\elist}[1]{\vari{List}\pbox{#1}}
%not sure if we use this anymore either
\newcommand{\equivtree}{\vari{EET}}
%expandtree tuple elements:
\newcommand{\monom}{\vari{v}}
\newcommand{\encMon}{\monom_{\vari{m}}}
\newcommand{\lencMon}{\monom_{\vari{m}_\linput}}
\newcommand{\rencMon}{\monom_{\vari{m}_\rinput}}
\newcommand{\coef}{\vari{c}}
%----------------------------------
% REPRESENTATIONS--this might be Boris' or Atri's stuff; verify if these macros are current
\newcommand{\rmod}{Mod}%mod function which transforms N[X]-DB to N-DB (S 2 and App A)
\newcommand{\reprs}{\mathcal{M}}%used to define Representation System in App A
\newcommand{\repr}{M}
%not sure about these? Perhaps in appendix B for \assign and S 5 for \support?
\newcommand{\assign}{\psi}%assignment function from a world vector to polynomial output in App A
\newcommand{\support}[1]{supp({#1})}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\eps}{\epsilon}%<----this is already defined as \error; need to pick one
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Forcing Layouts
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\trimfigurespacing}{\vspace*{-5mm}}
\newcommand{\mypar}[1]{\smallskip\noindent\textbf{{#1}.}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Proof/Section Headings %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Is this being used?
\newcommand{\caseheading}[1]{\smallskip \noindent \textbf{#1}.~}
%%%%%
%%%Adding stuff below so that long chain of display equatoons can be split across pages
\allowdisplaybreaks
%Macro for mult complexity
\newcommand{\multc}[2]{\overline{\mathcal{M}}\left({#1},{#2}\right)}
%consider perhaps putting the tikz code into a separate file.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Tikz Graph Symbols
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Shift macro
\newcommand{\patternshift}[1]{\hspace*{-0.5mm}\raisebox{-0.35mm}{#1}\hspace*{-0.5mm} }
%Global styles
\tikzset{
default_node/.style={align=center, inner sep=0pt},
pattern_node/.style={fill=gray!50, draw=black, semithick, inner sep=0pt, minimum size = 2pt, circle},
tree_node/.style={default_node, draw=black, black, circle, text width=0.5cm, font=\bfseries, minimum size=0.65cm},
gen_tree_node/.style={default_node, draw, circle, text width=0.5cm, font=\bfseries, minimum size=0.65cm},
highlight_color/.style={black}, wght_color/.style={black},
highlight_treenode/.style={tree_node, draw=black, black},
edge from parent path={(\tikzparentnode) -- (\tikzchildnode)}
}
%Subgraph patterns
\newcommand{\ed}{\patternshift{
\begin{tikzpicture}[every path/.style={thick, draw}]%[baseline=0.00005cm]
%\begin{scope}[yshift=-5cm]
\node at (0, 0)[pattern_node](bottom){};
\node [above=0.07cm of bottom, pattern_node] (top){};
\draw (top) -- (bottom);
% \node at (0, -2)[pattern_node, blue](b2){};
% \node [above=0.07cm of b2, pattern_node, blue] (t2){};
% \draw (t2) -- (b2);
%\end{scope}
\end{tikzpicture}
}
}
\newcommand{\kmatch}{\ed\cdots\ed^\kElem}
\newcommand{\twodis}{\patternshift{
\begin{tikzpicture}[every path/.style={thick, draw}]
\node at (0, 0) [pattern_node] (bottom1) {};
\node[above=0.07cm of bottom1, pattern_node] (top1) {} edge (bottom1);
\node at (0.14, 0) [pattern_node] (bottom2) {};
\node [above=0.07cm of bottom2, pattern_node] (top2) {} edge (bottom2);
\end{tikzpicture}
}
}
\newcommand{\twopath}{\patternshift{
\begin{tikzpicture}[every path/.style={thick, draw}]
\node at (0, 0.08) [pattern_node] (top){};
\node [below left=0.095cm and 0.05cm of top, pattern_node](left){};
\node[below right=0.095cm and 0.05cm of top, pattern_node](right){};
\draw (top) -- (left);
\draw (top) -- (right);
\end{tikzpicture}
}
}
\newcommand{\threedis}{\patternshift{
\begin{tikzpicture}[every path/.style={thick, draw}]
\node at (0, 0) [pattern_node] (bottom1) {};
\node[above=0.07cm of bottom1, pattern_node] (top1) {} edge (bottom1);
\node at (0.14, 0) [pattern_node] (bottom2) {};
\node [above=0.07cm of bottom2, pattern_node] (top2) {} edge (bottom2);
\node at (0.28, 0) [pattern_node] (bottom3) {};
\node [above=0.07cm of bottom3, pattern_node] (top3) {} edge (bottom3);
\end{tikzpicture}
}
}
\newcommand{\tri}{\patternshift{
\begin{tikzpicture}[every path/.style={ thick, draw}]
\node at (0, 0.08) [pattern_node] (top){};
\node [below left=0.08cm and 0.01cm of top, pattern_node](left){} edge (top);
\node[below right=0.08cm and 0.01cm of top, pattern_node](right){} edge (top) edge (left);
\end{tikzpicture}
}
}
\newcommand{\twopathdis}{\ed~\twopath}
\newcommand{\threepath}{\patternshift{
\begin{tikzpicture}[every path/.style={thick, draw}]
\node at (0, 0) [pattern_node] (node1a) {};
\node [above=0.07cm of node1a, pattern_node] (node1b) {} edge (node1a);
\node [right=0.099cm of node1a, pattern_node] (node2b) {}; %edge [semithick] (node1b);
\node [above=0.07cm of node2b, pattern_node] (node3b) {} edge (node2b);
\draw (node1b) -- (node3b);
\end{tikzpicture}
}
}
\newcommand{\oneint}{\patternshift{
\begin{tikzpicture}[level/.style={sibling distance=0.14cm, level distance=0.15cm}, every path/.style={thick, draw}]
\node at (0, 0) [pattern_node] {} [grow=down]
child{node [pattern_node]{}}
child {node [pattern_node] {}}
child{node [pattern_node] {}};
\end{tikzpicture}
}
}
\newcommand{\bsym}[1]{\boldsymbol{#1}}%b for bold; sym for symbol
\newcommand{\sg}[1]{S^{(#1)}}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,210 @@
\documentclass[sigconf]{acmart}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{tikz}
\usepackage{tikz-qtree}
\usepackage{comment}
\usepackage{amsmath}
% \usepackage{amssymb}
%\let\proof\relax
%
\let\endproof\relax
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{etoolbox}
\usepackage{xstring} %for conditionals in \newcommand
\usepackage{stmaryrd}
\usepackage[normalem]{ulem}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage[disable]{todonotes}
\usepackage{graphicx}
\usepackage{listings}
%%%%%%%%%% SQL + proveannce listing settings
\lstdefinestyle{psql}
{
tabsize=2,
basicstyle=\small\upshape\ttfamily,
language=SQL,
morekeywords={PROVENANCE,BASERELATION,INFLUENCE,COPY,ON,TRANSPROV,TRANSSQL,TRANSXML,CONTRIBUTION,COMPLETE,TRANSITIVE,NONTRANSITIVE,EXPLAIN,SQLTEXT,GRAPH,IS,ANNOT,THIS,XSLT,MAPPROV,cxpath,OF,TRANSACTION,SERIALIZABLE,COMMITTED,INSERT,INTO,WITH,SCN,UPDATED,LENS,SCHEMA_MATCHING,string,WINDOW,max,OVER,PARTITION,FIRST_VALUE,WITH},
extendedchars=false,
keywordstyle=\bfseries,
mathescape=true,
escapechar=@,
sensitive=true
}
\lstset{style=psql}
%%%%%%%%%%%%%%%%%%BORROWED FROM UADB paper^-----
\usepackage{fancyvrb}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{braket}
\usepackage[inline]{enumitem}
\usepackage{xspace}
\usepackage{hyperref}
\usepackage{url}
\usepackage{cleveref}
\usepackage{color}
% \usepackage{bbold}
\graphicspath{ {figures/} }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{macros}
% Copyright
\setcopyright{none}
%\setcopyright{acmcopyright}
%\setcopyright{acmlicensed}
% \setcopyright{rightsretained}
%\setcopyright{usgov}
%\setcopyright{usgovmixed}
%\setcopyright{cagov}
%\setcopyright{cagovmixed}
% DOI
\acmDOI{10.475/123_4}
% ISBN
\acmISBN{123-4567-24-567/08/06}
%Conference
\acmConference[WOODSTOCK'97]{ACM Woodstock conference}{July 1997}{El
Paso, Texas USA}
\acmYear{1997}
\copyrightyear{2016}
\acmArticle{4}
\acmPrice{15.00}
%%%%%%%%%%%%%%%%%%%%
% \textbullet Modelling Uncertainty as Attribute-level Taints and its Relationship to Provenance}
\title{Standard Operating Procedure in PDBs Considered Harmful}
\subtitle{(for bags)}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Su Feng, Boris Glavic}
% \orcid{1234-5678-9012}
\affiliation{%
\institution{Illinois Institute of Technology}
\country{USA}
}
\email{sfeng14@hawk.iit.edu,bglavic@iit.edu}
\author{Aaron Huber, Oliver Kennedy, Atri Rudra}
% \orcid{1234-5678-9012}
\affiliation{%
\institution{University at Buffalo}
\country{USA}
}
\email{ahuber,okennedy,atri@buffalo.edu}
% %%%%%%%%%%%%%%%%%%%%
% \author{Su Feng}
% % \orcid{1234-5678-9012}
% \affiliation{%
% \institution{Illinois Institute of Technology}
% }
% \email{sfeng14@hawk.iit.edu}
% \author{Boris Glavic}
% % \orcid{1234-5678-9012}
% \affiliation{%
% \institution{Illinois Institute of Technology}
% }
% \email{bglavic@iit.edu}
% \author{Aaron Huber}
% % \orcid{1234-5678-9012}
% \affiliation{%
% \institution{University at Buffalo}
% }
% \email{ahuber@buffalo.edu}
% \author{Oliver Kennedy}
% % \orcid{1234-5678-9012}
% \affiliation{%
% \institution{University at Buffalo}
% }
% \email{okennedy@buffalo.edu}
% \author{Atri Rudra}
% % \orcid{1234-5678-9012}
% \affiliation{%
% \institution{University at Buffalo}
% }
% \email{atri@buffalo.edu}
\pagestyle{plain}
\begin{document}
\input{abstract}
\lstset{language=sql}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% TECH REPORT TITLE PAGE
%\input{tr/techreport_page.tex}
\maketitle
%%%%%%%%%%%%%%%%%%%%
%\input{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\input{prob_def}
%\input{notation}
%\input{analysis}
%\input{est_bounds}
%\input{combining}
%
%\input{instantiation}
%\input{hash_const}
%\input{exact}
%\input{var_estj}
%\input{pos}
%\input{sop}
%\input{davidscheme}
\input{intro}
\input{ra-to-poly}
%\input{poly-form}
\input{mult_distinct_p}
\input{single_p}
%\input{lin_sys}
\input{approx_alg}
% \input{bi_cancellation}
\input{circuits-model-runtime}
\input{related-work}
\input{conclusions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliographystyle{plain}
\bibliography{main}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\appendix
\normalsize
\input{hardness-app}
\input{related-work-extra}
% \input{glossary.tex}
% \input{addproofappendix.tex}
\end{document}

802
Sketching Worlds/main.bib Normal file
View File

@ -0,0 +1,802 @@
@article{CHEN20061346,
title = {Strong computational lower bounds via parameterized complexity},
journal = {Journal of Computer and System Sciences},
volume = {72},
number = {8},
pages = {1346-1367},
year = {2006},
issn = {0022-0000},
doi = {https://doi.org/10.1016/j.jcss.2006.04.007},
url = {https://www.sciencedirect.com/science/article/pii/S0022000006000675},
author = {Jianer Chen and Xiuzhen Huang and Iyad A. Kanj and Ge Xia},
keywords = {Parameterized computation, Computational complexity, Lower bound, Clique, Polynomial time approximation scheme},
abstract = {We develop new techniques for deriving strong computational lower bounds for a class of well-known NP-hard problems. This class includes weighted satisfiability, dominating set, hitting set, set cover, clique, and independent set. For example, although a trivial enumeration can easily test in time O(nk) if a given graph of n vertices has a clique of size k, we prove that unless an unlikely collapse occurs in parameterized complexity theory, the problem is not solvable in time f(k)no(k) for any function f, even if we restrict the parameter values to be bounded by an arbitrarily small function of n. Under the same assumption, we prove that even if we restrict the parameter values k to be of the order Θ(μ(n)) for any reasonable function μ, no algorithm of running time no(k) can test if a graph of n vertices has a clique of size k. Similar strong lower bounds on the computational complexity are also derived for other NP-hard problems in the above class. Our techniques can be further extended to derive computational lower bounds on polynomial time approximation schemes for NP-hard optimization problems. For example, we prove that the NP-hard distinguishing substring selection problem, for which a polynomial time approximation scheme has been recently developed, has no polynomial time approximation schemes of running time f(1/ϵ)no(1/ϵ) for any function f unless an unlikely collapse occurs in parameterized complexity theory.}
}
@inproceedings{10.5555/645413.652181,
author = {Flum, J\"{o}rg and Grohe, Martin},
title = {The Parameterized Complexity of Counting Problems},
year = {2002},
isbn = {0769518222},
publisher = {IEEE Computer Society},
address = {USA},
abstract = {We develop a parameterized complexity theory for counting problems. As the basis of this theory, we introduce a hierarchy of parameterized counting complexity classes #W[t], for t geqslant 1 , that corresponds to Downey and Fellows's W-hierarchy [12] and show that a few central W-completeness results for decision problems translate to #W-completeness results for the corresponding counting problems.Counting complexity gets interesting with problems whose decision version is tractable, but whose counting version is hard. Our main result states that counting cycles and paths of length k in both directed and undirected graphs, parameterized by k , is#W[1]-complete. This makes it highly unlikely that any of these problems is fixed-parameter tractable, even though their decision versions are fixed-parameter tractable. More explicitly, our result shows that most likely there is no f(k) cdot n^c-algorithm for counting cycles or paths of length k in a graph of size n for any computable function f: mathbb{N} to mathbb{N} and constant c , even though there is a 2^{0(k)}cdot n^{2.376}algorithm for finding a cycle or path of length k [2].},
booktitle = {Proceedings of the 43rd Symposium on Foundations of Computer Science},
pages = {538},
series = {FOCS '02}
}
misc{pdbench,
howpublished = {r̆lhttp://pdbench.sourceforge.net/},
note = {Accessed: 2020-12-15},
title = {pdbench}
}
@article{AF18,
author = {Arab, Bahareh and Feng, Su and Glavic, Boris and Lee, Seokki and Niu, Xing and Zeng, Qitian},
journal = {IEEE Data Eng. Bull.},
number = {1},
pages = {51--62},
title = {GProM - A Swiss Army Knife for Your Provenance Needs},
volume = {41},
year = {2018}
}
@inproceedings{Imielinski1989IncompleteII,
title={Incomplete Information in Relational Databases},
author={T. Imielinski and W. Lipski},
year={1989}
}
@inproceedings{10.1145/1265530.1265571,
author = {Dalvi, Nilesh and Suciu, Dan},
booktitle = {PODS},
numpages = {10},
pages = {293--302},
title = {The Dichotomy of Conjunctive Queries on Probabilistic Structures},
year = {2007}
}
@inproceedings{DBLP:conf/icde/OlteanuHK10,
author = {Dan Olteanu and
Jiewen Huang and
Christoph Koch},
booktitle = {ICDE},
pages = {145--156},
title = {Approximate confidence computation in probabilistic databases},
year = {2010}
}
@book{DBLP:series/synthesis/2011Suciu,
author = {Dan Suciu and
Dan Olteanu and
Christopher Ré and
Christoph Koch},
publisher = {Morgan \& Claypool Publishers},
title = {Probabilistic Databases},
year = {2011}
}
@inproceedings{feng:2019:sigmod:uncertainty,
author = {Feng, Su and Huber, Aaron and Glavic, Boris and Kennedy, Oliver},
booktitle = {SIGMOD},
title = {Uncertainty Annotated Databases - A Lightweight Approach for Approximating Certain Answers},
year = {2019}
}
@article{FH12,
author = {Fink, Robert and Han, Larisa and Olteanu, Dan},
journal = {PVLDB},
number = {5},
pages = {490--501},
title = {Aggregation in probabilistic databases via knowledge compilation},
volume = {5},
year = {2012}
}
@inproceedings{DBLP:conf/tapp/Zavodny11,
author = {Jakub Závodný},
booktitle = {TaPP},
title = {On Factorisation of Provenance Polynomials},
year = {2011}
}
@inproceedings{kennedy:2010:icde:pip,
author = {Kennedy, Oliver and Koch, Christoph},
booktitle = {ICDE},
title = {PIP: A Database System for Great and Small Expectations},
year = {2010}
}
@inproceedings{DBLP:conf/icde/AntovaKO07a,
author = {Lyublena Antova and
Christoph Koch and
Dan Olteanu},
booktitle = {ICDE},
title = {MayBMS: Managing Incomplete Information with Probabilistic World-Set
Decompositions},
year = {2007}
}
@misc{Antova_fastand,
author = {Lyublena Antova and Thomas Jansen and Christoph Koch and Dan Olteanu},
title = {Fast and Simple Relational Processing of Uncertain Data},
year = {}
}
@inproceedings{DBLP:conf/pods/KhamisNR16,
author = {Mahmoud Abo Khamis and
Hung Q. Ngo and
Atri Rudra},
booktitle = {PODS},
pages = {13--28},
title = {FAQ: Questions Asked Frequently},
year = {2016}
}
@article{DBLP:journals/sigmod/GuagliardoL17,
author = {Paolo Guagliardo and
Leonid Libkin},
journal = {SIGMOD Rec.},
number = {3},
pages = {5--16},
title = {Correctness of SQL Queries on Databases with Nulls},
volume = {46},
year = {2017}
}
@inproceedings{DBLP:conf/vldb/AgrawalBSHNSW06,
author = {Parag Agrawal and
Omar Benjelloun and
Anish Das Sarma and
Chris Hayworth and
Shubha U. Nabar and
Tomoe Sugihara and
Jennifer Widom},
booktitle = {VLDB},
pages = {1151--1154},
title = {Trio: A System for Data, Uncertainty, and Lineage},
year = {2006}
}
@inproceedings{k-match,
author = {Radu Curticapean},
booktitle = {ICALP},
pages = {352--363},
title = {Counting Matchings of Size k Is W[1]-Hard},
volume = {7965},
year = {2013}
}
@inproceedings{DBLP:conf/sigmod/SinghMMPHS08,
author = {Sarvjeet Singh and
Chris Mayfield and
Sagar Mittal and
Sunil Prabhakar and
Susanne E. Hambrusch and
Rahul Shah},
booktitle = {SIGMOD},
pages = {1239--1242},
title = {Orion 2.0: native support for uncertain data},
year = {2008}
}
@inproceedings{DBLP:conf/pods/GreenKT07,
author = {Todd J. Green and
Gregory Karvounarakis and
Val Tannen},
booktitle = {PODS},
pages = {31--40},
title = {Provenance semirings},
year = {2007}
}
@inproceedings{ngo-survey,
author = {Hung Q. Ngo},
booktitle = {PODS},
title = {Worst-Case Optimal Join Algorithms: Techniques, Results, and Open
Problems},
year = {2018}
}
@article{skew,
author = {Hung Q. Ngo and
Christopher Ré and
Atri Rudra},
journal = {SIGMOD Rec.},
number = {4},
pages = {5--16},
title = {Skew strikes back: new developments in the theory of join algorithms},
volume = {42},
year = {2013}
}
@article{NPRR,
author = {Hung Q. Ngo and
Ely Porat and
Christopher Ré and
Atri Rudra},
journal = {J. ACM},
number = {3},
pages = {16:1--16:40},
title = {Worst-case Optimal Join Algorithms},
volume = {65},
year = {2018}
}
@book{arith-complexity,
author = {Peter Bürgisser and
Michael Clausen and
Mohammad Amin Shokrollahi},
publisher = {Springer},
title = {Algebraic complexity theory},
volume = {315},
year = {1997}
}
@inproceedings{triang-hard,
author = {Tsvi Kopelowitz and
Virginia Vassilevska Williams},
booktitle = {ICALP},
pages = {74:1--74:16},
title = {Towards Optimal Set-Disjointness and Set-Intersection Data Structures},
volume = {168},
year = {2020}
}
@article{LL97,
author = {Lakshmanan, L.V.S. and Leone, N. and Ross, R. and Subrahmanian, VS},
journal = {TODS},
number = {3},
pages = {419--469},
title = {Probview: A flexible probabilistic database system},
volume = {22},
year = {1997}
}
@article{jha-13-kcmdt,
author = {Jha, Abhay and Suciu, Dan},
title = {Knowledge Compilation Meets Database Theory: Compiling Queries
To Decision Diagrams},
journal = {Theory of Computing Systems},
volume = 52,
number = 3,
pages = {403--440},
year = 2013,
publisher = {Springer},
}
@inproceedings{BS06,
author = {Omar Benjelloun and Anish Das Sarma and Alon Y. Halevy and Jennifer Widom},
booktitle = {VLDB},
pages = {953--964},
title = {ULDBs: Databases with Uncertainty and Lineage},
year = {2006}
}
@conference{RS07,
author = {Ré, C. and Suciu, D.},
booktitle = {VLDB},
pages = {51--62},
title = {Materialized views in probabilistic databases: for information exchange and query optimization},
year = {2007}
}
@article{VS17,
Author = {Van den Broeck, Guy and Suciu, Dan},
Title = {Query Processing on Probabilistic Data: A Survey},
Year = {2017},
}
@incollection{GT06,
author = {Green, Todd J and Tannen, Val},
booktitle = {EDBT},
pages = {278--296},
title = {Models for incomplete and probabilistic information},
year = {2006}
}
@article{IL84a,
author = {Imieli\'nski, Tomasz and Lipski Jr, Witold},
journal = {JACM},
number = {4},
pages = {761--791},
title = {Incomplete Information in Relational Databases},
volume = {31},
year = {1984}
}
@article{DS12,
author = {Dalvi, Nilesh and Suciu, Dan},
journal = {JACM},
number = {6},
pages = {30},
title = {The dichotomy of probabilistic inference for unions of conjunctive queries},
volume = {59},
year = {2012}
}
@inproceedings{heuvel-19-anappdsd,
author = {Maarten Van den Heuvel and Peter Ivanov and Wolfgang Gatterbauer and Floris Geerts and Martin Theobald},
booktitle = {SIGMOD},
pages = {1295--1312},
title = {Anytime Approximation in Probabilistic Databases via Scaled Dissociations},
year = {2019}
}
@article{AB15,
author = {Amarilli, Antoine and Bourhis, Pierre and Senellart, Pierre},
journal = {PODS},
title = {Probabilities and provenance via tree decompositions},
year = {2015}
}
@inproceedings{OH09a,
author = {Olteanu, Dan and Huang, Jiewen},
booktitle = {SIGMOD},
pages = {389--402},
title = {Secondary-storage confidence computation for conjunctive queries with inequalities},
year = {2009}
}
@article{FO16,
author = {Robert Fink and Dan Olteanu},
journal = {TODS},
number = {1},
pages = {4:1--4:47},
title = {Dichotomies for Queries with Negation in Probabilistic Databases},
volume = {41},
year = {2016}
}
@article{FH13,
author = {Robert Fink and Jiewen Huang and Dan Olteanu},
journal = {VLDBJ},
number = {6},
pages = {823--848},
title = {Anytime approximation in probabilistic databases},
volume = {22},
year = {2013}
}
@inproceedings{AB15c,
author = {Antoine Amarilli and Pierre Bourhis and Pierre Senellart},
booktitle = {ICALP},
pages = {56--68},
title = {Provenance Circuits for Trees and Treelike Instances},
year = {2015}
}
@inproceedings{kenig-13-nclexpdc,
author = {Batya Kenig and Avigdor Gal and Ofer Strichman},
booktitle = {SUM},
pages = {219--232},
title = {A New Class of Lineage Expressions over Probabilistic Databases Computable in P-Time},
volume = {8078},
year = {2013}
}
@inproceedings{cavallo-87-tpd,
author = {Roger Cavallo and Michael Pittarelli},
booktitle = {VLDB},
pages = {71--81},
title = {The Theory of Probabilistic Databases},
year = {1987}
}
@inproceedings{roy-11-f,
author = {Sudeepa Roy and Vittorio Perduca and Val Tannen},
booktitle = {ICDT},
title = {Faster query answering in probabilistic databases using read-once functions},
year = {2011}
}
@article{sen-10-ronfqevpd,
author = {Prithviraj Sen and Amol Deshpande and Lise Getoor},
journal = {PVLDB},
number = {1},
pages = {1068--1079},
title = {Read-Once Functions and Query Evaluation in Probabilistic Databases},
volume = {3},
year = {2010}
}
@article{provan-83-ccccptg,
author = {J. Scott Provan and Michael O. Ball},
journal = {SIAM J. Comput.},
number = {4},
pages = {777--788},
title = {The Complexity of Counting Cuts and of Computing the Probability That a Graph Is Connected},
volume = {12},
year = {1983}
}
@article{valiant-79-cenrp,
author = {Leslie G. Valiant},
journal = {SIAM J. Comput.},
number = {3},
pages = {410--421},
title = {The Complexity of Enumeration and Reliability Problems},
volume = {8},
year = {1979}
}
@inproceedings{AD11d,
author = {Amsterdamer, Yael and Deutch, Daniel and Tannen, Val},
booktitle = {PODS},
pages = {153--164},
title = {Provenance for Aggregate Queries},
year = {2011}
}
@article{S18a,
author = {Senellart, Pierre},
journal = {SIGMOD Record},
number = {4},
pages = {5--15},
title = {Provenance and Probabilities in Relational Databases},
volume = {46},
year = {2018}
}
@article{RS09b,
author = {Christopher Ré and Dan Suciu},
journal = {VLDBJ},
number = {5},
pages = {1091--1116},
title = {The trichotomy of HAVING queries on a probabilistic database},
volume = {18},
year = {2009}
}
@article{gatterbauer-17-dpaplinws,
author = {Wolfgang Gatterbauer and Dan Suciu},
title = {Dissociation and Propagation for Approximate Lifted Inference
With Standard Relational Database Management Systems},
journal = {{VLDB} J.},
volume = 26,
number = 1,
pages = {5--30},
year = 2017
}
@inproceedings{fink-11,
author = {Robert Fink and Dan Olteanu},
booktitle = {ICDT},
pages = {174--185},
title = {On the optimal approximation of queries using tractable propositional languages},
year = {2011}
}
@article{jha-12-pdwm,
author = {Abhay Kumar Jha and Dan Suciu},
journal = {PVLDB},
number = {11},
pages = {1160--1171},
title = {Probabilistic Databases With Markoviews},
volume = {5},
year = {2012}
}
@conference{BD05,
author = {Boulos, J. and Dalvi, N. and Mandhani, B. and Mathur, S. and Re, C. and Suciu, D.},
booktitle = {SIGMOD},
title = {MYSTIQ: a system for finding more answers by using probabilities},
year = {2005}
}
@article{DS07,
author = {Dalvi, N. and Suciu, D.},
journal = {VLDB},
number = {4},
pages = {544},
title = {Efficient query evaluation on probabilistic databases},
volume = {16},
year = {2007}
}
@inproceedings{re-07-eftqevpd,
author = {Christopher Ré and Nilesh N. Dalvi and Dan Suciu},
booktitle = {ICDE},
pages = {886--895},
title = {Efficient Top-k Query Evaluation on Probabilistic Data},
year = {2007}
}
@inproceedings{DM14c,
author = {Deutch, Daniel and Milo, Tova and Roy, Sudeepa and Tannen, Val},
booktitle = {ICDT},
pages = {201--212},
title = {Circuits for Datalog Provenance},
year = {2014}
}
@inproceedings{bahar-93-al,
author = {R. Iris Bahar and Erica A. Frohm and Charles M. Gaona and Gary
D. Hachtel and Enrico Macii and Abelardo Pardo and Fabio
Somenzi},
booktitle = {IEEE CAD},
title = {Algebraic decision diagrams and their applications},
year = {1993}
}
@inproceedings{gogate-13-smp,
author = {Vibhav Gogate and Pedro M. Domingos},
booktitle = {UAI},
title = {Structured Message Passing},
year = {2013}
}
@article{chen-10-cswssr,
author = {Hubie Chen and Martin Grohe},
journal = {J. Comput. Syst. Sci.},
number = {8},
pages = {847--860},
title = {Constraint Satisfaction With Succinctly Specified Relations},
volume = {76},
year = {2010}
}
@inproceedings{GL16,
author = {Paolo Guagliardo and
Leonid Libkin},
booktitle = {PODS},
title = {Making SQL Queries Correct on Incomplete Databases: A Feasibility
Study},
year = {2016}
}
@inproceedings{jampani2008mcdb,
author = {Jampani, Ravi and Xu, Fei and Wu, Mingxi and Perez, Luis Leopoldo and Jermaine, Christopher and Haas, Peter J},
booktitle = {SIGMOD},
title = {MCDB: a monte carlo approach to managing uncertain data},
year = {2008}
}
@article{yang:2015:pvldb:lenses,
author = {Yang, Ying and Meneghetti, Niccolò and Fehling, Ronny and Liu, Zhen Hua and Gawlick, Dieter and Kennedy, Oliver},
title = {Lenses: An On-Demand Approach to ETL},
journal = {PVLDB},
volume = {8},
number = {12},
year = {2015},
pages = {1578--1589}
}
@misc{pdbench,
title = {pdbench},
howpublished = {\url{http://pdbench.sourceforge.net/}},
note = {Accessed: 2020-12-15}
}
@article{factorized-db,
author = {Dan Olteanu and
Maximilian Schleich},
journal = {SIGMOD Rec.},
number = {2},
pages = {5--16},
title = {Factorized Databases},
volume = {45},
year = {2016}
}
@article{virgi-survey,
author = {Virginia Vassilevska Williams},
title = {Some Open Problems in Fine-Grained Complexity},
journal = {{SIGACT} News},
volume = {49},
number = {4},
pages = {29--35},
year = {2018},
url = {https://doi.org/10.1145/3300150.3300158},
doi = {10.1145/3300150.3300158},
timestamp = {Tue, 18 Dec 2018 15:19:27 +0100},
biburl = {https://dblp.org/rec/journals/sigact/Williams18.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{param-comp,
author = {J{\"{o}}rg Flum and
Martin Grohe},
title = {Parameterized Complexity Theory},
series = {Texts in Theoretical Computer Science. An {EATCS} Series},
publisher = {Springer},
year = {2006},
url = {https://doi.org/10.1007/3-540-29953-X},
doi = {10.1007/3-540-29953-X},
isbn = {978-3-540-29952-3},
timestamp = {Tue, 16 May 2017 14:24:38 +0200},
biburl = {https://dblp.org/rec/series/txtcs/FlumG06.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{DBLP:books/daglib/0020812,
author = {Hector Garcia{-}Molina and
Jeffrey D. Ullman and
Jennifer Widom},
title = {Database systems - the complete book {(2.} ed.)},
publisher = {Pearson Education},
year = {2009}
}
@article{DBLP:journals/jal/KarpLM89,
author = {Richard M. Karp and
Michael Luby and
Neal Madras},
title = {Monte-Carlo Approximation Algorithms for Enumeration Problems},
journal = {J. Algorithms},
volume = {10},
number = {3},
pages = {429--448},
year = {1989}
}
@inproceedings{ajar,
author = {Manas R. Joglekar and
Rohan Puttagunta and
Christopher R{\'{e}}},
editor = {Tova Milo and
Wang{-}Chiew Tan},
title = {{AJAR:} Aggregations and Joins over Annotated Relations},
booktitle = {Proceedings of the 35th {ACM} {SIGMOD-SIGACT-SIGAI} Symposium on Principles
of Database Systems, {PODS} 2016, San Francisco, CA, USA, June 26
- July 01, 2016},
pages = {91--106},
publisher = {{ACM}},
year = {2016},
url = {https://doi.org/10.1145/2902251.2902293},
doi = {10.1145/2902251.2902293},
timestamp = {Tue, 06 Nov 2018 16:58:02 +0100},
biburl = {https://dblp.org/rec/conf/pods/JoglekarPR16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@book{graetzer-08-un,
author = {Gr{\"a}tzer, George},
title = {Universal algebra},
year = 2008,
publisher = {Springer Science \& Business Media}
}
@article{AGM,
author = {Albert Atserias and
Martin Grohe and
D{\'{a}}niel Marx},
title = {Size Bounds and Query Plans for Relational Joins},
journal = {{SIAM} J. Comput.},
volume = {42},
number = {4},
pages = {1737--1767},
year = {2013},
url = {https://doi.org/10.1137/110859440},
doi = {10.1137/110859440},
timestamp = {Thu, 08 Jun 2017 08:59:24 +0200},
biburl = {https://dblp.org/rec/journals/siamcomp/AtseriasGM13.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/vldb/SaRR0W0Z17,
author = {Christopher De Sa and
Alexander Ratner and
Christopher R{\'{e}} and
Jaeho Shin and
Feiran Wang and
Sen Wu and
Ce Zhang},
title = {Incremental knowledge base construction using DeepDive},
journal = {{VLDB} J.},
volume = {26},
number = {1},
pages = {81--105},
year = {2017}
}
@article{DBLP:journals/pvldb/RekatsinasCIR17,
author = {Theodoros Rekatsinas and
Xu Chu and
Ihab F. Ilyas and
Christopher R{\'{e}}},
title = {HoloClean: Holistic Data Repairs with Probabilistic Inference},
journal = {Proc. {VLDB} Endow.},
volume = {10},
number = {11},
pages = {1190--1201},
year = {2017}
}
@article{DBLP:journals/pvldb/BeskalesIG10,
author = {George Beskales and
Ihab F. Ilyas and
Lukasz Golab},
title = {Sampling the Repairs of Functional Dependency Violations under Hard
Constraints},
journal = {Proc. {VLDB} Endow.},
volume = {3},
number = {1},
pages = {197--207},
year = {2010}
}
@article{DBLP:journals/tods/OlteanuS16,
author = {Dan Olteanu and
Sebastiaan J. van Schaik},
title = {ENFrame: {A} Framework for Processing Probabilistic Data},
journal = {{ACM} Trans. Database Syst.},
volume = {41},
number = {1},
pages = {3:1--3:44},
year = {2016}
}
@inproceedings{DBLP:conf/sigmod/GaoLPJ17,
author = {Zekai J. Gao and
Shangyu Luo and
Luis Leopoldo Perez and
Chris Jermaine},
title = {The {BUDS} Language for Distributed Bayesian Machine Learning},
booktitle = {{SIGMOD} Conference},
pages = {961--976},
publisher = {{ACM}},
year = {2017}
}
@inproceedings{DBLP:conf/sigmod/CaiVPAHJ13,
author = {Zhuhua Cai and
Zografoula Vagena and
Luis Leopoldo Perez and
Subramanian Arumugam and
Peter J. Haas and
Christopher M. Jermaine},
title = {Simulation of database-valued markov chains using SimSQL},
booktitle = {{SIGMOD} Conference},
pages = {637--648},
publisher = {{ACM}},
year = {2013}
}
@inproceedings{kumari:2016:qdb:communicating,
author = {Kumari, Poonam and Achmiz, Said and Kennedy, Oliver},
title = {Communicating Data Quality in On-Demand Curation},
booktitle = {QDB},
year = {2016}
}
@inproceedings{feng:2021:sigmod:efficient,
author = {Feng, Su and Glavic, Boris and Huber, Aaron and Kennedy, Oliver},
title = {Efficient Uncertainty Tracking for Complex Queries with Attribute-level Bounds},
booktitle = {SIGMOD},
year = {2021}
}

BIN
Sketching Worlds/main.pdf Normal file

Binary file not shown.

Binary file not shown.

196
Sketching Worlds/main.tex Normal file
View File

@ -0,0 +1,196 @@
\documentclass[sigconf, prologue, table]{acmart}
\AtBeginDocument{%
\providecommand\BibTeX{{%
\normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}
\setcopyright{acmcopyright}
\copyrightyear{2022}
\acmYear{2022}
\acmDOI{XXXXXXX.XXXXXXX}
\acmConference[Conference acronym 'XX]{Make sure to enter the correct
conference title from your rights confirmation emai}{June 03--05,
2018}{Woodstock, NY}
\acmPrice{15.00}
\acmISBN{978-1-4503-XXXX-X/18/06}
\usepackage{xcolor}%for rebuttal document, in particular \rowcolor
\usepackage{caption}%caption for table
\usepackage{cellspace}%padding of tabular cells
\usepackage{relsize}%\mathlarger
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{tikz}
\usepackage{tikz-qtree}
\usepackage{comment}
\let\endproof\relax
\usepackage{amsmath}
\newcommand\hmmax{0} % default 3
\newcommand\bmmax{0} % default 4
\usepackage{bm}
\usepackage{bm}%for math mode bold font
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{etoolbox}
\usepackage{xstring} %for conditionals in \newcommand
\usepackage{stmaryrd}
\usepackage[normalem]{ulem}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage{todonotes}
\usepackage{graphicx}
\usepackage{listings}
%%%%%%%%%% SQL + proveannce listing settings
\usepackage{mdframed}
\lstdefinestyle{psql}
{
tabsize=2,
basicstyle=\small\upshape\ttfamily,
language=SQL,
morekeywords={PROVENANCE,BASERELATION,INFLUENCE,COPY,ON,TRANSPROV,TRANSSQL,TRANSXML,CONTRIBUTION,COMPLETE,TRANSITIVE,NONTRANSITIVE,EXPLAIN,SQLTEXT,GRAPH,IS,ANNOT,THIS,XSLT,MAPPROV,cxpath,OF,TRANSACTION,SERIALIZABLE,COMMITTED,INSERT,INTO,WITH,SCN,UPDATED,LENS,SCHEMA_MATCHING,string,WINDOW,max,OVER,PARTITION,FIRST_VALUE,WITH},
extendedchars=false,
keywordstyle=\bfseries,
mathescape=true,
escapechar=@,
sensitive=true
}
\lstset{style=psql}
%%%%%%%%%%%%%%%%%%
\usepackage{wrapfig}
\usepackage{fancyvrb}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{braket}
\usepackage[inline]{enumitem}
\usepackage{xspace}
\usepackage{hyperref}
\usepackage{url}
\usepackage{cleveref}
\usepackage{color}
\graphicspath{ {figures/} }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{outlines}%For outline capabilities
\usepackage{enumitem}%used in tandem with outlines package
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\input{macros}
% reference names
\crefname{example}{ex.}{ex.}
\Crefname{example}{Ex.}{Ex.}
\Crefname{figure}{Fig.}{Fig.}
\Crefname{section}{Sec.}{Sec.}
\Crefname{definition}{Def.}{Def.}
\Crefname{theorem}{Thm.}{Thm.}
\Crefname{lemma}{Lem.}{Lem.}
\crefname{equation}{eq.}{eq.}
\Crefname{equation}{Eq.}{Eq.}
%%%%%%%%%%%%%%%%%%%%
\begin{document}
\title{Computing expected multiplicities for bag-TIDBs with bounded multiplicities}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Su Feng}\email{sfeng14@hawk.iit.edu}
\author{Boris Glavic}\email{bglavic@iit.edu}
\affiliation{%
\institution{Illinois Institute of Technology , USA}
\city{Chicago}
\state{New York}
\country{USA}
}
\author{Aaron Huber}\email{ahuber@buffalo.edu}
\author{Oliver Kennedy}\email{okennedy@buffalo.edu}
\author{Atri Rudra}\email{atri@buffalo.edu}
\affiliation{%
\institution{University at Buffalo, USA}
\city{Buffalo}
\state{New York}
\country{USA}
}
\renewcommand{\shortauthors}{Huber, Kennedy, Rudra, et al.}
\begin{abstract}
\input{abstract}
\end{abstract}
\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10002951.10003227</concept_id>
<concept_desc>Information systems~Information systems applications</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10002951.10002952</concept_id>
<concept_desc>Information systems~Data management systems</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10003752.10003753.10003757</concept_id>
<concept_desc>Theory of computation~Probabilistic computation</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10003752.10003777.10003778</concept_id>
<concept_desc>Theory of computation~Complexity classes</concept_desc>
<concept_significance>500</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}
\ccsdesc[500]{Information systems~Information systems applications}
\ccsdesc[500]{Information systems~Data management systems}
\ccsdesc[500]{Theory of computation~Probabilistic computation}
\ccsdesc[500]{Theory of computation~Complexity classes}
\keywords{probabilstic data model, parameterized complexity, fine-grained complexity, lineage polynomial}
\maketitle
\lstset{language=sql}
\input{introduction}
\input{binarybidb}
\input{pwsem}
\input{prob-def}
\input{mult_distinct_p}
\input{single_p}
\input{approx_alg}
\input{related-work}
\input{conclusions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{acks}
\input{acknowledgements}
\end{acks}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliographystyle{ACM-Reference-Format}
\bibliography{main}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\appendix
\normalsize
\input{appendix}
\input{related-work-extra}
\end{document}

View File

@ -0,0 +1,87 @@
%root:main.tex
%!TEX root=./main.tex
\section{Hardness of Exact Computation}
\label{sec:hard}
In this section, we will prove the hardness results claimed in Table~\ref{tab:lbs} for a specific (family) of hard instance $(\qhard,\pdb)$ for \Cref{prob:bag-pdb-poly-expected} where $\pdb$ is a $1$-\abbrTIDB.
Note that this implies hardness for \abbrCTIDB\xplural $\inparen{\bound\geq1}$, showing \Cref{prob:bag-pdb-poly-expected} cannot be done in $\bigO{\qruntime{\optquery{\query},\tupset,\bound}}$ runtime. The results also apply to \abbrOneBIDB and other more general \abbrPDB\xplural.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Preliminaries}\label{sec:hard:sub:pre}
Our hardness results are based on (exactly) counting the number of (not necessarily induced) subgraphs in $G$ isomorphic to $H$. Let $\numocc{G}{H}$ denote this quantity. We can think of $H$ as being of constant size and $G$ as growing.
In particular, we will consider the problems of computing the following counts (given $G$ in its adjacency list representation): $\numocc{G}{\tri}$ (the number of triangles), $\numocc{G}{\threedis}$ (the number of $3$-matchings), and the latter's generalization $\numocc{G}{\kmatch}$ (the number of $k$-matchings). We use $\kmatchtime$ to denote the optimal runtime of computing $\numocc{G}{\kmatch}$ exactly. Our hardness results in \Cref{sec:multiple-p} are based on the following hardness results/conjectures:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}[\cite{k-match}]
\label{thm:k-match-hard}
Given positive integer $k$ and undirected graph $G=(\vset,\edgeSet)$ with no self-loops or parallel edges, $\kmatchtime\ge \littleomega{f(k)\cdot |\edgeSet|^c}$ for any function $f$ and any constant $c$ independent of $\abs{E}$ and $k$ (assuming $\sharpwzero\ne\sharpwone$).
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{hypo}\label{conj:known-algo-kmatch}
There exists an absolute constant $c_0>0$ such that for every $G=(\vset,\edgeSet)$, we have $\kmatchtime \ge \Omega\inparen{|E|^{c_0\cdot k}}$ for large enough $k$.
\end{hypo}
We note that the above conjecture is somewhat non-standard. In particular, the best known algorithm to compute $\numocc{G}{\kmatch}$ takes time $\Omega\inparen{|V|^{k/2}}$ (i.e. if this is the best algorithm then $c_0=\frac 14$)~\cite{k-match}. What the above conjecture is saying is that one can only hope for a polynomial improvement over the state of the art algorithm to compute $\numocc{G}{\kmatch}$.
%
Our hardness result in Section~\ref{sec:single-p} is based on the following conjectured hardness result:
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{hypo}
\label{conj:graph}
There exists a constant $\eps_0>0$ such that given an undirected graph $G=(\vset,\edgeSet)$, computing $\numocc{G}{\tri}$ exactly cannot be done in time $o\inparen{|\edgeSet|^{1+\eps_0}}$.
\end{hypo}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
The so called {\em Triangle detection hypothesis} (cf.~\cite{triang-hard}), which states that detecting the presence of triangles in $G$ takes time $\Omega\inparen{|\edgeSet|^{4/3}}$, implies that in Conjecture~\ref{conj:graph} we can take $\eps_0\ge \frac 13$.
All of our hardness results rely on a simple lineage polynomial encoding of the edges of a graph.
To prove our hardness result, consider a graph $G=(\vset, \edgeSet)$, where $|\edgeSet| = m$, $\vset = [\numvar]$. Our lineage polynomial has a variable $X_i$ for every $i$ in $[\numvar]$.
Consider the polynomial
$\poly_{G}(\vct{X}) = \sum\limits_{(i, j) \in \edgeSet} X_i \cdot X_j.$
The hard polynomial for our problem will be a suitable power $k\ge 3$ of the polynomial above:
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}\label{def:qk}
For any graph $G=(V,\edgeSet)$ and $\kElem\ge 1$, define
\[\poly_{G}^\kElem(X_1,\dots,X_n) = \left(\sum\limits_{(i, j) \in \edgeSet} X_i \cdot X_j\right)^\kElem.\]
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\noindent Returning to \Cref{fig:two-step}, it can be seen that $\poly_{G}^\kElem(\vct{X})$ is the lineage polynomial from query $\qhard^k$, which we define next ($\query_2$ from~\Cref{sec:intro} is the same query with $k=2$). Let us alias
\begin{lstlisting}
SELECT DISTINCT 1 FROM T $t_1$, R r, T $t_2$
WHERE $t_1$.Point = r.Point$_1$ AND $t_2$.Point = r.Point$_2$
\end{lstlisting}
as $R$. The query $\qhard^k$ then becomes
\mdfdefinestyle{underbrace}{topline=false, rightline=false, bottomline=false, leftline=false, backgroundcolor=black!15!white, innerbottommargin=0pt}
\begin{mdframed}[style=underbrace]
\begin{lstlisting}
SELECT COUNT(*) FROM $\underbrace{R\text{ JOIN }R\text{ JOIN}\cdots\text{JOIN }R}_{k\rm\ times}$
\end{lstlisting}
\end{mdframed}
\noindent Consider again the \abbrCTIDB instance $\pdb$ of~\Cref{fig:two-step} and, for our hard instance, let $\bound = 1$. $\pdb$ generalizes to one compatible to~\Cref{def:qk} as follows. Relation $T$ has $n$ tuples corresponding to each vertex for $i$ in $[n]$, each with probability $\prob$ and $R$ has tuples corresponding to the edges $\edgeSet$ (each with probability of $1$).\footnote{Technically, $\poly_{G}^\kElem(\vct{X})$ should have variables corresponding to tuples in $R$ as well, but since they always are present with probability $1$, we drop those. Our argument also works when all the tuples in $R$ also are present with probability $\prob$ but to simplify notation we assign probability $1$ to edges.}
In other words, this instance $\tupset$ contains the set of $\numvar$ unary tuples in $T$ (which corresponds to $\vset$) and $\numedge$ binary tuples in $R$ (which corresponds to $\edgeSet$).
Note that this implies that $\poly_{G}^\kElem$ is indeed a $1$-\abbrTIDB lineage polynomial.
Next, we note that the runtime for answering $\qhard^k$ on deterministic database $\tupset$, as defined above, is $O_k\inparen{\numedge}$ (i.e. deterministic query processing is `easy' for this query):
\begin{Lemma}\label{lem:tdet-om}
Let $\qhard^k$ and $\tupset$ be as defined above. Then
$\qruntimenoopt{\qhard^k, \tupset}$ is $O_k\inparen{\numedge}$.
\end{Lemma}
\subsection{Multiple Distinct $\prob$ Values}
\label{sec:multiple-p}
We are now ready to present our main hardness result.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Theorem}\label{thm:mult-p-hard-result}
Let $\prob_0,\ldots,\prob_{2k}$ be $2k + 1$ distinct values in $(0, 1]$. Then computing $\rpoly_G^\kElem(\prob_i,\dots,\prob_i)$ (over all $i\in [2k+1]$) for arbitrary $G=(\vset,\edgeSet)$
needs time $\bigOmega{\kmatchtime}$, assuming $\kmatchtime\ge \omega\inparen{\abs{\edgeSet}}$.
\end{Theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
Note that the second row of \Cref{tab:lbs} follows from \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{thm:k-match-hard} while the third row is proved by \Cref{prop:expection-of-polynom}, \Cref{thm:mult-p-hard-result}, \Cref{lem:tdet-om}, and \Cref{conj:known-algo-kmatch}. Since \Cref{conj:known-algo-kmatch} is non-standard, the latter hardness result should be interpreted as follows. Any substantial polynomial improvement for \Cref{prob:bag-pdb-poly-expected} (over the trivial algorithm that converts $\poly$ into SMB and then uses \Cref{cor:expct-sop} for \abbrStepTwo) would lead to an improvement over the state of the art {\em upper} bounds on $\kmatchtime$. Finally, note that \Cref{thm:mult-p-hard-result} needs one to be able to compute the expected multiplicities over $(2k+1)$ distinct values of $p_i$, each of which corresponds to distinct $\bpd$ (for the same $\tupset$), which explain the `Multiple' entry in the second column in the second and third row in \Cref{tab:lbs}. Next, we argue how to get rid of this latter requirement.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,67 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Previous}
\begin{Definition}[$\bi$~\cite{DBLP:series/synthesis/2011Suciu}]
A Block Independent Database ($\bi$) is a PDB whose tuples are partitioned in blocks, where we denote block $i$ as $\block_i$. Each $\block_i$ is independent of all other blocks, while all tuples sharing the same $\block_i$ are mutually exclusive.
\end{Definition}
\begin{Definition}[$\ti$]
A Tuple Independent Database ($\ti$) is a special case of a $\bi$ such that each tuple is its own block.
\end{Definition}
\subsection{Modeling and Semantics}
Define $\vct{X}$ to be the vector of variables $X_1,\dots,X_M$. Let the set of all tuples in domain of $\sch(\db)$ be $\tset$.
\subsubsection{K-relations}\label{subsubsec:k-rel}
The information encoded in the annotation depends on the underlying semiring of the relation.
As noted in \cite{DBLP:conf/pods/GreenKT07}, the $\mathbb{N}[\vct{X}]$-semiring is a semiring over the set $\mathbb{N}[\vct{X}]$ of all polynomials, whose variables can then be substituted with $K$-values from other semirings, evaluating the operators with the operators of the substituted semiring, to produce varying semantics such as set, bag, and security.
Further define $\nxdb$ as an $\mathbb{N}[\vct{X}]$ database where each tuple $\tup \in \db$ is annotated with a polynomial over variables $X_1,\ldots, X_M$.
Since $\nxdb$ is a database that maps tuples to polynomials, it is customary for arbitrary table $\rel$ to be viewed as a function $\rel: \tset \mapsto \mathbb{N}[\vct{X}]$, where $\rel(\tup)$ denotes the polynomial annotating tuple $\tup$.
It has been shown in previous work that commutative semirings precisely model translations of RA+ query operations to $K$-annotations.
%The evalution semantics notation $\llbracket \cdot \rrbracket = x$ simply mean that the result of evaluating expression $\cdot$ is given by following the semantics $x$.
Given a query $\query$, operations in $\query$ are translated into the following polynomial expressions.
\begin{align*}
&\evald{\project_A(\rel)}{\db}(\tup)&& = &&\sum_{\tup': \project_A(\tup) = \tup} \evald{\rel}{\db}(\tup')\\
&\evald{(\rel_1 \union \rel_2)}{\db}(\tup)&& = &&\evald{\rel_1}{\db}(\tup) + \evald{\rel_2}{\db}(\tup)\\
&\evald{(\rel_1 \join \rel_2)}{\db}(\tup) && = &&\evald{\rel_1}{\db}(\project_{\sch(\rel_1)}(\tup)) \times \evald{\rel_2}{\db}(\project_{\sch(\rel_2)}(\tup)) \\
&\evald{\select_\theta(\rel)}{\db}(\tup) && = &&\begin{cases}
\evald{\rel}{\db}(\tup) &\text{if }\theta(\tup) = 1\\
0 &\text{otherwise}.
\end{cases}\\
&\evald{R}{\db}(\tup) && = &&\rel(\tup)
\end{align*}
The above semantics show us how to obtain the $K$-annotation on a tuple in the result of query $\query$ from the annotations of the input tuples. When used with $\mathbb B$-typed variables, an $\mathbb{N}[\vct{X}]$ relation is effectively a C-Table \cite{DBLP:conf/pods/GreenKT07}, since all first order formulas can be equivalently modeled by polynomials, where $\oplus$ is disjunction and $\otimes$ is conjunction.
This is the equivalent to substituting values and operators from the $\{\mathbb{B}, \vee, \wedge, \bot, \top\}$ semiring. In like manner, when assigning values from the $\mathbb{N}$ domain, the polynomials then model bag semantics, where the variables and $\oplus$ and $\otimes$ operations come from the natural numbers semiring $\{\mathbb{N}, +, \times, 0, 1\}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Defining the Data}\label{subsec:def-data}
For the set of possible worlds, $\wSet$, i.e. the set of all $\db_i \in \idb$, define an injective mapping to the set $\{0, 1\}^M$, where for each vector $\vct{w} \in \{0, 1\}^M$ there is at most one element $\db_i \in \idb$ mapped to $\vct{w}$.
In the general case, the binary value of $\vct{w}$ uniquely identifies a potential possible world. For example, consider the case of the Tuple Independent Database $(\ti)$ data model in which each table is a set of tuples, each of which is independent of one another, and individually occur with a specific probability $\prob_\tup$. Because of independence, a $\ti$ with $\numvar$ tuples naturally has $2^\numvar$ possible worlds, thus $\numvar = M$, and the injective mapping for each $\vct{w} \in \{0, 1\}^M$ is trivial. In the Block Independent Disjoint data model ($\bi$), because of the disjoint condition on tuples within the same block, a $\bi$ may not have exactly $2^M$ possible worlds since there are combinations of tuples that cannot exist in the encoding.
Denote a random variable selecting a world according to distribution $P$ to be $\rw$. Provided that for any non-possible world $\vct{w} \in \{0, 1\}^M, \pd[\rw = \vct{w}] = 0$, a probability distribution over $\{0, 1\}^M$ is a distribution over $\Omega$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%This could be a way to think of world binary vectors in the general case
%Let $\vct{w}$ be a $\left\lceil\log_2\left(\left|\wSet\right|\right)\right\rceil = \numvar$ binary bit vector, uniquely identifying possible world $\db_i \in \idb$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
From this point on our discussion focuses on exactly one specific tuple $\tup$. Thus, we abuse notation by using $\poly(\vct{X})$ to be the annotated polynomial $\llbracket\poly(\db)\rrbracket(\tup)$, and for a domain of $\{0, 1\}$ for each $X_i \in \vct{X}$, the injective mapping maps $\db$ to $\vct{X}$.
One of the aggregates we desire to compute over the annotated polynomial is the expectation over possible worlds, denoted,
\[\expct_{\vct{\rw} \sim \pd}\pbox{\poly(\rw)} = \sum\limits_{\vct{w} \in \{0, 1\}^\numvar} \poly(\vct{w})\cdot \pd[\rw = \vct{w}].\]
Above, $\poly(\vct{w})$ is used to mean the assignment of $\vct{w}$ to $\vct{X}$.
For a $\ti$, the bit-string world value $\vct{w}$ can be used as indexing to determine which tuples are present in the $\vct{w}$ world, where the $i^{th}$ bit position $(\wbit_i)$ represents whether a tuple $\tup_i$ appears in the unique world $\vct{w}$. Denote the vector $\vct{p}$ to be a vector whose elements are the individual probabilities $\prob_i$ of each tuple $\tup_i$ such that those probabilities produce the possible worlds in D with a distribution $\pd$ over all worlds. Let $\pd^{(\vct{p})}$ represent the distribution induced by $\vct{p}$.
\[\expct_{\rw\sim \pd^{(\vct{p})}}\pbox{\poly(\rw)} = \sum\limits_{\vct{w} \in \{0, 1\}^\numvar} \poly(\vct{w})\prod_{\substack{i \in [\numvar]\\ s.t. \wElem_i = 1}}\prob_i \prod_{\substack{i \in [\numvar]\\s.t. w_i = 0}}\left(1 - \prob_i\right).\]

Binary file not shown.

View File

@ -0,0 +1,40 @@
%root: main.tex
\paragraph{Outline of 1st ICDT Introduction Submission}
\begin{outline}[enumerate]
\1 Problem Introduction and Background
\2 Set-\abbrPDB notation, concepts, common (known) results \isIncluded{I. B. vi.}
\3 \notIncluded{Dichotomy} \isIncluded{Can be included in I. B. vi.}
\3 Exact Computation \sharpphard \isIncluded{I. B. vi.}
\2 Formal definition of expected result multiplicity \isIncluded{Can be included in I. C. ii. a.}
\2 \notIncluded{Example} \isIncluded{Can be used potentially sometime after I. C. ii. a.; maybe around D.}
\3 Assumed setting of {\emph set inputs} \isIncluded{I. D.}
\3 \notIncluded{Example based on explaining and motivating formal definition of expected result multiplicity}
\1 \notIncluded{Discussion of set-\abbrPDB\xplural} \isIncluded{Perhaps this might be useful around I. B. vi.}
\2 \notIncluded{Lineage from PosBool$[\vct{X}]$}
\2 \notIncluded{Encoding of possible worlds via $\vct{X}$}
\2 \notIncluded{Computing probability can be done using only the lineage}
\1 \notIncluded{Discussion of bag-\abbrPDB\xplural} \isIncluded{Might be useful around I. C.}
\2 \notIncluded{Link to $\domN[\vct{X}]$}
\2 \notIncluded{Link to computing the expected count of a lineage polynomial}
\2 \notIncluded{Example to illustrate computing an expected count over a lineage polynomial}
\1 \isIncluded{Computing expected multiplicity for an \abbrSOP representation versus a factorized representation -- I. B. iii. and I. B. iv.}
\2 Linear for \abbrSOP
\2 Introduce the problem by asking if it's linear in the size of the representation for factorized representation produced by such query optimizations as projection push-down.
\2 State our theoretical results (informally) that it is not linear in general
\1 \notIncluded{Contributions}
\2 \notIncluded{Hardness results for the expected multiplicity problem}
\3 \notIncluded{Reduction to counting $\kElem$-matchings}
\2 \isIncluded{Introduce our approximation algorithm and its guarantees -- I. B. v.}
\2 \notIncluded{Generalization to bag-\abbrPDB\xplural}
\2 \notIncluded{Result over $\raPlus$ queries }
\2 \isIncluded{Higher moments I. C. e.}
\1 \notIncluded{Overview of our techniques}
\2 \notIncluded{Informal introduction to $\rpoly$ with example}
\2 \notIncluded{Definition of reduced polynomial}
\2 \notIncluded{Equivalence of $\rpoly$ and computing $\expct$}
\2 \notIncluded{Further details into the technique of obtaining our hardness result}
\1 \notIncluded{Paper organization}
\2 \notIncluded{Also includes evaluation semantics figure}
\end{outline}

View File

@ -0,0 +1,104 @@
%root: main.tex
%!TEX root=./main.tex
\subsection{Formalizing \Cref{prob:intro-stmt}}\label{sec:expression-trees}
We focus on the problem of computing $\expct_{\worldvec\sim\pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}$ from now on, assume implicit $\query, \tupset, \tup$, and drop them from $\apolyqdt$ (i.e., $\poly\inparen{\vct{X}}$ will denote a polynomial).
\Cref{prob:intro-stmt} asks if there exists a linear time approximation algorithm in the size of a given circuit \circuit which encodes $\poly\inparen{\vct{X}}$. Recall that in this work we
represent lineage polynomials via {\em arithmetic circuits}~\cite{arith-complexity}, a standard way to represent polynomials over fields (particularly in the field of algebraic complexity) that we use for polynomials over $\mathbb N$ in the obvious way. Since we are specifically using circuits to model lineage polynomials, we can refer to these circuits as lineage circuits. However, when the meaning is clear, we will drop the term lineage and only refer to them as circuits.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[Circuit]\label{def:circuit}
A circuit $\circuit$ is a Directed Acyclic Graph (DAG) whose source gates (in degree of $0$) consist of elements in either $\domN$ or $\vct{X} = \inparen{X_1,\ldots,X_\numvar}$. For each result tuple there exists one sink gate. The internal gates have binary input and are either sum ($\circplus$) or product ($\circmult$) gates.
%
Each gate has the following members: \type, \vari{input}, \val, \vpartial, \degval, \vari{Lweight}, and \vari{Rweight}, where \type is the value type $\{\circplus, \circmult, \var, \tnum\}$ and \vari{input} the list of inputs. Source gates have an extra member \val storing the value. $\circuit_\linput$ ($\circuit_\rinput$) denotes the left (right) input of \circuit.
\end{Definition}
When the underlying DAG is a tree (with edges pointing towards the root), the structure is an expression tree \etree. In such a case, the root of \etree is analogous to the sink of \circuit. The fields \vari{partial}, \degval, \vari{Lweight}, and \vari{Rweight} are used in the proofs of \Cref{sec:proofs-approx-alg}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuits in \Cref{fig:two-step} encode their respective polynomials in column $\poly$.
Note that the ciricuit \circuit representing $AX$ and the circuit \circuit' representing $B\inparen{Y+Z}$ each encode a tree, with edges pointing towards the root.
\begin{wrapfigure}{L}{0.45\linewidth}
\centering
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (0, 0) {$\boldsymbol{X}$};
\node[tree_node] (b1) at (1.5, 0) {$\boldsymbol{2}$};
\node[tree_node] (c1) at (3, 0) {$\boldsymbol{Y}$};
\node[tree_node] (d1) at (4.5, 0) {$\boldsymbol{-1}$};
\node[tree_node] (a2) at (0.75, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (b2) at (2.25, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (c2) at (3.75, 0.75) {$\boldsymbol{\circmult}$};
\node[tree_node] (a3) at (0.55, 1.5) {$\boldsymbol{\circplus}$};
\node[tree_node] (b3) at (3.75, 1.5) {$\boldsymbol{\circplus}$};
\node[tree_node] (a4) at (2.25, 2.25) {$\boldsymbol{\circmult}$};
\draw[->] (a1) -- (a2);
\draw[->] (a1) -- (a3);
\draw[->] (b1) -- (a2);
\draw[->] (b1) -- (b2);
\draw[->] (c1) -- (c2);
\draw[->] (c1) -- (b2);
\draw[->] (d1) -- (c2);
\draw[->] (a2) -- (b3);
\draw[->] (b2) -- (a3);
\draw[->] (c2) -- (b3);
\draw[->] (a3) -- (a4);
\draw[->] (b3) -- (a4);
\draw[->] (a4) -- (2.25, 2.75);
\end{tikzpicture}
\caption{Circuit encoding of $(X + 2Y)(2X - Y)$}
\label{fig:circuit}
\end{wrapfigure}
We next formally define the relationship of circuits with polynomials. While the definition assumes one sink for notational convenience, it easily generalizes to the multiple sinks case.
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
$\polyf(\circuit)$ maps the sink of circuit $\circuit$ to its corresponding polynomial (in \abbrSMB). $\polyf(\cdot)$ is recursively defined on $\circuit$ as follows, with addition and multiplication following the standard interpretation for polynomials:
\begin{equation*}
\polyf(\circuit) = \begin{cases}
\polyf(\circuit_\lchild) + \polyf(\circuit_\rchild) &\text{ if \circuit.\type } = \circplus\\
\polyf(\circuit_\lchild) \cdot \polyf(\circuit_\rchild) &\text{ if \circuit.\type } = \circmult\\
\circuit.\val &\text{ if \circuit.\type } = \var \text{ OR } \tnum.
\end{cases}
\end{equation*}
\end{Definition}
$\circuit$ need not encode $\poly\inparen{\vct{X}}$ in the same, default \abbrSMB representation. For instance, $\circuit$ could encode the factorized representation $(X + 2Y)(2X - Y)$ of $\poly\inparen{\vct{X}} = 2X^2+3XY-2Y^2$, as shown in \Cref{fig:circuit}, while $\polyf(\circuit) = \poly\inparen{\vct{X}}$ is always the equivalent \abbrSMB representation.
\begin{Definition}[Circuit Set]\label{def:circuit-set}
$\circuitset{\polyX}$ is the set of all possible circuits $\circuit$ such that $\polyf(\circuit) = \polyX$.
\end{Definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$. One can think of $\circuitset{\polyX}$ as the infinite set of circuits where for each element \circuit, $\polyf\inparen{\circuit} = \polyX$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\medskip
\noindent We are now ready to formally state the final version of \Cref{prob:intro-stmt}.%our \textbf{main problem}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\pdb'$ be an arbitrary \abbrCTIDB and $\vct{X}$ be the set of variables annotating tuples in $\tupset'$. Fix an $\raPlus$ query $\query$ and a result tuple $\tup$.
The \expectProblem is defined as follows:\\[-7mm]
\begin{center}
\textbf{Input}: $\circuit \in \circuitset{\polyX}$ for $\poly'\inparen{\vct{X}} = \poly'\pbox{\query,\tupset',\tup}$
\hspace*{2mm}
\textbf{Output}: $\expct_{\vct{W} \sim \bpd}\pbox{\poly'\pbox{\query, \tupset', \tup}\inparen{\vct{W}}}$
\end{center}
\end{Definition}
\input{circuits-model-runtime}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,56 @@
%root: main.tex
%!TEX root = ./main.tex
\iffalse
\begin{Example}\label{example:qtilde}
Consider $\poly(X, Y) = (X + Y)(X + Y)$ where $X$ and $Y$ are from different blocks. The expanded derivation for $\rpoly(X, Y)$ is
\begin{align*}
(&X^2 + 2XY + Y^2 \mod X^2 - X) \mod Y^2 - Y\\
= ~&X + 2XY + Y^2 \mod Y^2 - Y\\
= ~& X + 2XY + Y
\end{align*}
\end{Example}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Let $\abs{\poly}$ be the number of operators in $\poly$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Corollary}\label{cor:expct-sop}
If $\poly$ is a $1$-\abbrBIDB lineage polynomial already in \abbrSMB, then the expectation of $\poly$, i.e., $\expct\pbox{\poly} = \rpoly\left(\prob_1,\ldots, \prob_\numvar\right)$ can be computed in $\bigO{\abs{\poly}}$ time.
\end{Corollary}
\subsubsection{Possible World Semantics}\label{subsub:possible-world-sem}
In this section, we show how the traditional possible worlds semantics corresponds to our setup. Readers can safely skip this part without missing anything vital to the results of this paper.
Queries over probabilistic databases are traditionally viewed as being evaluated using the so-called possible world semantics. A general bag-\abbrPDB can be defined as the pair $\pdb = \inparen{\Omega, \bpd}$ where $\Omega$ is the set of possible worlds represented by $\pdb$ and $\bpd$ the probability distribution over $\Omega$. Under the possible world semantics, the result of a query $\query$ over an incomplete database $\Omega$ is the set of query answers produced by evaluating $\query$ over each possible world $\omega\in\Omega$: $\inset{\query\inparen{\omega}: \omega\in\Omega}$.
The result of a query is the pair $\inparen{\query\inparen{\Omega}, \bpd'}$ where $\bpd'$ is a probability distribution that assigns to each possible query result the sum of the probabilites of the worlds that produce this answer: $\probOf\pbox{\omega\in\Omega} = \sum_{\omega'\in\Omega,\\\query\inparen{\omega'}=\query\inparen{\omega}}\probOf\pbox{\omega'}$.
Suppose that $\pdb'$ is a reduced \abbrOneBIDB from \abbrCTIDB $\pdb$ as defined by~\Cref{def:ctidb-reduct}. Instead of looking only at the possible worlds of $\pdb'$, one can consider the set of all worlds, including those that cannot exist due to, e.g., disjointness. Since $\abs{\tupset} = \numvar$ the all worlds set can be modeled by $\worldvec\in \{0, 1\}^{\numvar\bound}$, such that $\worldvec_{\tup, j} \in \worldvec$ represents whether or not the multiplicity of $\tup$ is $j$ (\emph{here and later, especially in \Cref{sec:algo}, we will rename the variables as $X_1,\dots,X_{\numvar'}$, where $\numvar'=\sum_{\tup\in\tupset}\abs{\block_\tup}$}).
\footnote{
In this example, $\abs{\block_\tup} = \bound$ for all $\tup$.
}
We can denote a probability distribution over all $\worldvec \in \{0, 1\}^{\numvar\bound}$ as $\bpd'$. When $\bpd'$ is the one induced from each $\prob_{\tup, j}$ while assigning $\probOf\pbox{\worldvec} = 0$ for any $\worldvec$ with $\worldvec_{\tup, j}, \worldvec_{\tup, j'} \neq 0$ for $j\neq j'$, we end up with a bijective mapping from $\bpd$ to $\bpd'$, such that each mapping is equivalent, implying the distributions are equivalent, and thus query results.
\Cref{subsec:supp-mat-ti-bi-def} has more details. \medskip
We now make a meaningful connection between possible world semantics and world assignments on the lineage polynomial.
\begin{Proposition}[Expectation of polynomials]\label{prop:expection-of-polynom}
Given a \abbrBPDB $\pdb = (\Omega,\bpd)$, $\raPlus$ query $\query$, and lineage polynomial $\apolyqdt$ for arbitrary result tuple $\tup$,
we have (denoting $\randDB$ as the random variable over $\Omega$):
$ \expct_{\randDB \sim \bpd}[\query(\randDB)(t)] = \expct_{\vct{\randWorld}\sim \pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}. $
\end{Proposition}
\noindent A formal proof of \Cref{prop:expection-of-polynom} is given in \Cref{subsec:expectation-of-polynom-proof}.\footnote{Although \Cref{prop:expection-of-polynom} follows, e.g., as an obvious consequence of~\cite{IL84a}'s Theorem 7.1, we are unaware of any formal proof for bag-probabilistic databases.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,307 @@
%root: main.tex
%!TEX root=./main.tex
\definecolor{GrayRew}{gray}{0.85}
\newcommand{\RCOMMENT}[1]{\medskip\noindent \begin{tabular}{|p{\linewidth-3ex}}\rowcolor{GrayRew} #1 \end{tabular}\smallskip\\}
\section{Response to first cycle reviewer comments}
\label{sec:rebuttal}
This paper is a resubmission of our submission to the ICDT first cycle. We thank the reviewers for their insightful comments, which we believe has helped improve the presentation of the paper tremendously.
We use this section to document the changes that have been made since our prior submission, and in particular, how we have addressed reviewer comments (reviewer comments are shaded followed by our responses).
\subsection{Meta Review}
\RCOMMENT{Problem definition not stated rigorously nor motivated. Discussion needed on the standard PDB approach vs your approach.}
We rewrote \Cref{sec:intro} to specifically address this concern. The opening paragraph precisely and formally states the query evaluation problem in \abbrBPDB\xplural. We use a series of problem statements to clearly define the problem we are addressing as it relates to the query evaluation problem.
We made the concrete problem statements more precise by more clearly formalizing $\qruntime{Q, \dbbase}$ and stating our runtime objectives relative to it (\Cref{prob:informal},~\ref{prob:big-o-joint-steps},~\ref{prob:intro-stmt}).
%Notably, explicit discussion of provenance polynomials is limited to the proofs in the appendices.
We have included a discussion of the standard approach, e.g. see the paragraph \textbf{Relationship to Set-Probabilistic Query Evaluation} on page 4.
\RCOMMENT{Definition 2.6 on reduced BIDB polynomials seem not the right tool for the studied problem.}
We have chosen to stick with a less formal, ad-hoc definition (please see \Cref{def:reduced-poly} and \Cref{def:reduced-bi-poly}) of the general problem as suggested by both Reviewer 1 and Reviewer 2. Our earlier proof of the current \Cref{lem:exp-poly-rpoly} (in the appendix) had a small bug, which also has been fixed.
\RCOMMENT{The paper is very difficult to read. Improvements are needed in particular for the presentation of the approximation results and their proofs. Also for the notation. Missing definitions for used notions need to be added. Ideally use one instead of three query languages (UCQ, RA+, SPJU).}
%\AH{How have we handled the presentation of the approximation results and their proofs?}
%\AR{Added text at the end of the answer
We have chosen one specific query language throughout the paper ($\raPlus$) and made a concerted effort to use clean, defined, non-ambiguous notation.
We have also simplified the notation by limiting the paper's use of provenance semirings (which are needed solely for proofs) to the appendix.
To the best of our examination, all notation conflicts have been addressed and definitions for used notions are added (see e.g. \Cref{def:Gk} appears before \Cref{lem:3m-G2} and \Cref{lem:lin-sys}).
After the rewrite of \Cref{sec:intro}, we had even less space for \Cref{sec:algo}. However, we have modified \Cref{sec:algo} so that it flows better. In particular, we start off with the algorithm idea first (paragraph \textbf{Overview of our Techniques} in \Cref{sec:intro} also has more details on the intuition behind the approximation algorithm) and then state the results (with more details on how we argue the claimed runtime). Finally, we clearly state \Cref{cor:approx-algo-punchline} for which queries our linear-time approximation result holds.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Reviewer 1}
\RCOMMENT{l.24 "is \#W[1]-hard": parameterized by what?}
\RCOMMENT{l.103 and l.105: again, what is the parameter exactly?}
While the above references do not exist in the revised \Cref{sec:intro} anymore, all theorem statements and claims on \sharpwone runtime have been stated in a way so as to avoid ambiguity in the parameter. Please see e.g. \Cref{thm:k-match-hard} and \Cref{thm:mult-p-hard-result}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{You might want to explain your title somewhere (probably in the introduction): in the end, what exactly should be considered harmful and why?}
We have modified the title to be more descriptive.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{l.45 when discussing Dalvi and Suciu's dichotomy, you might want to mention that they consider *data complexity*. Currently the second sentence of your introduction ("take a query Q and a pdb D") suggests that you are considering combined complexity.}
We have made an explicit mention of data complexity when alluding to Dalvi and Suciu's dichotomy. We have further rewritten \Cref{sec:intro} in such a way as to explicitly note the type(s) of complexity we are considering (mostly it's parameterized complexity).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{l.51 "Consider ... and tuples are independent random event": so this is actually a set PDB... You might want to use an example where the input PDB is actually a bag PDB. The last sentence before the example makes the reader *expect* that the example will be of a bag PDB that is not a set PDB}
Our revision has removed the example referred to above. While the paper considers inputs to queries that are equivalent to set-\abbrPDB, this is not limiting. Please see \Cref{footnote:set-not-limit} on \Cpageref{footnote:set-not-limit}. Furthermore, we have added a discussion to the appendix that expands on why our results do extend beyond set inputs (\Cref{sec:gener-results-beyond}).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{- In the case of set semantics, the lineage of a tuple can be defined for *any* query: it is the unique Boolean function that satisfies the if and only if property that you mention on line 70. For bag semantics however, to the best of my knowledge there is no general definition of what is a lineage for an arbitrary query. On line 73, it is not clear at all how the polynomial should be defined, since this will depend on the type of query that you consider}
Note that lineage for a set semantics query is as a positive Boolean formula is defined for positive relational algebra. For instance, for aggregate queries a more powerful model ~(\cite{AD11d}) is needed.
The definition of the lineage polynomial (bag \abbrPDB) semantics over an arbitrary $\raPlus$ query $\query$ is now given in \Cref{fig:nxDBSemantics}.
We also note that these semantics are not novel (e.g., similar semantics appear for both provenance \cite{DBLP:conf/pods/GreenKT07} and probabilistic database \cite{kennedy:2010:icde:pip,FH12} contexts). %feng:2019:sigmod:uncertainty,
However, as we were unable to find a formal proof of the equivalence between the expectation of the query multiplicity and of the lineage polynomial in related work, we have included a proof of \Cref{prop:expection-of-polynom}.
\RCOMMENT{l.75 "evaluating the lineage of t over an assignment corresponding to a possible world": here, does the assignment assigns each tuple to true or false? In other words, do the variables X still represent individual tuples? From what I see later in the article it seems that no, so this is confusing if we compare to what is explained in the previous paragraph about set TIDB}
The discussion after \Cref{prob:bag-pdb-poly-expected} (in particular, the paragraph \textbf{\abbrTIDB\xplural}) specifically address these questions. While values for possible worlds assigned are from $\{0, 1\}$, which is analog to Boolean, this is not limiting. Please see \Cref{footnote:set-not-limit} $\inparen{\Cpageref{footnote:set-not-limit}}$ and the new appendix section \Cref{sec:gener-results-beyond}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{- l.135 "polynomial Q(X)": Q should be reserved for queries... You could use $\varphi$ or $\phi$ or... anything else but Q really}
We now use $\poly\inparen{\vct{X}}$ for (lineage) polynomials.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{- If we consider data complexity (as did Dalvi and Suciu) and fix an UCQ Q, given as input a bag TIDB PDB we can always compute the lineage in $O(|D|^|Q|)$ in SOP form and from there compute the expected multiplicity with the same complexity, so in polynomial time. How does this relate to your hardness result? Is it that you are only interested in combined complexity? Why one shouldn't be happy with this running time? Usually queries are much smaller than databases and this motivates studying data complexity.}
We have rewritten \Cref{sec:intro} in a way to stress that we are are primarily interested in data complexity, but we cannot stop there. As the reviewer has noted, the problem we explore requires further analysis, where we require parameterized and fine grained complexity analysis to provide a theoretical foundation for the question we ask in \Cref{prob:informal}. We have discussed this in the prose following \Cref{prob:bag-pdb-poly-expected}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{A discussion is missing about the difference between the approach usually taken in PDB literature and your approach. In which case would one be more interested in the expected multiplicity or in the marginal probability of a tuple? This should be discussed clearly in the introduction, as currently there is no clear "motivation" to what you do. There is a section about related work at the end but it is mostly a set of facts and there is no insightful comparison to what you do.}
We provide more motivating examples in the first paragraph, and include a more detailed discussion of the relationship to sets in paragraph \textbf{Relationship to Set-Probabilistic Query Evaluation} after \Cref{prob:informal}.
For example, expected multiplicities can model expectation of a \lstinline{COUNT(*)} query, while in many contexts computing the probability that this count is non-zero is not that useful.
%As a trivial (albeit relevant) example, consider a model of a contact network.
%The probability that there exists at least one new COVID infection in the graph is far less informative than the expected number of new infections.
As we now explain in the introduction, another motivation for generalizing marginal probability to expected multiplicity is that it is a natural generalization. The marginal probability of a tuple $t$ is the expectation of a Boolean random variable that is assigned 1 in every world where tuple $t$ exists and $0$ otherwise. For bag-PDBs the multiplicity of a query result tuple can be modeled as a natural-number random variable that for a world $\db$ is assigned the multiplicity of the tuple in $\db$. Thus, a natural generalization of the marginal probability (expectation of a Boolean random variable) to bags is the expectation of this variable: the tuple's expected multiplicity.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{l.176 "N[X] relations are closed under RA+": is this a *definition* of what it means to take an RA+ query and evaluate it over an N[X] database, or does this sentence say something more? Also, I think it would be clearer to use UCQs in the whole paper instead of constantly changing between UCQs, RA+ and SPJU formalisms}
To make the paper more accessible and general, we found it better to not use $\semNX$-DBs. While we wanted to use UCQ, we found the choice of $\raPlus$ to be more amenable to the presentation of the paper, and have, as suggested stuck with one query formalism.
\RCOMMENT{There are too many things undefined in from l.182 to the end of page. l.182 and in Proposition 2.1 N-PDBs are not defined, the function mod is undefined, etc. The article should be self-contained: important definitions should be in the article and the appendix should only be used to hide proof details. I think it would not take a lot of space to properly define the main concepts that you are using, without hiding things in the appendix}
%We have done as the reviewer has suggested.
All material in \Cref{sec:background} that is proof-related is in the appendix, while \Cref{sec:background} (modulo the proofs) is itself now self-contained.
\RCOMMENT{l.622 and l.632-634: so a N-PDB is a PDB where each possible world is an N-database, but an N[X]-PDB is not a PDB where each possible world is an N[X]-database... Confusing notation}
The text now refers to latter as an \abbrNXPDB\xplural.
\RCOMMENT{If you want to be in the setting of bag PDBs, why not consider that the value of the variables are integers rather that Boolean? I.e., consider valuations $\nu: X \rightarrow$ N (or even to R, why not?) instead of $X \rightarrow \{0,1\}$; this would seem more natural to me than having this ad-hoc "mix" of Boolean and non-Boolean setting. If you consider this then your "reduced polynomial" trick does not seem to work anymore.}
Our objective is to establish the feasibility of bag-probabilistic databases as compared to existing deterministic query processing systems.
Accordingly, we take our input model from production database systems like Postgresql, Oracle, DB2, SQLServer, etc. (e.g., see \Cref{footnote:set-not-limit} on \Cpageref{footnote:set-not-limit}), where duplicate tuples are represented as independent entities.
As a convenient benefit, this leads to a direct translation of TIDBs (which are defined over $\{0,1\}$ inputs).
Finally, as we mention earlier, an easy generalization exists to encode a \abbrBPDB in a set-\abbrPDB (which then allows for bag inputs). \Cref{sec:gener-results-beyond}.
\RCOMMENT{- l.656 "Thus, from now on we will solely use such vectors...": this seems to
be false. Moreover you keep switching notation which makes it very hard to read... Sometimes it is $\varphi$, sometimes it is small w, sometimes it is big W (l.174 or l.722), sometimes the database is $\varphi(D)$, sometimes it is $\varphi_w(D)$, other times it is $D_{[w]}$ (l.671), and so on.}
We have made effort to be consistent with the use of notation, following standard usage whenever possible.
\RCOMMENT{l.658 "we use $\varphi(D)$ to denote the semiring homomorphism $\semNX \rightarrow \semN$
that...": I don't understand why you need a database to extend an assignment to its semiring homomorphism from $\semNX \rightarrow \semN$}
$\varphi$~\cite{DBLP:conf/pods/GreenKT07} lifts the valuation function (with kind $\semNX \rightarrow \semN$) to databases (i.e., a mapping from an $\semNX$-DB to a deterministic $\semN$-DB).
We note that the main body of the paper no longer references $\semNX$-DBs, and thus $\varphi$ is discussed exclusively in the appendix.
\RCOMMENT{Figure 2, K is undefined}
We have updated \Cref{fig:nxDBSemantics} (originally figure 2) to not need $K$.
\RCOMMENT{l.178 "$Q_t$", l.189 "Q will denote a polynomial": this is a very poor choice of notation}
\RCOMMENT{l.242 "and query Q": is Q a query or a lineage?}
We have reserved $\query$ to mean an $\raPlus$ query and nothing else.
\RCOMMENT{Section 2.1.1: here you are considering set semantics no? Otherwise, one would think that for bag semantics the annotation of a tuple could be 0 or something of the form c $\times$ X, where X is a variable and c is a natural number}
Please see \Cref{sec:gener-results-beyond} for a discussion on going beyond set inputs.
\RCOMMENT{Proof of Proposition A.3. I seems the proof should end after l.687, since you already proved everything from the statement of the proposition. I don't understand what it is that you do after this line.}
This text is an informal proof of \Cref{prop:expection-of-polynom} originally intended to motivate \Cref{prop:semnx-pdbs-are-a-}.
We agree that this should not be part of the proof of the later, and have removed the text.
\RCOMMENT{l.686 "The closure of ... over K-relations": you should give more details on this part. It is not obvious to me that the relations from l.646 hold.}
The core of this (otherwise trivial) argument, that semiring homomorphisms commute through queries, was already proven in \cite{DBLP:conf/pods/GreenKT07}. We now make this reference explicit.
We apologize for not explaining this in more detail. In universal algebra~\cite{graetzer-08-un}, it has been proven (the HSP theorem) that for any variety, the set of all structures (called objects) with a certain signature that obey a set of equational laws, there exists a ``most general'' object called the \emph{free object}. The elements of the free objects are equivalence classes (with respect to the laws of the variety) of symbolic expressions over a set of variables $\vct{X}$ that consist of the operations of the structure. The operations of the free object are combining symbolic expression using the operation. It has been shown that for any other object $K$ of a variety, any assignment $\phi: \vct{X} \to K$ uniquely extends to a homomorphism from the free object to $K$ by substituting variables for based on $\phi$ in symbolic expression and then evaluating the resulting expression in $K$.
Commutative semirings form a variety where $\semNX$ is the free object. Thus, for any polynomial (element of $\semNX$), for any assignment $\phi: \vct{X} \to \semN$ (also a semiring) there exists a unique semiring homomorphism $\textsc{Eval}_{\phi}: \semNX \to \semN$. Homomorphisms by definition commute with the operations of a semiring. Green et al. \cite{GK07} did prove that semiring homomorphisms extend to homomorphisms over K-relations (by applying the homomorphism to each tuple's annotation) and these homomorphisms over K-relations commute with queries.
\RCOMMENT{l.711 "As already noted...": ah? I don't see where you define which subclass of N[X]-PDBs define bag version of TIDBs. If this is supposed to be in Section 2.1.1 this is not clear, since the world "bag" does not even appear there (and as already mentioned everything seems to be set semantics in this section). I fact, nowhere in the article can I see a definition of what are bag TIDBs/BIDBs}
The new text precisely defines TIDBs (\Cref{sec:intro}), and the BIDB generalization (\Cref{subsec:tidbs-and-bidbs}). The specific text referenced in this comment has now been moved to the appendix and restructured to reference \Cref{def:semnx-pdbs} (which defines an \abbrNXPDB defined over variables $\vct{X}$) and relate it to the formal structure of BIDBs in \Cref{subsec:tidbs-and-bidbs}.
\RCOMMENT{- l.707 "the sum of the probabilities of all the tuples in the same block b is 1": no, traditionally it can be less than 1, which means that there could be no tuple in the block.}
The reviewer is correct and we have updated our appendix text accordingly. %\AR{Um, no we haven't. {\bf TODO for Atri:} do this.}
\RCOMMENT{it is not clear to me how you can go from l.733 to l.736, which is sad because this is actually the whole point of this proof. If I understand correctly, in l.733, Q(D)(t) is the polynomial annotation of t when you use the semantics of Figure 2 with the semiring K being N[X], so I don't see how you go from this to l.736}
This result follows from the inner sum looping only over $\vct{w}$ s.t. $\assign_{\vct{w}}(\pxdb) = \db$. As a consequence of this constraint, we have $Q(\db_{\semNX})(t)(\vct{w}) = \query(\db)(t)$. The latter term is independent of the summation, and so can be pulled out by distributivity of addition over multiplication.
We agree with the reviewer that this could be presented more clearly, and have now split the distributivity argument into a separate step.
\RCOMMENT{l.209-227: so you define what is a polynomial and what is the degree of a polynomial (things that everyone knows), but you don't bother explaining what "taking the mod of Q(X) over all polynomials in S" means? This is a bit weird.}
Based on this and other reviewer comments, we removed the earlier definition of $\rpoly\inparen{\vct{X}}$ and have defined it in a more ad-hoc manner, as suggested by the reviewers, including the comment immediately following.
\RCOMMENT{Definition 2.6: to me, using polynomial long division to define $\tilde{Q}$(X) seems like a pedantic way of reformulating something similar to Definition 1.3, which was perfectly fine and understandable already! You could just define $\tilde{Q}$(X) to set all exponents in the SOP that are >1 to 1 and to remove all monomials with variables from the same block, or using Lemma A.4 as a definition?}
As alluded to above, we have incorporated the reviewer's suggestion, c.f. \Cref{def:reduced-poly} and \Cref{def:reduced-bi-poly}.
\RCOMMENT{Definition 2.14. It is not clear what is the input exactly. Are the query Q and database D fixed? Moreover, I have the impression that your hardness results have nothing to do with lineages and that you don't need them to express your results. I think the problem you should consider is simply the following: Expected Multiplicity Problem: Input: query Q, N[X]-database D, tuple t. Output: expected multiplicity of t in Q(D). Your main hardness result would then look like this: the Expected
Multiplicity problem restricted to conjunctive queries is \#W[1]-hard, parameterized by query size. Indeed if I look at the proof, all you need is the queries $Q^k_G$. The problem is \#W[1]-hard and it should not matter how one tries to solve it: using an approach with lineages or using anything else.
Currently it is confusing because you make it look like the problem is hard only when you consider general arithmetic circuits, but your hardness proof has nothing to do with circuits. Moreover, it is not surprising that computing the expected output of an arithmetic circuit is hard: it is trivial, given a CNF $\phi$, to build an arithmetic circuit C such that for any valuation $\nu$ of the variables the formula $\phi$ evaluates to True under $\nu$ if C evaluates to 1 and the formula $\phi$ evaluates to False under $\nu$ if C evaluates to 0, so this problem is \sharpphard anyways.}
The reviewer is correct. Our hardness results are now stated independently of circuits. We note that the hardness result alluded to at the end of the comment above is not applicable in our case since for fixed queries $\query$, \Cref{prob:bag-pdb-query-eval} and \Cref{prob:bag-pdb-poly-expected} can be solved in polynomial time.
Further, as we point out in \Cref{sec:intro} what is new in our hardness results is that we show a query $Q^k$ such that $\qruntime{\query^k,\dbbase}$ is small (linear in $\abs{\dbbase}$ but solving \Cref{prob:bag-pdb-query-eval} and \Cref{prob:bag-pdb-poly-expected} is hard. We note that it is well-known that one can reduce the problem of counting $k$-cliques or $k$-matchings to a query $\query$ for which computing $\query(\dbbase)$ is $\sharpwone$-hard. So our contribution to come up with a different reduction from counting $k$-matchings so that the hardness manifests itself in the probabilistic computing part of our problem.
%We have rewritten \Cref{sec:intro} with a series of refined problem statements to show that the problem we explore and the results we obtain directly involve lineage polynomials. The reviewer is correct that the output is the expected multiplicity, and we hope that our updated presentation of the paper makes it clear that $\expct_{\vct{\randWorld}\sim\pdassign}\pbox{\apolyqdt\inparen{\vct{\randWorld}}}$ is indeed the expected multiplicity spoken of. We have also addressed the ambiguity in the complexity we are focusing on, both explicitly in the intro and in the revised definition, \Cref{def:the-expected-multipl}.
%
%Regarding the use of circuits, it is true that our hardness results do not require circuits while our approximation algorithm and cost model both rely on circuits. We have adjusted our presentation (e.g. the segway between \Cref{prob:informal} and \Cref{prob:big-o-joint-steps}) to make this distinction clear and eliminate any confusion.
\RCOMMENT{Section 3.3. It seems to me the important part of this section is not so much the fact that we have fixed values of p but that the query is now fixed and that you are looking at the fine-grained complexity. If what you really cared about was having fixed value of p, then the result of this section should be exactly like the one in Theorem 3.4, but starting with "fix p". So something like "Fix p. Computing $\tilde{Q}^k_G$ for arbitrary G is \#W1-hard".}
We agree with the reviewer that the result on fixed value of $p$ is mostly of (narrow) theoretical interest. We have added a discussion summarizing the reviewer's point above below \Cref{th:single-p-hard}.
\RCOMMENT{General remark: The story of the paper I think should be this: we can always compute the expected multiplicity for a UCQ Q and N[X]-database D and tuple t by first computing the lineage in SOP form and then using linearity of expectation, which gives an upper bound of (roughly) $O(|D|^|Q|)$. We show that this exponential dependence in |Q| is unavoidable by proving that this problem is \#W1 hard parameterized by |Q| (which implies that we cannot solve it in $f(|Q|) |D|^c$ ). Furthermore we obtain fine-grained superlinear lower bounds for a fix conjunctive query Q. (Observe how up to here, there is no need to talk about lineages at all). We then obtain an approximation algorithm for this problem for [this class of queries] and [that class of bag PDBs] with [that running time (Q,D)]. The method is to first compute the lineage as an arithmetic circuit C in [this running time (Q,D)], and then from the arithmetic circuit C compute in [running time(C)] an approximation of its expected output. Currently I don't understand to which queries your approximation algorithm can be applied (see later comments).}
%We have followed the suggestions of the reviewer to delineate between the `coarse' polynomial time and the fine grained complexity analysis. We found it necessary to introduce polynomials earlier since our hard query, hardness results, and their proofs are easier to present (and we feel make the paper more accessible) than doing so without the lineage polynomials.
%We have taken pains to be very clear that this work only considers $\raPlus$ queries, adding a reminder to this end in the first paragraph of \Cref{sec:algo}.
%\AH{We need to address the last line of the reviewer's comment. Also, not sure if I answered the comment perfectly.}
We have restructured \Cref{sec:intro} to more or less follow the reviewer's outline above. The only deviation is that we still introduce lineage polynomials. We do this because the polynomial view is very helpful in the proofs of our hardness result (in addition to the obvious relevance for the approximation algorithm). We have also clarified that our approximation result applied to all $\raPlus$ queries (see \Cref{cor:approx-algo-punchline}).
\RCOMMENT{l.381: Here again, I think it would be simpler to consider that the input of the problem is the query, the database and a tuple and claim that you can compute an approximation of the expected multiplicity in linear time. The algo is to first compute the lineage as an arithmetic circuit, and then to use what you currently use (which could be put in a lemma or in a proposition).}
We have implemented the above overview in \Cref{sec:intro} when we move from \Cref{prob:informal} to \Cref{prob:intro-stmt}. For the approximation algorithm we focus on \Cref{prob:intro-stmt}, which still takes a circuit as an input.
%Our appoximation algorithm assumes an input circuit \circuit that has been computed via an arbitrary $\raPlus$ query $\query$ and arbitrary \abbrBIDB $\pdb$. We have included prose to describe this at the beginning of {sec:algo:sub:main-result}.
\RCOMMENT{Definition 4.2: would you mind giving an intuition of what this is? It is not OK to define something and just tell the reader to refer the appendix to understand what this is and why this is needed; the article should be understandable without having to look at the appendix. It is simply something that gives the coefficient of each monomial in the reduced polynomial?}
We have provided an example in directly after \Cref{def:expand-circuit} as well as a sentence pointing out why this definitions is useful. %\AR{This is not enough: we need a line for {\em why} this notation is useful. {\bf TODO for Atri}}
\RCOMMENT{- l.409: how does it matter that the circuit C is the lineage of a UCQ? Doesn't this work for any arithmetic circuit?}
The reviewer is correct that the earlier Theorem 4.9 works for any circuit (this result is now in the appendix).
%The reviewer is correct that our approximation results apply to $\raPlus$ queries over \abbrBIDB\xplural. This we specify this in the formal statements of \Cref{sec:algo}, e.g. see \Cref{def:param-gamma} and \Cref{cor:approx-algo-const-p}.
%More specifically, our proofs rely on (i) circuits with a bounded polynomial degree (we use a slightly non standard definition of degree --- \Cref{def:degree}), which is the case for any circuit resulting from an $\raPlus$ query; and (ii) specific assumptions about variable independence, which hold when the input to the query is a BIDB.
\RCOMMENT{l.411: what are $|C|^2(1,...,1)$ and $|C|(1,...,1)$? }
We clarify this overloaded notation immediately after \Cref{def:positive-circuit}.
\RCOMMENT{Sometimes you consider UCQs, sometimes RA+ queries. I think it would be simpler if you stick to one formalism (probably UCQs is cleaner?)}
As alluded to previously, we have followed the reviewer's suggestion and have found $\raPlus$ queries to be most amenable for this work.
\RCOMMENT{l.432 what is an FAQ query?}
We actually no longer need that result since \Cref{lem:val-ub} now has a bound on $|\circuit|(1,\dots,1)$ in terms of $\depth(\circuit)$ and the latter is used in \Cref{cor:approx-algo-punchline} for all $\raPlus$ queries. Please see \Cref{lem:val-ub} and the followup discussion for more on this.
\RCOMMENT{Generally speaking, I think I don't understand much about Section 4, and the convolutedness of the appendix does not help to understand. I don't even see in which result you get a linear runtime and to which queries the linear runtime applies. Somewhere there should be a corollary that clearly states a linear time approximation algorithm for some queries.}
We have re-organized {sec:algo} to address the above comments as follows:
\begin{itemize}
\item We now start off \Cref{sec:algo:sub:main-result} with the algorithm idea.
\item We give a quick overview of how the claimed runtime follows from the algorithm idea mentioned above.
\item Added \Cref{cor:approx-algo-punchline} that clearly states that we get an $O(\qruntime{\query,\dbbase})$ for {\em all} $\raPlus$ queries $Q$.
\end{itemize}
\RCOMMENT{In section 5, it seems you are arguing that we can compute lineages as arithmetic circuits at the same time as we would be running an ordinary query evaluation plan. How is that different from using the relations in Figure 2 for computing the lineage?}
There is not a major difference between the two. This observation has persuaded us to eliminate $\semNX$-DB query evaluation and have only an algorithm for lineage.
We have also re-organized the earlier Section 5 and moved the definition of $\qruntime{\cdot}$ (earlier denoted as $\mathbf{cost}(\cdot)$) to \Cref{sec:gen} and moved the rest of the material to the appendix.
\RCOMMENT{l.679 where do you use $max(D_i)$ later in the proof?}
Thank you. This reference was unnecessary and has been removed.
\RCOMMENT{l.688 That sentence is hard to parse, consider reformulating it}
As the reviewer notes above, this paragraph is unnecessary and we have removed it.
\RCOMMENT{it seems you are defining N[X]-PDB at two places in the appendix: once near l.632, and another time near l.652}
Thank you. The latter definition has been removed.
\subsection{Reviewer 2}
\RCOMMENT{First, the paper should state rigorously the problem definition. There are three well-known definitions in database theory: data complexity, combined complexity, and parameterized complexity. If I understand correctly, Theorem 3.4 refers to the parameterized complexity, Theorem 3.6 refers to the data complexity (of a fixed query), while the positive results in Sec. 4 (e.g. Th. 4.8) introduce yet another notion of complexity, which requires discussion.}
We have addressed the concerns in rewriting the entirety of \Cref{sec:intro}, explicitly mentioning complexity metrics considered, while forming a series of problem statements that describe the exact problem we are considering, and the complexity metrics considered. We have also adjusted the phrasing of the said theorems and definitions to eliminate the ambiguity described.
\RCOMMENT{The problem definition is supposed to be in Definition 2.14, but this definition is sloppy. It states that the input to the problem is a circuit C: but then, what is the role of the PDB and the query Q? Currently Definition 2.14 reads as follows: "Given a circuit C defining some polynomial Q(X), compute E[Q(W)]", and, thus, the PDB and the query play no role at all. All results in Section 4 seem to assume this simplified version of Definition 2.14. On the other hand, if one interprets the definition in the traditional framework of data complexity (Q is fixed, the inputs are D and C) then the problem is solvable in PTIME (and there is no need for C), since E[Q(W)] is the sum of expectations of the monomials in Q (this is mentioned in Example 1.2).}
We have rephrased \Cref{def:the-expected-multipl} to qualify data complexity. The paper (especially in \Cref{sec:intro}) builds up the fact that we aren't stopping at polynomial time, but exploring parameterized complexity and fine grained analysis (as the reviewer aptly noted in the first comment).
\RCOMMENT{Second, Definition 2.6 of Reduced BIDB polynomials is simply wrong. It uses "mod" of two multivariate polynomials, but "mod" doesn't exists for multivariate polynomials...Either state Definition 2.6 directly, in an ad-hoc manner (which seems doable), or do a more thorough job grounding it in the ring of multivariate polynomials and its ideals.
}
The reviewer is correct in their comment on the "mod" part-- we apologize for the error.
We have implemented the reviewer's ad-hoc suggestion in light of Reviewer 1's similar suggestions.
\RCOMMENT{the paper uses three notations (UCQ, RA+, SPJU) for the same thing, and never defines formally any of them.}
We have chosen $\raPlus$ for consistent use throughout the paper. We have included \Cref{footnote:ra-def} on \Cpageref{footnote:ra-def} for an explicit definition of $\raPlus$ queries.
\RCOMMENT{$G^{\ell}$ is used in Lemma 3.8 but defined only in the Appendix (Def. B.2), without even a forward pointer. This is a major omission: Lemma 3.8 is a key step for a key result, but it is impossible to read.}
We have fixed this mistake. Unfortunately, because of the changes in the paper (especially expanding on \Cref{sec:intro}), the earlier Lemma 3.8 had to be moved to the appendix.
\RCOMMENT{Definition 2.7. "valid worlds $\eta$". This is confusing. A "possible world" is an element of $\idb$: this is not stated explicitly in the paper, but it is implicit on line 163, so I assumed that possible worlds refer to elements of $\idb$. If I assumed correctly, then calling $\eta$ a "world" in Def. 2.7 is misleading, because $\eta$ is not an element of $\idb$. More, it is unclear to me why this definition is needed: it is used right below, in Lemma 2.8, but that lemma seems to continue to hold even if w is not restricted.}
We agree with the reviewer that this notation is confusing;
$\eta$ is meant to cope with the fact that tuples from the same group in a BIDB can not co-exist, even though our $\{0,1\}$-input vectors can encode such worlds.
We now address this constraint by embedding it directly into the reduced polynomial with \Cref{def:reduced-bi-poly}.
\RCOMMENT{line 305: please define what is an "occurrence of H in G". It could mean: a homomorphic image, a subgraph of G isomorphic to H, an induced subgraph of G isomorphic to H, or maybe something else.}
We agree with the reviewer's suggestion and have rephrased the wording to be clear. Please see the beginning of \Cref{sec:hard:sub:pre}.
\RCOMMENT{If the proofs are given in the appendix, please say so. Lemmas 3.5 and 3.8 are stated without any mention, and one has to guess whether they are obvious, or proven somewhere else. On this note: I found Lemma 3.5 quite easy, since the number of k-matching is the coefficient of the leading monomial (of degree 2k) in $Q^k(p,p,...,p)$, while Lemma 3.8 appears much harder. It would help to briefly mention this in the main body of the paper.}
We have implemented the reviewer's suggestion. Please see the last sentence of \Cref{sec:intro} (as well as the expanded discussion on the hardness result in the {\bf Overview of our Techniques} part of \Cref{sec:intro}).
\RCOMMENT{line 177: what is $\Omega_{\semNX}$?}
We have eliminated the use of $\semNX$-DBs in the paper proper, using them only when necessary in the proofs of the appendix.
\RCOMMENT{line 217. The polynomial $X^2 + 2XY + Y^2$ is a poor choice to illustrate the degree. There are two standard definitions of the degree of a multivariate polynomial, and one has to always clarify which one is meant. One definition is the total degree (which is Def. 2.3 in the paper), the other is the maximum degree of any single variable. It is nice that you are trying to clarify for the reader which definition you are using, but the polynomial $X^2 + 2XY + Y^2$ is worst choice, since here the two coincide.}
We have adjusted the example to account for the reviewer's correct observation.
\RCOMMENT{line 220. "we consider only finite degree polynomials". This is a surprise. Polynomials, by definition, are of finite degree; there are extensions (I'm aware of powerseries, maybe you have other extensions in mind), but they are usually not called polynomials, plus, nothing in the paper so far suggests that it might refer to those extensions.}
We have removed the redundant terminology the reviewer has pointed out, and refined the discussion surrounding (and including) \Cref{eq:sop-form} to be explicit to the novice reader that polynomials are by definition of finite degree.
\RCOMMENT{"Note that our hardness results even hold for the expression trees". At this point we haven't seen the hardness results, nor their proofs, and we don't know what expression trees are. It's unclear what we can note.}
%We have accounted for the reviewer's concern in the rewrite of \Cref{sec:hard} adjusting the prose accordingly.
Our hardness results are now stated independently of circuits so the above statement no longer appears in the paper.
\RCOMMENT{paragraph at the top of pp.10 is confusing. My guess is that it is trying to this say: "there exists a query Q, such that, for each graph G, there exists a database D s.t. the lineage of Q on D is the polynomial $Q_G$."}
Our revision has eliminated this statement.
\subsection{Reviewer 3}
\RCOMMENT{The overall study is then extended to a multiplicative approximation algorithm for the expectation of polynomial circuits in linear time in the size of the polynomial. It was much harder to read this part, and I found the examples and flow in the appendix quite helpful. I suggest to include these examples into the body of the paper. }
%\AH{Need to address this.}
In our revision we expanded on \Cref{sec:intro} to give a better overview of the problems we are considering in this paper. This meant we had to cut out material in later sections, which unfortunately meant we did not have space in \Cref{sec:algo} to include any examples that the reviewer suggested above. However, we have tried to make \Cref{sec:algo} more readable as a whole.
\RCOMMENT{While ApproximateQ is linear in the size of the circuit, it is quadratic in epsilon and so we need quadratically many samples for the desired accuracy -- overall runtime is not linear therefore and it may be better to elaborate this. It may also be helpful to comment on how this relates to Karp, Luby, Madras algorithm [1] for \#DNF which is also quadratic in epsilon.}
%\AH{Need to elaborate on this.}
In \Cref{prob:big-o-joint-steps} we note explicitly that we care about linear dependence on $\qruntime{\inparen{\query,\dbbase}}$ and do not care about the exact dependence on $\epsilon$. While it would be nice to design an approximation algorithm that is linear in $1/\epsilon$ as well, we believe it is out of scope of this initial work.
\RCOMMENT{The coverage of related work is adequate. Fink et. al seems as the closest related work to me and I would appreciate a more elaborate comparison with this paper. My understanding is that Fink et. al considers exact evaluation only and focuses on knowledge compilation techniques based on decompositions. They also note that "Expected values can lead to unintuitive query answers, for instance when data values and their probabilities follow skewed and non-aligned distributions" attributed to [2]. Does this apply to the current work? Can you please comment on this?}
The work is indeed quite close to our own.
It targets a broader class of queries (aggregates include COUNT/SUM/MIN/MAX, rather than our more narrow focus on COUNT), but has significantly less general theoretical results.
Most notably, their proof of linear runtime in the size of the input polynomial is based on a tree-based encoding of the polynomial.
Tree-based representation representation (and hence the Fink et. al. algorithm's runtime) is, as we note several times, superlinear in $\qruntime{\query, \dbbase}$.
This result is also limited to a specific class of (hierarchical) queries, devolving to exponential time (as in \cite{FH13}) in general.
By contrast, our results apply to all of $\raPlus$.
Our revised related work section now addresses both points.
\RCOMMENT{I assume the authors focus on parameterized complexity throughout the paper, and even this is not stated unambiguously. The authors should make an extra effort to make the paper more accessible by using the explanations and examples from the appendix in the body of the paper. It is also important to highlight the differences with the complexity of standard query evaluation over PDBs.}
Our revision has focused on explicitly mentioning the complexity metrics we are interested in. This can be seen in e.g. \Cref{sec:intro} and formal statement (theorems, lemmas, etc.), which have been rewritten to eliminate ambiguities.
We have also taken pains to be promote accessibility, keeping the paper self-contained, and using examples for difficult or potentially unclear concepts. This can be seen in e.g. eliminating unnecessary machinery (e.g. $\semNX$-DB machinery from the paper proper), providing/modifying examples (c.f. \Cref{def:expand-circuit}, \Cref{def:degree}), and ensuring consistency in notational use, e.g. using one query evaluation formalism ($\raPlus$).
We decided to focus on beefing up \Cref{sec:intro} and cleaning up definitions of problems, which unfortunately meant we ran out of space to bring back examples from the appendix (especially into \Cref{sec:algo}).
\subsection{Reviewer 4}
\RCOMMENT{I wonder whether the writing could be revisited to give the reader a better overview of the technical challenges, motivation, and the high level ideas of the algorithm and hardness results. The current exposition seems slightly too tailored for the expert in probabilistic databases rather than the average ICDT attendee. Also the current exposition is structured such that the reader needs to get through quite a few definitions and technical lemmas until they get to the new ideas in the paper.}
We have (as noted throughout this section) revised the writing to provide precision and clarity to the problem we explore as well as the results we obtain. Part of this revision was a complete rewriting of \Cref{sec:intro} where we sought to be extremely precise in language and through a series of problem statements to help the reader navigate and understand the problem we explore as well as how we have gone about exploring that problem coupled with the validity of the exploration strategy. We have simultaneously sought to make the paper more accessible by assuming the average ICDT attendee and defining or explaining concepts that might not be known to them. Finally, we have expanded on the {\bf Overview of our Techniques} part of \Cref{sec:intro} to provide more intuition on how we prove our lower and upper bounds.
\RCOMMENT{}
\RCOMMENT{}
\RCOMMENT{}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,7 @@
%!TEX root=./main.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Parameterized Complexity}\label{sec:param-compl}
In \Cref{sec:hard}, we utilized common conjectures from fine-grained complexity theory. The notion of $\sharpwonehard$ is a standard notion in {\em parameterized complexity}, which by now is a standard complexity tool in providing data complexity bounds on query processing results~\cite{param-comp}. E.g. the fact that $k$-matching is $\sharpwonehard$ implies that we cannot have an $n^{\Omega(1)}$ runtime. However, these results do not carefully track the exponent in the hardness result. E.g. $\sharpwonehard$ for the general $k$-matching problem does not imply anything specific for the $3$-matching problem. Similar questions have led to intense research into the new sub-field of {\em fine-grained complexity} (see~\cite{virgi-survey}), where we care about the exponent in our hardness assumptions as well-- e.g. \Cref{conj:graph} is based on the popular {\em Triangle detection hypothesis} in this area (cf.~\cite{triang-hard}).

View File

@ -0,0 +1,34 @@
%!TEX root=./main.tex
\section{Related Work}\label{sec:related-work}
\textbf{Probabilistic Databases} (PDBs) have been studied predominantly for set semantics.
Approaches for probabilistic query processing (i.e., computing marginal probabilities of tuples), fall into two broad categories.
\emph{Intensional} (or \emph{grounded}) query evaluation computes the \emph{lineage} of a tuple
and then the probability of the lineage formula.
It has been shown that computing the marginal probability of a tuple is \sharpphard~\cite{valiant-79-cenrp} (by reduction from weighted model counting).
The second category, \emph{extensional} query evaluation,
is in \ptime, but is limited to certain classes of queries.
Dalvi et al.~\cite{DS12} and Olteanu et al.~\cite{FO16} proved dichotomies for UCQs and two classes of queries with negation, respectively.
Amarilli et al. investigated tractable classes of databases for more complex queries~\cite{AB15}.
Another line of work studies which structural properties of lineage formulas lead to tractable cases~\cite{kenig-13-nclexpdc,roy-11-f,sen-10-ronfqevpd}.
In this paper we focus on intensional query evaluation with polynomials.
Many data models have been proposed for encoding PDBs more compactly than as sets of possible worlds.
These include tuple-independent databases~\cite{VS17} (\tis), block-independent databases (\bis)~\cite{RS07}, and \emph{PC-tables}~\cite{GT06}.
%
Fink et al.~\cite{FH12} study aggregate queries over a probabilistic version of the extension of K-relations for aggregate queries proposed in~\cite{AD11d} (\emph{pvc-tables}) that supports bags, and has runtime complexity linear in the size of the lineage.
However, this lineage is encoded as a tree; the size (and thus the runtime) are still superlinear in $\qruntime{\query, \tupset, \bound}$.
The runtime bound is also limited to a specific class of (hierarchical) queries, suggesting the possibility of a generalization of \cite{DS12}'s dichotomy result to \abbrBPDB\xplural.
Several techniques for approximating tuple probabilities have been proposed in related work~\cite{FH13,heuvel-19-anappdsd,DBLP:conf/icde/OlteanuHK10,DS07}, relying on Monte Carlo sampling, e.g.,~\cite{DS07}, or a branch-and-bound paradigm~\cite{DBLP:conf/icde/OlteanuHK10}.
Our approximation algorithm is also based on sampling.
\noindent \textbf{Compressed Encodings} are used for Boolean formulas (e.g, various types of circuits including OBDDs~\cite{jha-12-pdwm}) and polynomials (e.g., factorizations~\cite{factorized-db}) some of which have been utilized for probabilistic query processing, e.g.,~\cite{jha-12-pdwm}.
Compact representations for which probabilities can be computed in linear time include OBDDs, SDDs, d-DNNF, and FBDD.
\cite{DM14c} studies circuits for absorptive semirings while~\cite{S18a} studies circuits that include negation (expressed as the monus operation). Algebraic Decision Diagrams~\cite{bahar-93-al} (ADDs) generalize BDDs to variables with more than two values. Chen et al.~\cite{chen-10-cswssr} introduced the generalized disjunctive normal form.
\Cref{sec:param-compl} covers more related work on fine-grained complexity.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,451 @@
%root = main.tex
\AH{\large\bf{New stuff 092520.}}
\begin{Claim}\label{claim:constpk-TI}
Given a positive query polynomial $\poly$ over a $\ti$, with constant $\prob$ such that there exists a $\prob_0$ where for all $\prob_i, \prob_0 \leq \prob_i$, and constant $k = \degree(\poly)$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is constant.
\end{Claim}
\begin{proof}[Proof of Claim ~\ref{claim:constpk-TI}]
By independence, a $\ti$ has the property that all of its annotations are positive. Combined with the fact that \Cref{claim:constpk-TI} uses only positive queries, i.e., queries that only use $\oplus$ and $\otimes$ semiring operators over its polynomial annotations, it is the case that no negation exists pre or post query.
For any $\poly$ then, it is true that all coefficients in $\abs{\etree}(1,\ldots, 1)$ are positive and thus the same as their $\rpoly$ counterparts. This then implies that the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)} \leq \frac{\abs{\etree}(1,\ldots, 1)}{\abs{\etree}(1,\ldots, 1) \cdot \prob_0^k}$, which is indeed a constant.
\end{proof}
\qed
\subsection{$\rpoly$ over $\bi$}
\AH{A general sufficient condition is the $\bi$ having fixed block size (thus implying increasing number of blocks for growing $\numvar$). For increasing $\numvar$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ can be proven to be a constant since, as $\numvar$ increases, it has to be the case that new blocks are added, and this results in a constant number of terms cancelled out by $\rpoly$, with the rest surviving, which gives us a constant $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$.
\par In the general case, with fixed number of blocks and growing $\numvar$, all additional terms will be cancelled out by $\rpoly$ while for $\abs{\etree}(1,\ldots, 1)$ it is the case that it will grow exponentially with $\numvar$, yielding a ratio $\frac{O(2^\numvar)}{O(1)}$ and (as will be seen) greater.}
\subsubsection{Known Reduction Result $\bi \mapsto \ti$}
Denote an arbitrary $\bi$ as $\bipdb = (\bipd, \biwset)$ and a constructed $\ti$ to be $\tipdb = (\tipd, \tiwset)$, the details to be described next.
It is well known that $\bipdb$ can be reduced to a query $\poly$ over $\tipdb$. For completeness, let us describe the reduction.
Let tuples in $\bipdb$ be denoted $a_{\block, i}$ and their $\tipdb$ counterparts as $x_{\block, i}$, where $\block$ represents the block id in which $a_{\block, i}$ resides.
\begin{Theorem}\label{theorem:bi-red-ti}
For any $\bipdb$, there exists a query $\poly$ and $\tipdb$ such that $\poly(\tiwset)$ over distribution $\tipd$ outputs elements in $\biwset$ according to their respective probabilities in $\bipd$.
\end{Theorem}
\begin{Definition}[Total Ordering $\biord$]\label{def:bi-red-ti-order}
The order $\biord$ is a fixed total order across all tuples in block $\block$ of $\bipdb$.
\end{Definition}
\begin{Definition}[Query $\poly$]\label{def:bi-red-ti-q}
$\poly$ is constructed to map all possible worlds of $\db_{ti} \in \tiwset$ for which $x_i$ is the greatest according to $\biord$, to the worlds $\vct{w}$ in $\biwset$ in which $a_{\block, i}$ is present and $\bipd(\vct{w}) > 0$. Recall the constraint on $\bipdb$ to be that if $a_{\block, i}$ is present, then it is the case that for all $j \neq i$, tuple $a_{\block, j}$ is not present. For $\bipdb$ with exactly one block, all such worlds $\db_{ti}$ are mapped to the world $\{a_i\}$.
\end{Definition}
For simplicity, we will consider $\bipdb$ to consist of one block $\block$. By independence of blocks in $\bi$, the proofs below immediately generalize to the case of $\bipdb$ with multiple blocks\textcolor{blue}{...umm, we'll see, we made need to argue this}.
The reduction consists of the construction of a query $\poly$ and $\tipdb$ such that $\poly$ is computed over $\tipdb$. To construct the $\tipdb$ given an arbitrary $\bipdb$ a tuple alternative $a_{\block, i}$ is transcribed to a tuple in $\tipdb$ with probability
\begin{equation}
P(x_{b, i}) = \begin{cases}
\frac{P(a_{\block, i})}{\prod_{j = 1}^{i - 1}(1 - P(x_{\block, j}))} &\textbf{if }i > 1\\
P(a_i) &\textbf{if } i = 1.
\end{cases}\label{eq:bi-red-ti-func}
\end{equation}
The above is more simply written as
\begin{equation*}
\tipd(x_{\block, i}) = \frac{P(a_{\block, i})}{1 - \sum_{j = 1}^{i - 1} P(a_{\block, j})}
\end{equation*}
The above mapping is applied across all tuples of $\bipdb$.
This method for computing the probabilities of the tuples in $\tipdb$ allows for the following. According to $\biord$, the powerset of possible worlds is mapped in such a way that the first ordered tuple appearing in a possible world $\db_{\tiabb}$of $\tiwset$ has that world mapped to the world $\db_{\biabb} \in \biwset$ where $a_{\block, i}$ is present with $\bipd(\db_{\biabb}) > 0$. Recall that since we are considering a $\bi$ with one block, there is only one such world in $\biwset$.
\begin{Lemma}\label{lem:bi-red-ti-prob}
The sum of the probabilities of all $\db_{\tiabb} \in \tiwset$ database worlds mapped to a a given tuple $x_{b, i}$ equals the probability of the tuple $a_{\block, i}$ in the original $\bipdb$.
\end{Lemma}
\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-prob}]
The proof is by induction. Given a tuple $a_{\block, i}$ in $\bipdb$ such that $1 \leq i \leq \abs{b}$, (where $\abs{b}$ denotes the number of alternative tuples in block $\block$), by \Cref{eq:bi-red-ti-func} $P(x_{\block, i}) = \frac{P(a_{\block, i})}{1 \cdot \prod_{j = 1}^{i - 1} (1 - P(x_{\block, j}))}$.
For the base case, we have that $i = 1$ which implies that $P(x_{\block, i}) = P(a_{\block, i})$ and the base case is satisfied.
%Other neat tidbits include that $\abs{b} = 1$, the set $b = \{a_1\}$, and the powerset $2^b = \{\emptyset, \{1\}\} = \tiwset$. For coolness, also see that $P(\neg x_i) = 1 - P(x_i) = 1 - P(a_i) = \emptyset$, so there is, in this case, a one to one correspondence of possible worlds and their respective probabilities in both $\ti$ and $\bi$, but this is extraneous information for the proof.
The hypothesis is then that for $k \geq 1$ tuple alternatives, \Cref{lem:bi-red-ti-prob} holds.
For the inductive step, prove that \Cref{lem:bi-red-ti-prob} holds for $k + 1$ alternatives. By definition of the query $\poly$ ( \Cref{def:bi-red-ti-q}), it is a fact that only the world $\wElem_{x_{\block, k + 1}} = \{x_{\block, k + 1}\}$ in the set of possible worlds is mapped to $\bi$ world $\{a_{\block, k + 1}\}$. Then for world $\wElem_{x_{\block, k + 1}}$ it is the case that $P(\wElem_{x_{\block, k + 1}}) = \prod_{j = 1}^{k} (1 - P(x_j)) \cdot P(x_{\block k + 1})$. Since by \Cref{eq:bi-red-ti-func} $P(x_{\block, k + 1}) = \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}$, we get
\begin{align*}
P(\wElem_{x_{\block, k + 1}}) =& \prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot P(x_{\block, k + 1})\\
=&\prod_{j = 1}^{k} (1 - P(x_{\block, j})) \cdot \frac{P(a_{\block, k + 1})}{\prod_{j = 1}^{k}(1 - P(x_{\block, j}))}\\
=&P(a_{\block, k + 1}).
\end{align*}
\end{proof}
\qed
This leaves us with the task of constructing a query $\poly$ over $\tipdb$ to perform the desired mapping of possible worlds. Setting $\poly$ to the following query yields the desired result.
\begin{lstlisting}
SELECT A FROM TI as a
WHERE A = 1 OR
OR A = 2 AND NOT EXISTS(SELECT A FROM TI as b
WHERE A = 1 AND a.blockID = b.blockID)
$\vdots$
OR A = $|$b.blockID$|$ AND NOT EXISTS(SELECT A FROM TI as b
WHERE A = 1 OR A = 2 $\ldots$ A = $|$b.blockID$|$ AND a.blockID = b.blockID
\end{lstlisting}
\begin{Lemma}\label{lem:bi-red-ti-q}
The query $\poly$ satisfies the requirements of \Cref{def:bi-red-ti-q}.
\end{Lemma}
\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-q}]
For any possible world in $2^b$, notice that the WHERE clause selects the tuple with the greatest ordering in the possible world. For all other tuples, disjunction of predicates dictates that no other tuple will be in the output by mutual exclusivity of the disjunction. Thus, it is the case for any $\ti$ possible world, that the tuple $x_{\block, i}$ with the greatest ordering appearing in that possible world will alone be in the output, and all such possible worlds with $x_{\block, i}$ as the greatest in the ordering will output the same world corresponding to the $\bi$ world for the disjoint tuple $a_{\block, i}$.
\end{proof}
\qed
\begin{proof}[Proof of Theorem ~\ref{theorem:bi-red-ti}]
For multiple blocks in $\bipdb$, note that the above reduction to $\poly(\tipdb)$ with multiple 'blocks' will behave the same as $\bipdb$ since the property of independence for $\ti$ ensures that all tuples in the $\ti$ will have the same marginal probability across all possible worlds as their tuple probability, regardless of how many tuples and, thus, worlds the $\tipdb$ has. Note that this propety is unchanging no matter what probabilities additional tuples in $\tipdb$ are assigned.
To see this consider the following.
\begin{Lemma}\label{lem:bi-red-ti-ind}
For any set of independent variables $S$ with size $\abs{S}$, when adding another distinct independent variable $y$ to $S$ with probability $\prob_y$, it is the case that the probability of each variable $x_i$ in $S$ remains unchanged.
\AH{This may be a well known property that I might not even have the need to prove, but since I am not certain, here goes.}
\end{Lemma}
\begin{proof}[Proof of Lemma ~\ref{lem:bi-red-ti-ind}]
The proof is by induction. For the base case, consider a set of one element $S = \{x\}$ with probability $\prob_x$. The set of possible outcomes includes $2^S = \{\emptyset, \{x\}\}$, with $P(\emptyset) = 1 - \prob_x$ and $P(x) = p_x$. Now, consider $S' = \{y\}$ with $P(y) = \prob_y$ and $S \cup S' = \{x, y\}$ with the set of possible outcomes now $2^{S \cup S'} = \{\emptyset, \{x\}, \{y\}, \{x, y\}\}$. The probabilities for each world then are $P(\emptyset) = (1 - \prob_x)\cdot(1 - \prob_y), P(x) = \prob_x \cdot (1 - \prob_y), P(y) = (1 - \prob_x)\cdot \prob_y$, and $P(xy) = \prob_x \cdot \prob_y$. For the worlds where $x$ appears we have
\[P(x) + P(xy) = \prob_x \cdot (1 - \prob_y) + \prob_x \cdot \prob_y = \prob_x \cdot \left((1 - \prob_y) + \prob_y\right) = \prob_x \cdot 1 = \prob_x.\]
Thus, the base case is satisfied.
For the hypothesis, assume that $\abs{S} = k$ for some $k \geq 1$, and for $S'$ such that $\abs{S'} = 1$ where its element is distinct from all elements in $S$, the probability of each independent variable in $S$ is the same in $S \cup S'$.
For the inductive step, let us prove that for $\abs{S_{k + 1}} = k + 1$ elements, adding another element will not change the probabilities of the independent variables in $S$. By the hypothesis, that $S_k \cup S_{k + 1}$, all probabilities in $S_k$ remained untainted after the union. Now consider a set $S' = \{z\}$ and the union $S_{k + 1} \cup S'$. Since all variables are distinct and independent, it is the case that the set of possible outcomes of $S_{k + 1} \cup S' = 2^{S_{k + 1} \cup S'}$ with $\abs{2^{S_{k + 1} \cup S'}} = 2^{\abs{S_{k + 1}} + \abs{S'}}$ since $\abs{S_{k + 1}} + \abs{S'} = \abs{S_{k + 1} \cup S'}$. Then, since $2^{\abs{S_{k + 1}} + \abs{S'}} = 2^{\abs{S_{k + 1}}} \cdot 2^{\abs{S'}}$, and $2^{S'} = \{\emptyset, \{x\}\}$, it is the case that all elements in the original set of out comes will appear \textit{exactly one} time without $z$ and \textit{exactly one }time with $z$, such that for element $x \in 2^{S_{k + 1}}$ with probability $\prob_x$ we have $P(x\text{ }OR\text{ }xz) = \prob_x \cdot (1 - \prob_z) + \prob_x \cdot \prob_z = \prob_x\cdot \left((1 - z) + z\right) = \prob_x \cdot 1 = \prob_x$, and the probabilities remain unchanged, and, thus, the marginal probabilities for each variable in $S_{k + 1}$ across all possible outcomes remain unchanged.
\end{proof}
\qed
The repeated application of \Cref{lem:bi-red-ti-ind} to any 'block' of independent variables in $\tipdb$ provides the same result as joining two sets of distinct elements of size $\abs{S_1}, \abs{S_2} > 1$.
Thus, by lemmas ~\ref{lem:bi-red-ti-prob}, ~\ref{lem:bi-red-ti-q}, and ~\ref{lem:bi-red-ti-ind}, the proof follows.
\end{proof}
\qed
\subsubsection{General results for $\bi$}\label{subsubsec:bi-gen}
\AH{One thing I don't see in the argument below is that as $\numvar \rightarrow \infty$, we have that $\prob_0 \rightarrow 0$.}
The general results of approximating a $\bi$ using the reduction and \Cref{alg:mon-sam} do not allow for the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ to be a constant. Consider the following example.
Let monomial $y_i = P(x_i) \cdot \prod_{j = 1}^{i - 1}(1 - P(x_j))$ Let $\poly(\vct{X}) = \sum_{i = 1}^{\numvar}y_i$. Note that this query output can exist on a projection for which each tuple agrees on the projected values of the query in a $\bi$ consisting of one block and $\numvar$ tuples.
First, let's analyze the numerator $\abs{\etree}(1,\ldots, 1)$. Expanding $\abs{\etree}$ yields $X_i + (1 + X_1)\cdot X_2 + \cdots + (1 + X_1)\cdot(1 + X_2)\cdots(1 + X_{\numvar - 1})\cdot X_n$ which yields a geometric series $S_{\abs{\etree}} = 2^0 + 2^1 +\cdots+2^{\numvar - 1}$. We can perform the following manipulations to obtain the following closed form.
\begin{align*}
2 \cdot S_{\abs{\etree}} =& 2^1 +\cdots+2^\numvar = 2^{\numvar} + S_2 - 1 \\
S_{\abs{\etree}} =& 2^{\numvar + 1} - 1
\end{align*}
So, then $\abs{\etree}(1,\ldots, 1) = 2^{\numvar} - 1$.
On the other hand, considering $\rpoly(\prob_1,\ldots, \prob_\numvar)$, since we are simply summing up the probabilities of one block of disjoint tuples (recall that $P(x_i) = \frac{P(a_i)}{1\cdot\prod_{j = 1}^{i - 1}(1 - P(x_j))}$ in the reduction for $a_i$ the original $\bi$ probability), it is the case that $\rpoly(\prob_1,\ldots, \prob_\numvar) \leq 1$, and the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ in this case is exponential $O(2^\numvar)$. Further note that setting $\poly(\vct{X}) = \sum_{i = 1}^{\numvar} y_i^k$ will yield an $O(2^{\numvar \cdot k})$ bound.
\subsubsection{Sufficient Condition for $\bi$ for linear time Approximation Algorithm}
Let us introduce a sufficient condition on $\bipdb$ for a linear time approximation algorithm.
\AH{Lemma ~\ref{lem:bi-suf-cond} is not true for the case of $\sigma$, where a $\sigma(\bowtie)$ query could select tuples from the same block, and self join them such that all tuples cancel out. We need a definition for 'safe' (in this context) queries, to prove the lemma.}
\begin{Lemma}\label{lem:bi-suf-cond}
For $\bipdb$ with fixed block size $\abs{b}$, the ratio $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is a constant.
\end{Lemma}
\AH{Two observations.
\par
1) I am not sure that the argument below is correct, as I think we would still get something exponential in the numerator $\abs{\etree}(1,\ldots, 1)$.
\par2) I \textit{think} a similar argument will hold however for the method of not using the reduction.}
\begin{proof}[Prood of Lemma ~\ref{lem:bi-suf-cond}]
For increasing $\numvar$ and fixed block size $\abs{b}$ in $\bipdb$ given query $\poly = \sum_{i = 1}^{\numvar}$ where $y_i = x_i \cdot \prod_{j = 1}^{i - 1} (1 - x_j)$, a query whose output is the maximum possible output, it has to be the case as seen in \Cref{subsubsec:bi-gen} that for each block $b$, $\rpoly(\prob_{b, 1},\ldots, \prob_{b, \abs{b}}) = P(a_{b, 1}) + P(a_{b, 2}) + \cdots + P(a_{b, \abs{b}})$ for $a_i$ in $\bipdb$. As long as there exists no block in $\bipdb$ such that the sum of alternatives is $0$ (which by definition of $\bi$ should be the case), we can bound the $\rpoly(p_1,\ldots, \prob_\numvar) \geq \frac{\prob_0 \cdot \numvar}{\abs{\block}}$ for $\prob_0 > 0$, and then we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_1,\ldots, \prob_\numvar)}$ is indeed a constant.
\end{proof}
\qed
Given a $\bipdb$ satisfying \Cref{lem:bi-suf-cond}, it is the case by \Cref{lem:approx-alg} that \Cref{alg:mon-sam} runs in linear time.
\AH{\Large \bf{092520 -- 100220 New material.}}
\section{Algorithm ~\ref{alg:mon-sam} for $\bi$}
We may be able to get a better run time by developing a separate approximation algorithm for the case of $\bi$. Instead performing the reduction from $\bi \mapsto \poly(\ti)$, we decide to work with the original variable annotations given to each tuple alternative in $\bipdb$. For clarity, let us assume the notation of $\bivar$ for the annotation of a tuple alternative. The algorithm yields $0$ for any monomial sampled that cannot exist in $\bipdb$ due to the disjoint property characterizing $\bi$. The semantics for $\rpoly$ change in this case. $\rpoly$ not only performs the same modding function, but also sets all monomial terms to $0$ if they contain variables which appear within the same block.
\begin{algorithm}[H]
\caption{$\approxq_{\biabb}$($\etree$, $\vct{p}$, $\conf$, $\error$, $\bivec$)}
\label{alg:bi-mon-sam}
\begin{algorithmic}[1]
\Require \etree: Binary Expression Tree
\Require $\vct{p} = (\prob_1,\ldots, \prob_\numvar)$ $\in [0, 1]^N$
\Require $\conf$ $\in [0, 1]$
\Require $\error$ $\in [0, 1]$
\Require $\bivec$ $\in [0, 1]^{\abs{\block}}$\Comment{$\abs{\block}$ is the number of blocks}
\Ensure \vari{acc} $\in \mathbb{R}$
\State $\vari{sample}_\vari{next} \gets 0$
\State $\accum \gets 0$\label{alg:mon-sam-global1}
\State $\numsamp \gets \ceil{\frac{2 \log{\frac{2}{\conf}}}{\error^2}}$\label{alg:mon-sam-global2}
\State $(\vari{\etree}_\vari{mod}, \vari{size}) \gets $ \onepass($\etree$)\label{alg:mon-sam-onepass}\Comment{$\onepass$ is \Cref{alg:one-pass} \;and \sampmon \; is \Cref{alg:sample}}
\For{\vari{i} \text{ in } $1\text{ to }\numsamp$}\Comment{Perform the required number of samples}
\State $(\vari{M}, \vari{sgn}_\vari{i}) \gets $ \sampmon($\etree_\vari{mod}$)\label{alg:mon-sam-sample}
\For{$\vari{x}_\vari{\block,i}$ \text{ in } $\vari{M}$}
\If{$\bivec[\block] = 1$}\Comment{If we have already had a variable from this block, $\rpoly$ drops the sample.}
\State $\vari{sample}_{\vari{next}} \gets 1$
\State break
\Else
\State $\bivec[\block] = 1$
% \State $\vari{sum} = 0$
% \For{$\ell \in [\abs{\block}]$}
% \State $\vari{sum} = \vari{sum} + \bivec[\block][\ell]$
% \EndFor
% \If{$\vari{sum} \geq 2$}
% \State $\vari{sample}_{\vari{next}} \gets 1$
% \State continue\Comment{Not sure for psuedo code the best way to state this, but this is analogous to C language continue statement.}
\EndIf
\EndFor
\If{$\vari{sample}_{\vari{next}} = 1$}
\State $\vari{sample}_{\vari{next}} \gets 0$
\State continue
\EndIf
\State $\vari{Y}_\vari{i} \gets 1$\label{alg:mon-sam-assign1}
\For{$\vari{x}_{\vari{j}}$ \text{ in } $\vari{M}$}%_{\vari{i}}$}
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times \; \vari{\prob}_\vari{j}$\label{alg:mon-sam-product2} \Comment{$\vari{p}_\vari{j}$ is the assignment to $\vari{x}_\vari{j}$ from input $\vct{p}$}
\EndFor
\State $\vari{Y}_\vari{i} \gets \vari{Y}_\vari{i} \times\; \vari{sgn}_\vari{i}$\label{alg:mon-sam-product}
\State $\accum \gets \accum + \vari{Y}_\vari{i}$\Comment{Store the sum over all samples}\label{alg:mon-sam-add}
\EndFor
\State $\vari{acc} \gets \vari{acc} \times \frac{\vari{size}}{\numsamp}$\label{alg:mon-sam-global3}
\State \Return \vari{acc}
\end{algorithmic}
\end{algorithm}
Before redefining $\rpoly$ in terms of the $\bi$ model, we need to define the notion of performing a mod operation with a set of polynomials.
\begin{Definition}[Mod with a set of polynomials]\label{def:mod-set-poly}
To mod a polynomial $\poly$ with a set $\vct{Z} = \{Z_1,\ldots Z_x\}$ of polynomials, the mod operation is performed successively on the $\poly$ modding out each element of the set $\vct{Z}$ from $\poly$.
\end{Definition}
\begin{Example}\label{example:mod-set-poly}
To illustrate for $\poly = X_1^2 + X_1X_2^3$ and the set $\vct{Z} = \{X_1^2 - X_1, X_2^2 - X_2, X_1X_2\}$ we get
\begin{align*}
&X_1^2 + X_1X_2^3 \mod X_1^2 - X_1 \mod X_2^2 - X_2 \mod X_1X_2\\
=&X_1 + X_1X_2^3 \mod X_2^2 - X_2 \mod X_1X_2\\
=&X_1 + X_1X_2 \mod X_1X_2\\
=&X_1
\end{align*}
\end{Example}
\begin{Definition}[$\rpoly$ for $\bi$ Data Model]\label{def:bi-alg-rpoly}
$\rpoly(\vct{X})$ over the $\bi$ data model is redefined to include the following mod operation in addition to definition ~\ref{def:qtilde}. For every $j \neq i$, we add the operation $\mod X_{\block, i}\cdot X_{\block, j}$. For set of blocks $\mathcal{B}$ and the size of block $\block$ as $\abs{\block}$,
\[\rpoly(\vct{X}) = \poly(\vct{X}) \mod \{X_{\block, i}^2 - X_{\block, i} \st \block \in \mathcal{B}, i \in [\abs{\block}]\} \cup_{\block \in \mathcal{B}} \{X_{\block, i}X_{\block, j} \st i, j \in [\abs{\block}], i \neq j\}
% \mod X_{\block_1, 1}^2 - X_{\block_1, 1} \cdots \mod X_{\block_k, \abs{\block_k}}^2 - X_{\block_k, \abs{\block_k}} \mod X_{b_1, 1} \cdot X_{b_1, 2}\cdots \mod X_{\block_1, \abs{\block_1} -1} \cdot X_{\block, \abs{\block_1}}\cdots \mod X_{\block_k, 1} \cdot X_{\block_k, 2} \cdots \mod X_{\block_k, \abs{\block_k} - 1}\cdot X_{\block_K, \abs{\block_k}}.
\]
\end{Definition}
\subsection{Correctness}
\begin{Theorem}\label{theorem:bi-approx-rpoly-bound}
For any query polynomial $\poly(\vct{X})$, an approximation of $\rpoly(\prob_1,\ldots, \prob_\numvar)$ in the $\bi$ setting can be computed in $O\left(\treesize(\etree) + \frac{\log{\frac{1}{\conf}}\cdot \abs{\etree}^2(1,\ldots, 1)}{\error^2\cdot\rpoly^2(\prob_1,\ldots, \prob_\numvar)}\right)$, with multiplicative $(\error,\delta)$-bounds, where $k$ denotes the degree of $\poly$.
\end{Theorem}
\begin{proof}[Proof of Theorem ~\ref{theorem:bi-approx-rpoly-bound}]
By the proof of \Cref{lem:approx-alg}, with a minor adjustment on $\evalmp$, such that we define the function to output $0$ for any monomial sharing disjoint variables, coupled with the fact that additional operations in \Cref{alg:bi-mon-sam} are $O(1)$ occuring at most $k$ times for each of the $\numsamp$ samples, the proof of \Cref{theorem:bi-approx-rpoly-bound} immediately follows.
\end{proof}
\qed
\subsection{Safe Query Class for $\bi$}
We want to analyze what is the class of queries and data restrictions that are necessary to guarantee that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\prob_{1},\ldots, \prob_{\numvar})}$ is $O(1)$.
\subsubsection{When $\rpoly$ is zero}
First, consider the case when $\rpoly$ cancels out all terms in $\poly$, where $\poly \neq \emptyset$. For $\rpoly$ to cancel out a tuple $\tup$, by \Cref{def:bi-alg-rpoly} it must be the case that output tuple $\tup$ is dependent on two different tuples appearing in the same block. For this condition to occur, it must be that the query $\poly$ contains a self join operation on a table $\rel$, from which $\tup$ has been derived.
Certain conditions on both the data and query must exist for all tuples $\tup$ to be cancelled out by $\rpoly$ as described above.
For $\rpoly$ to be $0$, the data of a $\bi$ must satisfy certain conditions.
\begin{Definition}[Data Restrictions]\label{def:bi-qtilde-data}
Consider $\bi$ table $\rel$. For $\rpoly$ to potentially cancel all its terms, $\rel$ must be such that given a self join, the join constraints remain unsatisfied for all tuple combinations $x_{\block_i, \ell} \times x_{\block_j, \ell'}$ for $i \neq j$, $\ell \in [\abs{\block_i}], \ell' \in [\abs{\block_j}]$, i.e. combinations across different blocks. Note that this is trivially satisfied with a $\rel$ composed of just one block. Further, it must be the case that the self join constraint is only satisfied in one or more crossterm combinations $x_{\block, i} \times x_{\block_j}$ for $i \neq j$, i.e., within the same block of the input data.
\end{Definition}
To be precise, only equijoins are considered in the following definition. Before preceding, note that a natural self join will never result in $\rpoly$ cancelling all terms, since it is the case that each tuple will necessarily join with itself, and $\rpoly$ will not mod out this case. Also, although we are using the term self join, we consider cases such that query operations over $\rel$ might be performed on each join input prior to the join operation. While technically the inputs may not be the same set of tuples, this case must be considered, since all the tuples originate from the table $\rel$. To this end, let $\poly_1(\rel) = S_1$ and $\poly_2(\rel) = S_2$ be the input tables to the join operation.
\begin{Definition}[Class of Cancelling Queries]\label{def:bi-qtilde-query-class}
When \Cref{def:bi-qtilde-data} is satisfied, it must be that $\poly$ contains a join $S_1 \bowtie_\theta S_2$ such that either% that satisfies the following constraints based on its structure.
\textsc{Case 1:} $S_1 \cap S_2 = \emptyset$
%Any join over this structure will produce a $\poly$ such that $\rpoly$ cancels all monomials out.
%Such a condition implies $\rpoly$ is $0$ regardless of join condition $\theta$. Note the beginning premise of this definition, and the fact that such premise rules out the natural join across all attributes, since we would have that $\poly = \rpoly = 0$.
Or
\textsc{Case 2:} $S_1 \cap S_2 \neq \emptyset$, the attributes in the join predicate are non-matching, i.e., neither operand of the comparison is a strict subset of the other, and no input tuple has agreeing values across the join attributes.
%\begin{enumerate}
% \item When the join condition $\theta$ involves equality between matching attributes, it must be that the attributes of the join conditon $\attr{\theta}$ are a strict subset of $\attr{\rel}$. Then, to satisfy \Cref{def:bi-qtilde-data} it must be that the join input consists of non-intersecting strict subsets of $\rel$, meaning $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$. $\poly_1$ in \Cref{ex:bi-tildeq-0} illustrates this condition.
% \item If $\theta$ involves an equality on non-matching attributes, there exist two cases.
% \begin{enumerate}
% \item The first case consists of when the join inputs intersect, i.e., $S_1 \cap S_2 \neq \emptyset$ . To satisfy \Cref{def:bi-qtilde-data} it must be the case that no tuple can exist with agreeing values across all attributes in $\attr{\theta}$. $\poly_3$ of \Cref{ex:bi-tildeq-0} demonstrates this condition.
% \item The second case consists of when $S_1 \cap S_2 = \emptyset$ and $S_1, S_2 \neq \emptyset$ in the join input, and this case does not contradict the requirements of \Cref{def:bi-qtilde-query-class}. This case is illustrated in $\poly_2$ of \Cref{ex:bi-tildeq-0}.
% \end{enumerate}
%\end{enumerate}% , cause $\rpoly$ to be $0$ must have the following characteristics. First, there must be a self join. Second, prior to the self join, there must be operations that produce non-intersecting sets of tuples for each block in $\bi$ as input to the self join operation.
\end{Definition}
In \Cref{ex:bi-tildeq-0}, $\poly_1$ and $\poly_2$ are both examples of \textsc{Case 1}, while $\poly_3$ is an example of \textsc{Case 2}.
\begin{Theorem}\label{theorem:bi-safe-q}
When both \Cref{def:bi-qtilde-data} and \Cref{def:bi-qtilde-query-class} are satisfied, $\rpoly$ cancels out all monomials.
\end{Theorem}
\begin{proof}[Proof of Theorem ~\ref{theorem:bi-safe-q}]
Starting with the case that $S_1 \cap S_2 = \emptyset$. When this is the case, by definition, all joins on tuples in $S_1$ and $S_2$ will be will involve elements in $S_1 \times S_2$ such that both tuples are distinct. Further, \Cref{def:bi-qtilde-data} rules out joins across different blocks, while calling for joins of the above form within the same block. Thus all tuples in the query output are dependent on more than one tuple from the same block, thus implying by \Cref{def:bi-alg-rpoly} that $\rpoly$ will cancel all monomials.
For the next case where $S_1 \cap S_2 \neq \emptyset$, note that there exists at least one tuple in both $S_1$ and $S_2$ that is the same. Therefore, all equijoins involving matching attributes will produce at least one self joined tuple in the output, breaking the last property of \Cref{def:bi-qtilde-data}. For the case of equijoins with predicates involving non-matching attribute operands, note that by definition of equijoin, the only case that a tuple shared in both $S_1$ and $S_2$ can join on itself is precisely when that tuple's values agree across all the join attributes in $\theta$. Thus, it is the case that when $S_1 \cap S_2 \neq \emptyset$ and the join predicate involves equality comparison between non-matching attributes such that the values of the non-matching comparison attributes for each tuple in $\{S_1 \cap S_2\}$ do not agree, we have that \Cref{def:bi-qtilde-data} is not contradicted, and when \Cref{def:bi-qtilde-data} is fulfilled, it must be the case that $\poly \neq 0$ while $\rpoly = 0$.
This concludes the proof.
\end{proof}
\qed
Note then that the class of queries described in \Cref{def:bi-qtilde-query-class} belong to the set of queries containing some form of selction over self cross product.
%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-data}]
%\end{proof}
%\begin{proof}[Proof of Lemma ~\ref{lem:bi-qtilde-query-class}]
%\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%
%The condition that causes $\rpoly(\prob_1,\ldots, \prob_\numvar)$ to be $0$ is when all the output tuples in each block cancel each other out. Such occurs when the annotations of each output tuple break the required $\bi$ property that tuples in the same block must be disjoint. This can only occur for the case when a self-join outputs tuples each of which have been joined to another tuple from its block other than itself.
%
%The observation is then the following. In order for such a condition to occur, we must have a query that is a self-join such that the join is on two different sets of atoms for each block. This condition can occur when inner query operations with different constraints on input table $\rel$ produce two non-intersecting sets of tuples and then performs a self join on them, such that the join condition \textit{only} holds for tuples that are members of the same block.
%
%There are two operators that can produce the aforementioned selectivity. First, consider $\sigma$, where two different selection conditions $\theta_1$ and $\theta_2$ over $\rel$ can output sets $S_{\sigma_{\theta_1}}$ and $S_{\sigma_{\theta_2}}$ where $S_{\sigma_{\theta_1}} \cap S_{\sigma_{\theta_2}} = \emptyset$. A join over these two outputs can produce an ouput $\poly$ where all annotations will be disjoint and $\rpoly$ will effectively cancel them all out. Second, consider the projection operator $\pi$, such that projections over $\rel$ which project on different attributes can output two non-intersecting sets of tuples, which when joined, again, provided that the join condition holds only for tuples appearing in the same block, can output tuples all of which will break the disjoint requirement and $\rpoly$ will cancel them out.
\begin{Example}\label{ex:bi-tildeq-0}
Consider the following $\bi$ table $\rel$ consisting of one block, with the following queries $\poly_1 = \sigma_{A = 1}(\rel)\bowtie_{B = B'} \sigma_{A = 2}(\rel)$, $\poly_2 = \sigma_{A = 1}(\rel)\bowtie_{A = B'} \sigma_{A = 2}(\rel)$, and $\poly_3 = \rel \bowtie_{A = B} \rel$. While the output $\poly_i \neq \emptyset$, all queries have that $\rpoly_i = 0$. Since $\rel$ consists of only one block, we will use single indexing over the annotations.
\end{Example}
\begin{figure}[ht]
\begin{tabular}{ c | c c c }
\rel & A & B & $\phi$\\
\hline
& 1 & 2 & $x_1$\\
& 2 & 1 & $x_2$\\
& 1 & 3 & $x_3$\\
& 3 & 1 & $x_4$\\
\end{tabular}
\caption{Example~\ref{ex:bi-tildeq-0} Table $\rel$}
\label{fig:bi-ex-table}
\end{figure}
%%%%%%%%%%Query 1 and 2
\begin{figure}[ht]
\begin{subfigure}{0.2\textwidth}
\centering
\begin{tabular}{ c | c c c }
$\sigma_{\theta_{A = 1}}(\rel )$& A & B & $\phi$\\
\hline
& 1 & 2 & $x_1$\\
& 1 & 3 & $x_3$\\
\end{tabular}
\caption{$\poly_1, \poly_2$ First Selection}
\label{subfig:bi-q1-sigma1}
\end{subfigure}
\begin{subfigure}{0.2\textwidth}
\centering
\begin{tabular}{ c | c c c}
$\sigma_{\theta_{A = 2}}(\rel)$ & A & B' & $\phi$\\
\hline
& 2 & 1 & $x_2$\\
\end{tabular}
\caption{$\poly_1, \poly_2$ Second Selection}
\label{subfig:bi-q1-sigma2}
\end{subfigure}
\begin{subfigure}{0.25\textwidth}
\centering
\begin{tabular}{ c | c c c c c}
$\poly_1(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\
\hline
& 1 & 2 & 2 & 1 & $x_1x_2$\\
\end{tabular}
\caption{$\poly_1(\rel)$ Output}
\label{subfig:bi-q1-output}
\end{subfigure}
\begin{subfigure}{0.4\textwidth}
\centering
\begin{tabular}{ c | c c c c c}
$\poly_2(\rel)$ & $A_R$ & $B_R$ & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\
\hline
& 1 & 2 & 2 & 1 & $x_1x_2$\\
& 1 & 3 & 2 & 1 & $x_2x_3$\\
\end{tabular}
\caption{$\poly_2(\rel)$ Output}
\label{subfig:bi-q2-output}
\end{subfigure}
\caption{$\poly_1, \poly_2(\rel)$}
\label{fig:bi-q1-q2}
\end{figure}
%%%%%%%%%%%Query 3
\begin{figure}[ht]
% \begin{subfigure}{0.2\textwidth}
% \centering
% \begin{tabular}{ c | c c }
% $\pi_{A}(\rel)$ & A & $\phi$\\
% \hline
% & 1 & $x_1$\\
% & 2 & $x_2$\\
% & 1 & $x_3$\\
% & 3 & $x_4$\\
% \end{tabular}
% \caption{$\poly_3$ First Projection}
% \label{subfig:bi-q3-pi1}
% \end{subfigure}
% \begin{subfigure}{0.2\textwidth}
% \centering
% \begin{tabular}{ c | c c }
% $\pi_{B}(\rel)$ & B & $\phi$\\
% \hline
% & 2 & $x_1$\\
% & 1 & $x_2$\\
% & 3 & $x_3$\\
% & 1 & $x_4$\\
% \end{tabular}
% \caption{$\poly_3$ Second Projection}
% \label{subfig:bi-q3-pi2}
% \end{subfigure}
\begin{subfigure}{0.2\textwidth}
\centering
\begin{tabular}{ c | c c c c c }
$\poly_3(\rel)$ & A & B & $A_{\rel'}$ & $B_{\rel'}$ & $\phi$\\
\hline
& 1 & 2& 2 & 1 & $x_1x_2$\\
& 1 & 2 & 3 & 1 & $x_1x_2$\\
& 2 & 1 & 1 & 2 & $x_1x_2$\\
& 1 & 3 & 2 & 1 & $x_2x_3$\\
& 1 & 3 & 3 & 1 & $x_3x_4$\\
& 3 & 1 & 1 & 3 & $x_3x_4$\\
\end{tabular}
\caption{$\poly_3(\rel)$ Output}
\label{subfig:bi-q3-output}
\end{subfigure}
\caption{$\poly_3(\rel)$}
\label{fig:bi-q3}
\end{figure}
Note that all of \Cref{subfig:bi-q1-output}, \Cref{subfig:bi-q2-output}, and \Cref{subfig:bi-q3-output} each have a set of tuples, where each annotation has cross terms from its block, and by \Cref{def:bi-alg-rpoly} $\rpoly$ will eliminate all tuples output in the respective queries.
\subsubsection{When $\rpoly > 0$}
\par\AH{General Case and Sufficient Condition for $\bi$ and $\rpoly_{\bi}$ approx alg needs to be written.}
\paragraph{General Case}
Consider the query $\poly = \sum_{i = 1}^{\numvar}x_i$, analogous to a projection where all tuples match on the projected set of attributes, meaning $\tup_i[A] = \tup_j[A]$ for $i, j \in [\numvar]$ such that $i \neq j$. When $\numvar$ grows unboundedly, $\abs{\etree}(1,\ldots, 1) = \numvar$. We assume that the sum of the probabilities of all $\numvar$ tuples in the block remain a constant as $\numvar$ grows. Thus, we have that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{n}{c}$ for some constant $c$, and this implies $O(\numvar)$ growth.
% while $\rpoly(\vct{\prob}) \leq 1$, which implies that the ratio is linear, i.e., $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{p})} = \frac{\numvar}{\numvar \cdot \prob_0} = \frac{1}{\prob_0}$ for $\prob_0 = min(\vct{\prob})$. However, note that for $\numvar \rightarrow \infty$ it is the case that $\prob_0 \rightarrow 0$, and as $\numvar$ grows, so does $\frac{1}{\prob_0}$. Intuitively, consider when $p_0 = \frac{1}{\numvar}$. Then we know that the bound is $\frac{\numvar}{1}$ which is $O(\numvar)$.
\paragraph{Sufficient Condition for $\bi$ to achieve linear approximation}
Consider the same query $\poly = \sum_{i = 1}^{\numvar}$, but this time conditioned on a fixed block size which we denote $\abs{\block}$. Then it is the case that $\abs{\etree}(1,\ldots, 1) = \numvar$, but if we assume that all blocks have a sum of probabilities equal to $1$, $\rpoly(\vct{\prob}) = \frac{\numvar}{\abs{b}}$, and this means that $\frac{\abs{\etree}(1,\ldots, 1)}{\rpoly(\vct{\prob})} = \frac{\numvar}{\frac{\numvar}{\abs{\block}}} = \abs{\block}$. For the general case when all blocks do not have the property that the sum of the probabilities of the alternatives equal $1$, we can lower bound the sum of probabilities as $\frac{\numvar}{\abs{\block}} \cdot \prob_0$ for $\prob_0 = min(\vct{\prob})$. Note that in $\numvar \cdot \frac{\prob_0}{\abs{\block}}$, $\frac{\prob_0}{\block}$ is indeed a constant, and this gives an overall ratio of $O(1)$ as $\numvar$ increases.

View File

@ -0,0 +1,167 @@
----------------------- REVIEW 1 ---------------------
SUBMISSION: 61
TITLE: Standard Operating Procedure in PDBs Considered Harmful (for bags)
AUTHORS: Su Feng, Boris Glavic, Aaron Huber, Oliver Kennedy and Atri Rudra
----------- Overall evaluation -----------
SCORE: 3 (strong accept)
----- TEXT:
Assume a probabilistic database (PDB) where every tuple is annotated with a random variable whose domain is the set of natural numbers. That is, the possible worlds of this PDB represent the different combinations of multiplicities of the tuples. As in the classic tuple-independent setting, it is assumed that the random variables are pairwise independent.
The problem investigated is to compute the expected multiplicity of every result tuple.
The paper examines the intensional approach to query evaluation that translates to computing the expectation of the lineage polynomial.
If the polynomial is given in sum-of-product form, then applying linearity of expectation along with the assumption that the RVs are pairwise independent allows to compute the expectation in time that is linear in the size of the lineage.
The main question considered by the authors is whether there is an efficient algorithm (for computing the expectation) when the lineage is given in compressed form.
The paper proves several results: hardness of computing the expectation when the lineage is in compressed form, a (restricted) generalization of the former when all RVs have the same probability, and a linear-time approximation algorithm with a multiplicative approximation factor. The proofs are elegant and use recent results and conjectures in fine-grained-complexity. The use of polynomial algebra to process the lineage and prove equivalences in this setting is novel. Finally, the exposition is good, with many examples providing intuition. Therefore, I recommend acceptance.
RESULTS:
The first result is a proof showing that if the expectation of the lineage can be computed in linear time in the size of its compressed form, then it is possible to count the number of k-matchings in a graph in time O(k^3). Since counting k-matchings is #W[1]-hard (when parameterized by k) then this proves the hardness of the compressed evaluation.
The technique used is common in the PDB literature (i.e., Dalvi and Suciu 2008) where the PDB is used to encode a graph, and query evaluation is used to construct a system of linear equations represented by a Vandermonde matrix, which can be solved in polynomial time.
The authors then generalize this result to the case where all RVs have the same probability by using the triangle detection hypothesis stating that counting triangles takes time at least \Omega(|E|^{4/3}).
Comments:
1. The paper would benefit if the transition from N-PDB to N[X]-PDB (while proved in the appendix) were explained in the paper because the results that follow rest on the correctness of this encoding.
2. Conjecture 3.2: the fine-grained complexity bound refers only to triangle counting, and not the P4-path and the 3-matching. It is confusing that these are mentioned in the conjecture.
Minor Comments:
Section 2.2.1 - mention that \pi_{sch} refers to the schema of R_1
Definition 2.11 - poly(T) is not yet defined at this point (defined in def. 4.2)
Lemma 3.8: p^4 is missing on the third term of the polynomial.
----------- Reviewer's confidence -----------
SCORE: 3 ((medium))
----------------------- REVIEW 2 ---------------------
SUBMISSION: 61
TITLE: Standard Operating Procedure in PDBs Considered Harmful (for bags)
AUTHORS: Su Feng, Boris Glavic, Aaron Huber, Oliver Kennedy and Atri Rudra
----------- Overall evaluation -----------
SCORE: -2 (reject)
----- TEXT:
From what I can tell: This paper studies the problem of computing, in the context of probabilistic DBs, the marginal probability of a result tuple, under bag semantics. While this problem is tractable, when the lineage polynomial is presented in a typical representation (sum-of-products), there exist compressed representations that are relevant, and the paper focuses on studying the problem with respect to these compressed representations. While a negative result is shown, a positive result, an approximation algorithm, is also given.
I had an extreme amount of difficulty reading the paper, to the point where I am not sure of the meaning of notions that seem basic to the paper's aims. As a result of this, I don't think that the paper can be accepted in the present form.
Below are some examples of places where I had difficulty.
A number of these refer to the introduction. To be clear, I don't expect the introduction to be completely rigorous mathematically, but where not rigorous, it should give useful hints that drive the reader towards understanding.
- Pg 1, "computing the expectation of the lineage, which under bag semantics is a polynomial". Make clear, what is a polynomial: the expectation or the lineage? Also, a polynomial in what?
- Pg 1, "These compression schemes are analogous to typical...optimizations like projection push-down". What is the analogy? What is projection push-down? And why is 27 an appropriate reference for it, did they invent it?
- Pg 1, "this algorithm only has a constant factor overhead". What does that mean?
- Pg 2, Example 1.2. I read that the lineage of the result in a Bag-PDB is a polynomial formula. What is a polynomial formula?.
- Pg 2, "observe that the clauses...are neither independent nor disjoint". What does it mean for clauses to be independent or disjoint?
Same sentence: what is the Shannon decomposition, and what does it mean for it to be "at worst exponential" in the size of the input?
Also, what are clauses of a polynomial? Here, you say "of the SOP polynomial": of what SOP polynomial?
- Pg 2, Col 2, "further interesting feature". Can you say more about why this is interesting?
- Pg 2, Col 2, on dropping subscript from Q_{bag}: This is very confusing, not in the least because the first usage of Q after you say this seems to refer to a query.
I think this overloading leads to a ton of confusion, indeed, throughout the paper, one sees (for example) Q(W_a,W_b,W_c), Q(\Omega), Q(X), Q(D), Q(W), Q(), and each time the reader potentially has to invoke some sort of search over the possible meanings to determine the actual meaning.
- Pg 2, Col 2, "Cartesian product of Q". What do you mean by Cartesian products of queries?
- Pg 3, Col 1, central question of this paper: I'm not sure how to interpret this given what comes before it. The central question refers to "the compressed lineage polynomial", but I do not know what this is, and miss a definition. The beginning of Section 1.2 says some vague-sounding things about compressed representations: 'For an arbitrary polynomial, it is known that there may exist equivalent compressed representations. One such compression is the factorized polynomial...', but I don't see any real definition.
- Sect 1.3, first sentence. In (i), did you define TIDB? (This is mentioned in Ex 1.1, but is it defined anywhere?) Also, you say "over a bag-TIDB"; do you mean over one particular bag-TIDB, or is the bag-TIDB part of the input?
- Defs. 2.2 and 2.3. Do you need to state that each c_i in Def. 2.2 is nonzero in order so that the degree in Def. 2.3 is defined correctly?
- Def. 2.11. Is "poly" defined anywhere?
Right after Def. 2.11, an example of a set ET(Q(X)) is given, but the elements of the set do not appear to be binary trees.
- Sect. 3, first sentence. You say 'project-join query', earlier you used the term 'UCQ': why the inconsistency?
- Sect. 3.1, beginning. You say "fixed graph H", but how is this graph fixed? (It varies in, say, Theorem 3.1.) Also, why do you suddenly switch to saying "pattern" after that, also, what is an occurrence of a pattern?
- Thms. 3.1 and 3.4. What is the parameterization? (Without stating this, there is no way to understand what the #W[1]-hardness refers to.)
----------- Reviewer's confidence -----------
SCORE: 4 ((high))
----------------------- REVIEW 3 ---------------------
SUBMISSION: 61
TITLE: Standard Operating Procedure in PDBs Considered Harmful (for bags)
AUTHORS: Su Feng, Boris Glavic, Aaron Huber, Oliver Kennedy and Atri Rudra
----------- Overall evaluation -----------
SCORE: 2 (accept)
----- TEXT:
This paper investigates the problem of computing the expected number of times a tuple is produced as a result of a bag-semantics query evaluation over probabilistic databases (BIDs). Though the computation of this expected value can be done in linear time on the monomial expansions of the provenance, it is observed that this is not the case on factorized representations of the provenance. This is formalized by a (conditional) super-linear hardness result, through some quite technical reductions from counting patterns in graphs. A scheme for approximate computation is proposed, which is linear-time for some specific settings, including when the input database is a TID.
This is a sound and novel paper, and the attention given to probabilistic counting is interesting. It is generally quite well-written.
I am not a huge fan of the title of the paper, which does not give a good overview of what the paper is about. At the very least, I would urge the authors to add the "(for bags)" to the actual title and not just subtitle of their papers, to avoid any metadata issue with the paper -- the notion of bags is quite crucial in this paper.
The introduction states that the assumption that input tuples have cardinality 0 or 1 is done without loss of generality, but it seems to actually have quite a strong impact, since the assumption that E[X]=Pr(X=1) is used in a number of places. This should be discussed in more depth.
Section 2.2.1, maybe replace the \sum symbol with \bigoplus?
Definition 2.11: at this point, poly has not been defined. ET does not really seem well defined. Contrarily to the example that follows, there are infinitely many trees that evaluate to the same polynomial (you can always have things like X-X+X-X+X-X...).
Before Section 3.1, when mentioning for the first time "these conjectures hold", there is no conjecture mentioned at this point.
Lemma 3.8, factor p⁴ missing in the monomial with the 2-matchings.
Typos:
- (page 1) "the the first linear"
- (page 3) extraneous space after "is #W[1]-hard"
- (page 4) something wrong with the formatting of the subscript in footnote 4
- (page 5) "demonstrate Section 3.3"
- (page 6) Main verb missing in the sentence after Conjecture 3.2
- (page 7) "that this problem" -> does not parse
- (page 7) "Analogusly"
- (page 9) The statement of Corollary 4.9 has an extraneous closing parenthesis
- (page 11) "So far focused" -> "So far we focused"
- (page 11) "to e.g. to"
----------- Reviewer's confidence -----------
SCORE: 5 ((expert))
----------------------- REVIEW 4 ---------------------
SUBMISSION: 61
TITLE: Standard Operating Procedure in PDBs Considered Harmful (for bags)
AUTHORS: Su Feng, Boris Glavic, Aaron Huber, Oliver Kennedy and Atri Rudra
----------- Overall evaluation -----------
SCORE: 0 (borderline paper)
----- TEXT:
The paper addresses the problem of computing the expected multiplicity of a tuple in the result of evaluating a query on a probabilistic databases under bag semantics. The paper shows that while this problem is hard, one can compute efficiently an additive approximation.
Strong points:
The problem seems interesting and the results seem to require non-trivial effort
Weak points:
The presentation is hard to follow, in particular the introduction. There were basic definitions that were unclear to me.
Detailed comments:
Introduction - I find the structure of the introduction almost impossible to follow as it is built on things that are only defined in the body of the paper. I read it several times and returned to it after reading the whole paper and there are still things that are vague to me. I would prefer a shorter introduction that gives an overview of the results and the techniques used to obtain them with much fewer technical details.
Main question - I did not understand why the main question is "Is it always the case that the expectation of a UCQ in a
Bag-PDB can be computed in time linear in the size of the compressed lineage polynomial?"
Why is the complexity measured with respect to the size of the compressed lineage polynomial and what is the complexity required to compute this polynomial from the input? You discuss close issues on 5.1.2 but it is still not clear to me.
Also, is reduced polynomial and compressed polynomial refer to the same notion? If so, why do we need both?
Definition 2.2.2 - It would have been useful to elaborate more here on the correspondence between the original PDB and the N[X}-PDB. This is a key element and without it it is impossible to understand why the main problem is indeed the one presented in definition 2.12.
Definition 2.5 - I did not understand the intuitive explanation. Can you explain why this is the case?
Section 2.1: you use both multi-sets and bags for the same notion
Theorem 3.4: the index of p_i in the reduced polynomial are wrong (should be p_0, \cdots, p_{2k})
----------- Reviewer's confidence -----------
SCORE: 2 ((low))

View File

@ -0,0 +1,450 @@
\documentclass[sigconf,9pt]{acmart}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMMENTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \newcommand{\BG}[1]{\todo[inline]{\textbf{Boris says:$\,$} #1}}
% \newcommand{\SF}[1]{\todo[inline]{\textbf{Su says:$\,$} #1}}
% \newcommand{\OK}[1]{\todo[inline]{\textbf{Oliver says:$\,$} #1}}
% \newcommand{\AH}[1]{\todo[inline]{\textbf{Aaron says:$\,$} #1}}
%\newcommand{\comment}[1]{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% ACMART settings
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\setcopyright{none}
\settopmatter{printacmref=false, printccs=false, printfolios=false}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PACKAGES
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{comment}
\usepackage{amsmath}
%\usepackage{amssymb}
% \let\proof\relax
% \let\endproof\relax
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{etoolbox}
\usepackage{url}
\def\UrlBreaks{\do\/\do-}
\usepackage{hyperref}
\hypersetup{breaklinks=true}
\usepackage{stmaryrd}
\usepackage[normalem]{ulem}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{fancyvrb}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{braket}
\usepackage[inline]{enumitem}
\usepackage{xspace}
\usepackage{colortbl}
\usepackage{hyphenat}
% \usepackage{bbold}
%\usepackage[breaklinks]{hyperref}
%\allowdisplaybreaks
\usepackage{multirow}
%\usepackage{makecell}
\usepackage{cleveref}
% \usepackage{footnote}
% \makesavenoteenv{tabular}
\usepackage{todonotes}
%\usepackage[disable]{todonotes}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Colors
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{black}{rgb}{0,0,0}
\definecolor{grey}{rgb}{0.8,0.8,0.8}
\definecolor{red}{rgb}{1,0,0}
\definecolor{green}{rgb}{0,1,0}
\definecolor{darkgreen}{rgb}{0,0.5,0}
\definecolor{darkpurple}{rgb}{0.5,0,0.5}
\definecolor{darkdarkpurple}{rgb}{0.3,0,0.3}
\definecolor{blue}{rgb}{0,0,1}
\definecolor{shadegreen}{rgb}{0.95,1,0.95}
\definecolor{shadeblue}{rgb}{0.95,0.95,1}
\definecolor{shadered}{rgb}{1,0.85,0.85}
\definecolor{shadegrey}{rgb}{0.85,0.85,0.85}
\definecolor{oddRowGrey}{rgb}{0.80,0.80,0.80}
\definecolor{evenRowGrey}{rgb}{0.85,0.85,0.85}
%%%%% workaround for citations spanning multiple pages breaking pdflatex
%%%%% see: https://tex.stackexchange.com/questions/1522/pdfendlink-ended-up-in-different-nesting-level-than-pdfstartlink
\usepackage{etoolbox}
% \makeatletter
% \patchcmd\@combinedblfloats{\box\@outputbox}{\unvbox\@outputbox}{}{\errmessage{\noexpand patch failed}}
% \makeatother
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DOCS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Revision}
\subtitle{Efficient Uncertainty Tracking for Complex Queries with Attribute-level Bounds}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Include information below and uncomment for camera ready
\definecolor{GrayRew}{gray}{0.85}
\newcommand{\RCOMMENT}[1]{\medskip\noindent \begin{tabular}{|p{\linewidth-3ex}}\rowcolor{GrayRew} #1 \end{tabular}\smallskip\\}
\newcommand{\MREV}[1]{\noindent {\color{blue}{#1}}}
\newcommand{\MREVO}[1]{\noindent {\color{darkgreen}{#1}}}
\newcommand{\FillInValue}{\textbf{\underline{XXX}}\xspace}
% REVISION COLORING
\definecolor{revgreen}{rgb}{0,0.5,0}
\newrobustcmd{\reva}[1]{\textcolor{blue}{{#1}}}
\newrobustcmd{\revb}[1]{\textcolor{revgreen}{{#1}}}
\newrobustcmd{\revc}[1]{\textcolor{magenta}{{#1}}}
\newrobustcmd{\revm}[1]{\textcolor{red}{{#1}}}
\newcommand{\todoMarker}{\textcolor{red}{TODO}}
\newcommand{\todoC}[1]{\textcolor{red}{TODO[{#1}]}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% ALLOW REFERENCES TO MAIN DOCUMENT
\usepackage{xcite}
\usepackage{xr-hyper}
\externalcitedocument{./main}
%\externaldocument[techrep-]{../techreport}
\externaldocument{../main}
\begin{document}
\maketitle
We thank the reviewers for their detailed reviews and constructive suggestions. Our changes in the paper are highlighted:
\reva{blue} for changes addressing comments of reviewer 1, \revb{green} for changes addressing comments of reviewer 2, \revc{magenta} for changes addressing comments of reviewer 3, and \revm{red} for general changes or changes addressing comments from more than one reviewer.
We do not highlight fixed typos and deletions to improve readability.
% This revision report explains in detail how we
% addressed the comments and edited the paper.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Meta Reviewer}\label{sec:meta-reviewer}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{R1} We look forward to a revision that incorporates the authors responses in the rebuttal in the final revised version. This set of promised revisions was agreed to be a minimal acceptable set of revisions and the reviewers have faith that the authors will deliver. A major concern is how dense the paper is and the lack of space to provide further details. After discussion, the reviewers agreed to leave it to the authors to find the right balance and given the limited space, the reviewers hope the authors can address all the comments as much as they can.
}
We addressed the ``opportunity for improvement'' sections of all reviewers and added all promised changes while keeping as much essential content as possible. Please see detailed responses addressing individual reviews below.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Reviewer 1}
\label{sec:reviewer-1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{Response to author feedback}
The author feedback is convincing in that I believe the authors will be able to handle my comments in a revision
\textbf{List required changes for a revision}
opportunities for improvement (e.g., O1, O3, O6).
Please clean up the model and add all details requested in my comments in the "opportunities for improvement" part.
}
Thank you for the detailed comments, we explain how we have addressed each of your concerns in the following.
%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O1} Section 7 claims to define a query semantics for RA but it only focuses on selection. Please include all operators.
}
Note that $\uaaN$ is a semiring and standard K-relational query semantics for $\raPlus$ preserves bounds on relations over $\db$ annotated with $\uaaN$. As we note at the start of section 7, this is also true for relations over range-annotated values, with the exception of selection. Since the other operators have standard K-relational semantics, we did not show their definitions in 7 (they are the same as in 3.1). We now state more clearly in section 7 that $\uaaN$ is a semiring and standard K-relational query semantics applies.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O2} Specifically, is there a way to handle joins that would not often degenerate into cartesian products? With C-tables we still need to maintain all combinations but at least we can recover the join condition; here it seems that we both "pay" in impreciseness caused by lose the predicates and "pay" in size. There is a discussion of join optimizations in p10 but I think it is too late and too terse, and I propose the authors include both the basic join construction and the optimization in Section 7 along with examples.
}
% In the variant of C-tables where variables are allowed as attribute values (as defined in~\cite{DBLP:journals/jacm/ImielinskiL84}), joins may degenerate to cartesian products. For instance, if we are joining two tables on attributes whose values are unconstrained variables (no restrictions on the values of the variables through global constraints allow us to filter out any join results).
Without our optimization, joins over AU-DBs may degenerate to cross-products in the worst-case (when the bounds on the join attribute values of all tuples from both input tables overlap). The join optimization factors out attribute-level uncertainty and possible answers from the SGW + under-approximation of the certain answer. The cross product is restricted to the (ideally already small) possible part of both inputs, which we can further compress in a bound preserving way (albeit at a loss of accuracy). The larger SGW part can be joined using a regular join operation with the join condition from the user's query. We moved the discussion on join optimizations to section 7.1 and added more details. Furthermore, we have added additional experiments evaluating performance for queries with multiple joins (Figure 14) when controlling the aggressiveness of compression.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O3} Similarly, the formal treatment of aggregates (Section 8) should be improved. I see how it may work for K=N, but the generalization to semirings is deferred to the full version and is unclear. Specifically, the model heavily uses the $*_{M}$ function, that essentially embeds elements of the semi-module back into the monoid. It is not obvious to me that it makes sense to have such an operation for every semiring and monoid: what if the semiring is access control/tropical from the PODS '07 semiring paper? How would such an embedding make sense there?
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O4}
Continuing O3, I think the crucial definition 15 can only be bound-preserving if one makes some monotonicity assumptions on the $*_{M}$ operation. These assumption should be spelled out. Again monotonicity makes sense if K=N (but even then, it must be stated as an assumption), but otherwise I'm not sure: first, there is the issue in O1 of how *{M} would even work; then, what if K is only partially ordered? Etc.
One solution is of course to restrict the whole construction to K=N, i.e. bag semantics. The authors indeed include a paragraph saying that the treatment of other semirings is deferred to the full version. But this hurts the generality of the contribution, so I do hope the authors can do better and formalize their assumptions in a more general fashion than that within the conference paper.
}
We apologize for not clarifying this better. As shown in [6], indeed $*_M$ is only well-behaved for certain combinations of semirings and aggregation monoids. The solution in [6] is to use monoids whose domains are symbolic (bags of pairs of monoid elements and semiring elements). However, these tensors do not correspond to regular monoid elements in the general case. It may be possible to allow such symbolic expressions as bounds, but may require storing expressions as large as the input (e.g., aggregation without group-by), which are probably meaningless to a user. Definition 15 should have been limited to the aggregation monoids mentioned earlier: SUM, MIN, MAX. We have fixed this definition and now state the assumptions and limitation of our aggregation semantics more clearly upfront.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O5} Corrolary 1: What's $RA^{agg}$? I thought it means RA with agg+group-by as last operation, because it seems this the supported class. However the Introduction seems to criticize other approaches for supporting this very same class (please see last paragraph of Page 1), so this needs to be clarified.
}
Note that $RA^{agg}$ is any query with aggregation, not just queries where there is a single aggregation as the last operation. A major advantage of our data model is that it is closed under aggregation which allows us to support queries with multiple aggregations. We state this now explicitly in the list of contributions at the end of the introduction. Also note that in the experiments (Section 9.1, TPC-H queries) we are using actual TPC-H benchmark queries and query Q7 has multiple aggregations.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O6} Query evaluation for $RA^{agg}$ using your semantics is PTIME, right? Maybe worthwhile to have a proposition to this effect.
}
We added the statement that $RA^{agg}$ is PTIME in section 8 and have added a proposition to our technical report.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O7} Continuing O2, how "join-intensive" are the queries used in the experiments? I.e. how many joins, how large are the relations being joined, what is the selectivity, etc? It may be worthwhile adding experiments that explicitly measure the effect of the number of joins on the algorithms performance.
}
We have added a benchmark (Figure 12) showing the effect of the number of joins on performance. For that experiment we used tables of a fixed size (4k) and varied the number of joins, uncertainty rate (3\% and 10\%) and compressed data size (i.e., the size of the compressed possible part of an input table or intermediate query result generated by our join optimization) and measured execution time for a given number of joins. We used equality joins with a single join condition. The join graph is a chain, e.g., $R \join S \join T$. As expected, the difference in runtime between the uncompressed and optimized version can be significant, especially for larger number of joins (up to $\sim$ 3 orders of magnitude for 2 joins and up to $\sim$ 5 orders of magnitude for 4 joins on 10\% uncertainty rate).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Reviewer 2}\label{sec:reviewer-2}
%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{\textbf{Response to author feedback} O1: Per author feedback, I also
appreciate the blending of systems and theory, and how difficult it is to
condense the work to conference-paper length. I have faith that the authors
will do their best to balance out the material as best they can within the
current space constraints when responding to the reviewers. I hope that there
is a follow-up full journal paper that can fully archive this nice piece of
work.}
\RCOMMENT{O3: I certainly agree that ranges are helpful in communicating uncertainty. I
was concerned that the fine-grained level of information provided could be
hard to assimilate for decision makers. Maybe just indicate that the results
could potentially be fed into either appropriate analytics tools or
uncertainty-sensitive visual displays?
\textbf{List required changes for a revision} I think all of O1-O6 need to be
addressed to some degree. This may require splitting up the paper as
mentioned.
}
\textbf{Regarding O1:} thank you for your kind words, we tried our best to present the work as clearly as possible within the space constraints.
\textbf{Regarding O3:} thank you, this is a good suggestion, we have added a comment to this effect to the introduction (last paragraph).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \RCOMMENT{
% \textbf{O1} The paper reads more like a PODS submission, with a lot of formalism (semirings, monoids, semimodules, and so on) and only 1.5 columns devoted to implementation. Im wondering whether the authors should split this into two papers, a PODS paper that fills out the theory in the current draft (right now there are many pointers to a full tech report and a missing result as described in O2), and then writing a second paper going into some interesting implementation and perhaps usability details, which seem to be just hinted at in the current paper. Then both the various theoretical and practical issues can be given the full treatment that they deserve.
% }
% We believe that blending formalism and implementation is one of our paper's strengths. Without the discussion of the implementation, it is difficult to see the practical implications of the formalism (i.e., performance, accuracy in practice). Without the discussion of the formalism, it is not clear that the implementation is computing anything sensible.\BG{Maybe remove, because superseeded by the response ot author feedback}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O2} A related comment is that the authors assert in Sections 1, 4, and 11 that query evaluation has PTIME complexity, but they never discuss or prove this result. This needs to be fleshed out.
}
We have added a statement to section 7 that our use of K-Relations implies PTIME query evaluation and added a more detailed argument to our supplementary technical report arguing why aggregation (and set difference) is also in PTIME.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O3} In terms of real-world practicality, I am not sure how digestible a query output such as the one in Figure 1(c) would be to an analyst or decision maker who is trying to make decisions under uncertainty. (There are HCI studies indicating that people can barely deal with confidence intervals.) It would greatly strengthen the paper to provide some convincing scenarios around how the output of an AU-DB query could be used in various important real-world applications.
}
We have added relevant references to the introduction to explain that ranges are established as a user-friendly representation of uncertainty.
% Figure 1.c is playing double-duty as an example and a view "under-the-hood", and as such we agree that a cleaner interface would be preferable, e.g. see [1, 2] for examples. However, we disagree that ranges are unhelpful. [4], as well as many others, demonstrates that communicating uncertainty (via ranges) can lead to better decision-making. \\
% \\
% $[1]$ Hellerstein, Haas, Wang. Online aggregation. SIGMOD, 1997. \\
% $[2]$ Kumari, Achmiz, Kennedy. Communicating data quality in on-demand curation. QDB, 2016. \\
% $[4]$ Jung, Sirkin, Gür, and Steinert. Displayed uncertainty improves driving experience and behavior: The case of range anxiety in an electric car. CHI, 2015.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O4}
The formalism seems to struggle when it comes to categorical attributes. E.g., the required total order on attribute values seems pretty artificial, and simply saying that for some attribute any value is possible seems crude. It is often the case that the value of an uncertain categorical attribute is known to lie in some small set (having more than three possible values). I don't see how this be represented in an AU-DB; if the point is that this level of detail is sacrificed in order to obtain computational efficiency, then this should be made clear.
}
The reviewer is correct that AU-DBs shine for numerical domains or other domains that have a clear total order. Note that aggregation, which is one of our main use cases, is producing numerical results. The examples have been reworked to use categorical attributes that have a hierarchical relationship defining a sensible total order (e.g., town $<$ city $<$ metro). We now explicitly state that attribute bounds would typically degrade to a binary marker for unordered categorical attributes. We believe that there are opportunities for interesting follow-up work that considers other compact representations of sets of possible values, perhaps using the lowest common ancestor of values in a taxonomy for unordered domains (for instance, a set of cities may be replaced by the state or country they belong to).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O5} I am puzzled by the comparison to probabilistic databases, both in the literature review and in the experiments. PDB's provide a different functionality from AU-DB's, in that AU-DB's efficiently compute and represent sets of possible worlds, but PDB's also provide a probability distribution over the possible worlds. I.e., knowing that some world is possible is not so interesting if we know that the probability of seeing that world is negligible. Given this greater functionality (which of course comes with greater demands on the input data), it is not surprising that AU-DB is more computationally efficient than, say, MCDB. I think a more precise discussion is needed about where each of these types of databases is more appropriate.
}
We compare against probabilistic databases (PDBs) in related work, because PDBs also capture uncertainty. Of course, probabilistic databases are more expressive than incomplete databases (on which AU-DBs are based on). The choice to use PDBs for evaluation is motivated by the lack of available incomplete database (IDB) implementations that support possible answers (and aggregation). As we note in Section 9, we skip MayBMS's probability computation step, and MCDB is already independent of probabilities once samples are generated.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O6} It is also unclear how one goes from a PDB representation to an AU-DB representation, which the paper asserts can be done. It seems as if you need to throw away a lot of information. E.g., if an attribute is real-valued with a normal probability distribution. Then the lower and upper bounds are -infinity and +infinity respectively, which is not very informative.
}
To be clear, the claim that we are making is that existing schemes designed to create a PDB can be re-used to create AU-DBs (e.g., MayBMS' probabilistic repair-key operator). % We now clarify that this comes at a loss of accuracy.
We envision our techniques to be applied to PDBs when queries have to be answered that are computationally infeasible over PDBs. In this case, we may map the PDB into an AU-DB at query time and preserve the input PDB for future use. Our approach may only return a coarse approximation (the example presented by the reviewer of a continuous domain where all values have a non-zero probability is a worst-case scenario), but does so within reasonable time.
We now clearly state these limitations in the paper in \Cref{sec:creating-abbruaadbs}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Additional remarks
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Reviewer 2 additional remarks}
\RCOMMENT{
\textbf{12. } The example does not involve any cardinalities greater than 1. I was briefly confused about whether, for a tuple with uncertain multiplicity and uncertain attribute values, in a possible world where the multiplicity is, say, 3, all three instances would have the same attribute values. I think that they would, but a reworked example might make this clearer. The terms under- and over-approximate are used in Sec. 1 but not defined until Sec. 2.
}
Note that a tuple in an AU-DB with multiplicity 3 may represent different tuples in different worlds. Otherwise, the attribute-level uncertainty would not give us only very limited power to compactly encode possible answers.
\RCOMMENT{
\textbf{12. Sec 3} It would be helpful to say more explicitly that UA-DB's handle uncertainty only at the level of tuple multiplicity.
}
We now mention this explicitly.
\RCOMMENT{
\textbf{12. Sec 8} In the paragraph on aggregation monoids, there seems to be a slight notation clash, because M is used both as the domain of a monoid and an element of {SUM, MIN, MAX}. In Section 8.1, the notation $*_{N_{AU},SUM}$ is a bit confusing.
}
Using common practice in work on K-relations, we abuse notation to use $M$ to mean both the structure (monoid) as well as the domain over which the structure is defined.
\RCOMMENT{
\textbf{12. Sec 8.2} It appears that the circled-star operator could lead to some rather loose bounds. Is this related to the occasionally high metric values in Fig. 11? (Those high values should be explained in any case.) Does this have something to do with the rather loose-looking upper bounds in Def. 19?
}
With sincere apologies, this was due to a poor choice of datasets from our part:
the errors in our ``real world'' benchmarks, taken from a data cleaning
benchmark (\url{http://db.unibas.it/projects/bart/}), are synthetically
generated. Because of their synthetic origins, the error values in these
datasets had an unrealistically high variance, creating wide bounds on uncertain
attributes, and in turn leading to the high imprecision in aggregate results
that appeared in the original Figure 11. We have moved the synthetic real world
experiments to our technical report, and identified several new real world
datasets with conflicting duplicate entries. The precision on these real world
datasets with real world errors turns out to be significantly better.
\RCOMMENT{
\textbf{12. p9} bottom left par.: Some more explanation around exactly why the given assumptions are "worst case" would be helpful. In the top right par., change "[6] did extend" -> "[6] extended", and in the bottom left par., change "From Thm. 4,... it follows our main technical result" to "Our main technical result follows from Thm. 4..."
}
Thanks, we have fixed these.
\RCOMMENT{
\textbf{} It would be interesting to know whether the new techniques could be applied to OLAP databases, improving on Sismanis et al., ICDE 2009.
}
In general there is nothing that prevents the application of our work on OLAP databases. We assume the reviewer is referring to the particular type of uncertainty from Sismanis et al., ICDE 2009, where uncertainty stems from unresolved entity resolution decisions. This type of uncertainty can be represented as a block-independent database / x-db (each entity is a block and all tuples that could represent this entity are the tuples belonging to the block). In the supplementary technical report we present a scheme for mapping x-dbs to AU-DBs. This scheme could be used for entity resolution. We now discuss Sismanis et al., ICDE 2009. in related work as one of the approaches that represents aggregation results using bounds.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Reviewer 3}
\label{sec:reviewer-3}
%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{List required changes for a revision}
O1 and O2 are the most important. Followed by O5.
}
We have addressed all opportunities of improvement, please see below.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O1} a discussion of runtime of the AU-DB method. The details may be in the technical report, but some overall results would be helpful to understand the performance.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We added a statement to this effect to Section 7 and at the beginning of Section 8. Since we are using K-relational semantics, query evaluation for $\raPlus$ has PTIME data complexity. We prove in [24] that aggregation (and set difference) also has PTIME data complexity.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O2} I appreciate the real-world data experiment, but that section needs to be better explained. What real world datasets are these? What are their attributes? Why only modify a single tuple per group? Where is the percentage of uncertain tuples shown? (The table is a bit confusing)
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We modified the real-world data experiments section to add detailed explanations and present additional results. The percentage of uncertain tuples is labeled in the table by the name of the dataset with the average amount of variations for each uncertain tuple. More details also added to the tech-report about the datasets. For possible recall, we used two metrics, one using tuple id and group by attributes to identify tuples (all tuple variations in a block are the same tuple) and the other using all values in the tuple to identify the tuple (all variations in a block are different tuples). Please also see our response to Reviewer 2 (additional comments, Sec 8.2) for why we used different datasets for the revision.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O3} The paper could use a better discussion of the challenges and cons of UA-DBs. They touch on this at the beginning of section 4, but if they expanded this, it would be more clear as to the real benefits of their approach.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Critically, UA-DBs do not support aggregation, which requires a compact over-approximation of possible tuples and uncertain attributes, nor do they support set difference, which requires the former. Please see our response to your detailed comments.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O4} A discussion of the impact of tuple bounds on query results. Perhaps measure the error of your upper/lower bounds one the non-altered deterministic table. The point of this work is clearly not to make the tightest bounds and be highly accurate, but it would be nice to understand the impact of bounds on query results.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We added a benchmark (Figure 11) showing the effect of over-estimation of attribute bounds on input tuples on the over-estimation of output tuple bounds produced by our query semantics. We construct x-DBs (block-independent incomplete databases) and create AU-DB instances accordingly. We compute query result over the x-DBs to calculate tight attribute-level bounds as the ground truth and compare this with the bounds produced by AU-DBs.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
\textbf{O5} they discuss that an improvement over UA-DBs is in their compactness, but they do not have experiments/metrics/analysis showing this. This type of result would strengthen their argument.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
We have added the comparison with UA-DBs to Figure 13.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% additional remarks
\subsection{Reviewer 3 additional remarks}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
High level suggestion: it might be worth changing the font of AU-DB to be different than UA-DB. I often misread one
for the other and got very confused. This is not critical though, just a suggestion as they are very similar acronyms.
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Thanks, we have done this.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
Sec 4: this first part could be improved to really strengthen your argument. Can you expand upon what the
challenges of using UA-DBs are. Some more detailed comments on that section are: (1) is "precise" really the right
term to describe UA-DBs. It seems you are looking more for compactness of representation? Also, can you explain
the UA-DB query semantics not supporting non-monotone operations due to over-approximation statement more?
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As you point out we want a representation of possible answers that is more compact than in UA-DBs. However, AU-DBs can also be more precise, because it is possible to express the fact that a tuple certainly exists even though some of its attribute values are not precisely known. This has real implications on precision. For example, our aggregation semantics over AU-DBs can often determine that a group in the aggregation result certainly exists, but its aggregation value is unknown: if at least one certain tuple with these group-by values exists, then the group exists certainly in the output. In UA-DBs, even if we extend the semantics to keep an over-approximation of possible tuples, then still we have to mark such tuples in the aggregation result as being uncertain. We have tried to express this more clearly in the beginning of Section 4. Also note that we only claim that AU-DBs are more precise than UA-DBs. Obviously, PDBs or incomplete databases by encoding exactly all possible tuples are often more precise than AU-DBs.
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \RCOMMENT{
% Figure 4: can you explain the annotation of 2 for the D2 tuple 2 row? I also don't know where you use D2 in the
% paper.
% }
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Note that this is an example incomplete database with two possible worlds ($D_1$ and $D_2$). We are using $D_2$, because the example AU-DB bounds the incomplete database (it bounds both $D_1$ and $D_2$). With just a single possible world $D_1$ it would be hard to explaining the bounding properties of AU-DBs. Regarding your specific question, the annotation means that tuple $<1,3>$ appears twice in the possible world $D_2$.\BG{Maybe not necessary, should we remove?}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
Sec 9: the join optimization section was a bit confusion. I understand you are pressed for space and the details are
in the report, but I struggled to understand at a high level what you were doing. Maybe explain it at a higher level of
abstraction? You also mention that the latter part of the join (upper bound) is expensive, how expensive?
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Following the suggestion of reviewer 1, we have merged the discussion of the query semantics with the join optimization and have tried to present it at a higher-level of abstraction as suggested. We hope this is now more clear.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
Experiments: my main comment on this is explaining the real word data section. The table did not make much
sense, and I didn't understand the datasets. Where are they coming from, what are the queries, what is your
reasoning for only altering one tuple per group. My other comments on this section are in the "opportunities for
improvement section".
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Originally we used some datasets that were commonly used in data cleaning, but the error in these datasets were synthetically generated leading to unrealistically large attribute bounds. We now use three real world datasets (Netflix, Crimes, Healthcare) which have real errors. Detailed information about these datasets and all queries we used are shown in our technical report. To keep the paper more self-contained, we have added links to the datasets. We have updated the description of this section and provide additional details in the technical report. Please also see our response to reviewer 2's additional comments.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RCOMMENT{
- Experiments: can you define scale factor as SF (I don't think you define that acronym).
- Def 9: the bold fact t and t are switched in other definitions of TM (boldface is usually first)
- You sometime use $N^3$ and sometimes use $N_{AU}$ to define the annotation. Can you make this consistent?
- Example 6: it might help to add a sentence about why you don't need the min(0, ...) formulation for t1's participation
in the result. It's because the annotations are not uncertain (3=3=3)
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Thanks, we have fixed these.
% \bibliographystyle{plain}
% \bibliography{../uaadb}
\end{document}

View File

@ -0,0 +1,20 @@
%root: main.tex
%!TEX root=./main.tex
\subsection{Single $\prob$ value}
\label{sec:single-p}
While \Cref{thm:mult-p-hard-result} shows that computing $\rpoly(\prob,\dots,\prob)$ for multiple values of $\prob$ in general is hard it does not rule out the possibility that one can compute this value exactly for a {\em fixed} value of $\prob$. Indeed, it is easy to check that one can compute $\rpoly(\prob,\dots,\prob)$ exactly in linear time for $\prob\in \inset{0,1}$. Next we show that these two are the only possibilities:
\begin{Theorem}\label{th:single-p-hard}
Fix $\prob\in (0,1)$. Then assuming \Cref{conj:graph} is true, any algorithm that computes $\rpoly_{G}^3(\prob,\dots,\prob)$ for arbitrary $G = (\vset, \edgeSet)$ exactly has to run in time $\Omega\inparen{\abs{\edgeSet}^{1+\eps_0}}$, where $\eps_0$ is as defined in \Cref{conj:graph}.
\end{Theorem}
Note that \Cref{prop:expection-of-polynom} and \Cref{th:single-p-hard} above imply the hardness result in the first row of \Cref{tab:lbs}.
We note that \Cref{thm:k-match-hard} and \Cref{conj:known-algo-kmatch} (and the lower bounds in the second and third row of Table~\ref{tab:lbs}) need $k$ to be large enough (in particular, we need a family of hard queries). But the above \Cref{th:single-p-hard} (and the lower bound in first row of Table~\ref{tab:lbs}) holds for $k=3$ (and hence for a fixed query).
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "main"
%%% End:

View File

@ -0,0 +1,132 @@
%root: main.tex
\usetikzlibrary{shapes.geometric}%for cylinder
\usetikzlibrary{shapes.arrows}%for arrow shape
\usetikzlibrary{shapes.misc}
%rid of vertical spacing for booktabs rules
\renewcommand{\aboverulesep}{0pt}
\renewcommand{\belowrulesep}{0pt}
\begin{figure}[t!]
\centering
\resizebox{\textwidth}{5.2cm}{%
\begin{tikzpicture}
%pdb cylinder
\node[cylinder, text width=0.28\textwidth, align=center, draw=black, text=black, cylinder uses custom fill, cylinder body fill=blue!10, aspect=0.12, minimum height=5cm, minimum width=2.5cm, cylinder end fill=blue!50, shape border rotate=90] (cylinder) at (0, 0) {
\tabcolsep=0.1cm
\begin{tabular}{>{\small}c | >{\small}c | >{\small}c}
\multicolumn{2}{c}{$\boldsymbol{T}$}\\
%\toprule
Point & $\Phi$ \\%& $\semN$\\
\midrule
$e_1$ & $A$ \\%& 1 \\
$e_2$ & $B$ \\%& 1\\
$e_3$ & $C$ \\%& 1\\
$e_4$ & $E$ \\%& 1\\
\end{tabular}\\
\tabcolsep=0.05cm
%\captionof{table}{Route}
\begin{tabular}{>{\footnotesize}c | >{\footnotesize}c | >{\footnotesize}c | >{\footnotesize}c}
\multicolumn{3}{c}{$\boldsymbol{R$}}\\
%\toprule
$\text{Point}_1$ & $\text{Point}_2$ & $\Phi$\\% & $\semN$ \\
\midrule
$e_1$ & $e_2$ & $X$\\% & 2 \\
$e_2$ & $e_4$ & $Y$\\% & 4 \\
%& $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ \\
$e_2$ & $e_3$ & $Z$\\% & 3 \\
\end{tabular}};
%label below cylinder
\node[below=0.2 cm of cylinder]{{\LARGE$ \pdb$}};
%First arrow
\node[single arrow, right=0.25 of cylinder, draw=black, fill=black!65, text=white, minimum height=0.75cm, minimum width=0.25cm](arrow1) {\textbf{\abbrStepOne}};
\node[above=of arrow1](arrow1Label) {$\query_2$};
\usetikzlibrary{arrows.meta}%for the following arrow configurations
\draw[line width=0.5mm, dashed, arrows = -{Latex[length=3mm, open]}] (arrow1Label)->(arrow1);
%Query output (output of step 1)
\node[rectangle, right=0.175 of arrow1, draw=black, text=black, fill=purple!10, minimum height=4.5cm, minimum width=2cm](rect) {
\tabcolsep=0.075cm
%\captionof{table}{Q}
%\setlength{\cellspacetoplimit}{4pt}
\begin{tabular}{>{\normalsize}c | >{\centering\arraybackslash\normalsize}m{1.95cm} | >{\centering\arraybackslash\small}m{1.95cm}}
%\multicolumn{3}{c}{$\boldsymbol{\query_2(\pdb)}$}\\[1mm]
%\toprule
Point & $\Phi$ & Circuit\\% & $\expct_{\idb \sim \probDist}[\query_2(\db)(t)]$ \\ \hline
\midrule
%\hline
%\\\\[-3.5\medskipamount]
$e_1$ & $AX$ &\resizebox{!}{10mm}{
\begin{tikzpicture}[thick]
\node[gen_tree_node](sink) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[gen_tree_node](source1) at (0, 0){$A$};
\node[gen_tree_node](source2) at (1, 0){$X$};
\draw[->](source1)--(sink);
\draw[->] (source2)--(sink);
\end{tikzpicture}% & $0.5 \cdot 1.0 + 0.5 \cdot 1.0 = 1.0$
}\\% & $0.9$ \\
$e_2$ & $B(Y + Z)$\newline \text{Or}\newline $BY+ BZ$&
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[gen_tree_node] (a1) at (1, 0){$Y$};
\node[gen_tree_node] (b1) at (2, 0){$Z$};
%level 1
\node[gen_tree_node] (a2) at (0.75, 0.8){$B$};
\node[gen_tree_node] (b2) at (1.5, 0.8){$\boldsymbol{\circplus}$};
%level 0
\node[gen_tree_node] (a3) at (1.1, 1.6){$\boldsymbol{\circmult}$};
%edges
\draw[->] (a1) -- (b2);
\draw[->] (b1) -- (b2);
\draw[->] (a2) -- (a3);
\draw[->] (b2) -- (a3);
\end{tikzpicture}
}\newline\text{Or}\newline
%%%%%%%%%%%
%Non factorized circuit%
%%%%%%%%%%%
\resizebox{!}{16mm} {
\begin{tikzpicture}[thick]
\node[gen_tree_node] (a2) at (0, 0){$Y$};
\node[gen_tree_node] (b2) at (1, 0){$B$};
\node[gen_tree_node] (c2) at (2, 0){$Z$};
%level 1
\node[gen_tree_node] (a1) at (0.5, 0.8){$\boldsymbol{\circmult}$};
\node[gen_tree_node] (b1) at (1.5, 0.8){$\boldsymbol{\circmult}$};
%level 0
\node[gen_tree_node] (a0) at (1.0, 1.6){$\boldsymbol{\circplus}$};
%edges
\draw[->] (a2) -- (a1);
\draw[->] (b2) -- (a1);
\draw[->] (b2) -- (b1);
\draw[->] (c2) -- (b1);
\draw[->] (a1) -- (a0);
\draw[->] (b1) -- (a0);
\end{tikzpicture}
}\\
\end{tabular}
};
%label below rectangle
\node[below=0.2cm of rect]{{\LARGE $\query_2(\pdb)\inparen{\tup}\equiv \poly\inparen{\vct{X}}$}};
%Second arrow
\node[single arrow, right=0.25 of rect, draw=black, fill=black!65, text=white, minimum height=0.75cm, minimum width=0.25cm](arrow2) {\textbf{\abbrStepTwo}};
%Expectation computation; (output of step 2)
\node[rectangle, right=0.25 of arrow2, rounded corners, draw=black, fill=red!10, text=black, minimum height=4.5cm, minimum width=2cm](rrect) {
\tabcolsep=0.09cm
%\captionof{table}{Q}
\begin{tabular}{>{\small}c | >{\arraybackslash\normalsize}c}%m{1.95cm}}
%\multicolumn{2}{c}{$\expct\pbox{\poly(\vct{X})}$}\\[1mm]
%\toprule
Point & $\mathbb{E}[\poly(\vct{X})]$\\
\midrule%[0.05pt]
$e_1$ & $\inparen{\prob_{A, 1} +\prob_{A, 2}}\cdot\left(\prob_{X, 1} + 2\prob_{X, 2}\right)$\\%$1.0 \cdot 0.9 = 0.9$\\[3mm]
$e_2$ & $\inparen{\prob_{B, 1} + \prob_{B_2}}\inparen{\prob_{Y, 1}+2\prob_{Y, 2} + \prob_{Z, 1} + 2\prob_{Z, 2}}$\\%$(0.5 \cdot 1.0) + $\newline $\hspace{0.2cm}(0.5 \cdot 1.0)$\newline $= 1.0$\\
\end{tabular}
};
%label of rounded rectangle
\node[below=0.2cm of rrect]{{\LARGE $\expct\pbox{\poly(\vct{X})}$}};
\end{tikzpicture}
}
\caption{Intensional Query Evaluation Model $(\query_2 = \project_{\text{Point}}$ $\inparen{T\join_{\text{Point} = \text{Point}_1}R}$ where, for table $R,~\bound = 2$, while for $T,~\bound = 1.)$}
\label{fig:two-step}
\end{figure}

Binary file not shown.

After

Width:  |  Height:  |  Size: 305 KiB

View File

@ -60,24 +60,35 @@ Given \abbrCTIDB $\pdb = \inparen{\worlds, \bpd}$, let $\pdb' = \inparen{\onebid
We now define the reduced polynomial $\rpoly'$ of a \abbrOneBIDB.
\begin{figure}[t!]
\centering
\resizebox{\textwidth}{!}{
\begin{minipage}{\textwidth}
%\centering
%\resizebox{0.5\textwidth}{!}{
%\begin{minipage}{0.5\textwidth}
\begin{align*}
\poly'\pbox{\project_A\inparen{\query}, \gentupset', \tup_j} =& \sum_{\substack{\tup_{j'},\\\project_{A}\inparen{\tup_{j'}} = \tup_j}}\poly'\pbox{\query, \gentupset', \tup_{j'}} &
\poly'\pbox{\query_1\union\query_2, \gentupset', \tup_j} =& \poly'\pbox{\query_1, \gentupset', \tup_j}+\poly'\pbox{\query_2, \gentupset', \tup_j}\\
\poly'\pbox{\select_\theta\inparen{\query}, \gentupset', \tup_j} =& \begin{cases}\theta = 1&\poly'\pbox{\query, \gentupset', \tup_j}\\\theta = 0& 0\\\end{cases} &
\begin{aligned}
\poly'\pbox{\query_1\join\query_2, \gentupset', \tup_j} = \\~
\end{aligned} &
\begin{aligned}
&\poly'\pbox{\query_1, \gentupset', \project_{attr\inparen{\query_1}}\inparen{\tup_j}}\\ &~~~\cdot\poly'\pbox{\query_2, \gentupset', \project_{attr\inparen{\query_2}}\inparen{\tup_j}}
&\begin{aligned}[t]
&\poly'\pbox{\project_A\inparen{\query}, \gentupset', \tup_j} =\\
&~\sum_{\substack{\tup_{j'},\\\project_{A}\inparen{\tup_{j'}} = \tup_j}}\poly'\pbox{\query, \gentupset', \tup_{j'}}
\end{aligned}
&
&\begin{aligned}[t]
&\poly'\pbox{\query_1\union\query_2, \gentupset', \tup_j} = \\
&\qquad\poly'\pbox{\query_1, \gentupset', \tup_j}+\poly'\pbox{\query_2, \gentupset', \tup_j}
\end{aligned}\\
&\begin{aligned}
&\poly'\pbox{\select_\theta\inparen{\query}, \gentupset', \tup_j} =\\
&~\begin{cases}\theta = 1 &\poly'\pbox{\query, \gentupset', \tup_j}\\\theta = 0& 0\\\end{cases}
\end{aligned}
&
&\begin{aligned}
&\poly'\pbox{\query_1\join\query_2, \gentupset', \tup_j} = \\
&\qquad \poly'\pbox{\query_1, \gentupset', \project_{attr\inparen{\query_1}}\inparen{\tup_j}}\\ &\qquad\cdot\poly'\pbox{\query_2, \gentupset', \project_{attr\inparen{\query_2}} \inparen{\tup_j}}
\end{aligned}\\
&&&\poly'\pbox{\rel,\gentupset', \tup_j} = j\cdot X_{\tup, j}.
\end{align*}\\[-10mm]
\end{minipage}}
\end{align*}\\%[-10mm]
%\end{minipage}}
\setlength{\abovecaptionskip}{-0.2cm}
\caption{Construction of the lineage (polynomial) for an $\raPlus$ query $\query$ over $\gentupset'$.}
\label{fig:lin-poly-bidb}
\vspace{-0.53cm}
\end{figure}
\begin{Definition}[$\rpoly'$]\label{def:reduced-poly-one-bidb}

View File

@ -38,24 +38,30 @@ For these algorithms, $\jointime{R_1, \ldots, R_n}$ is linear in the {\em AGM bo
% = |R_1| + \ldots + |R_n| + |R_1(\db) \bowtie \ldots \bowtie R_n(\db)|$.
Our cost model for general query evaluation follows from the join cost:
\noindent\resizebox{1\linewidth}{!}{
\begin{minipage}{1.0\linewidth}
\begin{align*}
\qruntimenoopt{R,\gentupset,\bound} & = |\gentupset.R| &
\qruntimenoopt{\sigma \query, \gentupset,\bound} & = \qruntimenoopt{\query,\gentupset} &
\qruntimenoopt{\pi \query, \gentupset,\bound} & = \qruntimenoopt{\query,\gentupset,\bound} + \abs{\query(\gentupset)}
\end{align*}\\[-15mm]
\begin{align*}
\qruntimenoopt{\query \cup \query', \gentupset,\bound} & = \qruntimenoopt{\query, \gentupset,\bound} +
\qruntimenoopt{\query', \gentupset,\bound} +
\abs{\query\inparen{\gentupset}}+\abs{\query'\inparen{\gentupset}} \\
\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound}
& = \qruntimenoopt{\query_1, \gentupset,\bound} + \ldots +
\qruntimenoopt{\query_m,\gentupset,\bound} +
\jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}
\end{align*}
\end{minipage}
}\\
%\noindent\resizebox{1\linewidth}{!}{
%\begin{minipage}{1.0\linewidth}
\vspace{-0.57cm}
\begin{flalign*}
&\begin{aligned}
&\qruntimenoopt{R,\gentupset,\bound} = |\gentupset.R|
&
&\qquad\qquad\qruntimenoopt{\sigma \query, \gentupset,\bound} = \qruntimenoopt{\query,\gentupset}\\
\end{aligned}&\\
%\vspace{-.6cm}
&\qruntimenoopt{\pi \query, \gentupset,\bound} = \qruntimenoopt{\query,\gentupset,\bound} +\abs{\query(\gentupset)}&\\
&\qruntimenoopt{\query \cup \query', \gentupset,\bound} = \qruntimenoopt{\query, \gentupset,\bound} + \qruntimenoopt{\query', \gentupset,\bound} + \abs{\query\inparen{\gentupset}}+\abs{\query'\inparen{\gentupset}}& \\
&\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound} = &\\
&\qquad\qruntimenoopt{\query_1, \gentupset,\bound} + \ldots + \qruntimenoopt{\query_m,\gentupset,\bound} +
\jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}&\\
\end{flalign*}
\vspace{-0.93cm}
%\begin{align*}
% &\qruntimenoopt{\query_1 \bowtie \ldots \bowtie \query_m, \gentupset,\bound} = \\
% &\qquad\qruntimenoopt{\query_1, \gentupset,\bound} + \ldots + \qruntimenoopt{\query_m,\gentupset,\bound} +
% \jointime{\query_1(\gentupset), \ldots, \query_m(\gentupset)}
%\end{align*}\\%[-10mm]
%\end{minipage}
%}\\
Under this model, an $\raPlus$ query $\query$ evaluated over database $\gentupset$ has runtime $O(\qruntimenoopt{Q,\gentupset, \bound})$.

View File

@ -105,7 +105,7 @@ We drop $\query$, $\tupset$, and $\tup$ from $\apolyqdt$ when they are clear fro
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{Problem}[Expected Multiplicity of Lineage Polynomials]\label{prob:bag-pdb-poly-expected}
Given an $\raPlus$ query $\query$, \abbrCTIDB $\pdb$ and result tuple $\tup$, compute the expected
multiplicity of the polynomial $\apolyqdt$ (i.e., $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt(\vct{W})}$, where $\vct{W} \in \worlds$).
multiplicity of the polynomial $\apolyqdt$ (i.e., for $\worldvec\in\worlds$, compute $\expct_{\vct{W}\sim \pdassign}\pbox{\apolyqdt\inparen{\worldvec}}$).
\end{Problem}
We note that computing \Cref{prob:expect-mult}
is equivalent (yields the same result as) to computing \Cref{prob:bag-pdb-poly-expected} (see \Cref{prop:expection-of-polynom}).
@ -134,7 +134,7 @@ Considering again our example,
\refpoly{1, }^{\inparen{ABX}^2}\inparen{A, X, B} = \poly_1^{\inparen{AXB}^2}\inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}, \sum_{j_2\in\pbox{\bound}}j_2X_{j_2}, \sum_{j_3\in\pbox{\bound}}j_3B_{j_3}} \\
= \inparen{\sum_{j_1\in\pbox{\bound}}j_1A_{j_1}}^2\inparen{\sum_{j_2\in\pbox{\bound}}j_2X_{j_2}}^2\inparen{\sum_{j_3\in\pbox{\bound}}j_3B_{j_3}}^2.
\end{multline*}
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Computing expectation we get $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$, since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$.
Since the set of multiplicities for tuple $\tup$ by nature are disjoint we can drop all cross terms and have $\refpoly{1, }^2 = \sum_{j_1, j_2, j_3 \in \pbox{\bound}}j_1^2A^2_{j_1}j_2^2X_{j_2}^2j_3^2B^2_{j_3}$. Computing expectation we get $\expct\pbox{\refpoly{1, }^2}=\sum_{j_1,j_2,j_3\in\pbox{\bound}}j_1^2j_2^2j_3^2$ \allowbreak $\expct\pbox{\randWorld_{A_{j_1}}}\expct\pbox{\randWorld_{X_{j_2}}}\expct\pbox{\randWorld_{B_{j_3}}}$, since we now have that all $\randWorld_{X_j}\in\inset{0, 1}$.
This leads us to consider a structure related to the lineage polynomial.
\begin{Definition}\label{def:reduced-poly}
@ -151,13 +151,14 @@ Continuing with the example
\footnote{
To save clutter we do not show the full expansion for variables with greatest multiplicity $= 1$ since e.g. for variable $A$, the sum of products itself evaluates to $1^2\cdot A^2 = A$.
}
$\poly_1^2\inparen{A, B, C, E, X_1, X_2, Y, Z}$ we have
\begin{multline*}
\rpoly_1^2(A, B, C, E, X_1, X_2, Y, Z) = \\
A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}B + BYE + BZC + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BYE + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BZC + 2BYEZC =\\
ABX_1 + AB\inparen{2}^2X_2 + BYE + BZC + 2AX_1BYE + 2A\inparen{2}^2X_2BYE + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
\end{multline*}
Note that we have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$ $\probOf\inparen{B=1}, \probOf\inparen{C=1}), \probOf\inparen{E=1}, \probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
$\poly_1^2\inparen{A, B, C, E, X_1, X_2, Y, Z}$ we have $\rpoly_1^2(A, B, C, E, X_1, X_2, Y, Z)=$
\begin{align*}
&A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}B + BYE + BZC + 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BYE \\
&\qquad+ 2A\inparen{\sum\limits_{j\in\pbox{\bound}}j^2X_j}BZC + 2BYEZC \\
&= ABX_1 + AB\inparen{2}^2X_2+ BYE + BZC + 2AX_1BYE+ 2A\inparen{2}^2X_2BYE\\
&\qquad + 2AX_1BZC + 2A\inparen{2}^2X_2BZC + 2BYEZC.
\end{align*}
Note that we have argued that for our specific example the expectation that we want is $\rpoly_1^2(\probOf\inparen{A=1},$\allowbreak$\probOf\inparen{B=1}, \probOf\inparen{C=1}$,\allowbreak $\probOf\inparen{E=1},$\allowbreak $\probOf\inparen{X_1=1}, \probOf\inparen{X_2=1}, \probOf\inparen{Y=1}, \probOf\inparen{Z=1})$.
\Cref{lem:tidb-reduce-poly} generalizes the equivalence to {\em all} $\raPlus$ queries on \abbrCTIDB\xplural (proof in \Cref{subsec:proof-exp-poly-rpoly}).
\begin{Lemma}\label{lem:tidb-reduce-poly}
For any \abbrCTIDB $\pdb$, $\raPlus$ query $\query$, and lineage polynomial
@ -185,8 +186,8 @@ Denote by $\timeOf{\abbrStepTwo}(\circuit, \epsilon)$ (recall $\circuit$ is the
\begin{Problem}[\abbrCTIDB linear time approximation]\label{prob:big-o-joint-steps}
Given \abbrCTIDB $\pdb$, $\raPlus$ query $\query$,
is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ for all result tuples $\tup$ where
$\exists \circuit : \timeOf{\abbrStepOne}(Q,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
is there a $(1\pm\epsilon)$-approximation of $\expct_{\rvworld\sim\bpd}$\allowbreak$\pbox{\query\inparen{\rvworld}\inparen{\tup}}$ for all result tuples $\tup$ where
$\exists \circuit : \timeOf{\abbrStepOne}(Q,\tupset, \circuit) + \timeOf{\abbrStepTwo}(\circuit, \epsilon) \le$\allowbreak$ O_\epsilon(\qruntime{\optquery{\query}, \tupset, \bound})$?
\end{Problem}
We show in \Cref{sec:circuit-depth} an $\bigO{\qruntime{\optquery{\query}, \tupset, \bound}}$ algorithm for constructing the lineage polynomial for all result tuples of an $\raPlus$ query $\query$ (or more more precisely, a single circuit $\circuit$ with one sink per tuple representing the tuple's lineage).
@ -230,15 +231,15 @@ This then implies
&\qquad+2AX_2BZC+2BYEZC\\
\end{align*}
\end{footnotesize}
Substituting $\vct{\prob}$ for $\vct{X}$,
Substituting $\vct{\prob}$ for $\vct{X}$, $\refpoly{1, }^2\inparen{\probAllTup} =$
\begin{footnotesize}
\begin{align*}
\hspace*{-3mm}
\refpoly{1, }^2\inparen{\probAllTup} &= \prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2 + \prob_B^2\prob_Y^2\prob_E^2 + \prob_B^2\prob_Z^2\prob_C^2 + 2\prob_A\prob_{X_1}\prob_B^2\prob_Y\prob_E + 2\prob_A\prob_{X_2}\prob_B^2\prob_Y\prob_E\\
&\qquad+ 2\prob_A\prob_{X_1}\prob_B^2\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B^2\prob_Z\prob_C+ 2\prob_B^2\prob_Y\prob_E\prob_Z\prob_C\\
&\leq\prob_A\prob_{X_1}\prob_B + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A\prob_{X_2}\prob_b + \prob_B\prob_Y\prob_E + \prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_1}\prob_B\prob_Y\prob_E+ 2\prob_A\prob_{X_2}\prob_B\prob_Y\prob_E \\
&\qquad+ 2\prob_A\prob_{X_1}\prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B\prob_Z\prob_C + 2\prob_B\prob_Y\prob_E\prob_Z\prob_C
= \rpoly_1^2\inparen{\vct{p}} + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2.
&\prob_A^2\prob_{X_1}^2\prob_B^2 + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A^2\prob_{X_2}^2\prob_B^2 + \prob_B^2\prob_Y^2\prob_E^2 + \prob_B^2\prob_Z^2\prob_C^2 + 2\prob_A\prob_{X_1}\prob_B^2\prob_Y\prob_E\\
&\qquad+ 2\prob_A\prob_{X_2}\prob_B^2\prob_Y\prob_E + 2\prob_A\prob_{X_1}\prob_B^2\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B^2\prob_Z\prob_C+ 2\prob_B^2\prob_Y\prob_E\prob_Z\prob_C\\
&\leq\prob_A\prob_{X_1}\prob_B + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2 + 4\prob_A\prob_{X_2}\prob_b + \prob_B\prob_Y\prob_E + \prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_1}\prob_B\prob_Y\prob_E \\
&\qquad + 2\prob_A\prob_{X_2}\prob_B\prob_Y\prob_E+ 2\prob_A\prob_{X_1}\prob_B\prob_Z\prob_C + 2\prob_A\prob_{X_2}\prob_B\prob_Z\prob_C + 2\prob_B\prob_Y\prob_E\prob_Z\prob_C\\
&= \rpoly_1^2\inparen{\vct{p}} + 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2.
\end{align*}
\end{footnotesize}
If we assume that all probability values are at least $p_0>0$, then given access to $\refpoly{1, }^2\inparen{\vct{\prob}} - 4\prob_A^2\prob_{X_1}\prob_{X_2}\prob_B^2$

BIN
main.pdf

Binary file not shown.

Binary file not shown.

View File

@ -21,7 +21,7 @@ The circuits in \Cref{fig:two-step} encode their respective polynomials in colum
Note that the ciricuit \circuit representing $AX$ and the circuit \circuit' representing $B\inparen{Y+Z}$ each encode a tree, with edges pointing towards the root.
\begin{wrapfigure}{L}{0.45\linewidth}
\begin{figure}[t!]
\centering
\begin{tikzpicture}[thick]
\node[tree_node] (a1) at (0, 0) {$\boldsymbol{X}$};
@ -54,7 +54,8 @@ Note that the ciricuit \circuit representing $AX$ and the circuit \circuit' repr
\end{tikzpicture}
\caption{Circuit encoding of $(X + 2Y)(2X - Y)$}
\label{fig:circuit}
\end{wrapfigure}
\vspace{-0.53cm}
\end{figure}
We next formally define the relationship of circuits with polynomials. While the definition assumes one sink for notational convenience, it easily generalizes to the multiple sinks case.
\begin{Definition}[$\polyf(\cdot)$]\label{def:poly-func}
$\polyf(\circuit)$ maps the sink of circuit $\circuit$ to its corresponding polynomial (in \abbrSMB). $\polyf(\cdot)$ is recursively defined on $\circuit$ as follows, with addition and multiplication following the standard interpretation for polynomials:
@ -84,12 +85,11 @@ The circuit of \Cref{fig:circuit} is an element of $\circuitset{2X^2+3XY-2Y^2}$.
\begin{Definition}[The Expected Result Multiplicity Problem]\label{def:the-expected-multipl}
Let $\pdb'$ be an arbitrary \abbrCTIDB and $\vct{X}$ be the set of variables annotating tuples in $\tupset'$. Fix an $\raPlus$ query $\query$ and a result tuple $\tup$.
The \expectProblem is defined as follows:\\[-7mm]
\begin{center}
\textbf{Input}: $\circuit \in \circuitset{\polyX}$ for $\poly'\inparen{\vct{X}} = \poly'\pbox{\query,\tupset',\tup}$
\hspace*{2mm}
\textbf{Output}: $\expct_{\vct{W} \sim \bpd}\pbox{\poly'\pbox{\query, \tupset', \tup}\inparen{\vct{W}}}$
\end{center}
The \expectProblem is defined as follows:%\\[-7mm]
\begin{flalign*}
&\textbf{Input}: \circuit \in \circuitset{\polyX} \text{ for }\poly'\inparen{\vct{X}} = \poly'\pbox{\query,\tupset',\tup}&\\
&\textbf{Output}: \expct_{\vct{W} \sim \bpd}\pbox{\poly'\pbox{\query, \tupset', \tup}\inparen{\vct{W}}}.&
\end{flalign*}
\end{Definition}
\input{circuits-model-runtime}

View File

@ -7,7 +7,7 @@
\renewcommand{\belowrulesep}{0pt}
\begin{figure}[t!]
\begin{figure*}[t!]
\centering
\resizebox{\textwidth}{5.2cm}{%
\begin{tikzpicture}
@ -128,5 +128,5 @@
}
\caption{Intensional Query Evaluation Model $(\query_2 = \project_{\text{Point}}$ $\inparen{T\join_{\text{Point} = \text{Point}_1}R}$ where, for table $R,~\bound = 2$, while for $T,~\bound = 1.)$}
\label{fig:two-step}
\end{figure}
\end{figure*}