Import from http://maybms.cvs.sourceforge.net/
This commit is contained in:
commit
7b2599df3b
11
Documents/Makefile
Normal file
11
Documents/Makefile
Normal file

@ 0,0 +1,11 @@


all:


latex manual


bibtex manual


latex manual


latex manual


dvipdf manual




clean:


rm f manual.ps manual.dvi manual.log manual.aux manual.blg


rm f manual.bbl manual.out manual.pdf manual.toc



827
Documents/bibtex.bib
Normal file
827
Documents/bibtex.bib
Normal file

@ 0,0 +1,827 @@




@inproceedings(GGH1998,


author = {Erich Gr\"adel and Yuri Gurevich and Colin Hirsch},


title = "{``The Complexity of Query Reliability''}",


booktitle = {Proc.\ PODS},


year = 1998,


pages = {227234}


)






@inproceedings(SD2007,


author = {Prithviraj Sen and Amol Deshpande},


title = "{``Representing and Querying Correlated Tuples in


Probabilistic Databases''}",


booktitle = {Proc.\ ICDE},


year = 2007,


pages = {596605}


)






@inproceedings(Koch2008,


author = {Christoph Koch},


title = "{``Approximating Predicates and


Expressive Queries on Probabilistic Databases''}",


booktitle = {Proc.\ PODS},


year = 2008


)






@incollection(KochBook2008,


author = {Christoph Koch},


title = "{``MayBMS: A System for Managing Large Uncertain and Probabilistic Databases''}",


chapter = 6,


editor = {Charu Aggarwal},


booktitle = {Managing and Mining Uncertain Data},


publisher = {SpringerVerlag},


year = 2008


)






@article(DP1960,


author = {Martin Davis and Hillary Putnam},


title = "{``A Computing Procedure for Quantification Theory''}",


journal = {Journal of ACM},


volume = {{\bf 7}},


number = 3,


pages = {201215},


year = 1960


)




@article(Bat1979,


author = {Don S. Batory},


title = "{``On Searching Transposed Files''}",


journal = tods,


volume = {{\bf 4}},


number = 4,


pages = {531544},


year = 1979


)






@inproceedings(SAB+2005,


author = {Michael Stonebraker and Daniel J. Abadi and Adam Batkin and


Xuedong Chen and Mitch Cherniack and Miguel Ferreira and


Edmond Lau and Amerson Lin and Samuel Madden and


Elizabeth J. O'Neil and Patrick E. O'Neil and Alex Rasin


and Nga Tran and Stanley B. Zdonik},


title = "{``CStore: A Columnoriented DBMS''}",


booktitle = {Proc.\ VLDB},


year = 2005,


pages = {553564}


)






@article(IL1984,


author = {T. Imielinski and W. Lipski},


title = "{``Incomplete information in relational databases''}",


journal = {Journal of ACM},


volume = {{\bf 31}},


number = 4,


year = 1984,


pages = {761791}


)




@inproceedings(Wachter:Multistate:2007,


author = {Michael Wachter and Rolf Haenni},


title = "{``Multistate Directed Acyclic Graphs''}",


booktitle = {Proc.\ Canadian {AI}},


OPTaddress = {Denver, Colorado},


year = 2007,


pages = {464475}


)




@article(Birnbaum:DP:1999,


author = {Elazar Birnbaum and Eliezer Lozinskii},


title = "{``The Good Old DavisPutnam Procedure Helps Counting Models''}",


journal = {Journal of AI Research},


volume = {{\bf 10}},


number = 6,


year = 1999,


pages = {457477}


)








@inproceedings(INV1991,


author = {T. Imielinski and S. Naqvi and K. Vadaparty},


title = "{``Incomplete objects  a data model for design and planning


applications''}",


booktitle = {Proc.\ SIGMOD},


OPTaddress = {Denver, Colorado},


year = 1991,


pages = {288297}


)






@inproceedings(ASV2001,


author = "Serge Abiteboul and Luc Segoufin and Victor Vianu",


title = "{``Representing and Querying {XML} with


Incomplete Information''}",


booktitle = "Proc.\ PODS",


year = "2001"


)






@article(AKG1991,


author = {Serge Abiteboul and Paris Kanellakis and G\"osta Grahne},


title = "{``On the Representation and Querying of Sets of


Possible Worlds''}",


journal = {Theor.\ Comput.\ Sci.},


volume = {{\bf 78}},


number = 1,


pages = "158187",


year = "1991"


)






@inproceedings(Gra1984,


author = {G\"osta Grahne},


title = "{``Dependency Satisfaction in Databases with


Incomplete Information''}",


booktitle = {Proc.\ VLDB},


year = 1984,


pages = {3745}


)






@book(Gra1991,


author = {G\"osta Grahne},


title = "{The Problem of Incomplete Information in Relational Databases}",


publisher = {SpringerVerlag},


series = {LNCS},


number = 554,


year = 1991


)






@inproceedings(LW1993,


author = "Leonid Libkin and Limsoon Wong",


title = "{``Semantic Representations and Query Languages for {OR}Sets''}",


booktitle = {Proc.\ PODS},


pages = "3748",


year = "1993"


)






@book(AHV95,


author = {Serge Abiteboul and Richard Hull and Victor Vianu},


title = "{Foundations of Databases}",


year = 1995,


publisher = "AddisonWesley"


)






% ancestor paper on the chase, see also MMS79


@article(ABU79,


author = {Alfred V. Aho and Catriel Beeri and Jeffrey D. Ullman},


title = "{``The Theory of Joins in Relational Databases''}",


journal = tods,


volume = {{\bf 4}},


number = 3,


pages = {297314},


year = 1979


)






% original articulation of the chase (after ABU79)


@article(MMS79,


author = {David Maier and Alberto O. Mendelzon and Yehoshua Sagiv},


title = "{``Testing Implications of Data Dependencies''}",


journal = tods,


volume = {{\bf 4}},


number = 4,


pages = {455469},


year = 1979


)






@article(RD2000,


author = {Erhard Rahm and Hong Hai Do},


title = "{``Erhard Data Cleaning: Problems and Current Approaches''}",


journal = {IEEE Data Engineering Bulletin},


year = 2000


)






@inproceedings(GFSS2000,


title = "{``AJAX: An Extensible Data Cleaning Tool''}",


author = {H. Galhardas and D. Florescu and D. Shasha and E Simon},


booktitle = {Proc.\ SIGMOD},


year = 2000


)






@inproceedings(RH2001,


title = "{``Potter's Wheel: An Interactive Data Cleaning System''}",


author = {V. Raman and J.M. Hellerstein},


booktitle = {Proc.\ VLDB},


year = 2001


)






@inproceedings(CGGM2003,


title = "{``Robust and Efficient Fuzzy Match for Online Data Cleaning''}",


author = {S. Chaudhuri and K. Ganjam and V. Ganti and R. Motwani},


booktitle = {Proc.\ SIGMOD},


year = 2003


)






@inproceedings(Koch2005a,


title = "{``On the Complexity of Nonrecursive XQuery and Functional Query Languages on Complex Values''}",


author = {Christoph Koch},


booktitle = {Proc.\ PODS},


year = 2005


)






@inproceedings(INFOMIX,


author = {Nicola Leone and Gianluigi Greco and Giovambattista Ianni and


Vincenzino Lio and Giorgio Terracina and Thomas Eiter and


Wolfgang Faber and Michael Fink and Georg Gottlob and


Riccardo Rosati and Domenico Lembo and Maurizio Lenzerini and


Marco Ruzzi and Edyta Kalka and Bartosz Nowicki and


Witold Staniszkis},


title = "{``The INFOMIX system for advanced integration of incomplete and inconsistent data''}",


booktitle = {Proc.\ SIGMOD},


year = 2005,


pages = {915917}


)






@inproceedings(Koch2005b,


title = "{``On the Role of Composition in XQuery''}",


author = {Christoph Koch},


booktitle = {Proc.\ WebDB},


year = 2005


)






% introduction of the notions expression complexity and data complexity


@inproceedings(Var82,


author = {Moshe Y. Vardi},


title = "{``The Complexity of Relational Query Languages''}",


booktitle = {Proc.\ STOC},


pages = {137146},


OPTaddress = {San Francisco, CA USA},


OPTmonth = may,


day = {57},


year = 1982


)






@inproceedings(ABC1999,


author = {Marcelo Arenas and Leopoldo E. Bertossi and Jan Chomicki},


title = "{``Consistent Query Answers in Inconsistent Databases''}",


booktitle = {Proc.\ PODS},


pages = {6879},


year = 1999


)






@Misc{IPUMS,


author = {Steven Ruggles and Matthew Sobek and Trent Alexander and


Catherine A. Fitch and Ronald Goeken and


Patricia Kelly Hall and Miriam King and Chad Ronnander},


title = "{``Integrated Public Use Microdata Series: V3.0''}",


year = 2004,


OPTaddress = {Minneapolis MN USA},


OPTpublisher = {Minnesota Population Center},


OPTnote = {http://www.ipums.org}


}






@inproceedings(AD98,


author = {Serge Abiteboul and Oliver M. Duschka},


title = "{``Complexity of Answering Queries Using


Materialized Views''}",


booktitle = {Proc.\ PODS},


pages = {254263},


year = 1998


)






@Article{PG1992,


author = {Jan Paredaens and Dirk Van Gucht},


title = "{``Converting nested algebra expressions into


flat algebra expressions''}",


pages = {6593},


year = 1992,


journal = {{TODS}},


volume = {17},


number = {1}


}






@Article{RBG96,


author = {Sudhir Rao and Antonio Badia and Dirk Van Gucht},


title = "{``Providing better support for a class of decision support queries''}",


pages = {217227},


year = 1996,


journal = {{SIGMOD} Record},


volume = {25},


number = {2}


}






@inproceedings{Rosa06c,


author = "Riccardo Rosati",


title = "{``On the decidability and finite controllability of query


processing in databases with incomplete information''}",


booktitle = { Proc.\ PODS},


year = 2006,


}






@inproceedings(CDLR2004,


author = {Diego Calvanese and Giuseppe De Giacomo and Maurizio Lenzerini


and Riccardo Rosati},


title = "{``Logical Foundations of PeerToPeer Data Integration''}",


booktitle = {PODS 2004},


pages = {241251},


year = 2004


)






@inproceedings(BB2005,


author = {Leopoldo E. Bertossi and Loreto Bravo},


title = "{``Consistent Query Answers in


Virtual Data Integration Systems''}",


booktitle = {Inconsistency Tolerance},


year = 2005,


pages = {4283}


)






@inproceedings(BBFL2005,


author = {Leopoldo E. Bertossi and Loreto Bravo and Enrico Franconi and


Andrei Lopatenko},


title = "{``Complexity and Approximation of Fixing Numerical Attributes


in Databases Under Integrity Constraints''}",


booktitle = {Proc.\ DBPL},


year = 2005,


pages = {262278}


)






@article(ABC2003,


author = {Marcelo Arenas and Leopoldo E. Bertossi and Jan Chomicki},


title = "{``Answer sets for consistent query answering in


inconsistent databases''}",


journal = {TPLP},


volume = {{\bf 3}},


number = {45},


pages = {393424},


year = 2003


)






@inproceedings(CMS2004,


author = {Jan Chomicki and Jerzy Marcinkowski and Slawomir Staworko},


title = "{``Computing consistent query answers using conflict


hypergraphs''}",


booktitle = {Proc.\ CIKM},


year = 2004,


pages = {417426}


)






% axioms for inequalities: p.886


@book(Ull89,


author = {Jeffrey D. Ullman},


title = "{Principles of Database \& KnowledgeBase Systems Vol. 2:


The New Technologies}",


publisher = {Computer Science Press},


year = 1989


)






@inproceedings(BFFR2005,


author = {Philip Bohannon and Wenfei Fan and Michael Flaster and


Rajeev Rastogi},


title = "{``A CostBased Model and Effective Heuristic for


Repairing Constraints by Value Modification}",


booktitle = {Proc.\ SIGMOD},


month = jun,


year = 2005


)






@inproceedings(BFGJK2007,


author = {Philip Bohannon and Wenfei Fan and Floris Geerts and


Xibei Jia and Anastasios Kementsietsidis},


title = "{``Conditional Functional Dependencies for Data Cleaning''}",


booktitle = {Proc.\ ICDE},


year = 2007


)






@misc(medicinenet,


key = {MedicineNet},


title = {\textsf{http://www.medicinenet.com}}


)






@article(FKMP2005,


author = {Ronald Fagin and Phokion G. Kolaitis and Renee J. Miller and Lucian Popa},


title = "{``Data exchange: semantics and query answering''}",


journal = {Theoretical Computer Science},


volume = {{\bf 336}},


number = 1,


year = 2005,


pages = {89124}


)








@inproceedings(GT2006,


author = {Todd J. Green and Val Tannen},


title = "{``Models for Incomplete and Probabilistic Information''}",


booktitle = {International Workshop on Incompleteness and


Inconsistency in Databases (IIDB)},


year = 2006


)








@article(BSHW2006,


author = {Omar Benjelloun and Anish Das Sarma and Chris Hayworth and Jennifer Widom},


title = "{``An Introduction to ULDBs and the Trio System''}",


journal = {{IEEE Data Engineering Bulletin}},


OPTvolume = {{\bf 336}},


OPTnumber = 1,


year = 2006,


OPTpages = {89124}


)






@inproceedings(BDSHW2006,


author = {Omar Benjelloun and Anish Das Sarma and Alon Halevy and


Jennifer Widom},


title = "{``{ULDBs}: Databases with Uncertainty and Lineage''}",


booktitle = {Proc.\ VLDB},


year = 2006


)




@inproceedings(STW2008,


author = { Anish Das Sarma and Martin Theobald and Jennifer Widom},


title = "{``Exploiting Lineage for Confidence Computation in Uncertain and Probabilistic Databases''}",


booktitle = {Proc.\ ICDE},


year = 2008,


)




@MANUAL{triql2006,


organization = {{Stanford Trio Project}},


title = "{``TriQL  The Trio Query Language''}",


year = {2006},


OPTedition = {Revision 2.6.0},


OPTnote = {http://infolab.stanford.edu/~widom/triql.html},


OPTannote = {}


}




@inproceedings(dalvi04efficient,


author = "Nilesh Dalvi and Dan Suciu",


title = "{``Efficient query evaluation on probabilistic databases''}",


booktitle = "Proc.\ VLDB",


pages = "864875",


year = "2004"


)




@article(dalvi07efficient,


author = "Nilesh Dalvi and Dan Suciu",


title = "{``Efficient query evaluation on probabilistic databases''}",


journal = "VLDB Journal",


pages = "523544",


volume = {{\bf 16}},


number = {4},


year = 2007


)






@inproceedings{RDS07,


author = {Christopher Re and


Nilesh Dalvi and


Dan Suciu},


title = {Efficient Topk Query Evaluation on Probabilistic Data},


booktitle = "Proc.\ ICDE",


year = {2007},


pages = {886895},


}




@inproceedings{suciu05mystiq,


author = {Jihad Boulos and


Nilesh Dalvi and


Bhushan Mandhani and


Shobhit Mathur and


Chris Re and


Dan Suciu},


title = {{MYSTIQ}: a system for finding more answers by using probabilities},


booktitle = {Proc.\ SIGMOD},


year = {2005},


pages = {891893},


}






@inproceedings{miller06clean,


author = {Periklis Andritsos and Ariel Fuxman and Renee J. Miller},


title = "{``Clean Answers over Dirty Databases:


A Probabilistic Approach''}",


booktitle = {Proc.\ ICDE},


year = {2006},


OPTpages = {30},


}






@misc(GK2008,


author = {Michaela Goetz and Christoph Koch},


title = "{``A Compositional Framework


for Complex Queries over Uncertain Data''}",


note = {Under submission},


year = 2008


)






@techreport(Koch2008SO,


author = {Christoph Koch},


title = "{``A Compositional Query Algebra for SecondOrder Logic and


Uncertain Databases''}",


number = {arXiv:0807.4620},


year = 2008


)






@inproceedings(OHK2008,


author = {Dan Olteanu and Jiewen Huang and Christoph Koch},


title = "{``SPROUT: Lazy vs. Eager Query Plans for TupleIndependent


Probabilistic Databases.''}",


booktitle = {Proc.\ ICDE},


year = 2009


)






@inproceedings(KO2008,


author = {Christoph Koch and Dan Olteanu},


title = "{``Conditioning Probabilistic Databases''}",


booktitle = {Proc.\ VLDB},


year = 2008


)






@article(OKA2008,


author = {Dan Olteanu and Christoph Koch and Lyublena Antova},


title = "{``Worldset Decompositions: Expressiveness and


Efficient Algorithms''}",


journal = {Theoretical Computer Science},


volume = {{\bf 403}},


number = 23,


pages = {265284},


year = 2008


)






@inproceedings(AKMUD2008,


author = {Lyublena Antova and Christoph Koch},


title = "{``On APIs for Probabilistic Databases''}",


booktitle = {Proc.\ 2nd International Workshop on Management of Uncertain Data},


address = {Auckland, New Zealand},


year = 2008


)






@inproceedings{AKO07WSD,


author = {Lyublena Antova and Christoph Koch and Dan Olteanu},


title = "{``$10^{10^6}$ Worlds and Beyond: Efficient Representation


and Processing of Incomplete Information''}",


booktitle = {Proc.\ ICDE},


year = {2007},


}






@inproceedings{AKO07WSDb,


author = {Lyublena Antova and Christoph Koch and Dan Olteanu},


title = "{``MayBMS: Managing Incomplete Information with


Probabilistic WorldSet Decompositions''}",


booktitle = {Proc.\ ICDE},


year = {2007},


OPTnote = {Demonstration Paper}


}






@inproceedings{AKO07ISQL,


author = {Lyublena Antova and Christoph Koch and Dan Olteanu},


title = "{``From Complete to Incomplete Information and Back''}",


booktitle = {Proc.\ SIGMOD},


year = {2007}


}






@inproceedings(AKOVLDBDEMO2007,


author = {Lyublena Antova and Christoph Koch and Dan Olteanu},


title = "{``Query Language Support for


Incomplete Information in the MayBMS System''}",


booktitle = {Proc.\ VLDB},


year = 2007


)








@inproceedings{AJKO2008,


author = {Lyublena Antova and Thomas Jansen and


Christoph Koch and Dan Olteanu},


title = "{``Fast and Simple Relational Processing of Uncertain Data''}",


booktitle = {Proc.\ ICDE},


year = {2008},


}




@inproceedings{OH:SUM:2008,


author = {Dan Olteanu and Jiewen Huang},


title = "{``Conjunctive Queries with Inequalities on Probabilistic Databases''}",


booktitle = {Proc.\ SUM},


year = 2008


}






@article{Bryant86,


author = {Randal E. Bryant},


title = {GraphBased Algorithms for Boolean Function Manipulation},


journal = {IEEE Trans. Computers},


volume = {35},


number = {8},


year = {1986},


pages = {677691},


}




@Article{brayton87,


author = {R. K. Brayton},


title = "{``Factoring logic functions''}",


journal = {{IBM J. Res. Develop.}},


year = {1987},


OPTkey = {},


volume = {{\bf 31}}},


number = {2},


OPTpages = {},


OPTmonth = {},


OPTnote = {},


OPTannote = {}


}






@Article{darwicheJAIR02,


author = {Adnan Darwiche and Pierre Marquis},


title = "{``A knowlege compilation map''}",


journal = {{Journal of AI Research}},


volume = "17",


pages = "229264",


year = "2002"


}






@Book{Meinel:OBDD:1998,


author = "Christoph Meinel and Thorsten Theobald",


title = "Algorithms and Data Structures in {VLSI} Design",


year = "1998",


publisher = "SpringerVerlag",


}




@Book{gj79,


author = "M. R. Garey and D. S. Johnson",


title = "Computers and intractability; a guide to the theory of


{\it NP}completeness",


year = "1979",


publisher = "W.H. Freeman",


}






@TechReport{re06prob,


author = {Chris Re and Nilesh Dalvi and Dan Suciu},


title = "{``Probabilistic Databases: Where and How''}",


institution = {University of Washington},


OPTkey = {},


OPTmonth = {},


year = {2005},


OPTannote = {}


}








@Article{tpch2000,


author = {Mikel Poess and Chris Floyd},


title = "{``New TPC Benchmarks for Decision Support and


Web Commerce''}",


journal = {{SIGMOD Record}},


year = {2000},


OPTkey = {},


volume = {29},


number = {4},


OPTpages = {},


OPTmonth = {},


OPTnote = {},


OPTannote = {}


}




@MANUAL{tpch2006,


organization = {{Transaction Processing Performance Council}},


title = {{TPC Benchmark H (Decision Support)}},


year = {2006},


edition = {Revision 2.6.0},


note = {http://www.tpc.org/tpch/spec/tpch2.6.0.pdf},


OPTannote = {}


}




@inproceedings{division,


author = {Ralf Rantzau and Christoph Mangold},


title = "{``Laws for Rewriting Queries


Containing Division Operators''}",


booktitle = {Proc.\ ICDE},


year = {2006}


}






@inproceedings(BunemanICDT01,


author = {Peter Buneman and Sanjeev Khanna and WangChiew Tan},


title = "{``Why and Where: A Characterization of Data Provenance''}",


booktitle = {Proc.\ ICDT},


year = 2001


)






@inproceedings(DS2007,


author = {Nilesh Dalvi and Dan Suciu},


title = "{``Management of Probabilistic Data: Foundations and


Challenges''}",


booktitle = {Proc.\ PODS},


year = 2007


)






@article(EIV2006,


author = {Ahmed K. Elmagarmid and Panagiotis G. Ipeirotis and


Vassilios K. Verykios},


title = "{``Duplicate Record Detection''}",


journal = {IEEE TKDE},


volume = {{\bf 19}},


number = 1,


year = 2006


)






@article(FR1997,


author = {Norbert Fuhr and Thomas R\"olleke},


title = "{``A Probabilistic Relational Algebra for the


Integration of Information Retrieval and Database Systems''}",


journal = {ACM Trans.\ Inf.\ Syst.},


volume = {{\bf 15}},


number = 1,


pages = {3266},


year = 1997


)






@book(Vazirani2001,


author = {Vijay V. Vazirani},


title = "{Approximation Algorithms}",


publisher = {Springer},


year = 2001


)






@inproceedings(KL1983,


author = {Richard M. Karp and Michael Luby},


title = "{``MonteCarlo Algorithms for Enumeration and


Reliability Problems''}",


booktitle = {Proc.\ FOCS},


year = 1983,


pages = {5664}


)






@article(KLM1989,


author = {Richard M. Karp and Michael Luby and Neal Madras},


title = "{``MonteCarlo Approximation Algorithms for


Enumeration Problems''}",


journal = {J.\ Algorithms},


volume = {{\bf 10}},


number = 3,


pages = {429448},


year = 1989


)






@article(DKLR2000,


author = {Paul Dagum and Richard M. Karp and Michael Luby and


Sheldon M. Ross},


title = "{``An Optimal Algorithm for Monte Carlo Estimation''}",


journal = {SIAM J.\ Comput.},


volume = {{\bf 29}},


number = 5,


pages = {14841496},


year = 2000


)




@unpublished{OHK2010,


author = {Dan Olteanu and Jiewen Huang and Christoph Koch},


title = "{``Approximate Confidence Computation in Probabilistic Databases''}",


note = "Submitted to ICDE 2010"


}



BIN
Documents/census.eps
Normal file
BIN
Documents/census.eps
Normal file
Binary file not shown.
106
Documents/codebase.tex
Normal file
106
Documents/codebase.tex
Normal file

@ 0,0 +1,106 @@






\chapter{The MayBMS Codebase}


\label{sect:codebase}




MayBMS is currently implemented in PostgreSQL 8.3.3. Integration into


an existing fullfledged DBMS brings two major advantages. First,


integration makes it possible to reuse the internal functions and


structures of the DBMS. Secondly, it often increases the efficiency of


query processing.




Figures~\ref{fig:modifiedfiles1} and \ref{fig:modifiedfiles2} give a


list of source files modified or added to the original PostgreSQL


8.3.3. All modifications are explicitly marked in the source files by


\begin{verbatim}


/* MAYBMS BEGIN */


... [some code goes here]


/* MAYBMS END */


\end{verbatim}


All files in directory \texttt{maybms} are newly created and the


others are existing files in PostgreSQL8.3.3. Header files (*.h) refer


to \texttt{src/include/directory/filename}. Source files (*.c and *.y)


refer to \texttt{src/backend/directory/filename}.




\begin{figure}[ht]


\begin{center}


\small


\begin{tabular}{ll}


\hline


File & Description \\


\hline


parser/gram.y & Adds new constructs such as repairkey and possible. \\ \hline


parser/keyword.c & Adds necessary keywords. \\ \hline


nodes/parsenodes.h & Adds the relation type to structure CreatStmt. \\ \hline


catalog/pg\_class.h & Adds an extra column specifying the type of a relation \\


catalog/pg\_attribute.h & in the catalog. \\ \hline


nodes/copyfuncs.c & Copying the relation type. \\ \hline


catalog/heap.c & Execution of creating urelations. \\ \hline


catalog/heap.h & An argument tabletype is added to function \\


catalog/toasting.c & heap\_create\_with\_catalog in heap.h. \\


commands/tablecmds.c & All files accessing this function are modified.\\


commands/cluster.c & \\


bootstrap/bootparse.y & \\


executor/execMain.c & \\ \hline


\end{tabular}


\end{center}




\vspace*{1em}


\caption{ Files related to Urelation creation.}


\label{fig:modifiedfiles1}


\end{figure}






\begin{figure}[ht]


\begin{center}


\small


\begin{tabular}{ll}


\hline


File & Description \\


\hline


catalog/pg\_proc.h & Registers conf, tconf, aconf, argmax, esum, ecount \\


& and the related functions. \\ \hline


catalog/pg\_aggregate.h & Specifies the relationships between conf, aconf and \\


& the related state, final functions. \\ \hline


nodes/execnodes.h & Adds confidence computation states to structure AggState. \\


executor/nodeAgg.c & \\ \hline


tcop/postgres.c & Access point to query rewriting. \\ \hline


maybms/conf\_comp.h & Prototypes for conf, tconf, aconf and their related functions. \\ \hline


maybms/SPROUT.c & Confidence computation of conf for hierarchical \\


& queries on tupleindependent Urelations using SPROUT. \\ \hline


maybms/tupleconf.c & Confidence computation for tconf. \\ \hline


maybms/wstree.c & Confidence computation of conf for arbitrary \\


& Urelations using wstreebased algorithm. \\ \hline


maybms/bitset.h & Auxiliary files for wstreebased algorithm. \\


maybms/bitset.c & \\ \hline


maybms/aconf.c & Implementation of approximate confidence computation. \\ \hline


maybms/signature.h & Derives signatures for hierarchical queries. \\


maybms/signature.c & \\ \hline


maybms/repair\_key.c & Implementation of repairkey construct by pure rewriting. \\ \hline


maybms/pick\_tuples.c & Implementation of picktuples construct by pure rewriting. \\ \hline


maybms/localcond.h & Storing the condition columns for confidence computation. \\


maybms/localcond.c & \\ \hline


maybms/argmax.c & Implementation of aggregate function argmax. \\ \hline


maybms/rewrite.c & Rewriting of select and create commands involving uncertainty. \\


maybms/rewrite\_utils.c & \\ \hline


maybms/rewrite\_updates.c & Rewriting of update commands (insert, delete, update). \\ \hline


maybms/supported.c & Checking whether a query is supported and should be rewritten. \\ \hline


maybms/utils.h & Utility functions. \\


maybms/utils.c & \\ \hline


\end{tabular}




\end{center}




\vspace*{1em}


\caption{ Files related to confidence computation and query rewriting. }


\label{fig:modifiedfiles2}


\end{figure}




\vspace*{10em}















292
Documents/experiments.tex
Normal file
292
Documents/experiments.tex
Normal file

@ 0,0 +1,292 @@






\chapter{Experiments}








This section reports on experiments performed with the first MayBMS release


(beta) and a benchmark consisting of two parts,


which are described in more detail in the remainder of this chapter:


%


\begin{enumerate}


\item


Computing the probability of triangles in random graphs.




\item


A modified subset of the TPCH queries on uncertain TPCH datasets.


\end{enumerate}






By this benchmark, we do not attempt to simulate a representative set of


use cases: the jury is still out on what such a set of use cases might be.


Instead, we focus on a benchmark that allows us to see how the performance


of MayBMS develops across releases on the two core technical problems solved


by MayBMS: polynomialtime query evaluation for the polynomialtime fragment


of our query language and the efficient approximation of query results for


queries that do not belong to the polynomialtime fragment. (Finding triangles


in random graphs is a nearcanonical example of such queries.)




We will keep monitoring the development of the state of the art and will


continue to survey applications and collect use cases; we will extend or


replace this benchmark as consensus develops regarding the most important


applications of probabilistic databases.






\medskip




Experimental setup.


All the experiments reported on in this chapter were conducted on an AthlonX2(4600+)64bit / 1.8GB / Linux2.6.20 / gcc4.1.2 machine.








\section{Random Graphs}


\subsection{Experiments with Varying Levels of Precision}




In this experiment, we create undirected random graphs in which


the presence of each edge is independent of that of the other edges. The probability that an edge is in the graph is 0.5 and


this applies to each edge. Then we compute the probability that there exists a triangle in the graphs using approximation.


The queries can be found in Appendix~\ref{app:randgraph}.




We report wallclock execution times


of queries run in the PostgreSQL8.3.3 psql shell with a warm


cache obtained by running a query once and then reporting


the average execution time over three subsequent, identical executions.


Figure \ref{fig:randgraph} shows the execution time of approximation with different precision parameters for random graphs composed of 5 to 33 nodes. An ($\epsilon, \delta$) approximation has the following property: let $p$ be the exact probability and $\hat{p}$ be the approximate probability, then


$\Pr\big[ p  \hat{p} \ge \epsilon \cdot p \big] \le \delta$.




\begin{figure}[htp]




\begin{center}


\begin{tabular}{  c  c  c  c  c  c  }


\hline


\multirow{2}{*}{\#nodes} & \multirow{2}{*}{\#clauses} & \multicolumn{4}{c}{Execution Time(Seconds)} \\ \cline{36}


& & (.05,.05) & (.01,.01) & (.005,.005) & (.001,.001) \\ \hline


5 & 10 & 0.01 & 0.03 & 0.11 & 2.08 \\ \hline


6 & 20 & 0.01 & 0.08 & 0.26 & 5.27 \\ \hline


7 & 35 & 0.02 & 0.14 & 0.46 & 9.15 \\ \hline


8 & 56 & 0.03 & 0.22 & 0.7 & 12.49 \\ \hline


9 & 84 & 0.04 & 0.28 & 0.85 & 14.95 \\ \hline


10 & 120 & 0.08 & 0.44 & 1.13 & 16.19 \\ \hline


11 & 165 & 0.15 & 0.60 & 1.60 & 17.98 \\ \hline


12 & 220 & 0.29 & 1.24 & 2.48 & 24.31 \\ \hline


13 & 286 & 0.55 & 2.38 & 4.74 & 35.29 \\ \hline


14 & 364 & 0.98 & 4.26 & 8.38 & 51.51 \\ \hline


15 & 455 & 1.56 & 6.74 & 13.29 & 73.00 \\ \hline


16 & 560 & 2.37 & 10.26 & 19.21 & 102.97 \\ \hline


17 & 680 & 3.46 & 14.6 & 28.76 & 144.02 \\ \hline


18 & 816 & 4.92 & 20.49 & 41.1 & 206.18 \\ \hline


19 & 969 & 7.03 & 28.52 & 56.43 & 291.21 \\ \hline


20 & 1140 & 9.97 & 39.72 & 81.01 & 395.18 \\ \hline


21 & 1330 & 14.74 & 57.13 & 123.79 & 597.86 \\ \hline


22 & 1540 & 23.94 & 119.81 & 218.62 & 600+ \\ \hline


23 & 1771 & 46.21 & 204.83 & 416.42 & 600+ \\ \hline


24 & 2024 & 79.03 & 411.67 & 600+ & 600+ \\ \hline


25 & 2300 & 115.64 & 515.65 & 600+ & 600+ \\ \hline


26 & 2600 & 159.66 & 600+ & 600+ & 600+ \\ \hline


27 & 2925 & 202.98 & 600+ & 600+ & 600+ \\ \hline


28 & 3276 & 251.82 & 600+ & 600+ & 600+ \\ \hline


29 & 3654 & 312.89 & 600+ & 600+ & 600+ \\ \hline


30 & 4060 & 387.72 & 600+ & 600+ & 600+ \\ \hline


31 & 4495 & 475.78 & 600+ & 600+ & 600+ \\ \hline


32 & 4960 & 582.4 & 600+ & 600+ & 600+ \\ \hline


33 & 5456 & 600+ & 600+ & 600+ & 600+ \\ \hline




\end{tabular}


\end{center}




\caption{Comparison between execution time of approximation with different precision}




\label{fig:randgraph}


\end{figure}




\subsection{Experiments with Different Edge Probabilities}




In the previous experiments, each edge had probability 0.5. We use other values as the edge probability(all edges still have the same probability) and run the experiment again with (0.05,0.05) approximation. The SQL statements in Appendix~\ref{app:randgraph} should be modified accordingly. Let $p$ be the probability, change the following statements


\begin{verbatim}


insert into inout values (1, 0.5);


insert into inout values (0, 0.5);


\end{verbatim}


to


\begin{verbatim}


insert into inout values (1, p);


insert into inout values (0, 1  p);


\end{verbatim}


Figure \ref{fig:edgeprob} shows the execution time for queries of random graphs composed of 25 to 101 nodes with different fixed edge probabilities.




\begin{figure}[htp]




\begin{center}


\begin{tabular}{  c  c  c  c  c  }


\hline


\multirow{2}{*}{\#nodes} & \multirow{2}{*}{\#clauses} & \multicolumn{3}{c}{Execution Time(Seconds)} \\ \cline{35}


& & p=0.5 & p=0.1 & p=0.05 \\ \hline


25 & 2300 & 115.64 & 1.77 & 0.55 \\ \hline


%26 & 2600 & 159.66 & 2.28 & 0.72 \\ \hline


%27 & 2925 & 202.98 & 2.52 & 0.83 \\ \hline


%28 & 3276 & 251.82 & 3.19 & 1.02 \\ \hline


%29 & 3654 & 312.89 & 3.73 & 1.19 \\ \hline


30 & 4060 & 387.72 & 4.13 & 1.35 \\ \hline


31 & 4495 & 475.78 & 4.94 & 1.54 \\ \hline


32 & 4960 & 582.40 & 5.72 & 1.82 \\ \hline


33 & 5456 & 600+ & 6.87 & 2.12 \\ \hline


%34 & 5984 & 600+ & 7.48 & 2.60 \\ \hline


35 & 6545 & 600+ & 8.74 & 2.74 \\ \hline


%36 & 7140 & 600+ & 9.59 & 3.12 \\ \hline


%37 & 7770 & 600+ & 11.53 & 3.63 \\ \hline


%38 & 8436 & 600+ & 13.38 & 3.92 \\ \hline


%39 & 9139 & 600+ & 15.32 & 4.6 \\ \hline


40 & 9880 & 600+ & 18.32 & 5.06 \\ \hline


%41 & 10660 & 600+ & 20.65 & 5.76 \\ \hline


%42 & 11480 & 600+ & 23.91 & 6.51 \\ \hline


%43 & 12341 & 600+ & 28.44 & 7.7 \\ \hline


%44 & 13244 & 600+ & 32.38 & 8.48 \\ \hline


45 & 14190 & 600+ & 36.77 & 8.96 \\ \hline


%46 & 15180 & 600+ & 41.09 & 9.99 \\ \hline


%47 & 16215 & 600+ & 48.68 & 11.45 \\ \hline


%48 & 17296 & 600+ & 54.66 & 12.62 \\ \hline


%49 & 18424 & 600+ & 61.05 & 13.39 \\ \hline


50 & 19600 & 600+ & 70.79 & 15.79 \\ \hline


%51 & 20825 & 600+ & 80.19 & 16.09 \\ \hline


%52 & 22100 & 600+ & 88.32 & 17.16 \\ \hline


%53 & 23426 & 600+ & 97.99 & 19.49 \\ \hline


%54 & 24804 & 600+ & 112.07 & 21.58 \\ \hline


55 & 26235 & 600+ & 123.69 & 21.97 \\ \hline


%56 & 27720 & 600+ & 138.92 & 25.73 \\ \hline


%57 & 29260 & 600+ & 155.86 & 27.52 \\ \hline


%58 & 30856 & 600+ & 172.39 & 29.37 \\ \hline


%59 & 32509 & 600+ & 190.98 & 32.06 \\ \hline


60 & 34220 & 600+ & 214.06 & 33.94 \\ \hline


%61 & 35990 & 600+ & & 36.97 \\ \hline


%62 & 37820 & 600+ & & 38.40 \\ \hline


%63 & 39711 & 600+ & & 42.80 \\ \hline


%64 & 41664 & 600+ & & 43.89 \\ \hline


65 & 43680 & 600+ & 343.66 & 47.09 \\ \hline


%66 & 45760 & 600+ & & 51.56 \\ \hline


%67 & 47905 & 600+ & & 54.87 \\ \hline


68 & 50116 & 600+ & 451.06 & 59.87 \\ \hline


69 & 52934 & 600+ & 490.64 & 64.69 \\ \hline


70 & 54740 & 600+ & 542.61 & 68.98 \\ \hline


71 & 57155 & 600+ & 595.03 & 72.88 \\ \hline


72 & 59640 & 600+ & 600+ & 82.30 \\ \hline


75 & 67525 & 600+ & 600+ & 106.49 \\ \hline


80 & 82160 & 600+ & 600+ & 154.92 \\ \hline


85 & 98770 & 600+ & 600+ & 224.3 \\ \hline


90 & 117480 & 600+ & 600+ & 316.28 \\ \hline


95 & 138415 & 600+ & 600+ & 437.39 \\ \hline


97 & 147440 & 600+ & 600+ & 510.39 \\ \hline


98 & 152096 & 600+ & 600+ & 543.87 \\ \hline


99 & 156849 & 600+ & 600+ & 558.44 \\ \hline


100 & 161700 & 600+ & 600+ & 593.84 \\ \hline


101 & 166650 & 600+ & 600+ & 600+ \\ \hline




\end{tabular}


\end{center}




\caption{Comparison between execution time of queries of random graphs with different fixed edge probabilities}




\label{fig:edgeprob}


\end{figure}




\subsection{Experiments with General Random Graphs}




The previous experiments were conducted on undirected graphs in which every pair of nodes had a possibly present edge. However, this may not be the case in general. In many scenarios, each pair of nodes may have a certainly present, certainly absent or possibly present edge. In our following experiments, we construct such general probabilistic random graphs from data representing directed links between webpage within nd.edu domain\footnote{http://www.nd.edu/~networks/resources/www/www.dat.gz}. If a link between two pages is absent from the data, then it is also absent from our graphs. If a link is present in the data, then it is a certainly or possibly present edge in our graphs. We run again the queries computing the probabilities of existence of triangles in such graphs with (0.05,0.05) approximation. The probabilities that possibly present edges are in the graphs are randomly distributed in (0,0.1). The queries of the graph constructions and confidence computation can be found in Appendix~\ref{app:generalrandgraph}. Figure \ref{fig:generalrandgraph} shows the execution time for queries of such random graphs composed of 1000 to 30000 nodes.




\begin{figure}[htp]




\begin{center}


\begin{tabular}{  c  c  c  c  }


\hline


\#nodes & \#possible edges & \#clauses & Execution Time(Seconds) \\ \hline


1000 & 3271 & 6367 & 4.04 \\ \hline


2000 & 6446 & 12598 & 11.84 \\ \hline


3000 & 9056 & 19836 & 21.88 \\ \hline


4000 & 11366 & 22455 & 28.57 \\ \hline


5000 & 13497 & 24574 & 31.38 \\ \hline


6000 & 16095 & 25731 & 35.36 \\ \hline


7000 & 17958 & 26070 & 35.82 \\ \hline


8000 & 23113 & 39481 & 80.14 \\ \hline


9000 & 26114 & 43369 & 115.45 \\ \hline


10000 & 32975 & 51586 & 140.00 \\ \hline


11000 & 35507 & 55562 & 157.34 \\ \hline


12000 & 37623 & 57260 & 170.05 \\ \hline


13000 & 40246 & 61060 & 197.67 \\ \hline


14000 & 44045 & 66530 & 225.88 \\ \hline


15000 & 45434 & 66966 & 230.51 \\ \hline


16000 & 47814 & 69787 & 260.70 \\ \hline


17000 & 50456 & 72710 & 278.48 \\ \hline


18000 & 52145 & 73043 & 280.76 \\ \hline


19000 & 53849 & 73437 & 288.01 \\ \hline


20000 & 55584 & 73953 & 289.30 \\ \hline


21000 & 57654 & 74688 & 290.37 \\ \hline


22000 & 59274 & 74991 & 295.66 \\ \hline


23000 & 61308 & 75954 & 296.13 \\ \hline


24000 & 63000 & 76288 & 313.13 \\ \hline


25000 & 65538 & 79404 & 354.95 \\ \hline


26000 & 69741 & 89888 & 439.01 \\ \hline


27000 & 72741 & 93016 & 479.78 \\ \hline


28000 & 76148 & 98065 & 553.75 \\ \hline


29000 & 79414 & 104328 & 573.24 \\ \hline


30000 & 82714 & 107633 & 601.33 \\ \hline




\end{tabular}


\end{center}




\caption{Execution time of confidence computation for existence of triangles in general random graphs}




\label{fig:generalrandgraph}


\end{figure}




\section{Probabilistic TPCH}




SPROUT\footnote{http://web.comlab.ox.ac.uk/projects/SPROUT/index.html} is a part


of the query engine of MayBMS and provides stateoftheart techniques for efficient


exact confidence computation. In this section, we show how TPCH queries can


benefit from these techniques. For each TPCH query, we consider its largest subquery


without aggregations and inequality joins but with


conf() for specifying exact probability computation


for distinct tuples in query answers. We consider two


flavours of each of these queries: A version with original


selection attributes (again, without aggregations), and a version


where we drop keys from the selection attributes. Queries are included in the


experiments if SPROUT's techniques can be applied to them. Our data set consists


of tupleindependent probabilistic databases obtained from deterministic


databases produced by TPCH 2.7.0 by associating each


tuple with a Boolean random variable and by choosing at random


a probability distribution over these variables. We perform experiments with TPCH


scale factor 1 (1GB database size) and evaluate the


TPCHlike queries mentioned above. The queries can


be found in Appendix~\ref{app:tpch}. In addition, we compare our results


with the reported time from \cite{OHK2008} in which SPROUT was only partially


integrated into PostgreSQL and storing temporary relations to the disk was


sometimes necessary. The average time shown below is obtained from ten subsequent,


identical executions with a warm cache by running the query once.




\begin{figure}[htp]




\begin{center}


\begin{tabular}{  c  c  c  }


\hline


Query & \multicolumn{2}{c}{Average Time(Seconds)} \\ \cline{23}


& Current running time & Time reported in \cite{OHK2008} \\ \hline


1 & 8.21 & 120.13 \\ \hline


4 & 40.57 & 39.52 \\ \hline


12 & 17.1 & 21.94 \\ \hline


15 & 5.5 & 3.2 \\ \hline


B1 & 5.37 & 14.92 \\ \hline


B4 & 31.88 & 33.02 \\ \hline


B6 & 3.82 & 6.37 \\ \hline


B12 & 15.91 & 18.56 \\ \hline


B14 & 4.17 & 4.86 \\ \hline


B15 & 4.81 & 5.24 \\ \hline


B16 & 0.87 & 3.16 \\ \hline


B17 & 3.25 & 2.43 \\ \hline




\end{tabular}


\end{center}


\caption{Current running times vs.\ running times reported in \cite{OHK2008}.


Boolean queries are prefixed by B. }




\end{figure}







1361
Documents/foundations.tex
Normal file
1361
Documents/foundations.tex
Normal file
File diff suppressed because it is too large
Load diff
75
Documents/generalrandgraph.tex
Normal file
75
Documents/generalrandgraph.tex
Normal file

@ 0,0 +1,75 @@


\chapter{Queries in General Random Graph Experiments}


\label{app:generalrandgraph}




\begin{verbatim}




drop table data0;


drop table data;




create table data0(u int, v int);


create table data(u int, v int);




/* Copy the data to a relation. */


copy data0


from 'path_of_the_data_file/www.dat' with delimiter as ' ';




/* Since the data represents a directed graph, we need to


insert all tuples again with u and v swapped.


*/


insert into data0


select v, u from data0;




/* This fetches the distinct pairs of (u,v), which represents


all edges of an undirected graph.


*/


insert into data


select distinct u, v from data0;




drop table edges;


drop table edge0;




create table edges (u integer, v integer, p float4);




/* This fetches all the edges related to the nodes we intend to


keep in the graph.


'1000' in 'u < 1000 and v < 1000' is the number of nodes


which will appear in the graph.


'0.01' in 'random() < 0.01' is the proportion of certainly


present edges in all edges.


'0.1' is the upper bound of the probability that a possibly


present edge is in the graph.


You may change the abovementioned three parameters in the


experiments.


*/


insert into edges


select u, v,


CASE WHEN random() < 0.01 THEN 1.0


ELSE random() * 0.1


END


from data


where u < 1000 and v < 1000 and u < v;




/* The number of edges in the graph */


select count(*) as edge_count from edges;




/* The number of clauses in the confidence computation */


select count(*) as clause_count from


edges e1, edges e2, edges e3


where e1.v = e2.u and e2.v = e3.v and e1.u = e3.u


and e1.u < e2.u and e2.u < e3.v;




/* Creation of an uncertain relations representing the graph */


create table edge0 as


(pick tuples from edges independently with probability p);




/* Confidence computation of existence of at least


a triangle in the graph


*/


select aconf(.05,.05) as triangle_prob


from edge0 e1, edge0 e2, edge0 e3


where e1.v = e2.u and e2.v = e3.v and e1.u = e3.u


and e1.u < e2.u and e2.u < e3.v;




\end{verbatim}


\newpage



436
Documents/language.tex
Normal file
436
Documents/language.tex
Normal file

@ 0,0 +1,436 @@




\chapter{The MayBMS Query and Update Language}






\section{Language Overview}


\label{sect:ql}






This section describes the query and update language of MayBMS, which is based on SQL.


In fact, our language is a generalization of SQL on classical relational databases.


To simplify the presentation, a fragment of the full language supported in MayBMS is presented here.




The representation system used in MayBMS, Urelations, has as a special case classical relational tables, that is, tables with no condition columns. We will call these tables {\em typedcertain (tcertain) tables}\/ in this section. Tables that are not tcertain are called uncertain.


Note that this notion of certainty is


purely syntactic, and


\[


\mbox{cert}(R) = \pi_{sch(R)}(\sigma_{P=1}(\mbox{conf}(R)))


\]


may well be equal to the projection of a Urelation $U_R$ to its attribute (noncondition) columns despite $R$ not being tcertain according to this definition.




\paragraph{Aggregates}


In MayBMS, full SQL is supported on tcertain tables.


Beyond tcertain tables, some restrictions are in place to assure that query evaluation is feasible. In particular, we do not support the standard SQL aggregates such as {\tt sum} or {\tt count} on uncertain relations. This can be easily justified: In general, these aggregates will produce exponentially many different numerical results in the various possible worlds, and there is no way of representing these results efficiently. However, MayBMS supports a different set of aggregate operations on uncertain relations. These include the computations of {\em expected}\/ sums and counts (using aggregates {\tt esum} and {\tt ecount}).




Moreover, the confidence computation operation is an aggregate in the MayBMS query language.


This is a deviation from the language flavor of our algebra, but there is a justification for


this. The algebra presented earlier assumed a setbased semantics for relations, where operations


such as projections automatically remove duplicates. In the MayBMS query language, just like in SQL, duplicates have to be eliminated explicitly, and confidence is naturally an aggregate that computes a single confidence value for each group of tuples that agree on (a subset of) the noncondition columns.


By using aggregation syntax for {\tt conf} and not supporting {\tt select distinct} on uncertain relations, we avoid a need for conditions beyond the special conjunctions that can be stored with each tuple in


Urelations.




All supported aggregates on uncertain tables produce tcertain tables.




\paragraph{Duplicate tuples}


SQL databases in general support multiset tables, i.e., tables in which there may be duplicate tuples. There is no conceptual difficulty at all in supporting multiset Urelations. In fact, since Urelations are just relations in which some columns are interpreted to have a special meaning (conditions), just storing them in a standard relational database management system which supports duplicates in tables yields support for multiset Urelations.




\paragraph{Syntax}


The MayBMS query language is compositional and built from uncertain and tcertain queries.


The uncertain queries are those that produce a possibly uncertain relation (represented by a Urelation with more than zero $V$ and $D$ columns). Uncertain queries can be constructed, inductively, from tcertain queries, {\tt selectfromwhere} queries over uncertain tables, the multiset union of uncertain queries (using the SQL {\tt union} construct), and the {\tt repairkey} and {\tt picktuples} statements that can be specified as follows


\begin{verbatim}


repair key <attributes> in


(<tcertainquery>  <tcertainrelation>)


[weight by <expression>];




pick tuples from


<tcertainquery>  <tcertainrelation>


[independently]


[with probability <expression>];


\end{verbatim}


Note that {\tt repairkey} is a query, rather than an update statement.


Details on these constructs can be found in Section~\ref{sec:langref}, Language reference.




The {\tt selectfromwhere} queries may use any tcertain subqueries in the conditions, plus uncertain subqueries in atomic conditions of the form


\begin{verbatim}


<tuple> in <uncertainquery>


\end{verbatim}


that occur positively in the condition. (That is, if the condition is turned into DNF, these literals are not negated.)




\nop{


Uncertain queries also support a construct {\tt tconf()} that can be used in the select list and outputs, for each tuple selected, its confidence. Applied to a multiset Urelation, this operation does not eliminate duplicates and compute aggregate confidence values as {\tt conf} does, but outputs the probability of the (conjunctive) condition for each of the tuples in the multiset.


} % end nop




The tcertain queries (i.e., queries that produce a tcertain table) are given by


\begin{itemize}


\item


all constructs of SQL on tcertain tables and tcertain subqueries, extended by a new aggregate


\begin{verbatim}


argmax(<argumentattribute>, <valueattribute>)


\end{verbatim}


which outputs one of the {\tt argumentattribute} values in the current group (determined by the groupby clause) whose tuples have a maximum {\tt valueattribute} value within the group. Thus, this is the typical argmax construct from mathematics added as an SQL extension.




\item


{\tt selectfromwheregroupby} on uncertain queries using the {\tt possible} construct for computing possible tuples, or the aggregates {\tt conf}, {\tt esum}, and {\tt ecount}, but none of the standard SQL aggregates. There is an exact and an approximate version of the {\tt conf} aggregate. The


latter takes two parameters $\epsilon$ and $\delta$ (see the earlier discussion of the KarpLuby FPRAS).


\end{itemize}






The aggregates {\tt esum} and {\tt ecount} compute expected sums and counts across groups of tuples.


While it may seem that these aggregates are at least as hard as confidence computation (which is \#Phard), this is in fact not so.


These aggregates can be efficiently computed exploiting linearity of expectation.


A query


\begin{verbatim}


select A, esum(B) from R group by A;


\end{verbatim}


is equivalent to a query


\begin{verbatim}


select A, sum(B * P) from R' group by A;


\end{verbatim}


where {\tt R'} is obtained from the Urelation of {\tt R} by


replacing each local condition $V_1, D_1$, $\dots$, $V_k$, $D_k$ by the probability


$\Pr[V_1=D_1 \land \dots \land V_k=D_k]$, not eliminating duplicates.


That is, expected sums can be computed efficiently tuple by tuple, and only require to determine the probability of a conjunction, which is easy, rather than a DNF of variable assignments


as in the case of the {\tt conf} aggregate.


The {\tt ecount} aggregate is a special case of {\tt esum} applied to a column of ones.






\begin{example}\em


\label{ex:coins_sql}


The query of Example~\ref{ex:twotosses} can be expressed in the query language of MayBMS as follows.


Let {\tt R} be {\tt repair key in Coins weight by Count} and let {\tt S} be


\begin{verbatim}


select R.Type, Toss, Face


from (repair key Type, Toss in (select * from Faces, Tosses)


weight by FProb) S0, R


where R.Type = S0.Type;


\end{verbatim}




It is not hard to verify that $\pi_{\mathrm{Toss}, \mathrm{Face}}(S) \neq Ev$


exactly if there exist tuples $\vec{s} \in S, \vec{t} \in Ev$ such that


$\vec{s}.\mathrm{Toss}=\vec{t}.\mathrm{Toss}$ and


$\vec{s}.\mathrm{Face} \neq \vec{t}.\mathrm{Face}$.


Let {\tt C} be


\begin{verbatim}


select S.Type from S, Ev


where S.Toss = Ev.Toss and S.Face <> Ev.Face;


\end{verbatim}




Then we can compute {\tt Q} using the trick of Example~\ref{ex:trick} as


% BUGFIX BEGIN


\begin{verbatim}


select Type, (P1P2)/(1P3) as P


from (select Type, conf() as P1 from S group by Type) Q1,


((select Type, conf() as P2 from C group by Type)


union


(


(select Type, 0 as P2 from Coins)


except


(select Type, 0 as P2 from


(select Type, conf() from C group by Type) Dummy)


)) Q2,


(select conf() as P3 from C) Q3


where Q1.Type = Q2.Type;


\end{verbatim}


% BUGFIX END




The argmax aggregate can be used to compute maximumaposteriori (MAP) and maximumlikelihood estimates.


For example,


the MAP coin type


\[


\mbox{argmax}_{\mathrm{Type}} \; \Pr[\mbox{evidence is twice heads} \land \mbox{coin type is Type}]


\]


can be computed as


{\tt select argmax(Type, P) from Q}


because the normalizing factor {\tt (1P3)} has no impact on argmax. Thus, the answer in this example


is the doubleheaded coin. (See table $Q$ of Figure~\ref{fig:twotosses_tables}: The fair coin has $P=1/3$, while the doubleheaded coin has $P=2/3$.)




The maximum likelihood estimate


\[


\mbox{argmax}_{\mathrm{Type}} \; \Pr[\mbox{evidence is twice heads} \mid \mbox{coin type is Type}]


\]


can be computed as


\begin{verbatim}


select argmax(Q.Type, Q.P/R'.P)


from Q, (select Type, conf() as P from R) R'


where Q.Type = R'.Type;


\end{verbatim}


Here, again, the result is 2headed, but this time with likelihood


1. (The fair coin has likelihood 1/4).


%


\punto


\end{example}






\paragraph{Supported Queries}


\index{Supported Queries}


MayBMS supports full SQL on tcertain tables. In addition it supports a large subset of SQL on tuncertain tables, with even more features supported when fragments of the uncertain query involve tcertain subqueries. The following restrictions apply:


\begin{itemize}


\item


Exact aggregates and duplicate elimination using {\tt distinct} in a select statement are supported as long as the from clause subqueries and the subqueries in the where condition are tcertain.


\item


If a tcertain subquery Q in the where condition of a select statement contains references to tuncertain tables, then the containing query is supported if Q is not correlated with it.


\item


The set operations {\tt except} and {\tt union} with duplicate elimination are supported when both the left and the right argument are tcertain queries.


\item


{\tt repairkey} and {\tt picktuples} are supported on tcertain queries.


\end{itemize}




Restrictions on the update statements are discussed below.




\paragraph{Updates}


\index{Updates}


%


MayBMS supports the usual schema modification and update statements of SQL. In fact, our use of Urelations makes this quite easy.


An insertion of the form


\begin{verbatim}


insert into <uncertaintable> (<uncertainquery>);


\end{verbatim}


is just the standard SQL insertion for tables we interpret as Urelations. Thus, the table inserted into must have the right number (that is, a sufficient number) of condition columns.


Schemamodifying operations such as


\begin{verbatim}


create table <uncertaintable> as (<uncertainquery>);


\end{verbatim}


are similarly straightforward.


A deletion


\begin{verbatim}


delete from <uncertaintable>


where <condition>;


\end{verbatim}


admits conditions that refer to the attributes of the current tuple and may use tcertain subqueries.


One can also update an uncertain table with an update statement


\begin{verbatim}


update <uncertaintable>


set <attribute> = <expr> [,...]


where <condition>;


\end{verbatim}


where the set list does not modify the condition columns and the where condition satisfies the same conditions as that of the delete statement. MayBMS allows users to insert a constant tuple by specifying values for the data columns in an insert statement:


\begin{verbatim}


insert into <uncertaintable> [<attribute_list>] <tuple>;


\end{verbatim}






\section{Language Reference}


\label{sec:langref}




We next discuss the extensions to SQL by MayBMS.


For a description of the standard SQL constructs please see the Postgres SQL language reference available at




\url{http://www.postgresql.org/docs/8.3/interactive/sqlcommands.html}






\subsection{repairkey}


\textbf{Syntax:}


\begin{verbatim}


repair key <attributes> in


(<tcertainquery>  <tcertainrelation>)


[ weight by <expression> ]


\end{verbatim}




\noindent \textbf{Description:}


The {\tt repairkey} operation turns a {\em tcertainquery}\/


(or, as a special case, a {\em tcertainrelation}\/) into the set of worlds consisting of all possible


{\em maximal repairs}\/ of key $attributes$. A repair of key $\vec{A}$ in


relation $R$ is a subset of $R$ for which $\vec{A}$ is a key.


We say that relation $R'$ is a {\em maximal repair}\/ of a functional dependency


for relation $R$ if $R'$ is a maximal subset of $R$ which satisfies that


functional dependency. The numericallyvalued $expression$ is used for


weighting the newly created alternative repairs.


If the {\tt weight by} clause is omitted, a uniform probability distribution is assumed among all tuples with


the same key. Suppose there are $n$ tuples sharing the same key, each of them is


associated with a probability of $1/n$. If the weight is specified by $expression$,


the value of $expression$ will be the probability of the tuple before normalization.


Suppose there are $n$ tuples sharing the same key, tuple $t_i$ is associated


with probability $expression_i$ / $\sum_{k=1}^n expression_k$. In either case,


the sum of the probabilities among all tuples with the same key is 1.


There will be an error message if the value of $expression$ in any tuple is


negative. The tuples for which probability is 0 are ignored and not included in any resulting possible world.




{\tt repairkey} can be placed wherever a select statement is allowed in SQL.


See Section~\ref{sect:pwsa} for more details on {\tt repairkey}.




\noindent \textbf{Example:}


Suppose $Customer$ is a certain


relation with columns $ID$ and $name$, the following query performs a {\tt repairkey} operation on column $ID$ in $Customer$:




\begin{verbatim}


repair key ID in Customer;


\end{verbatim}




Suppose $Accounts$ is a certain relation with columns $ID$ and $account$, the following is an example of {\tt repairkey} operation on column $ID$ in the output of a certain query:




\begin{verbatim}


repair key ID in


(select * from Customer natural join Accounts);


\end{verbatim}








\subsection{picktuples}


\textbf{Syntax:}


\begin{verbatim}


pick tuples from


<tcertainquery>  <tcertainrelation>


[independently]


[with probability <expression>];


\end{verbatim}




\noindent \textbf{Description:}


%


The {\tt picktuples} operation generates the set of worlds which can be obtained from a {\it tcertainquery} or a {\it tcertainrelation} by selecting a subset of the tuples of that query or relation. In the current version of MayBMS, the presence of {\tt independently} does not affect query evaluation. It is the default; in the future, MayBMS may be extended by other options.




By default, every tuple in a possible world is associated with probability 0.5. If {\tt with probability} $expression$ is specified, the numerical value of $expression$ is the probability of the tuple. Note that only values in (0,1] are valid. There will be an error message if the value of $expression$ is negative or larger than 1. Tuples for which $expression$ are 0 are ignored.




{\tt picktuples} can be placed wherever a select statement is allowed in SQL.




\subsection{possible}


\noindent \textbf{Syntax:}


\begin{verbatim}


select possible <attributes> from <query>  <relation>;


\end{verbatim}




\noindent \textbf{Description:}


The operation {\tt possible} selects the set of tuples appearing in at least one possible world. This construct is a shortcut for the query which selects all distinct tuples with confidence greater than zero:


\begin{verbatim}


select distinct <attributes> from


(select <attributes>, tconf() as conf from <query>  <relation>


where conf > 0) Q;


\end{verbatim}






\noindent \textbf{Example:}


Suppose R and S are uncertain relations, the following query displays distinct pairs (A,B) with positive probabilities.


\begin{verbatim}


select possible A, B from R, S;


\end{verbatim}






\subsection{Confidence computation and approximate aggregates}




{\tt argmax}, {\tt conf}, {\tt aconf}, {\tt tconf}, {\tt esum} and {\tt ecount} are functions introduced by MayBMS. Following is the summary of the functions. \\




\begin{small}


\begin{tabular}{ll}


\hline


Name & Brief Description \\


\hline


argmax(argument, value) & Returns the argument with the maximum value. \\ \hline


conf() & Returns the exact confidence of distinct tuples. \\ \hline


conf(approach, $\epsilon$) & Returns the approximate confidence of distinct tuples. \\ \hline


aconf($\epsilon$, $\delta$) & Returns the approximate confidence of distinct tuples. \\ \hline


tconf() & Returns the exact confidence of tuples. \\ \hline


esum(attribute) & Returns the expected sum over distinct tuples. \\ \hline


ecount(attribute) & Returns the expected count over distinct tuples. \\ \hline


\end{tabular}


\end{small}




\setcounter{secnumdepth}{3}




\subsubsection{argmax(argumentattribute, valueattribute)}




Outputs an {\tt argumentattribute} value in the current group (determined by the groupby clause) whose tuples have a maximum {\tt valueattribute} value within the group. If there are several tuples sharing the same maximum {\tt valueattribute} value with different {\tt argumentattribute} values, an arbitrary value among them is returned. For example,


\begin{verbatim}


select location, argmax(date, temperature)


from weather_reports


group by location;


\end{verbatim}


retrieves one of the dates with the highest temperature for each location.




{\tt argmax} can be used on all relations and queries.



