paper-HILDA-2016-Spreadsheets/okennedy.bib

%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/

%% Created for Oliver Kennedy at 2016-04-21 20:44:40 -0400


%% Saved with string encoding Unicode (UTF-8)


@inproceedings{Zloof:1975:QE:1499949.1500034,
	Acmid = {1500034},
	Address = {New York, NY, USA},
	Author = {Zloof, Mosh{\'e} M.},
	Booktitle = {Proceedings of the May 19-22, 1975, National Computer Conference and Exposition},
	Date-Added = {2016-04-22 00:44:16 +0000},
	Date-Modified = {2016-04-22 00:44:16 +0000},
	Doi = {10.1145/1499949.1500034},
	Location = {Anaheim, California},
	Numpages = {8},
	Pages = {431--438},
	Publisher = {ACM},
	Series = {AFIPS '75},
	Title = {Query by Example},
	Url = {http://doi.acm.org/10.1145/1499949.1500034},
	Year = {1975},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/1499949.1500034},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/1499949.1500034}}

@book{saltzer2009principles,
	Author = {Saltzer, Jerome H and Kaashoek, M Frans},
	Date-Added = {2016-04-11 20:51:46 +0000},
	Date-Modified = {2016-04-11 20:51:46 +0000},
	Publisher = {Morgan Kaufmann},
	Title = {Principles of computer system design: an introduction},
	Year = {2009}}

@inbook{Erwig2002,
	Address = {Berlin, Heidelberg},
	Author = {Erwig, Martin and Burnett, Margaret},
	Chapter = {Adding Apples and Oranges},
	Date-Added = {2016-04-11 18:48:43 +0000},
	Date-Modified = {2016-04-11 18:48:43 +0000},
	Doi = {10.1007/3-540-45587-6_12},
	Editor = {Krishnamurthi, Shriram and Ramakrishnan, C. R.},
	Isbn = {978-3-540-45587-5},
	Pages = {173--191},
	Publisher = {Springer Berlin Heidelberg},
	Title = {Practical Aspects of Declarative Languages: 4th International Symposium, PADL 2002 Portland, OR, USA, January 19--20, 2002 Proceedings},
	Url = {http://dx.doi.org/10.1007/3-540-45587-6_12},
	Year = {2002},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-45587-6_12}}

@inproceedings{ives2015looking,
	Author = {Ives, Zachary G and Yan, Zhepeng and Zheng, Nan and Litt, Brian and Wagenaar, Joost B},
	Date-Added = {2016-04-04 15:50:57 +0000},
	Date-Modified = {2016-04-04 15:50:57 +0000},
	Title = {Looking at Everything in Context.}}

@article{Wang:2016aa,
	Abstract = {Data-driven applications rely on the correctness of their data to function properly and effectively. Errors in data can be incredibly costly and disruptive, leading to loss of revenue, incorrect conclusions, and misguided policy decisions. While data cleaning tools can purge datasets of many errors before the data is used, applications and users interacting with the data can introduce new errors. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Even when some of these discrepancies are discovered, they are often corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this paper, we propose QFix, a framework that derives explanations and repairs for discrepancies in relational data, by analyzing the effect of queries that operated on the data and identifying potential mistakes in those queries. QFix is flexible, handling scenarios where only a subset of the true discrepancies is known, and robust to different types of update workloads. We make four important contributions: (a) we formalize the problem of diagnosing the causes of data errors based on the queries that operated on and introduced errors to a dataset; (b) we develop exact methods for deriving diagnoses and fixes for identified errors using state-of-the-art tools; (c) we present several optimization techniques that improve our basic approach without compromising accuracy, and (d) we leverage a tradeoff between accuracy and performance to scale diagnosis to large datasets and query logs, while achieving near-optimal results. We demonstrate the effectiveness of QFix through extensive evaluation over benchmark and synthetic data.},
	Author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu},
	Date-Added = {2016-04-04 15:31:10 +0000},
	Date-Modified = {2016-04-04 15:31:10 +0000},
	Eprint = {1601.07539},
	Month = {01},
	Title = {QFix: Diagnosing errors through query histories},
	Url = {http://arxiv.org/abs/1601.07539},
	Year = {2016},
	Bdsk-Url-1 = {http://arxiv.org/abs/1601.07539}}

@article{Krishnan:2016aa,
	Abstract = {Data cleaning is often an important step to ensure that predictive models, such as regression and classification, are not affected by systematic errors such as inconsistent, out-of-date, or outlier data. Identifying dirty data is often a manual and iterative process, and can be challenging on large datasets. However, many data cleaning workflows can introduce subtle biases into the training processes due to violation of independence assumptions. We propose ActiveClean, a progressive cleaning approach where the model is updated incrementally instead of re-training and can guarantee accuracy on partially cleaned data. ActiveClean supports a popular class of models called convex loss models (e.g., linear regression and SVMs). ActiveClean also leverages the structure of a user's model to prioritize cleaning those records likely to affect the results. We evaluate ActiveClean on five real-world datasets UCI Adult, UCI EEG, MNIST, Dollars For Docs, and WorldBank with both real and synthetic errors. Our results suggest that our proposed optimizations can improve model accuracy by up-to 2.5x for the same amount of data cleaned. Furthermore for a fixed cleaning budget and on all real dirty datasets, ActiveClean returns more accurate models than uniform sampling and Active Learning.},
	Author = {Sanjay Krishnan and Jiannan Wang and Eugene Wu and Michael J. Franklin and Ken Goldberg},
	Date-Added = {2016-04-04 15:31:02 +0000},
	Date-Modified = {2016-04-04 15:31:02 +0000},
	Eprint = {1601.03797},
	Month = {01},
	Title = {ActiveClean: Interactive Data Cleaning While Learning Convex Loss Models},
	Url = {http://arxiv.org/abs/1601.03797},
	Year = {2016},
	Bdsk-Url-1 = {http://arxiv.org/abs/1601.03797}}

@article{Haas:2015:WNS:2824032.2824122,
	Acmid = {2824122},
	Author = {Haas, Daniel and Krishnan, Sanjay and Wang, Jiannan and Franklin, Michael J. and Wu, Eugene},
	Date-Added = {2016-04-04 15:29:23 +0000},
	Date-Modified = {2016-04-04 15:29:23 +0000},
	Doi = {10.14778/2824032.2824122},
	Issn = {2150-8097},
	Issue_Date = {August 2015},
	Journal = {Proc. VLDB Endow.},
	Month = aug,
	Number = {12},
	Numpages = {4},
	Pages = {2004--2007},
	Publisher = {VLDB Endowment},
	Title = {Wisteria: Nurturing Scalable Data Cleaning Infrastructure},
	Url = {http://dx.doi.org/10.14778/2824032.2824122},
	Volume = {8},
	Year = {2015},
	Bdsk-Url-1 = {http://dx.doi.org/10.14778/2824032.2824122}}

@inproceedings{Wang:2014:SFF:2588555.2610505,
	Acmid = {2610505},
	Address = {New York, NY, USA},
	Author = {Wang, Jiannan and Krishnan, Sanjay and Franklin, Michael J. and Goldberg, Ken and Kraska, Tim and Milo, Tova},
	Booktitle = {Proceedings of the 2014 ACM SIGMOD International Conference on Management of Data},
	Date-Added = {2016-04-04 15:28:48 +0000},
	Date-Modified = {2016-04-04 15:28:48 +0000},
	Doi = {10.1145/2588555.2610505},
	Isbn = {978-1-4503-2376-5},
	Keywords = {aggregate query, data cleaning, dirty data, sampling},
	Location = {Snowbird, Utah, USA},
	Numpages = {12},
	Pages = {469--480},
	Publisher = {ACM},
	Series = {SIGMOD '14},
	Title = {A Sample-and-clean Framework for Fast and Accurate Query Processing on Dirty Data},
	Url = {http://doi.acm.org/10.1145/2588555.2610505},
	Year = {2014},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2588555.2610505},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2588555.2610505}}

@book{suciu2011probabilistic,
	Author = {Suciu, Dan and Olteanu, Dan and R{\'e}, Christopher and Koch, Christoph},
	Date-Added = {2016-04-03 20:39:53 +0000},
	Date-Modified = {2016-04-03 20:40:12 +0000},
	Publisher = {Morgan \& Claypool},
	Title = {Probabilistic databases, synthesis lectures on data management},
	Year = {2011}}

@book{norman2013design,
	Author = {Norman, Donald A},
	Date-Added = {2016-04-03 17:47:41 +0000},
	Date-Modified = {2016-04-03 17:47:41 +0000},
	Publisher = {Basic books},
	Title = {The design of everyday things: Revised and expanded edition},
	Year = {2013}}

@article{Bendre:2015:DUD:2824032.2824121,
 author = {Bendre, Mangesh and Sun, Bofan and Zhang, Ding and Zhou, Xinyan and Chang, Kevin Chen-Chuan and Parameswaran, Aditya},
 title = {DataSpread: Unifying Databases and Spreadsheets},
 journal = {Proc. VLDB Endow.},
 issue_date = {August 2015},
 volume = {8},
 number = {12},
 month = aug,
 year = {2015},
 issn = {2150-8097},
 pages = {2000--2003},
 numpages = {4},
 url = {http://dx.doi.org/10.14778/2824032.2824121},
 doi = {10.14778/2824032.2824121},
 acmid = {2824121},
 publisher = {VLDB Endowment},
}


@inproceedings{Liu:2009:SAD:1546683.1547431,
 author = {Liu, Bin and Jagadish, H. V.},
 title = {A Spreadsheet Algebra for a Direct Data Manipulation Query Interface},
 booktitle = {Proceedings of the 2009 IEEE International Conference on Data Engineering},
 series = {ICDE '09},
 year = {2009},
 isbn = {978-0-7695-3545-6},
 pages = {417--428},
 numpages = {12},
 url = {http://dx.doi.org/10.1109/ICDE.2009.34},
 doi = {10.1109/ICDE.2009.34},
 acmid = {1547431},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
 keywords = {database usability, user interface, spreadsheet},
}

@inproceedings{Tyszkiewicz:2010:SRD:1807167.1807191,
 author = {Tyszkiewicz, Jerzy},
 title = {Spreadsheet As a Relational Database Engine},
 booktitle = {Proceedings of the 2010 ACM SIGMOD International Conference on Management of Data},
 series = {SIGMOD '10},
 year = {2010},
 isbn = {978-1-4503-0032-2},
 location = {Indianapolis, Indiana, USA},
 pages = {195--206},
 numpages = {12},
 url = {http://doi.acm.org.gate.lib.buffalo.edu/10.1145/1807167.1807191},
 doi = {10.1145/1807167.1807191},
 acmid = {1807191},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {performance, relational algebra, relational databases, spreadsheets, sql},
}


@inproceedings{Witkowski:2005:QE:1083592.1083733,
 author = {Witkowski, Andrew and Bellamkonda, Srikanth and Bozkaya, Tolga and Naimat, Aman and Sheng, Lei and Subramanian, Sankar and Waingold, Allison},
 title = {Query by Excel},
 booktitle = {Proceedings of the 31st International Conference on Very Large Data Bases},
 series = {VLDB '05},
 year = {2005},
 isbn = {1-59593-154-6},
 location = {Trondheim, Norway},
 pages = {1204--1215},
 numpages = {12},
 url = {http://dl.acm.org.gate.lib.buffalo.edu/citation.cfm?id=1083592.1083733},
 acmid = {1083733},
 publisher = {VLDB Endowment},
}

@inproceedings{Witkowski:2003:SRO:872757.872767,
 author = {Witkowski, Andrew and Bellamkonda, Srikanth and Bozkaya, Tolga and Dorman, Gregory and Folkert, Nathan and Gupta, Abhinav and Shen, Lei and Subramanian, Sankar},
 title = {Spreadsheets in RDBMS for OLAP},
 booktitle = {Proceedings of the 2003 ACM SIGMOD International Conference on Management of Data},
 series = {SIGMOD '03},
 year = {2003},
 isbn = {1-58113-634-X},
 location = {San Diego, California},
 pages = {52--63},
 numpages = {12},
 url = {http://doi.acm.org.gate.lib.buffalo.edu/10.1145/872757.872767},
 doi = {10.1145/872757.872767},
 acmid = {872767},
 publisher = {ACM},
 address = {New York, NY, USA},
}