@inproceedings{chapman-21-fphdsdp, author = {Adriane Chapman and Paolo Missier and Giulia Simonelli and Riccardo Torlone}, booktitle = {Proceedings of the 29th Italian Symposium on Advanced Database Systems, SEBD 2021, Pizzo Calabro (VV), Italy, September 5-9, 2021}, pages = {411--418}, title = {Fine-grained Provenance for High-quality Data Science (Discussion Paper)}, year = {2021} } @article{chapman-20-cqfgppp, author = {Adriane Chapman and Paolo Missier and Giulia Simonelli and Riccardo Torlone}, journal = {PVLDB}, number = {4}, pages = {507--520}, title = {Capturing and Querying Fine-Grained Provenance of Preprocessing Pipelines in Data Science}, volume = {14}, year = {2020} } @article{lerner-18-usintcpr, author = {Barbara Lerner and Emery R. Boose and Luis Perez}, journal = {Informatics}, number = {1}, pages = {12}, title = {Using Introspection To Collect Provenance in R}, volume = {5}, year = {2018} } @inproceedings{brachmann:2020:cidr:your, author = {Brachmann, Michael and Spoth, William and Kennedy, Oliver and Glavic, Boris and Mueller, Heiko and Castelo, Sonia and Bautista, Carlos and Freire, Juliana}, booktitle = {CIDR}, title = {Your notebook is not crumby enough, REPLace it}, year = {2020} } @inproceedings{brachmann:2019:sigmod:data, author = {Brachmann, Mike and Bautista, Carlos and Castelo, Sonia and Feng, Su and Freire, Juliana and Glavic, Boris and Kennedy, Oliver and Mueller, Heiko and Rampin, Remi and Spoth, William and Yang, Ying}, booktitle = {SIGMOD-Demo}, title = {Data Debugging and Exploration with Vizier}, year = {2019} } @inproceedings{CW17a, author = {Carvalho, Lucas AMC and Wang, Regina and Gil, Yolanda and Garijo, Daniel}, booktitle = {K-CAP Workshops}, pages = {12--16}, title = {NiW: Converting Notebooks into Workflows to Capture Dataflow and Provenance.}, year = {2017} } @incollection{chapman-21-rphj, author = {Chapman, Adriane and Sasikant, Abhirami and Simonelli, Giulia and Missier, Paolo and Torlone, Riccardo}, booktitle = {Provenance in Data Science}, pages = {25--45}, title = {The Right (Provenance) Hammer for the Job: A Comparison of Data Provenance Instrumentation}, year = {2021} } @inproceedings{silles-10-pawr, author = {Chris A. Silles and Andrew R. Runnalls}, booktitle = {IPAW}, pages = {64--72}, title = {Provenance-Awareness in R}, volume = {6378}, year = {2010} } @inproceedings{koop-21-nar, author = {David Koop}, booktitle = {IPAW}, pages = {109--126}, title = {Notebook Archaeology: Inferring Provenance from Computational Notebooks}, year = {2021} } @inproceedings{KP17a, author = {David Koop and Jay Patel}, booktitle = {TaPP}, title = {Dataflow Notebooks: Encoding and Tracking Dependencies of Cells}, year = {2017} } @inproceedings{DBLP:conf/ipaw/PimentelFMB16, author = {João Felipe Pimentel and Juliana Freire and Leonardo Murta and Vanessa Braganholo}, booktitle = {IPAW}, pages = {199--203}, title = {Fine-Grained Provenance Collection over Scripts Through Program Slicing}, volume = {9672}, year = {2016} } @article{DBLP:journals/ese/PimentelMBF21, author = {João Felipe Pimentel and Leonardo Murta and Vanessa Braganholo and Juliana Freire}, journal = {Empir. Softw. Eng.}, number = {4}, pages = {65}, title = {Understanding and improving the quality and reproducibility of Jupyter notebooks}, volume = {26}, year = {2021} } @inproceedings{DBLP:conf/tapp/PimentelBMF15, author = {João Felipe Pimentel and Vanessa Braganholo and Leonardo Murta and Juliana Freire}, booktitle = {TaPP}, title = {Collecting and Analyzing Provenance on Interactive Notebooks: When IPython Meets noWorkflow}, year = {2015} } @article{pimentel-19-scmanpfs, author = {João Felipe Pimentel and Juliana Freire and Leonardo Murta and Vanessa Braganholo}, journal = {ACM Comput. Surv.}, number = {3}, pages = {47:1--47:38}, title = {A Survey on Collecting, Managing, and Analyzing Provenance From Scripts}, volume = {52}, year = {2019} } @article{pimentel-17-n, author = {João Felipe Pimentel and Leonardo Murta and Vanessa Braganholo and Juliana Freire}, journal = {PVLDB}, number = {12}, pages = {1841--1844}, title = {Noworkflow: a Tool for Collecting, Analyzing, and Managing Provenance From Python Scripts}, volume = {10}, year = {2017} } @article{rupprecht-20-imrdsptt, author = {Lukas Rupprecht and James C. Davis and Constantine Arnold and Yaniv Gur and Deepavali Bhagwat}, journal = {PVLDB}, number = {12}, pages = {3354--3368}, title = {Improving Reproducibility of Data Science Pipelines Through Transparent Provenance Capture}, volume = {13}, year = {2020} } @inproceedings{namaki-20-v, author = {Mohammad Hossein Namaki and Avrilia Floratou and Fotis Psallidas and Subru Krishnan and Ashvin Agrawal and Yinghui Wu and Yiwen Zhu and Markus Weimer}, booktitle = {SIGKDD}, pages = {1542--1551}, title = {Vamsa: Automated Provenance Tracking in Data Science Scripts}, year = {2020} } @inproceedings{samuel-18-p, author = {Sheeba Samuel and Birgitta König-Ries}, booktitle = {Proceedings of the ISWC 2018 Posters & Demonstrations, Industry and Blue Sky Ideas Tracks co-located with 17th International Semantic Web Conference (ISWC 2018), Monterey, USA, October 8th - to - 12th, 2018}, title = {ProvBook: Provenance-based Semantic Enrichment of Interactive Notebooks for Reproducibility}, year = {2018} } @article{macke-21-fglsnin, author = {Stephen Macke and Aditya G. Parameswaran and Hongpu Gong and Doris Jung Lin Lee and Doris Xin and Andrew Head}, journal = {PVLDB}, number = {6}, pages = {1093--1101}, title = {Fine-Grained Lineage for Safer Notebook Interactions}, volume = {14}, year = {2021} } @inproceedings{PG18, author = {Tomas Petricek and James Geddes and Charles A. Sutton}, booktitle = {TaPP}, title = {Wrattler: Reproducible, live and polyglot notebooks}, year = {2018} } @article{silva-18-d, author = {Vítor Silva and Daniel de Oliveira and Marta Mattoso and Patrick Valduriez}, journal = {PVLDB}, number = {12}, pages = {2082--2085}, title = {Dfanalyzer: Runtime Dataflow Analysis of Scientific Applications Using Provenance}, volume = {11}, year = {2018} } @article{DC07, author = {Davidson, Susan B. and Cohen-Boulakia, Sarah and Eyal, Anat and Ludäscher, Bertram and McPhillips, Timothy and Bowers, Shawn and Freire, Juliana}, journal = {IEEE Data Eng. Bull.}, number = {4}, pages = {44--50}, title = {Provenance in Scientific Workflow Systems}, volume = {32}, year = {2007} } @incollection{MB14a, author = {Murta, Leonardo and Braganholo, Vanessa and Chirigati, Fernando and Koop, David and Freire, Juliana}, booktitle = {Provenance and Annotation of Data and Processes}, pages = {71--83}, title = {noWorkflow: Capturing and analyzing provenance of scripts}, year = {2014} } @book{NN99, Author = {Nielson, F. and Nielson, H.R. and Hankin, C.}, Date-Added = {2008-03-18 11:52:57 +0100}, Date-Modified = {2013-08-21 01:11:11 +0000}, Publisher = {Springer}, Title = {{Principles of Program Analysis}}, Year = {1999}, } @article{DBLP:journals/tse/Weiser84, author = {Mark D. Weiser}, title = {Program Slicing}, journal = {{IEEE} Trans. Software Eng.}, volume = {10}, number = {4}, pages = {352--357}, year = {1984} } @article{chapman-20-cqfgppp, author = {Adriane Chapman and Paolo Missier and Giulia Simonelli and Riccardo Torlone}, journal = {PVLDB}, number = {4}, pages = {507--520}, title = {Capturing and Querying Fine-Grained Provenance of Preprocessing Pipelines in Data Science}, volume = {14}, year = {2020} } % Optional fields: subtitle, titleaddon, language, howpublished, type, version, note, organization, location, date, month, year, addendum, pubstate, doi, eprint, eprintclass, eprinttype, url, urldate @misc{papermill, author = {Netflix}, title = {Papermill}, howpublished = {https://github.com/nteract/papermill} } @article{DC07, author = {Davidson, Susan B. and Cohen-Boulakia, Sarah and Eyal, Anat and Ludäscher, Bertram and McPhillips, Timothy and Bowers, Shawn and Freire, Juliana}, journal = {IEEE Data Eng. Bull.}, number = {4}, pages = {44--50}, title = {Provenance in Scientific Workflow Systems}, volume = {32}, year = {2007} } @incollection{FS06, author = {Freire, Juliana and Silva, Cláudio T and Callahan, Steven P and Santos, Emanuele and Scheidegger, Carlos E and Vo, Huy T}, booktitle = {Provenance and Annotation of Data}, pages = {10--18}, title = {Managing rapidly-evolving scientific workflows}, year = {2006} } @misc{joelgrus, year = {2018}, author = {Joel Grus}, title = {I don't like notebooks}, howpublished = {https://www.youtube.com/watch?v=7jiPeIFXb6U} } @misc{nodebook, year={2018}, author={Kevin Zielnicki and Juan Nunez-Iglesias}, title={Nodebook}, howpublished={https://github.com/stitchfix/nodebook} } @book{WV02, Author = {Weikum, G. and Vossen, G.}, Date-Added = {2008-04-08 16:42:46 +0200}, Date-Modified = {2013-08-21 01:11:11 +0000}, Keywords = {transactions}, Publisher = {Morgan Kaufmann}, Title = {{Transactional Information Systems: Theory, Algorithms, and the Practice of Concurrency Control and Recovery}}, Year = {2002}, }