diff --git a/build.js b/build.js index 0f7e4b9e..e3d10d72 100644 --- a/build.js +++ b/build.js @@ -111,12 +111,22 @@ var metalsmith = Metalsmith(__dirname) })) ) // Render HTML - .use(branch() - .pattern('**/*.html') + .use(branch('**/*.html') + // Render LaTeX inlined into the HTML .use(renderTeX()) + // Assign a mimir-specific layout to the mimir subdirectory + .use(branch('research/mimir/*.html') + .use(function(files, smith, done) { + for(i in files){ files[i].layout = 'mimir.hbs' } + done(); + }) + ) + // Render outer layouts .use(layouts({ engine: "handlebars", - default: "default.hbs" + default: "default.hbs", + directory: "layouts", + partials: "partials" })) ) // Validate diff --git a/layouts/default.hbs b/layouts/default.hbs index 1692d9cd..a14f0049 100644 --- a/layouts/default.hbs +++ b/layouts/default.hbs @@ -1,58 +1,45 @@ - - -
- -+A lot of analytics is based on information that starts off incomplete, is inconsistent, or is simply used incorrectly. Although people find ways of coping with these sources of uncertainty, those ways usually require lots of pain, effort and suffering before the data can be used, even when using automation. +
+Mimir is a database wrapper that helps you to embrace uncertainty rather than trying to fight it. Mimir attaches to a database of your choice using JDBC and provides a suite of lightweight, easy-to-use data cleaning and data analysis tools. +
+Unlike other automated data cleaning systems, Mimir doesn't claim that it will clean your data correctly. Instead, whenever you query data cleaned by Mimir, Mimir helps you to understand the choices it had to make, how they could impact your query results, and how confident it is in those results. +
+If you want more reliable results, no problem! Mimir streamlines the process of manual curation, focusing you on those parts of the data that need it most. +
+ +Documentation will be posted soon.
+ ++Curating data, or making sure that it is correct, consistent, and complete +can be very slow and expensive. Most of this effort is often wasted, since +only a small portion of the curated data will ever be relevant to analysts +using it. Unfortunately, without basing an analysis on trustworthy, curated +data, it's currently foolish to trust the analysis' results. Our +on-demand certainty effort links query results to potential sources of +uncertainty that could affect them using a provenance model called Virtual +C-Tables. Seeing the impact of uncertainty can help analysts to evaluate +the quality and trustworthiness of those results. +
+ ++Mimir is built around a probabilistic database system. Classical +deterministic databases assume that all of your data is fixed: Every +cell has exactly one value, and every table has a fixed set of rows in it. +Probabilistic databases instead track multiple possibilities: for example +the results of OCR software parsing a glyph as being either a 4 or a 9. +That could be useful, but no one really wants to move their data to an +entirely new database system. We're exploring ways to enable probabilistic +database functionality within existing deterministic database engines, +allowing legacy database applications to transparently co-exist with +probability-aware applications. +
+ ++Quantitative metrics like standard deviations and probabilities help to +measure how reliable query results are, but don't really provide a good +sense of why the results aren't reliable or what can be done to fix them. +Mimir can provide users with a list of explanations of why a particular +result is uncertain, and rank that list in order of relevance. We are +exploring what contextual cues make an explanation relevant, and ways of +efficiently ranking explanations in bulk. +
diff --git a/src/research/mimir/index.md b/src/research/mimir/index.md deleted file mode 100644 index ab65b7c3..00000000 --- a/src/research/mimir/index.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: Mimir ---- - - -__Students:__ {{list projectStudents.mimir}} - -_(Mimir is supported by gifts from Oracle University Relations, and is being developed in collaboration with Ronny Fehling, Dieter Gawlick, Zhen Hua Liu, Boris Glavic, and Jan Chomicki)_ - - - -Many analytics tasks are based on information that is initially incomplete, inconsistent, or simply used incorrectly. Although a variety of strategies exist to help people cope with these sources of uncertainty, they often require users to undertake heavyweight upfront organizationational tasks (i.e., tagging, data-cleaning, or modeling) before the data can be used. Automated techniques exist, but typically introduce their own forms of uncertainty. - -Mimir takes a step back and accepts that uncertainty is a fact of life. Rather than trying to fight it, Mimir embraces uncertainty, and helps users to understand it better. Combining automated data cleaning and data analysis techniques, Mimir's goal is to help users clean and query uncertain data, and to understand the impact of that uncertainty on the results of their analyses. - ------- -## Active Research Efforts - -### On-Demand Data Certainty -Curating data, or making sure that it is correct, consistent, and complete -can be very slow and expensive. Most of this effort is often wasted, since -only a small portion of the curated data will ever be relevant to analysts -using it. Unfortunately, without basing an analysis on trustworthy, curated -data, it's currently foolish to trust the analysis' results. Our -on-demand certainty effort links query results to potential sources of -uncertainty that could affect them using a provenance model called Virtual -C-Tables. Seeing the impact of uncertainty can help analysts to evaluate -the quality and trustworthiness of those results. - -### Transparent Probabilistic Databases -Mimir is built around a probabilistic database system. Classical -deterministic databases assume that all of your data is fixed: Every -cell has exactly one value, and every table has a fixed set of rows in it. -Probabilistic databases instead track multiple possibilities: for example -the results of OCR software parsing a glyph as being either a 4 or a 9. -That could be useful, but no one really wants to move their data to an -entirely new database system. We're exploring ways to enable probabilistic -database functionality within existing deterministic database engines, -allowing legacy database applications to transparently co-exist with -probability-aware applications. - -### Sensitivity Analysis -Quantitative metrics like standard deviations and probabilities help to -measure how reliable query results are, but don't really provide a good -sense of why the results aren't reliable or what can be done to fix them. -Mimir can provide users with a list of explanations of why a particular -result is uncertain, and rank that list in order of relevance. We are -exploring what contextual cues make an explanation relevant, and ways of -efficiently ranking explanations in bulk. - -{{! -### Consistent Query Semantics -Minor differences in data semantics can easily combine to produce subtle errors in the correctness of a query. For example, when a table listing historical orders is joined with a table of current currency conversions, the result may be inaccurate (depending on what the user's intent is): The exchange rate listed will be valid as of today, and not when the order was placed. Unfortunately, detecting these errors is difficult, as it is not generally possible to gauge user intent, or to ask users to provide such fine-grained semantic information about data. Using a combination of natural language processing, and usage modeling, we instead seek to answer a simpler, though closely related question: "Will the answer to my query be the same if I ask it tomorrow?" -}} - ------- - -## Software -