diff --git a/lib/gemsmith.rb b/lib/gemsmith.rb
index 2ddfad64..54a713bf 100644
--- a/lib/gemsmith.rb
+++ b/lib/gemsmith.rb
@@ -104,7 +104,7 @@ module GemSmith
target_dirs.push out_path
task "#{site_name}_static" => target_dirs do
static_assets.each { |in_path, out_in_dir|
- system("rsync -a #{in_path} #{File.join(out_path, out_in_dir)}")
+ system("rsync -a #{in_path} #{File.join(out_path, File.basename(out_in_dir))}")
}
end
end
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/BI-Analyst.jpg b/slides/talks/2016-3-NYU-Mimir/graphics/BI-Analyst.jpg
new file mode 100644
index 00000000..45d77c69
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/BI-Analyst.jpg differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Base.png b/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Base.png
new file mode 100644
index 00000000..9e885986
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Base.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Explain.png b/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Explain.png
new file mode 100644
index 00000000..4097cc0c
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/Calendar_Explain.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/FullText-black.png b/slides/talks/2016-3-NYU-Mimir/graphics/FullText-black.png
new file mode 100644
index 00000000..a3153259
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/FullText-black.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/FullText-white.png b/slides/talks/2016-3-NYU-Mimir/graphics/FullText-white.png
new file mode 100644
index 00000000..b3f42b46
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/FullText-white.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/MSImageID.png b/slides/talks/2016-3-NYU-Mimir/graphics/MSImageID.png
new file mode 100644
index 00000000..8d4547fe
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/MSImageID.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/UIExample.png b/slides/talks/2016-3-NYU-Mimir/graphics/UIExample.png
new file mode 100644
index 00000000..e6300fa3
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/UIExample.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png
new file mode 100644
index 00000000..8d798b00
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/azure-data-lake.png b/slides/talks/2016-3-NYU-Mimir/graphics/azure-data-lake.png
new file mode 100644
index 00000000..f55d3221
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/azure-data-lake.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.pdf b/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.pdf
new file mode 100644
index 00000000..766bd274
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.pdf differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.png b/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.png
new file mode 100644
index 00000000..f0e0184f
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/credit_entropy.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/crystalball-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/crystalball-800px.png
new file mode 100644
index 00000000..4a4624cc
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/crystalball-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/dagobert83-female-user-icon-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/dagobert83-female-user-icon-800px.png
new file mode 100644
index 00000000..e3c69b0a
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/dagobert83-female-user-icon-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/data-lake-to-data-swamp.jpg b/slides/talks/2016-3-NYU-Mimir/graphics/data-lake-to-data-swamp.jpg
new file mode 100644
index 00000000..94f43b48
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/data-lake-to-data-swamp.jpg differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/database-server-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/database-server-800px.png
new file mode 100644
index 00000000..574dbffd
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/database-server-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/db.svg b/slides/talks/2016-3-NYU-Mimir/graphics/db.svg
new file mode 100644
index 00000000..974e0cba
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/graphics/db.svg
@@ -0,0 +1,330 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ image/svg+xml
+
+
+
+
+ Openclipart
+
+
+ database symbol
+ 2010-11-08T22:08:43
+ database symbol in metallic style
+ https://openclipart.org/detail/94723/database-symbol-by-rg1024
+
+
+ rg1024
+
+
+
+
+ database
+ server
+ symbol
+
+
+
+
+
+
+
+
+
+
+
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/ericlemerdy-Server-1-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/ericlemerdy-Server-1-800px.png
new file mode 100644
index 00000000..ae8fca7c
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/ericlemerdy-Server-1-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/iu.jpeg b/slides/talks/2016-3-NYU-Mimir/graphics/iu.jpeg
new file mode 100644
index 00000000..40d11ed8
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/iu.jpeg differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/jean-victor-balin-icon-table.svg b/slides/talks/2016-3-NYU-Mimir/graphics/jean-victor-balin-icon-table.svg
new file mode 100644
index 00000000..2d4e84e1
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/graphics/jean-victor-balin-icon-table.svg
@@ -0,0 +1,482 @@
+
+
+
+ icon_table
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ image/svg+xml
+
+
+
+
+ Openclipart
+
+
+ icon_table
+ 2010-01-29T14:02:11
+
+ https://openclipart.org/detail/29121/icon_table-by-jean_victor_balin
+
+
+ jean_victor_balin
+
+
+
+
+ calc
+ icon
+ table
+ unchecked
+
+
+
+
+
+
+
+
+
+
+
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/list-add-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/list-add-800px.png
new file mode 100644
index 00000000..8dbb6fb0
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/list-add-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/littlestorefront-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/littlestorefront-800px.png
new file mode 100644
index 00000000..b4d5ace2
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/littlestorefront-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/matt-icons_text-x-log-300px.png b/slides/talks/2016-3-NYU-Mimir/graphics/matt-icons_text-x-log-300px.png
new file mode 100644
index 00000000..a61f729e
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/matt-icons_text-x-log-300px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/maybe-address.png b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-address.png
new file mode 100644
index 00000000..0f68aee3
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-address.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/maybe-detail.png b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-detail.png
new file mode 100644
index 00000000..da73143a
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-detail.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/maybe-screen.png b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-screen.png
new file mode 100644
index 00000000..d54fc386
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/maybe-screen.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/mimir_logo_final.png b/slides/talks/2016-3-NYU-Mimir/graphics/mimir_logo_final.png
new file mode 100644
index 00000000..fe23d4e0
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/mimir_logo_final.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/performance-dbx1g.png b/slides/talks/2016-3-NYU-Mimir/graphics/performance-dbx1g.png
new file mode 100644
index 00000000..d3fcb643
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/performance-dbx1g.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/performance-sqlite1g.png b/slides/talks/2016-3-NYU-Mimir/graphics/performance-sqlite1g.png
new file mode 100644
index 00000000..647058d2
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/performance-sqlite1g.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/primary-queries.svg b/slides/talks/2016-3-NYU-Mimir/graphics/primary-queries.svg
new file mode 100644
index 00000000..5006a888
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/graphics/primary-queries.svg
@@ -0,0 +1,113 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ image/svg+xml
+
+
+
+
+ Openclipart
+
+
+
+
+
+
+
+
+
+
+
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.pdf b/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.pdf
new file mode 100644
index 00000000..37baf94d
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.pdf differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.png b/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.png
new file mode 100644
index 00000000..15c77940
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/product_entropy.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/realestate_entropy.pdf b/slides/talks/2016-3-NYU-Mimir/graphics/realestate_entropy.pdf
new file mode 100644
index 00000000..4f1bfd76
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/realestate_entropy.pdf differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/saco-800px.png b/slides/talks/2016-3-NYU-Mimir/graphics/saco-800px.png
new file mode 100644
index 00000000..56f10287
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/saco-800px.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/test.svg b/slides/talks/2016-3-NYU-Mimir/graphics/test.svg
new file mode 100644
index 00000000..2bb6dccd
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/graphics/test.svg
@@ -0,0 +1,4 @@
+
+ Foo
+
+
\ No newline at end of file
diff --git a/slides/talks/2016-3-NYU-Mimir/graphics/weka.png b/slides/talks/2016-3-NYU-Mimir/graphics/weka.png
new file mode 100644
index 00000000..37a1188a
Binary files /dev/null and b/slides/talks/2016-3-NYU-Mimir/graphics/weka.png differ
diff --git a/slides/talks/2016-3-NYU-Mimir/index.html b/slides/talks/2016-3-NYU-Mimir/index.html
new file mode 100644
index 00000000..784b52f8
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/index.html
@@ -0,0 +1,1090 @@
+
+
+
+
+
+
+ Embracing Uncertainty
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Embracing uncertainty with
+
+
+
+
+ Joint work with:
+
+ Ying Yang, Niccolo Meneghetti, Arindam Nandi,
+ Vinayak Karuppasamy (now at Bloomberg), Ronny Fehling (Airbus),
+ Zhen-Hua Liu (Oracle), Dieter Gawlick (Oracle),
+ Boris Glavic (IIT), Wolfgang Gatterbauer (CMU)
+ (and soon Juliana Freire (NYU))
+
+
+
+
+
+
+ A Big Data Fairy Tale
+
+
+
+
+ Meet Alice
+
+ (OpenClipArt.org)
+
+
+
+
+
+ Alice has a Store
+
+ (OpenClipArt.org)
+
+
+
+
+ →
+
+ Alice's store collects sales data
+
+ (OpenClipArt.org)
+
+
+
+
+ +
+
+ =
+
+ Alice wants to use her sales data to run a promotion
+
+ (OpenClipArt.org)
+
+
+
+
+ →
+
+ So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.
+
+ (OpenClipArt.org)
+
+
+
+
+ + ?
+ ... asks her question ...
+
+ (OpenClipArt.org)
+
+
+
+
+ + ? →
+
+ ... and basks in the limitless possibilities of big data.
+
+ (OpenClipArt.org)
+
+
+
+
+
+ Why is this a fairy tale?
+
+
+
+
+ →
+
+ It's never this easy...
+
+
+
+
+
+ CSV Import
+ Run a SELECT
on a raw CSV File
+
+ File may not have column headers
+ CSV does not provide "types"
+ Lines may be missing fields
+ Fields may be mistyped (typo, missing comma)
+ Comment text can be inlined into the file
+
+
+ State of the art : External Table Defn + "Manually" edit CSV
+
+
+
+
+ Merge Two Datasets
+ UNION
two data sources
+
+ Schema matching
+ Deduplication
+ Format alignment (GIS coordinates, $ vs €)
+ Precision alignment (State vs County)
+
+
+ State of the art : Manually map schema
+
+
+
+
+ JSON Shredding
+ Run a SELECT
on JSON or a Doc Store
+
+ Separating fields and record sets: (e.g., { A: "Bob", B: "Alice" }
)
+ Missing fields (Records with no 'address')
+ Type alignment (Records with 'address' as an array)
+ Schema matching$^2$
+
+
+ State of the art : DataGuide, Wrangler, etc...
+
+
+
+
+
+
+
+
+
+ Structure is hard!
+
+ Structured models (RelDBs) force curation during loading.
+ Problem: All curation costs are upfront.
+
+ Unstructured models (NoSQL) force curation into queries.
+ Problem: Complexity/redundancy blowup in queries.
+
+
+ Add structure, curation effort On-Demand
+
+
+
+ But... you still need some sort of structure?!?
+
+ Let the database make a guess!
+
+
+
+
+ In the name of Codd,thou shalt not give the user a wrong answer.
+
+
+
+ ... but what if we did?
+
+
+ What would it take for that to be ok?
+
+
+
+
+
+
+
+ Mimir
+
+ Lenses : Generic, best-guess data curation operators.
+
+ Explanations : How certain is my data?
+
+ Provenance : What issues still need to be fixed?
+
+
+
+
+
+ Lenses
+ Here's a problem with my data. Fix it.
+
+ What type is this column? (majority vote)
+ How do the columns of these relations line up? (pick your favorite schema matching paper)
+ How do I query heterogeneous JSON objects? (see above)
+ What should these missing values be? (learning-based interpolation)
+
+
+
+
+
+
+ View:
+
+ SELECT
+
+
+
+
+
+ Lens:
+
+ SELECT
+
+
+
+ [ ]
+
+
+
+
+
+
+ (best guess)
+
+
+
+
+
+
+ Lenses introduce uncertainty
+ (OpenClipArt.org)
+
+
+ The User's View
+
+ SELECT NAME, DEPARTMENT FROM PRODUCTS;
+
+
+ Name Department
+ Apple 6s, White Phone
+ Dell, Intel 4 core Computer
+ HP, AMD 2 core Computer
+ ... ...
+
+ Simple UI: Highlight values that are based on guesses.
+
+
+
+
+ SELECT NAME, DEPARTMENT FROM PRODUCTS;
+
+
+
+ Name Department
+ Apple 6s, White Phone
+ Dell, Intel 4 core Computer
+ HP, AMD 2 core Computer
+ ... ...
+
+
+
+ Produced by OmniGraffle 6.2.5 2015-09-20 14:45:55 +0000
+
+
+ Canvas 1
+
+ Layer 1
+
+
+ Pr obability: 95% Reason: Because I guessed ‘Computer’ for ‘Department’ on Row ‘3’ of ‘PRODUCTS’
+
+
+
+ Allow users to EXPLAIN
uncertain outputs
+ Explanations include reasons given in English
+
+
+
+
+
$PRODUCTS.DEPARTMENT_{3}$
+
⬍
+
"I guessed 'Computer' for 'Department' on Row '3'"
+
+
+
+
+ Explanations
+
+ Mark uncertain data and results.
+ Upon request, provide more detail:
+
+ Why is my data uncertain? (provenance)
+ How bad is it? (confidence, entropy, bounds)
+ What are other possibile answers? (samples)
+ What can I do to fix it? (repairs)
+
+
+
+
+
+
+
+
+ Mimir is a DB Overlay
+
+
+
+
+
+
+
+ (Any DB)
+
+
+
+
+ (Lens)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (Any DB)
+
+
+
+
+
+ SELECT
+
+
+
+
+ SELECT
+
+
+
+
+ SELECT
+
+
+
+
+
+
+ UNION
+ UNION
+
+
+ Mimir virtualizes uncertainty
+ (OpenClipArt.org)
+
+
+
+
+
+
+
+ Labeled Nulls
+ $Var(\ldots)$ constructs new variables
+
+ $Var('X')$ constructs a new variable $X$
+ $Var('X', 1)$ constructs a new variable $X_{1}$
+ $Var('X', ROWID)$ evaluates $ROWID$ and then constructs a new variable $X_{ROWID}$
+
+
+
+
+
+ Lazy Evaluation
+ Variables can't be evaluated until they are bound. So, we allow arbitrary expressions to represent data.
+
+ $X$ is a legitimate data value.
+ $X+1$ is a legitimate data value.
+ $1+1$ is a legitimate data value, but can be reduced to $2$.
+
+ A lazy value without variables is deterministic
+
+
+
+ Mimir SQL allows the $Var()$ operator to inlined
+
+ SELECT A, VAR('X', B)+2 AS C FROM R;
+
+
+
+
+
+ A B
+
+ 1 2
+ 3 4
+ 5 6
+
+
+
+ A C
+ 1 $X_2+2$
+ 3 $X_4+2$
+ 5 $X_6+2$
+
+
+
+
+
+
+ Selects on $Var()$ need to be deferred too...
+
+ SELECT A FROM R WHERE VAR('X', B) > 2;
+
+
+
+
+
+ A B
+
+ 1 2
+ 3 4
+ 5 6
+
+
+
+ A $\phi$
+ 1 $X_2>2$
+ 3 $X_4>2$
+ 5 $X_6>2$
+
+
+
+ When evaluating the table, rows where $\phi = \bot$ are dropped.
+
+
+
+ C-Tables
+
+ Original Formulation [Imielinski, Lipski 1981]
+ PC-Tables [Green, Tannen 2006]
+ Systems
+ Orchestra [Green, Karvounarakis, Taylor, Biton, Ives, Tannen 2007]
+ MayBMS [Huang, Antova, Koch, Olteanu 2009]
+ Pip [Kennedy, Koch 2009]
+ Sprout [Fink, Hogue, Olteanu, Rath 2011]
+
+ Generalized PC-Tables [Kennedy, Koch 2009]
+
+
+
+
+
+
+ Labeled nulls capture a lens' uncertainty
+
+
+
+
+ CREATE LENS PRODUCTS
+ AS SELECT * FROM PRODUCTS_RAW
+ USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+
+
+
+
is (almost) the same as the query...
+
+ CREATE VIEW PRODUCTS
+ AS SELECT ID, NAME, ...,
+ CASE WHEN DEPARTMENT IS NOT NULL THEN DEPARTMENT
+ ELSE VAR('PRODUCTS.DEPARTMENT', ROWID)
+ END AS DEPARTMENT
+ FROM PRODUCTS_RAW;
+
+
+
+
+
+ ID Name ... Department
+ 123 Apple 6s, White ... Phone
+ 34234 Dell, Intel 4 core ... Computer
+ 34235 HP, AMD 2 core ... $Prod.Dept_3$
+ ... ... ... ...
+
+
+
+
+
+ CREATE LENS PRODUCTS
+ AS SELECT * FROM PRODUCTS_RAW
+ USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+
+
+
+
Behind the scenes, a lens also creates a model...
+
+ SELECT * FROM PRODUCTS_RAW;
+
+
+
+
↓
+
+
+
+
+
+
↓
+
An estimator for $PRODUCTS.DEPARTMENT_{ROWID}$
+
+
+
+
+
+
+
+ ... but databases don't support labeled nulls
+
+
+
+ Labeled Nulls Percolate Up
+
+ SELECT A, VAR('X', B)+2 AS C FROM R;
+
+
+
Mimir dispatches this query to the DB:
+
+ SELECT A, B FROM R;
+
+
+
+
And for each row of the result, evaluates:
+
+ SELECT A, VAR('X', B)+2 AS C FROM RESULT;
+
+
+
+
+
+ Generating Explanations
+ All uncertainty comes from labeled nulls in the expressions that Mimir evaluates for each row of the output.
+
+ Why is the data uncertain?
+ All relevant lenses referenced in VAR('X', B)+2
.
+
+ How uncertain?
+ Estimate by sampling from VAR('X', B)
.
+
+ How do I fix it?
+ Each lens fixes one well-defined type of error.
+
+
+
+
+ Lazy evaluation can cause problems
+
+ SELECT R.A, S.C FROM R, S WHERE VAR('X', R.B) = S.B;
+
+
+
Mimir dispatches this query to the DB:
+
+ SELECT R.A, S.C, R.B AS TEMP_1, S.B AS TEMP_2 FROM R, S;
+
+
+
+
And for each row of the result, evaluates:
+
+ SELECT A, C FROM RESULT WHERE VAR('X', TEMP_1) = TEMP_2;
+
+
+
+
+
+ Helper views allow the DB to interpret labeled nulls
+
+ SELECT R.A, S.C FROM R, S
+ WHERE S.B = (SELECT VALUE FROM VARIABLE_X WHERE KEY = R.B);
+
+ ... but we lose the ability to explain outputs
+
+
+
+ Provenance Recovers Explanations
+
+ SELECT R.A, S.C FROM R, S WHERE VAR('X', R.B) = S.B;
+
+ Mimir dispatches this query to the DB:
+
+ SELECT R.A, S.C,
+ R.ROWID AS ID_1, S.ROWID AS ID_2
+ WHERE S.B = (SELECT VALUE FROM VARIABLE_X WHERE KEY = R.B);
+
+
+
Then to explain, Mimir dispatches the query:
+
+ SELECT R.A, S.C, R.B AS TEMP_1, S.B AS TEMP_2
+ WHERE R.ROWID = ID_1 AND S.ROWID = ID_2
+
+
+
+
+
+
+
+ Performance
+ TPC-H Data, but replace 0.1% of FK references with NULL. Ask Mimir to fix.
+ (a worst case from a performance standpoint)
+
+ Query 1: Table scan. Overhead for a no-op.
+ Query 3: 3-way join on an FK chain.
+ Query 5: 6-way join on an FK tree.
+ Query 9: 6-way join with cycles.
+
+
+
+
+
+ Mimir over SQLite in 4 different execution modes. 100% = Zero overhead
+
+
+
+
+
+
+
+ Questions?
+
+
+
+
+
+
+
+ C-Tables
+
+ Original Formulation [Imielinski, Lipski 1981]
+ PC-Tables [Green, Tannen 2006]
+ Systems
+ Orchestra [Green, Karvounarakis, Taylor, Biton, Ives, Tannen 2007]
+ MayBMS [Huang, Antova, Koch, Olteanu 2009]
+ Pip [Kennedy, Koch 2009]
+ Sprout [Fink, Hogue, Olteanu, Rath 2011]
+
+ Generalized PC-Tables [Kennedy, Koch 2009]
+
+
+
+
+ Lenses
+
+ A VG-RA Expression
+ A 'Model' that defines for each variable...
+ A sampling process
+ A best guess estimator
+ A human-readable description
+
+
+ Lenses implement PC-Tables
+
+
+
+
+ CREATE LENS PRODUCTS
+ AS SELECT * FROM PRODUCTS_RAW
+ USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+
+
+
+ AS
clause defines source data.
+ USING
clause requests repairs.
+
+
+
+
+
+
+
+ Selection (Filtering)
+
+ SELECT NAME FROM PRODUCTS
+ WHERE DEPARTMENT='PHONE'
+ AND ( VENDOR='APPLE'
+ OR PLATFORM='ANDROID' )
+
+ Recall, row-level uncertainty is a boolean formula $\phi$.
+
+ For this query, $\phi$ can be as complex as:
+ $$DEPT_{ROWID}='P\ldots' \wedge \left( VEND_{ROWID}='Ap\ldots' \vee PLAT_{ROWID} = 'An\ldots' \right)$$
+ Too many variables! Which is the most important?
+
+
+
+ What is important?
+ Data Cleaning
+ Which variables are important?
+ The ones that keep us from knowing everything
+
+
+
+ $$D_{ROWID}='P' \wedge \left( V_{ROWID}='Ap' \vee PLAT_{ROWID} = 'An' \right)$$
+ ⬍
+ $$A \wedge (B \vee C)$$
+
+
+
+ Naive Approach
+
+ Consider a game between a database and an impartial oracle.
+
+ The DB picks a variable $v$ in $\phi$ and pays a cost $c_v$.
+ The Oracle reveals the truth value of $v$.
+ The DB updates $\phi$ accordingly and repeats until $\phi$ is deterministic.
+
+ Naive Algorithm: Pick all variables!
+ Less Naive Algorithm: Minimize $E\left[\sum c_v\right]$.
+
+
+
+ Exponential Time Bad!
+
+
+
+
+
+ The Value of What We Don't Know
+ $$\phi = A \wedge (B \vee C)$$
+
+ Generate Samples for $A$, $B$, $C$
+ Estimate $p(\phi)$
+ Compute $H[\phi] = -\log\left(p(\phi) \cdot (1-p(\phi))\right)$
+
+ Entropy is intuitive: $H = 1$ means we know nothing, $H = 0$ means we know everything.
+
+
+
+ Information Gain
+ $$\mathcal I_{A \leftarrow \top} (\phi) = H\left[\phi\right] - H\left[\phi(A \leftarrow \top)\right]$$
+ Information gain of $v$: The reduction in entropy from knowing the truth value of a variable $v$.
+
+
+
+ Expected Information Gain
+ $$\mathcal I_{A} (\phi) = \left(p(A)\cdot \mathcal I_{A\leftarrow \top}(\phi)\right) + \left(p(\neg A)\cdot \mathcal I_{A\leftarrow \bot}(\phi)\right)$$
+ Expected information gain of $v$: The probability-weighted average of the information gain for $v$ and $\neg v$.
+
+
+
+ The Cost of Perfect Information
+ Combine Information Gain and Cost
+ $$f(\mathcal I_{A}(\phi), c_A)$$
+ For example: $EG2(\mathcal I_{A}(\phi), c_A) = \frac{2^{\mathcal I_{A}(\phi)} - 1}{c_A}$
+ Greedy Algorithm: Minimize $f(\mathcal I_{A}(\phi), c_A)$ at each step
+
+
+
+ Experimental Data
+
+
+ Start with a large dataset.
+ Delete random fields (~50%).
+
+
+
+
+ Experimental Queries
+
+ Simulate an analyst trying to manually explore correlations.
+
+ Train a tree-classifier on the base data.
+ Convert the decision tree to a query for all rows where the tree predicts a specific value.
+
+
+
+
+ Cost vs Entropy: Credit Data
+
+
+ EG2: Greedy Cost/Value Ordering
+ NMETC: Naive Minimal Expected Total Cost
+ Random: Completely Random Order
+
+
+
+
+ Cost vs Entropy: Product Data
+
+
+ EG2: Greedy Cost/Value Ordering
+ NMETC: Naive Minimal Expected Total Cost
+ Random: Completely Random Order
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/slides/talks/2016-3-NYU-Mimir/ubodin.css b/slides/talks/2016-3-NYU-Mimir/ubodin.css
new file mode 100644
index 00000000..1e51f073
--- /dev/null
+++ b/slides/talks/2016-3-NYU-Mimir/ubodin.css
@@ -0,0 +1,362 @@
+@font-face {
+ font-family: 'News Cycle';
+ font-style: normal;
+ font-weight: 400;
+ src: local('News Cycle'), local('NewsCycle'), url(../reveal.js-3.1.0/fonts/9Xe8dq6pQDsPyVH2D3tMQsDdSZkkecOE1hvV7ZHvhyU.ttf) format('truetype');
+}
+@font-face {
+ font-family: 'News Cycle';
+ font-style: normal;
+ font-weight: 700;
+ src: local('News Cycle Bold'), local('NewsCycle-Bold'), url(../reveal.js-3.1.0/fonts/G28Ny31cr5orMqEQy6ljt8BaWKZ57bY3RXgXH6dOjZ0.ttf) format('truetype');
+}
+@font-face {
+ font-family: 'Lato';
+ font-style: normal;
+ font-weight: 400;
+ src: local('Lato Regular'), local('Lato-Regular'), url(../reveal.js-3.1.0/fonts/1EqTbJWOZQBfhZ0e3RL9uvesZW2xOQ-xsNqO47m55DA.ttf) format('truetype');
+}
+@font-face {
+ font-family: 'Lato';
+ font-style: normal;
+ font-weight: 700;
+ src: local('Lato Bold'), local('Lato-Bold'), url(../reveal.js-3.1.0/fonts/MZ1aViPqjfvZwVD_tzjjkwLUuEpTyoUstqEm5AMlJo4.ttf) format('truetype');
+}
+@font-face {
+ font-family: 'Lato';
+ font-style: italic;
+ font-weight: 400;
+ src: local('Lato Italic'), local('Lato-Italic'), url(../reveal.js-3.1.0/fonts/61V2bQZoWB5DkWAUJStypevvDin1pK8aKteLpeZ5c0A.ttf) format('truetype');
+}
+@font-face {
+ font-family: 'Lato';
+ font-style: italic;
+ font-weight: 700;
+ src: local('Lato Bold Italic'), local('Lato-BoldItalic'), url(../reveal.js-3.1.0/fonts/HkF_qI1x_noxlxhrhMQYECZ2oysoEQEeKwjgmXLRnTc.ttf) format('truetype');
+}
+
+
+
+/**@import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700);
+@import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic);
+**/
+/**
+ * A simple theme for reveal.js presentations, similar
+ * to the default theme. The accent color is darkblue.
+ *
+ * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed.
+ * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se
+ */
+/*********************************************
+ * GLOBAL STYLES
+ *********************************************/
+body {
+ background: #fff;
+ background-color: #fff; }
+
+.reveal {
+ font-family: 'Lato', sans-serif;
+ font-size: 36px;
+ font-weight: normal;
+ color: #000; }
+
+::selection {
+ color: #fff;
+ background: rgba(0, 0, 0, 0.99);
+ text-shadow: none; }
+
+.reveal .slides > section, .reveal .slides > section > section {
+ line-height: 1.3;
+ font-weight: inherit; }
+
+/*********************************************
+ * STATIC HEADER/FOOTER
+ *********************************************/
+
+.reveal .header {
+ position: absolute;
+ top: 0px;
+ left: 0px;
+ right: 0px;
+ height: 25px;
+ text-align: center;
+ padding-left: 15px;
+ padding-right: 15px;
+ padding-bottom: 10px;
+ padding-top: 15px;
+ background-color: #041a9b;
+ color: white;
+ font-size: 0.5em;
+ z-index: 100;
+}
+.reveal .footer {
+ position: absolute;
+ bottom: 0px;
+ left: 0px;
+ right: 0px;
+ height: 40px;
+ text-align: center;
+ padding-left: 15px;
+ padding-right: 15px;
+ padding-bottom: 10px;
+ padding-top: 20px;
+ background-color: #041a9b;
+ color: white;
+ font-size: 0.5em;
+ z-index: 100;
+}
+
+
+/*********************************************
+ * HEADERS
+ *********************************************/
+.reveal h1, .reveal h2, .reveal h3, .reveal h4, .reveal h5, .reveal h6 {
+ margin: 0 0 20px 0;
+ color: #000;
+ font-family: 'News Cycle', Impact, sans-serif;
+ font-weight: normal;
+ line-height: 1.2;
+ letter-spacing: normal;
+ text-transform: none;
+ text-shadow: none;
+ word-wrap: break-word; }
+
+.reveal h1 {
+ font-size: 3.77em; }
+
+.reveal h2 {
+ font-size: 2.11em; }
+
+.reveal h3 {
+ font-size: 1.55em; }
+
+.reveal h4 {
+ font-size: 1em; }
+
+.reveal h1 {
+ text-shadow: none; }
+
+/*********************************************
+ * OTHER
+ *********************************************/
+.reveal p {
+ margin: 20px 0;
+ line-height: 1.3; }
+
+.reveal attribution {
+ font-size: 0.5em;
+ position: absolute;
+ right: -10px;
+ bottom: -10px;
+ text-align: right;
+}
+
+/* Ensure certain elements are never larger than the slide itself */
+.reveal img, .reveal video, .reveal iframe {
+ max-width: 95%;
+ max-height: 95%; }
+
+.reveal strong, .reveal b {
+ font-weight: bold; }
+
+.reveal em {
+ font-style: italic; }
+
+.reveal ol, .reveal dl, .reveal ul {
+ display: inline-block;
+ text-align: left;
+ margin: 0 0 0 1em; }
+
+.reveal ol {
+ list-style-type: decimal; }
+
+.reveal ul {
+ list-style-type: disc; }
+
+.reveal ul > li {
+ margin-top: 20px; }
+
+.reveal ul ul {
+ list-style-type: square; }
+
+.reveal ul ul ul {
+ list-style-type: circle; }
+
+.reveal ul ul, .reveal ul ol, .reveal ol ol, .reveal ol ul {
+ display: block;
+ margin-left: 40px; }
+
+.reveal dt {
+ font-weight: bold; }
+
+.reveal dd {
+ margin-left: 40px; }
+
+.reveal q, .reveal blockquote {
+ quotes: none; }
+
+.reveal blockquote {
+ display: block;
+ position: relative;
+ width: 70%;
+ margin: 20px auto;
+ padding: 5px;
+ font-style: italic;
+ background: rgba(255, 255, 255, 0.05);
+ box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); }
+
+.reveal blockquote p:first-child, .reveal blockquote p:last-child {
+ display: inline-block; }
+
+.reveal q {
+ font-style: italic; }
+
+.reveal pre {
+ display: block;
+ position: relative;
+ width: 90%;
+ margin: 20px auto;
+ text-align: left;
+ font-size: 0.55em;
+ font-family: monospace;
+ line-height: 1.2em;
+ word-wrap: break-word;
+ box-shadow: 0px 0px 6px rgba(0, 0, 0, 0.3); }
+
+.reveal code {
+ font-family: monospace;
+}
+
+.reveal pre code {
+ display: block;
+ padding: 5px;
+ overflow: auto;
+ max-height: 400px;
+ word-wrap: normal;
+ background: #3F3F3F;
+ color: #DCDCDC; }
+
+.reveal table {
+ margin: auto;
+ border-collapse: collapse;
+ border-spacing: 0; }
+
+.reveal table th {
+ font-weight: bold;
+ border-bottom: 1px solid; }
+
+.reveal table th, .reveal table td {
+ text-align: center;
+ padding: 0.2em 0.5em 0.2em 0.5em;}
+
+.reveal table th[align="left"], .reveal table td[align="left"] {
+ text-align: left; }
+
+.reveal table th[align="right"], .reveal table td[align="right"] {
+ text-align: right; }
+
+.reveal table tr:last-child td {
+ border-bottom: none; }
+
+.reveal sup {
+ vertical-align: super; }
+
+.reveal sub {
+ vertical-align: sub; }
+
+.reveal small {
+ display: inline-block;
+ font-size: 0.6em;
+ line-height: 1.2em;
+ vertical-align: top; }
+
+.reveal small * {
+ vertical-align: top; }
+
+/*********************************************
+ * LINKS
+ *********************************************/
+.reveal a {
+ color: #00008B;
+ text-decoration: none;
+ -webkit-transition: color 0.15s ease;
+ -moz-transition: color 0.15s ease;
+ transition: color 0.15s ease; }
+
+.reveal a:hover {
+ color: #0000f1;
+ text-shadow: none;
+ border: none; }
+
+.reveal .roll span:after {
+ color: #fff;
+ background: #00003f; }
+
+/*********************************************
+ * IMAGES
+ *********************************************/
+.reveal section img {
+ margin: 15px 0px;
+ background: rgba(255, 255, 255, 0.12);
+}
+
+.reveal section img.bordered
+{
+ border: 4px solid #000;
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.15);
+}
+
+.reveal a img {
+ -webkit-transition: all 0.15s linear;
+ -moz-transition: all 0.15s linear;
+ transition: all 0.15s linear; }
+
+.reveal a:hover img {
+ background: rgba(255, 255, 255, 0.2);
+ border-color: #00008B;
+ box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); }
+
+/*********************************************
+ * NAVIGATION CONTROLS
+ *********************************************/
+.reveal .controls div.navigate-left, .reveal .controls div.navigate-left.enabled {
+ border-right-color: #00008B; }
+
+.reveal .controls div.navigate-right, .reveal .controls div.navigate-right.enabled {
+ border-left-color: #00008B; }
+
+.reveal .controls div.navigate-up, .reveal .controls div.navigate-up.enabled {
+ border-bottom-color: #00008B; }
+
+.reveal .controls div.navigate-down, .reveal .controls div.navigate-down.enabled {
+ border-top-color: #00008B; }
+
+.reveal .controls div.navigate-left.enabled:hover {
+ border-right-color: #0000f1; }
+
+.reveal .controls div.navigate-right.enabled:hover {
+ border-left-color: #0000f1; }
+
+.reveal .controls div.navigate-up.enabled:hover {
+ border-bottom-color: #0000f1; }
+
+.reveal .controls div.navigate-down.enabled:hover {
+ border-top-color: #0000f1; }
+
+/*********************************************
+ * PROGRESS BAR
+ *********************************************/
+.reveal .progress {
+ background: rgba(0, 0, 0, 0.2); }
+
+.reveal .progress span {
+ background: #00008B;
+ -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
+ -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985);
+ transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); }
+
+/*********************************************
+ * SLIDE NUMBER
+ *********************************************/
+.reveal .slide-number {
+ color: #00008B; }
diff --git a/slides/talks/2016-3-HILDA/images/assemblyline.jpeg b/slides/talks/2016-4-HILDA/images/assemblyline.jpeg
similarity index 100%
rename from slides/talks/2016-3-HILDA/images/assemblyline.jpeg
rename to slides/talks/2016-4-HILDA/images/assemblyline.jpeg
diff --git a/slides/talks/2016-3-HILDA/images/handcarved.jpeg b/slides/talks/2016-4-HILDA/images/handcarved.jpeg
similarity index 100%
rename from slides/talks/2016-3-HILDA/images/handcarved.jpeg
rename to slides/talks/2016-4-HILDA/images/handcarved.jpeg
diff --git a/slides/talks/2016-3-HILDA/images/twinkie.jpeg b/slides/talks/2016-4-HILDA/images/twinkie.jpeg
similarity index 100%
rename from slides/talks/2016-3-HILDA/images/twinkie.jpeg
rename to slides/talks/2016-4-HILDA/images/twinkie.jpeg
diff --git a/slides/talks/2016-3-HILDA/index.html b/slides/talks/2016-4-HILDA/index.html
similarity index 100%
rename from slides/talks/2016-3-HILDA/index.html
rename to slides/talks/2016-4-HILDA/index.html