diff --git a/cvs/okennedy_current_pending.pdf b/cvs/okennedy_current_pending.pdf index 186a646c..d198e57d 100644 Binary files a/cvs/okennedy_current_pending.pdf and b/cvs/okennedy_current_pending.pdf differ diff --git a/db/cv/okennedy/grants.json b/db/cv/okennedy/grants.json index bad7e21d..5a93331b 100644 --- a/db/cv/okennedy/grants.json +++ b/db/cv/okennedy/grants.json @@ -170,7 +170,7 @@ "effort" : "25%", "copis" : ["Hung Ngo", "Shambhu Upadhyaya", "Varun Chandola"], "status" : "accepted", - "start" : "09/2014", "end" : "08/2017", + "start" : "09/2014", "end" : "08/2018", "type" : "grant", "collaborative" : [ { "institution" : "University of Michigan Ann Arbor", diff --git a/slides/talks/2016-4-HILDA/index.html b/slides/talks/2016-4-HILDA/index.html index 2e7ca427..aa7f8047 100755 --- a/slides/talks/2016-4-HILDA/index.html +++ b/slides/talks/2016-4-HILDA/index.html @@ -125,12 +125,7 @@
-

Making spreadsheeds a lot less like...

- -
- -
-

And a lot more like...

+

Making spreadsheets lot more like...

diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/BI-Analyst.jpg b/slides/talks/2016-5-Oracle-Mimir/graphics/BI-Analyst.jpg new file mode 100644 index 00000000..45d77c69 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/BI-Analyst.jpg differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Base.png b/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Base.png new file mode 100644 index 00000000..9e885986 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Base.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Explain.png b/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Explain.png new file mode 100644 index 00000000..4097cc0c Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/Calendar_Explain.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-black.png b/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-black.png new file mode 100644 index 00000000..a3153259 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-black.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-white.png b/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-white.png new file mode 100644 index 00000000..b3f42b46 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/FullText-white.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/MSImageID.png b/slides/talks/2016-5-Oracle-Mimir/graphics/MSImageID.png new file mode 100644 index 00000000..8d4547fe Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/MSImageID.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/UIExample.png b/slides/talks/2016-5-Oracle-Mimir/graphics/UIExample.png new file mode 100644 index 00000000..e6300fa3 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/UIExample.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png new file mode 100644 index 00000000..8d798b00 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/User-Icon-Remix-by-Merlin2525-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/azure-data-lake.png b/slides/talks/2016-5-Oracle-Mimir/graphics/azure-data-lake.png new file mode 100644 index 00000000..f55d3221 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/azure-data-lake.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.pdf b/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.pdf new file mode 100644 index 00000000..766bd274 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.pdf differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.png b/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.png new file mode 100644 index 00000000..f0e0184f Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/credit_entropy.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/crystalball-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/crystalball-800px.png new file mode 100644 index 00000000..4a4624cc Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/crystalball-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/dagobert83-female-user-icon-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/dagobert83-female-user-icon-800px.png new file mode 100644 index 00000000..e3c69b0a Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/dagobert83-female-user-icon-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/data-lake-to-data-swamp.jpg b/slides/talks/2016-5-Oracle-Mimir/graphics/data-lake-to-data-swamp.jpg new file mode 100644 index 00000000..94f43b48 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/data-lake-to-data-swamp.jpg differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/database-server-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/database-server-800px.png new file mode 100644 index 00000000..574dbffd Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/database-server-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/db.svg b/slides/talks/2016-5-Oracle-Mimir/graphics/db.svg new file mode 100644 index 00000000..974e0cba --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/graphics/db.svg @@ -0,0 +1,330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Openclipart + + + database symbol + 2010-11-08T22:08:43 + database symbol in metallic style + https://openclipart.org/detail/94723/database-symbol-by-rg1024 + + + rg1024 + + + + + database + server + symbol + + + + + + + + + + + diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/ericlemerdy-Server-1-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/ericlemerdy-Server-1-800px.png new file mode 100644 index 00000000..ae8fca7c Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/ericlemerdy-Server-1-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/interfaces.png b/slides/talks/2016-5-Oracle-Mimir/graphics/interfaces.png new file mode 100644 index 00000000..69053d85 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/interfaces.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/iu.jpeg b/slides/talks/2016-5-Oracle-Mimir/graphics/iu.jpeg new file mode 100644 index 00000000..40d11ed8 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/iu.jpeg differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/jean-victor-balin-icon-table.svg b/slides/talks/2016-5-Oracle-Mimir/graphics/jean-victor-balin-icon-table.svg new file mode 100644 index 00000000..2d4e84e1 --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/graphics/jean-victor-balin-icon-table.svg @@ -0,0 +1,482 @@ + + + + icon_table + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Openclipart + + + icon_table + 2010-01-29T14:02:11 + + https://openclipart.org/detail/29121/icon_table-by-jean_victor_balin + + + jean_victor_balin + + + + + calc + icon + table + unchecked + + + + + + + + + + + diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/list-add-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/list-add-800px.png new file mode 100644 index 00000000..8dbb6fb0 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/list-add-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/littlestorefront-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/littlestorefront-800px.png new file mode 100644 index 00000000..b4d5ace2 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/littlestorefront-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/matt-icons_text-x-log-300px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/matt-icons_text-x-log-300px.png new file mode 100644 index 00000000..a61f729e Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/matt-icons_text-x-log-300px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-address.png b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-address.png new file mode 100644 index 00000000..0f68aee3 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-address.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-detail.png b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-detail.png new file mode 100644 index 00000000..da73143a Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-detail.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-screen.png b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-screen.png new file mode 100644 index 00000000..d54fc386 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/maybe-screen.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/mimir_logo_final.png b/slides/talks/2016-5-Oracle-Mimir/graphics/mimir_logo_final.png new file mode 100644 index 00000000..fe23d4e0 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/mimir_logo_final.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/performance-dbx1g.png b/slides/talks/2016-5-Oracle-Mimir/graphics/performance-dbx1g.png new file mode 100644 index 00000000..d3fcb643 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/performance-dbx1g.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/performance-sqlite1g.png b/slides/talks/2016-5-Oracle-Mimir/graphics/performance-sqlite1g.png new file mode 100644 index 00000000..647058d2 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/performance-sqlite1g.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/primary-queries.svg b/slides/talks/2016-5-Oracle-Mimir/graphics/primary-queries.svg new file mode 100644 index 00000000..5006a888 --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/graphics/primary-queries.svg @@ -0,0 +1,113 @@ + + + + + + + + + + + + + image/svg+xml + + + + + Openclipart + + + + + + + + + + + diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.pdf b/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.pdf new file mode 100644 index 00000000..37baf94d Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.pdf differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.png b/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.png new file mode 100644 index 00000000..15c77940 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/product_entropy.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/realestate_entropy.pdf b/slides/talks/2016-5-Oracle-Mimir/graphics/realestate_entropy.pdf new file mode 100644 index 00000000..4f1bfd76 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/realestate_entropy.pdf differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/saco-800px.png b/slides/talks/2016-5-Oracle-Mimir/graphics/saco-800px.png new file mode 100644 index 00000000..56f10287 Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/saco-800px.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/test.svg b/slides/talks/2016-5-Oracle-Mimir/graphics/test.svg new file mode 100644 index 00000000..2bb6dccd --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/graphics/test.svg @@ -0,0 +1,4 @@ + + Foo + + \ No newline at end of file diff --git a/slides/talks/2016-5-Oracle-Mimir/graphics/weka.png b/slides/talks/2016-5-Oracle-Mimir/graphics/weka.png new file mode 100644 index 00000000..37a1188a Binary files /dev/null and b/slides/talks/2016-5-Oracle-Mimir/graphics/weka.png differ diff --git a/slides/talks/2016-5-Oracle-Mimir/index.html b/slides/talks/2016-5-Oracle-Mimir/index.html new file mode 100644 index 00000000..09cd6ebf --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/index.html @@ -0,0 +1,1127 @@ + + + + + + + Embracing Uncertainty + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ + Embracing Uncertainty +
+ + +
+ +
+

Embracing uncertainty with

+ +
+ +
+

Joint work with:

+

+ PhD Students: Ying Yang, Niccolo Meneghetti, Will Spoth, Aaron Huber, Poonam Kumari
+ BS Students: Lisa Lu, Mike Kulbacki, Jacob P. Verghese
+ Alums: Arindam Nandi (HPE/Vertica), Vinayak Karuppasamy (Bloomberg)
+ Collabs: Ronny Fehling (Airbus), Zhen-Hua Liu (Oracle), Dieter Gawlick (Oracle), Beda Hammerschmidt (Oracle), + Boris Glavic (IIT), Wolfgang Gatterbauer (CMU), Juliana Freire (NYU), Heiko Mueller (NYU) +

+
+ + +
+
+

A Big Data Fairy Tale

+
+ +
+ +

Meet Alice

+ + (OpenClipArt.org) +
+ +
+ + +

Alice has a Store

+ + (OpenClipArt.org) +
+ +
+ + + +

Alice's store collects sales data

+ + (OpenClipArt.org) +
+ +
+ + + + + = + +

Alice wants to use her sales data to run a promotion

+ + (OpenClipArt.org) +
+ +
+ + + +

So Alice loads up her sales data in her trusty database/hadoop/spark/etc... server.

+ + (OpenClipArt.org) +
+ +
+ + + ? +

... asks her question ...

+ + (OpenClipArt.org) +
+ +
+ + + ? → + +

... and basks in the limitless possibilities of big data.

+ + (OpenClipArt.org) +
+
+ +
+
+

Why is this a fairy tale?

+
+ +
+ + + +

It's never this easy...

+
+
+ +
+
+

CSV Import

+

Run a SELECT on a raw CSV File

+
    +
  • File may not have column headers
  • +
  • CSV does not provide "types"
  • +
  • Lines may be missing fields
  • +
  • Fields may be mistyped (typo, missing comma)
  • +
  • Comment text can be inlined into the file
  • +
+

+ State of the art: External Table Defn + "Manually" edit CSV +

+
+ +
+

Merge Two Datasets

+

UNION two data sources

+
    +
  • Schema matching
  • +
  • Deduplication
  • +
  • Format alignment (GIS coordinates, $ vs €) +
  • Precision alignment (State vs County)
  • +
+

+ State of the art: Manually map schema +

+
+ +
+

JSON Shredding

+

Run a SELECT on JSON or a Doc Store

+
    +
  • Separating fields and record sets:
    (e.g., { A: "Bob", B: "Alice" })
  • +
  • Missing fields (Records with no 'address')
  • +
  • Type alignment (Records with 'address' as an array)
  • +
  • Schema matching$^2$
  • +
+

+ State of the art: DataGuide, Wrangler, etc... +

+
+
+ +
+
+

Data Cleaning is Hard!

+
+ +
+

State of the Art

+ + + (skilledup.com) + +

Alice spends weeks cleaning her data before using it.

+
+ +
+

Newer State of the Art

+ + (azure.microsoft.com) +
+ +
+ + (timoelliott.com) +
+
+ +
+ +
+

Structure is hard!

+
    +
  • Structured models (RelDBs) force curation during loading. +
    • Problem: All curation costs are upfront.
    +
  • +
  • Unstructured models (NoSQL) force curation into queries. +
    • Problem: Complexity/redundancy blowup in queries.
    +
  • +
+

Add structure, curation effort On-Demand

+
+ +
+

But... you still need some sort of structure?!?

+ +

Let the database make a guess!

+
+ +
+

+ In the name of Codd,
thou shalt not give the user a wrong answer. +

+ +

+ ... but what if we did? +

+

+ What would it take for that to be ok? +

+
+
+ +
+
+

Industry says...

+
+ +
+ +              +
+

My phone is guessing, but is letting me know that it did

+
+ +
+ +

Microsoft Image ID tells you something (and how sure it is)

+
+ +
+ +
+ +
+ +

Easy interactions to accept, reject, or explain uncertainty

+
+ +
+

Communication

+
    +
  • Why is my data uncertain?
  • +
  • How bad is it?
  • +
  • What can I do about it?
  • +
+
+ +
+

What if a database did the same?

+
+ +
+
    +
  • A: Standard SQL.
  • +
  • B: Annotated Output.
  • +
  • C: Lens Diagram.
  • +
  • D: Result Explanations.
  • +
+ +
+
+ +
+

Mimir

+
    +
  • Lenses: Generic, best-guess data curation operators.
  • + +
  • Explanations: How certain is my data?
  • + +
  • Provenance: What issues still need to be fixed?
  • +
+
+ +
+
+

Lenses

+

Here's a problem with my data. Fix it.

+
    +
  • What type is this column? (majority vote)
  • +
  • How do the columns of these relations line up? (pick your favorite schema matching paper)
  • +
  • How do I query heterogeneous JSON objects? (see above)
  • +
  • What should these missing values be? (learning-based interpolation)
  • +
+
+ +
+ + + View: + + SELECT + + + + + + Lens: + + SELECT + + + + [      ] + + + + + + + (best guess) + + + + + + +

Lenses introduce uncertainty

+ (OpenClipArt.org) +
+
+

The User's View

+

+  SELECT NAME, DEPARTMENT FROM PRODUCTS;
+					
+ + + + + + +
NameDepartment
Apple 6s, WhitePhone
Dell, Intel 4 coreComputer
HP, AMD 2 coreComputer
......
+

Simple UI: Highlight values that are based on guesses.

+
+ +
+

+  SELECT NAME, DEPARTMENT FROM PRODUCTS;
+					
+ + + + + + + +
NameDepartment
Apple 6s, WhitePhone
Dell, Intel 4 coreComputer
HP, AMD 2 coreComputer
......
+
+ + Produced by OmniGraffle 6.2.5 2015-09-20 14:45:55 +0000 + + + Canvas 1 + + Layer 1 + + + Probability: 95%Reason: Because I guessed ‘Computer’ for ‘Department’ on Row ‘3’ of ‘PRODUCTS’ + + + +

Allow users to EXPLAIN uncertain outputs

+

Explanations include reasons given in English

+
+ +
+
+

$PRODUCTS.DEPARTMENT_{3}$

+
+

"I guessed 'Computer' for 'Department' on Row '3'"

+
+
+ +
+

Explanations

+
    +
  1. Mark uncertain data and results.
  2. +
  3. Upon request, provide more detail: +
      +
    • Why is my data uncertain? (provenance)
    • +
    • How bad is it? (confidence, entropy, bounds)
    • +
    • What are other possibile answers? (samples)
    • +
    • What can I do to fix it? (repairs)
    • +
  4. +
+
+ +
+ +
+
+

Mimir is a DB Overlay

+
+ +
+ + + + + (Any DB) + + + + + (Lens) + + + + + + + + + + + + + + + + + (Any DB) + + + + + + SELECT + + + + + SELECT + + + + + SELECT + + + + + + + UNION + UNION + + +

Mimir virtualizes uncertainty + (OpenClipArt.org) +

+
+ +
+

How?

+
+ +
+
+

Labeled Nulls

+

$Var(\ldots)$ constructs new variables

+
    +
  • $Var('X')$ constructs a new variable $X$
  • +
  • $Var('X', 1)$ constructs a new variable $X_{1}$
  • +
  • $Var('X', ROWID)$ evaluates $ROWID$ and then constructs a new variable $X_{ROWID}$
  • +
+
+ + +
+

Lazy Evaluation

+

Variables can't be evaluated until they are bound.
So, we allow arbitrary expressions to represent data.

+
    +
  • $X$ is a legitimate data value.
  • +
  • $X+1$ is a legitimate data value.
  • +
  • $1+1$ is a legitimate data value, but can be reduced to $2$.
  • +
+

A lazy value without variables is deterministic

+
+ +
+

Mimir SQL allows the $Var()$ operator to inlined

+

+                  SELECT A, VAR('X', B)+2 AS C FROM R;
+					
+ +
+ + + + + + + + +
AB
12
34
56
+ + + + + +
AC
1$X_2+2$
3$X_4+2$
5$X_6+2$
+
+
 
+
+ +
+

Selects on $Var()$ need to be deferred too...

+

+                  SELECT A FROM R WHERE VAR('X', B) > 2;
+					
+ +
+ + + + + + + + +
AB
12
34
56
+ + + + + +
A$\phi$
1$X_2>2$
3$X_4>2$
5$X_6>2$
+
+
 
+

When evaluating the table, rows where $\phi = \bot$ are dropped.

+
+ +
+

C-Tables

+
    +
  • Original Formulation [Imielinski, Lipski 1981]
  • +
  • PC-Tables [Green, Tannen 2006]
  • +
  • Systems
      +
    • Orchestra [Green, Karvounarakis, Taylor, Biton, Ives, Tannen 2007]
    • +
    • MayBMS [Huang, Antova, Koch, Olteanu 2009]
    • +
    • Pip [Kennedy, Koch 2009] +
    • Sprout [Fink, Hogue, Olteanu, Rath 2011]
    • +
  • +
  • Generalized PC-Tables [Kennedy, Koch 2009]
  • +
+
+
+ +
+
+

Labeled nulls capture a lens' uncertainty

+
+ +
+

+  CREATE LENS PRODUCTS 
+     AS SELECT * FROM PRODUCTS_RAW
+     USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+					
+ +
+

is (almost) the same as the query...

+

+  CREATE VIEW PRODUCTS 
+     AS SELECT ID, NAME, ...,
+          CASE WHEN DEPARTMENT IS NOT NULL THEN DEPARTMENT
+               ELSE VAR('PRODUCTS.DEPARTMENT', ROWID)
+          END AS DEPARTMENT
+     FROM PRODUCTS_RAW;
+						
+
+ + + + + + + + +
IDName...Department
123Apple 6s, White...Phone
34234Dell, Intel 4 core...Computer
34235HP, AMD 2 core...$Prod.Dept_3$
............
+
+
+
+

+  CREATE LENS PRODUCTS 
+     AS SELECT * FROM PRODUCTS_RAW
+     USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+					
+ +
+

Behind the scenes, a lens also creates a model...

+

+                      SELECT * FROM PRODUCTS_RAW;
+						
+
+
+
+
+ +
+
+
+
+

An estimator for $PRODUCTS.DEPARTMENT_{ROWID}$

+
+ +
+
+ +
+
+

... but databases don't support labeled nulls

+
+ +
+

Labeled Nulls Percolate Up

+

+                  SELECT A, VAR('X', B)+2 AS C FROM R;
+					
+
+

Mimir dispatches this query to the DB:

+

+                          SELECT A, B FROM R;
+					
+
+
+

And for each row of the result, evaluates:

+

+               SELECT A, VAR('X', B)+2 AS C FROM RESULT;
+					
+
+
+ +
+

Generating Explanations

+

All uncertainty comes from labeled nulls in the expressions that Mimir evaluates for each row of the output.

+
+
Why is the data uncertain?
+
All relevant lenses referenced in VAR('X', B)+2.
+ +
How uncertain?
+
Estimate by sampling from VAR('X', B).
+ +
How do I fix it?
+
Each lens fixes one well-defined type of error.
+
+
+ +
+

Lazy evaluation can cause problems

+

+        SELECT R.A, S.C FROM R, S WHERE VAR('X', R.B) = S.B;
+					
+
+

Mimir dispatches this query to the DB:

+

+      SELECT R.A, S.C, R.B AS TEMP_1, S.B AS TEMP_2 FROM R, S;
+					
+
+
+

And for each row of the result, evaluates:

+

+      SELECT A, C FROM RESULT WHERE VAR('X', TEMP_1) = TEMP_2;
+					
+
+
+ +
+

Helper views allow the DB to interpret labeled nulls +


+      SELECT R.A, S.C FROM R, S
+      WHERE S.B = (SELECT VALUE FROM VARIABLE_X WHERE KEY = R.B);
+					
+

... but we lose the ability to explain outputs

+
+ +
+

Provenance Recovers Explanations

+

+        SELECT R.A, S.C FROM R, S WHERE VAR('X', R.B) = S.B;
+					
+

Mimir dispatches this query to the DB:

+

+      SELECT R.A, S.C, 
+             R.ROWID AS ID_1, S.ROWID AS ID_2
+      WHERE S.B = (SELECT VALUE FROM VARIABLE_X WHERE KEY = R.B);
+					
+
+

Then to explain, Mimir dispatches the query:

+

+      SELECT R.A, S.C, R.B AS TEMP_1, S.B AS TEMP_2
+      WHERE R.ROWID = ID_1 AND S.ROWID = ID_2
+					
+
+
+
+ +
+
+

Performance

+

TPC-H Data, but replace 0.1% of FK references with NULL. Ask Mimir to fix.

+

(a worst case from a performance standpoint)

+
    +
  • Query 1: Table scan. Overhead for a no-op.
  • +
  • Query 3: 3-way join on an FK chain.
  • +
  • Query 5: 6-way join on an FK tree.
  • +
  • Query 9: 6-way join with cycles.
  • +
+
+ +
+
+
Classic:
+
Naive execution strategy where Mimir computes joins.
+
Partition:
+
Separate query fragments compute 'certain' results and one or more classes of uncertain results.
+
Inline:
+
Nested SELECTs dynamically inject best guess values into the query.
+
Hybrid:
+
Combine Partition and Inline.
+
+
+ +
+ +

Mimir over "DB X" in 4 different execution modes.
100% = Zero overhead, Timeout at 1400x

+
+
+ +
+
+

Presentation

+

Participants were shown a table of 3 products with 3 ratings (e.g., Amazon, Best Buy, Walmart) each

+

Step 1: The randomly generated ratings were biased to encourage a predictable, but mildly ambiguous ordering of the three products.

+
+ +
+

Step 2: We used the same randomization, but this time we marked several of the values as uncertain: +

    +
  • Red Text
  • +
  • Red Background
  • +
  • $value \pm tolerance$
  • +
  • $value*$
  • +
+

+
+ +
+

Probability of Agreement With Target Order

+ +
+
+ +
+
+ +
    +
  • On-Demand Data Curation makes data exploration easier.
  • +
  • "Best-Guess" results streamline analytics. +
       ... if the DB communicates the resulting uncertainty.
  • +
+

Questions?

+
+
+ +
+
+

Backup Slides

+
+
+ +
+
+

C-Tables

+
    +
  • Original Formulation [Imielinski, Lipski 1981]
  • +
  • PC-Tables [Green, Tannen 2006]
  • +
  • Systems
      +
    • Orchestra [Green, Karvounarakis, Taylor, Biton, Ives, Tannen 2007]
    • +
    • MayBMS [Huang, Antova, Koch, Olteanu 2009]
    • +
    • Pip [Kennedy, Koch 2009] +
    • Sprout [Fink, Hogue, Olteanu, Rath 2011]
    • +
  • +
  • Generalized PC-Tables [Kennedy, Koch 2009]
  • +
+
+ +
+

Lenses

+
    +
  • A VG-RA Expression
  • +
  • A 'Model' that defines for each variable...
      +
    • A sampling process
    • +
    • A best guess estimator
    • +
    • A human-readable description
    • +
  • +
+

Lenses implement PC-Tables

+
+ +
+

+  CREATE LENS PRODUCTS 
+     AS SELECT * FROM PRODUCTS_RAW
+     USING DOMAIN_REPAIR(DEPARTMENT NOT NULL);
+					
+ +
    +
  • AS clause defines source data.
  • +
  • USING clause requests repairs.
  • +
+ +
+
+ +
+
+

Selection (Filtering)

+

+                  SELECT NAME FROM PRODUCTS
+                  WHERE DEPARTMENT='PHONE' 
+                     AND ( VENDOR='APPLE' 
+                           OR PLATFORM='ANDROID' )
+					
+

Recall, row-level uncertainty is a boolean formula $\phi$.

+

+ For this query, $\phi$ can be as complex as: + $$DEPT_{ROWID}='P\ldots' \wedge \left( VEND_{ROWID}='Ap\ldots' \vee PLAT_{ROWID} = 'An\ldots' \right)$$

+

Too many variables! Which is the most important?

+
+ +
+

What is important?

+

Data Cleaning

+

Which variables are important?

+

The ones that keep us from knowing everything

+
+ +
+

$$D_{ROWID}='P' \wedge \left( V_{ROWID}='Ap' \vee PLAT_{ROWID} = 'An' \right)$$

+
+

$$A \wedge (B \vee C)$$

+
+ +
+

Naive Approach

+ +

Consider a game between a database and an impartial oracle.

+
    +
  • The DB picks a variable $v$ in $\phi$ and pays a cost $c_v$.
  • +
  • The Oracle reveals the truth value of $v$.
  • +
  • The DB updates $\phi$ accordingly and repeats until $\phi$ is deterministic.
  • +
+

Naive Algorithm: Pick all variables!

+

Less Naive Algorithm: Minimize $E\left[\sum c_v\right]$.

+
+ +
+

Exponential Time Bad!

+
+
+ +
+
+

The Value of What We Don't Know

+

$$\phi = A \wedge (B \vee C)$$

+
    +
  1. Generate Samples for $A$, $B$, $C$
  2. +
  3. Estimate $p(\phi)$
  4. +
  5. Compute $H[\phi] = -\log\left(p(\phi) \cdot (1-p(\phi))\right)$
  6. +
+

Entropy is intuitive:
$H = 1$ means we know nothing,
$H = 0$ means we know everything.

+
+ +
+

Information Gain

+

$$\mathcal I_{A \leftarrow \top} (\phi) = H\left[\phi\right] - H\left[\phi(A \leftarrow \top)\right]$$

+

Information gain of $v$: The reduction in entropy from knowing the truth value of a variable $v$.

+
+ +
+

Expected Information Gain

+

$$\mathcal I_{A} (\phi) = \left(p(A)\cdot \mathcal I_{A\leftarrow \top}(\phi)\right) + \left(p(\neg A)\cdot \mathcal I_{A\leftarrow \bot}(\phi)\right)$$

+

Expected information gain of $v$: The probability-weighted average of the information gain for $v$ and $\neg v$.

+
+ +
+

The Cost of Perfect Information

+

Combine Information Gain and Cost

+

$$f(\mathcal I_{A}(\phi), c_A)$$

+

For example: $EG2(\mathcal I_{A}(\phi), c_A) = \frac{2^{\mathcal I_{A}(\phi)} - 1}{c_A}$

+

Greedy Algorithm: Minimize $f(\mathcal I_{A}(\phi), c_A)$ at each step

+
+ +
+

Experimental Data

+ +
    +
  • Start with a large dataset.
  • +
  • Delete random fields (~50%).
  • +
+
+ +
+

Experimental Queries

+ +

Simulate an analyst trying to manually explore correlations.

+
    +
  • Train a tree-classifier on the base data.
  • +
  • Convert the decision tree to a query for all rows where the tree predicts a specific value.
  • +
+
+ +
+

Cost vs Entropy: Credit Data

+ +

+ EG2: Greedy Cost/Value Ordering
+ NMETC: Naive Minimal Expected Total Cost
+ Random: Completely Random Order +

+
+ +
+

Cost vs Entropy: Product Data

+ +

+ EG2: Greedy Cost/Value Ordering
+ NMETC: Naive Minimal Expected Total Cost
+ Random: Completely Random Order +

+
+ +
+ +
+ + + + + + + + diff --git a/slides/talks/2016-5-Oracle-Mimir/ubodin.css b/slides/talks/2016-5-Oracle-Mimir/ubodin.css new file mode 100644 index 00000000..1e51f073 --- /dev/null +++ b/slides/talks/2016-5-Oracle-Mimir/ubodin.css @@ -0,0 +1,362 @@ +@font-face { + font-family: 'News Cycle'; + font-style: normal; + font-weight: 400; + src: local('News Cycle'), local('NewsCycle'), url(../reveal.js-3.1.0/fonts/9Xe8dq6pQDsPyVH2D3tMQsDdSZkkecOE1hvV7ZHvhyU.ttf) format('truetype'); +} +@font-face { + font-family: 'News Cycle'; + font-style: normal; + font-weight: 700; + src: local('News Cycle Bold'), local('NewsCycle-Bold'), url(../reveal.js-3.1.0/fonts/G28Ny31cr5orMqEQy6ljt8BaWKZ57bY3RXgXH6dOjZ0.ttf) format('truetype'); +} +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 400; + src: local('Lato Regular'), local('Lato-Regular'), url(../reveal.js-3.1.0/fonts/1EqTbJWOZQBfhZ0e3RL9uvesZW2xOQ-xsNqO47m55DA.ttf) format('truetype'); +} +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 700; + src: local('Lato Bold'), local('Lato-Bold'), url(../reveal.js-3.1.0/fonts/MZ1aViPqjfvZwVD_tzjjkwLUuEpTyoUstqEm5AMlJo4.ttf) format('truetype'); +} +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 400; + src: local('Lato Italic'), local('Lato-Italic'), url(../reveal.js-3.1.0/fonts/61V2bQZoWB5DkWAUJStypevvDin1pK8aKteLpeZ5c0A.ttf) format('truetype'); +} +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 700; + src: local('Lato Bold Italic'), local('Lato-BoldItalic'), url(../reveal.js-3.1.0/fonts/HkF_qI1x_noxlxhrhMQYECZ2oysoEQEeKwjgmXLRnTc.ttf) format('truetype'); +} + + + +/**@import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700); +@import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic); +**/ +/** + * A simple theme for reveal.js presentations, similar + * to the default theme. The accent color is darkblue. + * + * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed. + * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se + */ +/********************************************* + * GLOBAL STYLES + *********************************************/ +body { + background: #fff; + background-color: #fff; } + +.reveal { + font-family: 'Lato', sans-serif; + font-size: 36px; + font-weight: normal; + color: #000; } + +::selection { + color: #fff; + background: rgba(0, 0, 0, 0.99); + text-shadow: none; } + +.reveal .slides > section, .reveal .slides > section > section { + line-height: 1.3; + font-weight: inherit; } + +/********************************************* + * STATIC HEADER/FOOTER + *********************************************/ + +.reveal .header { + position: absolute; + top: 0px; + left: 0px; + right: 0px; + height: 25px; + text-align: center; + padding-left: 15px; + padding-right: 15px; + padding-bottom: 10px; + padding-top: 15px; + background-color: #041a9b; + color: white; + font-size: 0.5em; + z-index: 100; +} +.reveal .footer { + position: absolute; + bottom: 0px; + left: 0px; + right: 0px; + height: 40px; + text-align: center; + padding-left: 15px; + padding-right: 15px; + padding-bottom: 10px; + padding-top: 20px; + background-color: #041a9b; + color: white; + font-size: 0.5em; + z-index: 100; +} + + +/********************************************* + * HEADERS + *********************************************/ +.reveal h1, .reveal h2, .reveal h3, .reveal h4, .reveal h5, .reveal h6 { + margin: 0 0 20px 0; + color: #000; + font-family: 'News Cycle', Impact, sans-serif; + font-weight: normal; + line-height: 1.2; + letter-spacing: normal; + text-transform: none; + text-shadow: none; + word-wrap: break-word; } + +.reveal h1 { + font-size: 3.77em; } + +.reveal h2 { + font-size: 2.11em; } + +.reveal h3 { + font-size: 1.55em; } + +.reveal h4 { + font-size: 1em; } + +.reveal h1 { + text-shadow: none; } + +/********************************************* + * OTHER + *********************************************/ +.reveal p { + margin: 20px 0; + line-height: 1.3; } + +.reveal attribution { + font-size: 0.5em; + position: absolute; + right: -10px; + bottom: -10px; + text-align: right; +} + +/* Ensure certain elements are never larger than the slide itself */ +.reveal img, .reveal video, .reveal iframe { + max-width: 95%; + max-height: 95%; } + +.reveal strong, .reveal b { + font-weight: bold; } + +.reveal em { + font-style: italic; } + +.reveal ol, .reveal dl, .reveal ul { + display: inline-block; + text-align: left; + margin: 0 0 0 1em; } + +.reveal ol { + list-style-type: decimal; } + +.reveal ul { + list-style-type: disc; } + +.reveal ul > li { + margin-top: 20px; } + +.reveal ul ul { + list-style-type: square; } + +.reveal ul ul ul { + list-style-type: circle; } + +.reveal ul ul, .reveal ul ol, .reveal ol ol, .reveal ol ul { + display: block; + margin-left: 40px; } + +.reveal dt { + font-weight: bold; } + +.reveal dd { + margin-left: 40px; } + +.reveal q, .reveal blockquote { + quotes: none; } + +.reveal blockquote { + display: block; + position: relative; + width: 70%; + margin: 20px auto; + padding: 5px; + font-style: italic; + background: rgba(255, 255, 255, 0.05); + box-shadow: 0px 0px 2px rgba(0, 0, 0, 0.2); } + +.reveal blockquote p:first-child, .reveal blockquote p:last-child { + display: inline-block; } + +.reveal q { + font-style: italic; } + +.reveal pre { + display: block; + position: relative; + width: 90%; + margin: 20px auto; + text-align: left; + font-size: 0.55em; + font-family: monospace; + line-height: 1.2em; + word-wrap: break-word; + box-shadow: 0px 0px 6px rgba(0, 0, 0, 0.3); } + +.reveal code { + font-family: monospace; +} + +.reveal pre code { + display: block; + padding: 5px; + overflow: auto; + max-height: 400px; + word-wrap: normal; + background: #3F3F3F; + color: #DCDCDC; } + +.reveal table { + margin: auto; + border-collapse: collapse; + border-spacing: 0; } + +.reveal table th { + font-weight: bold; + border-bottom: 1px solid; } + +.reveal table th, .reveal table td { + text-align: center; + padding: 0.2em 0.5em 0.2em 0.5em;} + +.reveal table th[align="left"], .reveal table td[align="left"] { + text-align: left; } + +.reveal table th[align="right"], .reveal table td[align="right"] { + text-align: right; } + +.reveal table tr:last-child td { + border-bottom: none; } + +.reveal sup { + vertical-align: super; } + +.reveal sub { + vertical-align: sub; } + +.reveal small { + display: inline-block; + font-size: 0.6em; + line-height: 1.2em; + vertical-align: top; } + +.reveal small * { + vertical-align: top; } + +/********************************************* + * LINKS + *********************************************/ +.reveal a { + color: #00008B; + text-decoration: none; + -webkit-transition: color 0.15s ease; + -moz-transition: color 0.15s ease; + transition: color 0.15s ease; } + +.reveal a:hover { + color: #0000f1; + text-shadow: none; + border: none; } + +.reveal .roll span:after { + color: #fff; + background: #00003f; } + +/********************************************* + * IMAGES + *********************************************/ +.reveal section img { + margin: 15px 0px; + background: rgba(255, 255, 255, 0.12); +} + +.reveal section img.bordered +{ + border: 4px solid #000; + box-shadow: 0 0 10px rgba(0, 0, 0, 0.15); +} + +.reveal a img { + -webkit-transition: all 0.15s linear; + -moz-transition: all 0.15s linear; + transition: all 0.15s linear; } + +.reveal a:hover img { + background: rgba(255, 255, 255, 0.2); + border-color: #00008B; + box-shadow: 0 0 20px rgba(0, 0, 0, 0.55); } + +/********************************************* + * NAVIGATION CONTROLS + *********************************************/ +.reveal .controls div.navigate-left, .reveal .controls div.navigate-left.enabled { + border-right-color: #00008B; } + +.reveal .controls div.navigate-right, .reveal .controls div.navigate-right.enabled { + border-left-color: #00008B; } + +.reveal .controls div.navigate-up, .reveal .controls div.navigate-up.enabled { + border-bottom-color: #00008B; } + +.reveal .controls div.navigate-down, .reveal .controls div.navigate-down.enabled { + border-top-color: #00008B; } + +.reveal .controls div.navigate-left.enabled:hover { + border-right-color: #0000f1; } + +.reveal .controls div.navigate-right.enabled:hover { + border-left-color: #0000f1; } + +.reveal .controls div.navigate-up.enabled:hover { + border-bottom-color: #0000f1; } + +.reveal .controls div.navigate-down.enabled:hover { + border-top-color: #0000f1; } + +/********************************************* + * PROGRESS BAR + *********************************************/ +.reveal .progress { + background: rgba(0, 0, 0, 0.2); } + +.reveal .progress span { + background: #00008B; + -webkit-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); + -moz-transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); + transition: width 800ms cubic-bezier(0.26, 0.86, 0.44, 0.985); } + +/********************************************* + * SLIDE NUMBER + *********************************************/ +.reveal .slide-number { + color: #00008B; }