From aab72ced0fcdb892811df484efe718e6537ed839 Mon Sep 17 00:00:00 2001 From: Oliver Date: Mon, 9 Dec 2019 21:29:42 -0500 Subject: [PATCH] Revisions --- .../graphics/time_series_with_errors.svg | 1 + slides/talks/2019-5-VizierCaveats/index.html | 306 ++++++++++-------- slides/talks/2019-5-VizierCaveats/notes.txt | 58 ++++ 3 files changed, 238 insertions(+), 127 deletions(-) create mode 100644 slides/talks/2019-5-VizierCaveats/graphics/time_series_with_errors.svg diff --git a/slides/talks/2019-5-VizierCaveats/graphics/time_series_with_errors.svg b/slides/talks/2019-5-VizierCaveats/graphics/time_series_with_errors.svg new file mode 100644 index 00000000..7fa055ce --- /dev/null +++ b/slides/talks/2019-5-VizierCaveats/graphics/time_series_with_errors.svg @@ -0,0 +1 @@ +Time SeriesPotential ErrorsTimeseries020004000600080001000000.20.40.60.81 \ No newline at end of file diff --git a/slides/talks/2019-5-VizierCaveats/index.html b/slides/talks/2019-5-VizierCaveats/index.html index 599c9855..696c08bb 100644 --- a/slides/talks/2019-5-VizierCaveats/index.html +++ b/slides/talks/2019-5-VizierCaveats/index.html @@ -92,8 +92,11 @@
+

Act 1

Alice wants to analize two unaligned time series.

+
+
@@ -159,12 +162,6 @@

Interpolate missing values

Hand tune around the switchover as-needed

-

-            SELECT a.time, a.reading AS reading_one
-                           b.reading AS reading_two
-            FROM series_one_buckets a, series_two_buckets b
-            WHERE a.time = b.time
-          
@@ -203,7 +200,7 @@

Act 2

-

Carol gets a dataset from Dave

+

Carol gets a dataset from Dave

@@ -239,6 +236,7 @@
+

Act 3

Eve needs to load a CSV file

→ @@ -255,7 +253,7 @@ I'm sorry, I can't do that, Eve.

- You have a stray comma on line 1252538. + You have a non-numerical value at position 1252538:24.

@@ -308,11 +306,42 @@
+

Wouldn't it be nice if...

-

There needs to be a better way!

+

Wouldn't it be nice if...

+

... this is what Bob saw:

+ +
+ +
+

Wouldn't it be nice if...

+

... this is what Carol saw:

+
TimeReading
15757310010
+ + + + + +
+ The data included an unexpected value: 'Non-Hispanic White'
The most similar known value is 'White Non-Hispanic' +
@@ -324,19 +353,33 @@
-

Declare a caveat when volating an assumption might...

-
    -
  • ... change one or more values
  • -
  • ... remove one or more records
  • -
  • ... add one or more record
  • -
  • (rarely) ... change the db schema
  • -
+

Why?

+

Propagation

+
+
Caveats...
+ +
+
... can go where the data goes
+
Derived values retain caveats on source data.
+
+ +
+
... stop where the data stops
+
Irrelevant caveats don't get propagated
+
+
+
+ +
+

Wouldn't it be nice if...

+

... this is what Eve saw:

+
-

So what is a caveat?

+

What is a Caveat?

A brief digression...

@@ -358,49 +401,98 @@
-

Possible tuples exist in at least one one possible world. $$possible(\mathcal R) = \bigcup_{R \in \mathcal R} R$$

Certain tuples exist in all possible worlds. $$certain(\mathcal R) = \bigcap_{R \in \mathcal R} R$$

+

Uncertain tuples exist in at least one,
but not all possible worlds. $$uncertain(\mathcal R) = \bigcup_{R \in \mathcal R} R - certain(\mathcal R)$$

(not limited to set semantics)

+
+

A caveat is an assumption tied to one or more data elements (cells or rows).

+

If the assumption is wrong, so is the element.

+
+ +
+

Alice / Bob

+ +
+ +
+

Carol / Dave

+ +
+ +
+

Eve / Hal

+ +
+ +
+

An element has a caveat → The element is uncertain.

+ +

... and btw, here's why.

+
+ + +
+
+

Caveats

+ +
    +
  1. Story Time
  2. +
  3. What is a Caveat?
  4. +
  5. Applying Caveats
  6. +
  7. Propagating Caveats
  8. +
  9. Caveats Beyond SQL
  10. +
  11. The Vizier Notebook
  12. +
+
+ +
+

+            SELECT setting_1, setting_2, estimate
+            FROM Simulation;
+          
+ +

We want to indicate that the estimate column is only accurate if (for example) P ≠ NP.

+
+ +
+

caveat(value, assumption)

+ +

returns value, annotated with assumption.

+
+

             SELECT setting_1, setting_2,
-                   caveat(estimate, 'Only correct if phi is 42')
+                   caveat(estimate, 'Only correct if P ≠ NP')
                      AS estimate
             FROM Simulation;
           
- is the same as -

-            SELECT setting_1, setting_2, estimate
-            FROM Simulation;
-          
-

Caveat: If it turns out that phi ≠ 42,
all estimate values could be wrong.

-

(The first query annotates all `estimate` values with the caveat)

+

annotation is just a human-readable string.

-

caveat(value, assumption)

-

Each call fragments reality into multiple possible worlds.

-
-
-
value
-
Indicates the value in one of those worlds.
-
- -
-
assumption
-
being wrong indicates that we need a different world.
-
-
-
- -
-

Applying Caveats

-

a few examples...

+

Incomplete Databases

+

+ caveat() creates 2 sets of possible worlds: +

    +
  • The assumption holds: value is correct.
  • +
  • The assumption does not hold: value is unknown.
  • +
+

+

Alice / Bob

Mark multi-valued buckets (key repair).


     SELECT bucket, 
@@ -412,20 +504,24 @@
              FIRST(reading) AS reading
              COUNT(*) AS bucket_size
       FROM sensor
+      GROUP BY bucket;
     )
           

Interpolation is more complex... but similar.

+

Carol / Dave

Mark unexpected values the model wasn't trained on.


   SELECT
     CASE WHEN race_ethnicity 
-      IN ('white non-hispanic', 'black non-hispanic', /* ... */)
+      IN ('White Non-Hispanic', 'Black Non-Hispanic', /* ... */)
       THEN race_ethnicity
+
       ELSE caveat(race_ethnicity, 
                     'Unexpected race_ethnicity: ' & race_ethnicity)
+
     END, /* ... */
   FROM R
           
@@ -433,41 +529,29 @@
-

Spark's CSV loader can augment tables with a $\texttt{parse_error}$ column.

+

Eve / Hal


-        SELECT * FROM csv_file
-        WHERE 
-          CASE WHEN parse_error IS NULL THEN TRUE ELSE
-            caveat(FALSE, parse_error)
-          END;
+      SELECT /* ... */, 
+          CASE WHEN CAST(salary AS float) IS NULL THEN
+
+            caveat(NULL, 'Could not cast [ '&salary&' ] to float.')
+            
+            ELSE CAST(salary AS float) END AS salary
+      FROM raw_csv_data;
           
+
-
-

Why?

-

Propagation

-
-
Caveats...
- -
-
... can go where the data goes
-
Derived values retain caveats on source data.
-
- -
-
... stop where the data stops
-
Irrelevant caveats don't get propagated
-
-
-
- - +

Caveats

    -
  1. Propagating Caveats
  2. +
  3. Story Time
  4. +
  5. What is a Caveat?
  6. +
  7. Applying Caveats
  8. +
  9. Propagating Caveats
  10. Caveats Beyond SQL
  11. The Vizier Notebook
@@ -553,21 +637,6 @@ - Step 1: Which values are affected by a caveat - Step 2: Which caveats affect those values --> -
-

Caveats

- -
    -
  1. Propagating Caveats
  2. -
  3. Caveats Beyond SQL
  4. -
  5. The Vizier Notebook
  6. -
-
- -
-

What semantics do we want?

-

Caveatted data elements could be wrong.

-
-

Certain Data Elements: Elements guaranteed to be in the result in all possible worlds.

@@ -578,8 +647,9 @@

If a caveatted element can't affect an output element, don't propagate its caveats!

Propagate caveats to any data elements that could be affected by a change.

+
-

Challenge: How do we propagate caveats
without penalizing query evaluation.

+

Challenge: How do we propagate caveats
without penalizing query evaluation?

Don't!

@@ -649,19 +719,17 @@

-    CREATE VIEW by_language AS
+    CREATE VIEW survey_responses AS
       SELECT language, 
-          CASE WHEN CAST(salary AS float) IS NOT NULL THEN
-
+          CASE WHEN CAST(salary AS float) IS NULL THEN
             caveat(NULL, 'Could not cast [ '&salary&' ] to float.')
-            
             ELSE CAST(salary AS float) END AS salary
       FROM raw_csv_data;
           
becomes

-    CREATE VIEW by_language AS
+    CREATE VIEW survey_responses AS
       SELECT language, CAST(salary AS float) AS salary,
              FALSE                         AS _caveat_field_language,
              CAST(salary as float) IS NULL AS _caveat_field_salary
@@ -674,7 +742,7 @@
         

             SELECT salary 
-            FROM by_language
+            FROM survey_responses
             WHERE language = 'Scala'
           
@@ -683,7 +751,7 @@ SELECT salary, _caveat_field_salary AS _caveat_field_salary, _caveat_row AND _caveat_field_language AS _caveat_row - FROM by_language + FROM survey_responses WHERE language = 'Scala'
@@ -692,7 +760,7 @@

             SELECT AVG(salary) AS salary
-            FROM by_language
+            FROM survey_responses
           
becomes @@ -700,7 +768,7 @@ SELECT salary, GROUP_OR(_caveat_field_salary) AS _caveat_field_salary, FALSE AS _caveat_row - FROM by_language + FROM survey_responses
@@ -708,21 +776,21 @@

             SELECT language, AVG(salary) AS salary
-            FROM by_language
+            FROM survey_responses
             GROUP BY language
           
... first we evaluate

       SELECT GROUP_OR(_caveat_field_language)
-      FROM by_language
+      FROM survey_responses
           

Can often be evaluated statically.

-

If TRUE

+

If GROUP BY has caveats


     SELECT language, AVG(salary) AS salary
@@ -736,7 +804,7 @@
         
-

If FALSE

+

If no GROUP BY caveats


         SELECT language, AVG(salary) AS salary
@@ -749,10 +817,6 @@
           
-
-

Ongoing work with Boris Glavic + Su Feng @ IIT

-
-