diff --git a/lib/slide_utils.rb b/lib/slide_utils.rb index 0261cc8b..50abebb1 100644 --- a/lib/slide_utils.rb +++ b/lib/slide_utils.rb @@ -61,4 +61,147 @@ def data_table(schema, data, params = {}) data.zip(row_args).map { |row, args| tag("tr", row.join, args) }.join("\n"), params.fetch(:table_args, {}) ) +end + +class RATreeNode + def initialize(type, params, children = []) + @type = type + @params = params + @children = children + @self_width = 100 + @self_height = 100 + case @type + when :table then + @self_width = 40*params[:name].length + @self_height = 50 + when :select, :join then + @self_width += 15*params[:pred].length + when :project then + @self_width += 15*params[:attrs].length + end + @height_above_children = 100 + end + + def subscript(x) + "#{x}" + end + + def symbol + case @type + when :select then " 𝛔#{subscript @params[:pred]}" + when :project then " 𝛑#{subscript @params[:attrs]}" + when :aggregate then "#{subscript @params[:groupby] if @params.has_key? :groupby}𝛄#{subscript @params[:aggregates]}" + when :join then "⋈#{subscript @params[:pred]}" + when :cross then "" + when :diff then " -" + when :union then "" + when :table then "#{@params[:name]}" + else type.to_s + end + end + + def height(config = {}) + unless @height + if @children.nil? + @height = @self_height + else + @height = @children.map { |c| c.height(config) }.max + (@self_height + @height_above_children) + end + end + @height + end + + def child_width(config = {}) + return 0 if @children.nil? + unless @child_width + separator_x = 20 + @child_width = @children.map { |c| c.width(config) }.sum + separator_x * (@children.size-1) + end + @child_width + end + + def width(config = {}) + unless @width + if @children.nil? + @width = @self_width + else + @width = [ + child_width, + @self_width + ].max + end + end + @width + end + + def symbol_text(config) + symbol_x = width(config) / 2 - (@self_width / 2) + symbol_y = 0 + debug = "#{config.fetch(:indent, "")}\n" if config.fetch(:debug, false) + "#{debug}#{config.fetch(:indent, "")}#{symbol}\n" + end + + def render(config = {}) + return symbol_text(config) if @children.nil? + indent = config.fetch(:indent, "") + separator_x = 20 + separator_y = @height_above_children + children_x = [0] + children_x = [(width(config) - child_width(config)) / 2] if width(config) > child_width(config) + (1..@children.length).each { |i| children_x[i] = children_x[i-1] + @children[i-1].width + separator_x } + children_y = separator_y + @self_height + + child_blobs = @children.map.with_index do |c, i| + rendered = c.render(config.merge( indent: indent+" " )) + p rendered if config.fetch(:debug, false) + p children_x[i] if config.fetch(:debug, false) + "#{indent} \n#{rendered}\n" + end + + line_x = width(config) / 2 + line_y = (@self_height) * 1.1 + target_y = line_y + @height_above_children + + child_lines = @children.map.with_index do |c, i| + target_x = (children_x[i] + children_x[i+1] - separator_x) / 2 + "#{indent} \n" + end + + symbol_text(config)+child_blobs.join+child_lines.join + end +end + +def ra_table(name) + RATreeNode.new(:table, { name: name }, nil) +end +def ra_union(*children) + RATreeNode.new(:union, {}, children) +end +def ra_diff(*children) + RATreeNode.new(:diff, {}, children) +end +def ra_join(predicate, lhs, rhs) + RATreeNode.new(:table, { pred: predicate }, [lhs, rhs]) +end +def ra_aggregate(groupby, aggregates, input) + RATreeNode.new(:aggregate, { groupby: groupby, aggregates: aggregates}, [input]) +end +def ra_select(predicate, input) + RATreeNode.new(:select, { pred: predicate }, [input]) +end +def ra_project(attrs, input) + attrs = attrs.map { |k, v| "#{k} ← #{v}"}.join("; ") if attrs.is_a? Hash + RATreeNode.new(:project, { attrs: attrs }, [input]) +end + +def relational_algebra(params = {}) + indent = params.fetch(:indent, "") + ra = yield + scale = if ra.height > 500 then 500.0 / ra.height else 1 end + return ( + "#{indent}\n"+ + "#{indent}"+ + ra.render(params.merge( indent: indent+" " ))+ + "#{indent}\n" + ) end \ No newline at end of file diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb new file mode 100644 index 00000000..a0df9fa1 --- /dev/null +++ b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb @@ -0,0 +1,326 @@ +--- +template: templates/cse4562_2019_slides.erb +title: Incomplete and Probabilistic Databases +date: May 1, 2019 +textbook: "PDB Concepts and C-Tables" +dependencies: + - lib/slide_utils.rb +--- +<% + require "slide_utils.rb" +%> +
+
+ +
+ +
+ + https://www.anishathalye.com/2017/07/25/synthesizing-adversarial-examples/ +
+ +
+ + Deep Learning Demystified +
+ +
+

What happens when you don't know your data precisely?

+
+ +
+

+      SELECT * FROM Posts WHERE image_class = 'Cat';
+    
+

+      SELECT COUNT(*) FROM Posts WHERE image_class = 'Cat';
+    
+

+      SELECT user_id FROM Posts
+      WHERE image_class = 'Cat'
+      GROUP BY user_id HAVING COUNT(*) > 10;
+    
+
+
+ +
+
+

Incomplete Databases

+

Probabilistic Databases

+
+ +
+
    +
  1. Representing Incompleteness
  2. +
  3. Querying Incomplete Data
  4. +
  5. Implementing It
  6. +
+
+ +
+ +
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %> + or + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %> +
+
+ +
+

Incomplete Database ($\mathcal D$): A set of possible worlds

+

Possible World ($D \in \mathcal D$): One (of many) database instances

+

(Require all possible worlds to have the same schema)

+
+ +
+

What does it mean to run a query on an incomplete database?

+

$Q(\mathcal D) = ?$

+

$Q(\mathcal D) = \{\;Q(D)\;|\;D \in \mathcal D \}$

+
+ +
+
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %> + or + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %> +
+

$$Q_1 = \pi_{Name}\big( \sigma_{state = \texttt{'NY'}} (R \bowtie_{zip} ZipLookups) \big)$$

+ + + + +
{ + <%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$", rowids: true) %> + or + <%= data_table(["Name"], [["Alice"]], name: "$Q(R_2)$", rowids: true) %> + }
+ +
+ +
+
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %> + or + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %> +
+

$$Q_2 = \pi_{Name}\big( \sigma_{region = \texttt{'Northeast'}} (R \bowtie_{zip} ZipLookups) \big)$$

+ + + + +
{ + <%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$", rowids: true) %> + or + <%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_2)$", rowids: true) %> + }
+
+ +
+
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %> + or + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %> +
+

$$Q_2 = \pi_{Name}\big( \sigma_{region = \texttt{'Northeast'}} (R \bowtie_{zip} ZipLookups) \big)$$

+ + + + +
{ + <%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$ or $Q(R_2)$", rowids: true) %> + }
+
+ +
+
+
+ +
+
+ +
+
+

Challenge: There can be lots of possible worlds.

+
+ +
+

Observation: Possibilities for database creation break down into lots of independent choices.

+ +

Factorize the database.

+
+ +
+ +
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"], ["Carol", "13201"]], name: "$R_1$", rowids: true) %> + + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"], ["Carol", "18201"]], name: "$R_2$", rowids: true) %> +
+ <%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"], ["Carol", "13201"]], name: "$R_3$", rowids: true) %> + + <%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"], ["Carol", "18201"]], name: "$R_4$", rowids: true) %> +
+

Alice appears in both databases.
The only differences are Bob and Carol's zip codes.

+
+ +
+

List Out Choices

+ +
    +
  • $\texttt{bob}$$ \in \{ 4, 9 \}$ (Bob's zip code digit)
  • +
  • $\texttt{carol}$$ \in \{ 3, 8 \}$ (Carol's zip code digit)
  • +
+
+ + <% [false, true].each do |with_annotations| %> +
+ <%= data_table( + ["Name", "ZipCode"], + [ ["Alice", "10003"], + ["Bob","14260"], + ["Bob","14290"], + ["Carol","13201"], + ["Carol","18201"] + ], + name: "$\\mathcal R$", + rowids: true, + annotations: if with_annotations then [ + "always", + "if $\\texttt{bob} = 4$", + "if $\\texttt{bob} = 9$", + "if $\\texttt{carol} = 3$", + "if $\\texttt{carol} = 8$" + ] else nil end + ) %> +
+
+
+

$\big[\;\texttt{bob} \in \{4, 9\},\; \texttt{carol} \in \{3, 8\}\;\big]$

+
+
+ <% end %> +
+ <%= data_table( + ["Name", "ZipCode"], + [ ["Alice", "10003"], + ["Bob","14260"], + ["Bob","14290"], + ["Carol","13201"], + ["Carol","18201"] + ], + name: "$\\mathcal R$", + rowids: true, + annotations: [ + "a", + "b", + "c", + "d", + "e" + ] + ) %> +
+
+

Pick one of each: $\big[\;\{a\},\; \{b, c\},\; \{d, e\}\;\big]$

+

Set those variables to $T$ and all others to $F$

+
+ +
+

$R_1 \equiv \big[a \rightarrow T, b \rightarrow T, d \rightarrow T, * \rightarrow F\big]$

+ <%= data_table( + ["Name", "ZipCode"], + [ ["Alice", "10003"], + ["Bob","14260"], + ["Bob","14290"], + ["Carol","13201"], + ["Carol","18201"] + ], + name: "$\\mathcal R$", + rowids: true, + annotations: [ + "T (a)", + "T (b)", + "F (c)", + "T (d)", + "F (e)" + ] + ) %> +
+ +
+

Use provenance as before...

+

... but what about aggregates?

+
+ +
+

+                SELECT COUNT(*) 
+                FROM R NATURAL JOIN ZipCodeLookup 
+                WHERE State = 'NY'
+    
+

+ $$= \begin{cases} + 1 & \textbf{if } \texttt{bob} = 9 \wedge \texttt{carol} = 8\\ + 2 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 8 \\&\; \vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3\\ + 3 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 3 + \end{cases}$$

+

Problem: A combinatorial explosion of possibilities

+
+ +
+

Idea: Simplify the problem

+
    +
  1. Is a particular tuple Possible?
  2. +
  3. Is a particular tuple Certain?
  4. +
+
+ +
+
+
+
Certain Tuple
+
A tuple that appears in all possible worlds
+
$\forall D \in \mathcal D : t \in D$
+
+ +
+
Possible Tuple
+
A tuple that appears in at least one possible world
+
$\exists D \in \mathcal D : t \in D$
+
+
+
+ +
+

Non-aggregate queries

+
+
Is a tuple Certain?
+
Is the provenance polynomial a tautology?
+ +
Is a tuple Possible?
+
Is the provenance polynomial a contradiction?
+
+

Pick your favorite SAT solver, plug in and go

+
+ +
+

Aggregate queries

+ +

+ As before, factorize the possible outcomes +

+

+ $$1 + \{\;1\;\textbf{if}\;\texttt{bob} = 4\;\} + \{\;1\;\textbf{if}\;\texttt{carol} = 3\;\}$$ +

+

+ Not bigger than the aggregate input... +

+

+ ...but at least it only reduces to bin-packing
(or a similarly NP problem.) +

+
+ +
+

In short, incomplete databases are limited, but have some uses.

+

What about probabilities?

+
+
diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-03-Checkpoint4.erb b/src/teaching/cse-562/2019sp/slide/2019-05-03-Checkpoint4.erb new file mode 100644 index 00000000..770218e3 --- /dev/null +++ b/src/teaching/cse-562/2019sp/slide/2019-05-03-Checkpoint4.erb @@ -0,0 +1,355 @@ +--- +template: templates/cse4562_2019_slides.erb +title: Checkpoint 4 +date: May 3, 2019 +textbook: +dependencies: + - lib/slide_utils.rb +--- +<% + require "slide_utils.rb" +%> +
+
+

A few things first...

+
+ +
+ +
+ +
+

4/562 Databake Off @ 3:00

+

RSVP (limited space available) to participate

+
+ +
+

A note on optimization...

+ +

Lots of interesting strategies used in Checkpoint 3

+
    +
  • Pre-parsing
  • +
  • Column Stores
  • +
  • Cost-based Opt
  • +
  • Hyper-optimize the slowest query
  • +
+
+
+ +
+
+

Checkpoint 4

+

Implement Updates

+

(lambda-architecture edition)

+

Due May 20

+
+ +
+
    +
  • A stream of inserts, deletes, updates, and queries.
  • +
  • No restarts.
  • +
  • Answer queries as fast as possible.
  • +
  • Make sure query results account for DDL effects.
  • +
+
+ +
+
+
Stage 0
+
10 minutes of prep
+
Stage 1
+
Inserts only
+
Stage 2
+
Inserts + Deletes
+
Stage 3
+
Inserts + Deletes + Updates
+
+

No restarts.

+
+
+ +
+
+

Do I need to implement block-based storage?

+

No (although you can).

+

Ok... so what else can I do?

+
+ +
+

Classical Databases

+ +
+ +
+

Problem 1: More indexes = Slower writes (bad for OLTP)

+

Problem 2: Fewer indexes = Slower reads (bad for OLAP)

+
+ +
+

What if you have both OLAP and OLTP workloads?

+
+ +
+

Idea: Weekly / Nightly / Hourly dump
from OLTP System to OLAP system.

+

(Index the data while dumping)

+
+ +
+

Problem: Not seeing the freshest data!

+
+ +
+

Better Idea: OLTP DB + OLAP DB.

+

OLTP DB has few indexes, but only stores recent updates.

+

OLAP DB has many indexes, and stores everything except recent updates.

+

Periodically migrate updates into OLAP DB.

+

(Lambda Architecture)

+
+ +
+

Checkpoint 4

+

Suggested Approach: Lambda-Lite

+
+
+ +
+
+

Handling Inserts

+
+ +
+

+              INSERT INTO FOO(A, B, C) VALUES (1, 2, 3);
+    
+
+ +
+ <%= + relational_algebra() do + ra_table("Orig") + end + %> +
+
+ <%= + relational_algebra(debug: false) do + ra_union( + ra_table("Orig"), + ra_table("New") + ) + end + %> +
+
+ +
+
+

Example

+
+
+

+      SELECT COUNT(*) FROM lineitem WHERE mktsegment = 'BUILDING';
+    
+
+
+ <%= + relational_algebra do + ra_aggregate(nil, "COUNT(*)", + ra_select("mktsegment = 'BUILDING'", + ra_table("lineitem") + ) + ) + end + %> +
+
+ <%= + relational_algebra do + ra_aggregate(nil, "COUNT(*)", + ra_select("mktsegment = 'BUILDING'", + ra_union( + ra_table("lineitem"), + ra_table("inserts") + ) + ) + ) + end + %> +
+
+ +
+
+

Handling Deletes

+
+ +
+

+                  DELETE FROM FOO WHERE A > 5;
+    
+
+ +
+ <%= + relational_algebra do + ra_table("Orig") + end + %> +
+ +
+ <%= + relational_algebra do + ra_diff( + ra_table("Orig"), + ra_table("New") + ) + end + %> +

... but that's not quite how SQL Delete works.

+
+ +
+

+                      DELETE FROM FOO WHERE A > 5;
+    
+
+ <%= + relational_algebra do + ra_select("A ≤ 5", + ra_table("FOO") + ) + end + %> +
+
+ +
+

+                      DELETE FROM Orig WHERE Something;
+    
+ <%= + relational_algebra do + ra_select("NOT Something", + ra_table("Orig") + ) + end + %> +
+
+ +
+
+

Example

+
+
+

+    INSERT INTO lineitem(...) VALUES (...);
+    INSERT INTO lineitem(...) VALUES (...);
+    DELETE FROM lineitem WHERE shipdate BETWEEN date(1997-10-01) 
+                                            AND date(1997-10-30);
+    SELECT COUNT(*) FROM lineitem WHERE mktsegment = 'BUILDING';
+    
+
+
+ <%= + relational_algebra do + ra_aggregate(nil, "COUNT(*)", + ra_select("mktsegment = 'BUILDING'", + ra_table("lineitem") + ) + ) + end + %> +
+
+ <%= + relational_algebra do + ra_aggregate(nil, "COUNT(*)", + ra_select("mktsegment = 'BUILDING'", + ra_union( + ra_table("lineitem"), + ra_table("inserts") + ) + ) + ) + end + %> +
+
+ <%= + relational_algebra do + ra_aggregate(nil, "COUNT(*)", + ra_select("mktsegment = 'BUILDING'", + ra_select("shipdate NOT BETWEEN ...", + ra_union( + ra_table("lineitem"), + ra_table("inserts") + ) + ) + ) + ) + end + %> +
+
+ +
+
+

Handling Updates

+
+ +
+

+            UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
+    
+
+ +
+

+            UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
+    
+ <%= + relational_algebra do + ra_union( + ra_select( "C = 3", + ra_project( { A: "1", B: "2", C: "C" }, + ra_table("Foo") + ) + ), + ra_select( "C ≠ 3", + ra_table("Foo") + ) + ) + end + %> +
+ +
+

+            UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
+    
+ <%= + relational_algebra do + ra_project( { A: "CASE WHEN C = 3 THEN 1 ELSE A END", B: "CASE ...", C: "C"}, + ra_table("Foo") + ) + end + %> +

+      SELECT CASE WHEN C = 3 THEN 1 ELSE A END AS A,
+             CASE WHEN C = 3 THEN 2 ELSE B END AS B,
+             C AS C
+      FROM Foo;
+    
+
+
+ +
+

Final Advice

+
    +
  • This isn't the only way to implement updates.
  • +
  • Optimizer performance is crucial!
  • +
  • Consider periodically pausing to collapse updates
  • +
+
\ No newline at end of file diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb new file mode 100644 index 00000000..f35c50b4 --- /dev/null +++ b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb @@ -0,0 +1,63 @@ +--- +template: templates/cse4562_2019_slides.erb +title: Incomplete and Probabilistic Databases +date: May 6, 2019 +textbook: "PDB Concepts and C-Tables" +dependencies: + - lib/slide_utils.rb +--- +<% + require "slide_utils.rb" +%> +
+
+

Idea: Make $\texttt{bob}$ and $\texttt{carol}$ random variables.

+
+ +
+

$$\texttt{bob} = \begin{cases} 4 & p = 0.8 \\ 9 & p = 0.2\end{cases}$$

+

$$\texttt{carol} = \begin{cases} 3 & p = 0.4 \\ 8 & p = 0.6\end{cases}$$

+
+ +
+

+ $$Q(\mathcal D) = \begin{cases} + 1 & \textbf{if } \texttt{bob} = 9 \wedge \texttt{carol} = 8\\ + 2 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 8 \\&\; \vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3\\ + 3 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 3 + \end{cases}$$

+

+ $$ = \begin{cases} + 1 & p = 0.2 \times 0.6\\ + 2 & p = 0.8 \times 0.6 + 0.2 \times 0.4\\ + 3 & p = 0.8 \times 0.4 \end{cases}$$ +

+

+ $$ = \begin{cases} + 1 & p = 0.12\\ + 2 & p = 0.56\\ + 3 & p = 0.32\end{cases}$$ +

+
+ +
+

+ $$Q(\mathcal D) = \begin{cases} + 1 & p = 0.12\\ + 2 & p = 0.56\\ + 3 & p = 0.32\end{cases}$$ +

+

$E\left[Q(\mathcal D)\right] = 0.12+1.12+0.96 = 2.20$

+

$P\left[Q(\mathcal D) \geq 2\right] = 0.56+0.32 = 0.88$

+
+ +
+

In general, computing probabilities exactly is #P

+ +

... so we approximate

+
+ +
+

Idea 1: Sample. Pick 10 random possible worlds and compute results for each.

+
+
\ No newline at end of file diff --git a/src/teaching/cse-562/2019sp/slide/graphics/2019-05-03-DemoDay.png b/src/teaching/cse-562/2019sp/slide/graphics/2019-05-03-DemoDay.png new file mode 100644 index 00000000..f25b3b2e Binary files /dev/null and b/src/teaching/cse-562/2019sp/slide/graphics/2019-05-03-DemoDay.png differ