From ce252b6b230fd60678dac04569d8c93f757e0332 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sun, 21 Feb 2021 22:35:33 -0500 Subject: [PATCH] Slides --- src/teaching/cse-562/2021sp/index.erb | 4 + .../cse-562/2021sp/slide/2021-02-23-ExtRA.erb | 517 +++++++++++ .../graphics/2021-02-23-2PassHashAgg.svg | 359 ++++++++ .../slide/graphics/2021-02-23-Merge-Run.svg | 806 ++++++++++++++++++ .../slide/graphics/2021-02-23-Merge-Tree.svg | 679 +++++++++++++++ .../slide/graphics/2021-02-23-ReplSort-1.svg | 194 +++++ .../slide/graphics/2021-02-23-ReplSort-2.svg | 194 +++++ .../slide/graphics/2021-02-23-ReplSort-3.svg | 193 +++++ .../slide/graphics/2021-02-23-ReplSort-4.svg | 193 +++++ .../slide/graphics/2021-02-23-ReplSort-5.svg | 194 +++++ .../slide/graphics/2021-02-23-ReplSort-6.svg | 194 +++++ .../slide/graphics/2021-02-23-ReplSort-7.svg | 167 ++++ .../slide/graphics/2021-02-23-ReplSort-8.svg | 167 ++++ .../slide/graphics/2021-02-23-Sort-Run.svg | 529 ++++++++++++ 14 files changed, 4390 insertions(+) create mode 100644 src/teaching/cse-562/2021sp/slide/2021-02-23-ExtRA.erb create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-2PassHashAgg.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Run.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Tree.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-1.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-2.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-3.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-4.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-5.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-6.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-7.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-8.svg create mode 100644 src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Sort-Run.svg diff --git a/src/teaching/cse-562/2021sp/index.erb b/src/teaching/cse-562/2021sp/index.erb index 7bc828e8..97b4db86 100644 --- a/src/teaching/cse-562/2021sp/index.erb +++ b/src/teaching/cse-562/2021sp/index.erb @@ -34,8 +34,12 @@ schedule: slides: slide/2021-02-18-QueryAlgorithms.html - date: "Feb. 23" topic: "Extended Relational Algebra" + materials: + slides: slide/2021-02-23-ExtRA.html - date: "Feb. 25" topic: "Physical Data Layout" + materials: + slides: slide/2021-02-25-PhysicalLayout.html - date: "Mar. 2" topic: "Indexes: Tree-Based, Hash" - date: "Mar. 4" diff --git a/src/teaching/cse-562/2021sp/slide/2021-02-23-ExtRA.erb b/src/teaching/cse-562/2021sp/slide/2021-02-23-ExtRA.erb new file mode 100644 index 00000000..a9369e10 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/2021-02-23-ExtRA.erb @@ -0,0 +1,517 @@ +--- +template: templates/cse4562_2019_slides.erb +title: "Extended Relational Algebra" +date: February 13, 2019 +textbook: "Ch. 5.2, 15.4" +--- +
+
+

Extended Relational Algebra

+ +
+
Set/Bag Operations
+
Select ($\sigma$), Project ($\pi$), Join ($\bowtie$), Union ($\cup$)
+ +
Bag Operations
+
Distinct ($\delta$), Outer Joins (⟗)
+ +
List Operations
+
Sort ($\tau$), Limit (L)
+ +
Arithmetic Operations
+
Extended Projection ($\pi$), Aggregation ($\Sigma$), Grouping ($\gamma$)
+
+ +
+ +
+
+
+

Extended Projection

+ +

Like normal projection, but can create new columns

+ + $\pi_{M \leftarrow A+B*C,\; N \leftarrow 2}(R)$ +

produces 1 row for every row of R, with 2 columns: M and N

+
+
+ +
+
+

Outer Join

+

... but first

+
+ +
+

NULL Values

+ +
    +
  • Field values can be unknown or inapplicable.
      +
    • A tree with an unknown species.
    • +
    • The street of a tree in a park.
    • +
    • The '.' on many of your ID cards
    • +
  • +
  • SQL provides a special NULL value for these cases
  • +
+

NULL makes things more complicated.

+
+ +
+ $$\textbf{Trees.SPC_COMMON} = \texttt{'Brooklyn'}$$ + +

+ What happens if Trees.SPC_COMMON is NULL? +

+ +
+ $$\texttt{NULL} = \textbf{'Brooklyn'} \equiv \textbf{Unknown}$$ +
+
+ +
+ + + + + + + + + + +
UnknownANDUnknown$\equiv$Unknown
UnknownANDTrue$\equiv$Unknown
UnknownANDFalse$\equiv$False
UnknownORUnknown$\equiv$Unknown
UnknownORTrue$\equiv$True
UnknownORFalse$\equiv$Unknown
NOTUnknown$\equiv$Unknown
+ +

WHERE clauses eliminate all non-True rows

+
+
+ +
+
+ $$Streets \bowtie_{StreetName} Trees$$ + +

What happens if some streets have no trees?

+
+ +
+

Outer Join (⟗, ⟕, ⟖)

+ +
    +
  1. Include all results from the normal (inner) join.
  2. +
  3. Also include rows that don't get joined.
  4. +
+
+ +
+

Outer Join

+ +
+
Inner Join
+
Normal, plain, simple join
+ +
Left Outer Join (⟕)
+
Include un-joined rows from the left hand side
+ +
Right Outer Join (⟖)
+
Include un-joined rows from the right hand side
+ +
[Full] Outer Join (⟗)
+
Include un-joined rows from either side
+
+
+
+
+

Streets

+ + + + + + +
DistrictStreetName
AApple St.
BBorder St.
CCoventry St.
DDurham St.
+
+
+

Trees

+ + + + + + +
TreeIdStreetName
1Apple St.
2Border St.
3Durham St.
4Elicott St.
+
+
+
+

Streets ⟗ Trees

+ + + + + + + +
DistrictStreetName TreeIdStreetName
A Apple St. 1 Apple St.
B Border St. 2 Border St.
C Coventry St.NULL NULL
D Durham St. 3 Durham St.
NULL NULL 4 Elicott St.
+

Only LEFT outer join

+

Only RIGHT outer join

+
+
+ +
+
+

Sort / Limit

+ +

+ $$\tau_{A}(R)$$ + The tuples of $R$ in ascending order according to 'A' +

+ +

+ $$\textbf{L}_{n}(R)$$ + The first $n$ tuples of R +

(Typically combined with sort. If not, pick arbitrarily.)
+

+
+ +
+

Sort

+ +

+ Pick your favorite sort algorithm. +

+ +

What happens if you don't have enough memory?

+
+ +
+

Key Idea: Merging 2 sorted lists requires $O(1)$ memory.

+
+ +
+

2-Way Sort

+
+
Pass 1
+
Create lots of (small) sorted lists.
+ +
Pass 2+
+
Merge sorted lists of size $N$ into sorted lists of size $2N$
+
+
+ +
+

Pass 1: Create Sorted Runs

+ +
+ +
+

Pass 2: Merge Sorted Runs

+ +
+ +
+

Repeat Pass 2 As Needed.

+
+ +
+

What's the bottleneck?

+ +

IO Cost: $O(N \cdot \lceil\log_2(N)\rceil)$
(with $N$ blocks)

+
+ +
+ +
+ +
+

Using More Memory

+ +
+
Pass 1
+
Sort Bigger Buffers
+
Re-Use Memory When Done
+ +
Pass 2
+
Merge $K$ Runs Simultaneously: $O(N \cdot \lceil\log_K(N)\rceil)$ IO
+
+
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

Replacement Sort

+ +
+ +
+

On average, we'll get runs of size $2 \cdot |WS|$

+
+
+ +
+
+

Aggregation

+ +
+
+
Normal Aggregates
+
SELECT COUNT(*) FROM R
+
SELECT SUM(A) FROM R
+
+ +
+
Group-By Aggregates
+
SELECT A, SUM(B) FROM R GROUP BY A
+
+ +
+
Distinct
+
SELECT DISTINCT A FROM R
+
SELECT A FROM R GROUP BY A
+
+
+
+ +
+

Normal Aggregates

+ + + + + + + + + +
TREE_IDSPC_COMMONBORONAMETREE_DBH
180683'red maple''Queens'3
315986'pin oak''Queens'21
204026'honeylocust''Brooklyn'3
204337'honeylocust''Brooklyn'10
189565'American linden''Brooklyn'21
... and 683783 more
+ +
+ SELECT COUNT(*) FROM TREES +
+
+ +
+ + + + + + + + + + + + + + + +
TREE_IDSPC_COMMONBORONAMETREE_DBH
COUNT = 0
180683'red maple''Queens'3
COUNT = 1
315986'pin oak''Queens'21
COUNT = 2
204026'honeylocust''Brooklyn'3
COUNT = 3
204337'honeylocust''Brooklyn'10
COUNT = 4
189565'American linden''Brooklyn'21
COUNT = 5
... and 683783 more
COUNT = 683788
+
+ +
+
+ SELECT SUM(TREE_DBH) FROM TREES +
+ + + + + + + + + + + + + + +
TREE_IDSPC_COMMONBORONAMETREE_DBH
SUM = 0
180683'red maple''Queens'3
SUM = 3
315986'pin oak''Queens'21
SUM = 24
204026'honeylocust''Brooklyn'3
SUM = 27
204337'honeylocust''Brooklyn'10
SUM = 37
189565'American linden''Brooklyn'21
SUM = 58
... and 683783 more
+
+ +
+

Basic Aggregate Pattern

+

This is also sometimes called a "fold"

+
+
Init
+
Define a starting value for the accumulator
+
Fold(Accum, New)
+
Merge a new value into the accumulator
+
+
+ +
+

COUNT(*)

+
+
Init
+
$0$
+
Fold(Accum, New)
+
$Accum + 1$
+
+
+ +
+

SUM(A)

+
+
Init
+
$0$
+
Fold(Accum, New)
+
$Accum + New$
+
+
+ +
+

AVG(A)

+
+
Init
+
$\{ sum = 0, count = 0 \}$
+
Fold(Accum, New)
+
$\{ sum = Accum.sum + New, \\\;count = Accum.count + 1\}$
+
Finalize(Accum)
+
$\frac{Accum.sum}{Accum.count}$
+
+
+ +
+

Basic Aggregate Pattern

+
+
Init
+
Define a starting value for the accumulator
+
Fold(Accum, New)
+
Merge a new value into the accumulator
+
Finalize(Accum)
+
Extract the aggregate from the accumulator.
+
+
+ +
+

Basic Aggregate Types

+

Grey et. al. "Data Cube: A Relational Aggregation Operator Generalizing Group-By, Cross-Tab, and Sub-Totals

+ +
+
Distributive
+
Finite-sized accumulator and doesn't need a finalize (COUNT, SUM)
+
Algebraic
+
Finite-sized accumulator but needs a finalize (AVG)
+
Holistic
+
Unbounded accumulator (MEDIAN)
+
+
+
+ +
+
+

Group-By Aggregates

+
+ SELECT SPC_COMMON, COUNT(*) FROM TREES GROUP BY SPC_COMMON +
+
+ +
+

Naive Idea: Keep a separate accumulator for each group

+
+ +
+ + + + + + + + + + + +
TREE_IDSPC_COMMONBORONAMETREE_DBH
{}
180683'red maple''Queens'3
{ 'red maple' = 1 }
204337'honeylocust''Brooklyn'10
{ 'red maple' = 1, 'honeylocust' = 1 }
315986'pin oak''Queens'21
{ 'red maple' = 1, 'honeylocust' = 1, 'pin oak' = 1 }
204026'honeylocust''Brooklyn'3
{ 'red maple' = 1, 'honeylocust' = 2, 'pin oak' = 1 }
+
+ +
+

What could go wrong?

+
+ +
+

Alternative Grouping Algorithms

+ +
+
2-pass Hash Aggregate
+
Like 2-pass Hash Join: Distribute groups across buckets, then do an in-memory aggregate for each bucket.
+ +
Sort-Aggregate
+
Like Sort-Merge Join: Sort data by groups, then group elements will be adjacent.
+
+
+ +
+ +
+ +
+ + + + + + + + + + + + + +
TREE_IDSPC_COMMONBORONAMETREE_DBH
{}
204337'honeylocust''Brooklyn'10
{ 'honeylocust' = 1 }
204026'honeylocust''Brooklyn'3
{ 'honeylocust' = 2 }
... and more
315986'pin oak''Queens'21
{ 'honeylocust' = 3206, 'pin oak' = 1 }
... and more
180683'red maple''Queens'3
{ 'pin oak' = 53814, 'red maple' = 1 }
+
+
+ +
+
+
Set/Bag Operations
+
Select ($\sigma$), Project ($\pi$), Join ($\bowtie$), Union ($\cup$)
+ +
Bag Operations
+
Distinct ($\delta$), Outer Joins (⟗)
+ +
List Operations
+
Sort ($\tau$), Limit (L)
+ +
Arithmetic Operations
+
Extended Projection ($\pi$), Aggregation ($\sigma$), Grouping ($\gamma$)
+
+
+ +
+

Next Time...

+

Physical Layouts

+
\ No newline at end of file diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-2PassHashAgg.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-2PassHashAgg.svg new file mode 100644 index 00000000..411f8578 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-2PassHashAgg.svg @@ -0,0 +1,359 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + h(A) = 0 + h(A) = 1 + + + + + 1 + + + + 1 + + + + 3 + + + + 3 + + + + 2 + + + + 2 + + + + 1 + + + + 1 + + + + + + 1: Count = 2 + + + + 2: Count = 1 + + + + 3: Count = 1 + + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Run.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Run.svg new file mode 100644 index 00000000..e9ecce4c --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Run.svg @@ -0,0 +1,806 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Tree.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Tree.svg new file mode 100644 index 00000000..4bfcaa5c --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Merge-Tree.svg @@ -0,0 +1,679 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + 1 + 2 + 4 + + log2(N) + + + + + + + + + + + + + + + N + + + + + + + + + + + + + + + + + 1 + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-1.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-1.svg new file mode 100644 index 00000000..5aa69c37 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-1.svg @@ -0,0 +1,194 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 12 + 8 + 10 + ... + Working Set + Input + Output + 35 + 2 + 4 + ... + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-2.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-2.svg new file mode 100644 index 00000000..f6b7e4c2 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-2.svg @@ -0,0 +1,194 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 12 + 8 + 10 + ... + Working Set + Input + Output + 35 + 2 + 4 + ... + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-3.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-3.svg new file mode 100644 index 00000000..87559e28 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-3.svg @@ -0,0 +1,193 @@ + + + + + + + + image/svg+xml + + + + + + + + 12 + 8 + 10 + ... + Working Set + Input + Output + 35 + 2 + 4 + ... + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-4.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-4.svg new file mode 100644 index 00000000..45f5bb9f --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-4.svg @@ -0,0 +1,193 @@ + + + + + + + + image/svg+xml + + + + + + + + 10 + 8 + 12 + ... + Working Set + Input + Output + 35 + 2 + ... + 4 + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-5.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-5.svg new file mode 100644 index 00000000..e365a9e8 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-5.svg @@ -0,0 +1,194 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 10 + 8 + 12 + ... + Working Set + Input + Output + 35 + 2 + 4 + ... + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-6.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-6.svg new file mode 100644 index 00000000..02c49355 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-6.svg @@ -0,0 +1,194 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 10 + 8 + 12 + ... + Working Set + Input + Output + 35 + 2 + 4 + ... + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-7.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-7.svg new file mode 100644 index 00000000..2746c5fb --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-7.svg @@ -0,0 +1,167 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 12 + ... + Working Set + Input + Output + 2 + 4 + ... + max = 10 + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-8.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-8.svg new file mode 100644 index 00000000..c88d6589 --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-ReplSort-8.svg @@ -0,0 +1,167 @@ + + + + + + + + + + image/svg+xml + + + + + + + + 12 + ... + Working Set + Input + Output + 2 + 4 + ... + max = 10 + + diff --git a/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Sort-Run.svg b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Sort-Run.svg new file mode 100644 index 00000000..f40228ee --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/graphics/2021-02-23-Sort-Run.svg @@ -0,0 +1,529 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + σ/π/⋈ + + + τ + + + + + + + + + + + + + + + + + + + + +