From 035e606ffb91b8229bcb37003f4a240008651a11 Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 27 Apr 2021 01:29:27 -0400 Subject: [PATCH] slides --- src/teaching/cse-562/2021sp/index.erb | 5 +- .../2021sp/slide/2021-04-27-Checkpoint4.html | 243 ++++++++++++++++++ 2 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html diff --git a/src/teaching/cse-562/2021sp/index.erb b/src/teaching/cse-562/2021sp/index.erb index 7b487e49..a9a95a3a 100644 --- a/src/teaching/cse-562/2021sp/index.erb +++ b/src/teaching/cse-562/2021sp/index.erb @@ -124,10 +124,11 @@ schedule: topic: "Distributed Commit" materials: slides: slide/2021-04-22-ParallelUpdates.html - - date: "Apr. 26" - due: "Checkpoint 3" - date: "Apr. 27" + due: "Checkpoint 3" topic: "Indexing Review + Checkpoint 4" + materials: + slides: slide/2021-04-27-Checkpoint4.html - date: "Apr. 29" topic: "TBD" - date: "May 4" diff --git a/src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html b/src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html new file mode 100644 index 00000000..a120172b --- /dev/null +++ b/src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html @@ -0,0 +1,243 @@ +--- +template: templates/cse4562_2021_slides.erb +title: "Checkpoint 4" +date: April, 2021 +textbook: "Ch. 8.3-8.4, 14.1-14.2, 14.4" +--- + +
+ +
+

Checkpoint 4

+ +

Just like Checkpoint 3, but now...

+ +
    +
  • Tighter Bounds
  • +
  • More Start-Up Time
  • +
  • Hints About the Table
  • +
+
+
+ +
+ +
+

Primary Key

+ +

+    CREATE TABLE CUSTOMER(
+      CUSTKEY INT, 
+      NAME STRING, 
+      ADDRESS STRING, 
+      NATIONKEY INT, 
+      PHONE STRING, 
+      ACCTBAL FLOAT, 
+      MKTSEGMENT STRING, 
+      COMMENT STRING
+    ) USING csv OPTIONS(
+      path 'data/CUSTOMER.data', 
+      delimiter = '|',
+      primary_key = 'custkey'
+    )
+    
+ +
+ +
+

Primary Key

+

+    primary_key = 'custkey',
+    
+ or +

+    primary_key = 'orderkey,lineitem',
+    
+ +

comma-separated list describing the primary key of the table

+
+ +
+
+
+
The Reference Implementation
+
In-Memory Tables
+
Primary Key Index
+
Index Scans
+
+ +
+
Index-Nested Loop Join
+
Ideas From Past Submissions
+
Materialized Views
+
Gather Statistics
+
Secondary Indexes
+
+
+
+
+ +
+
+

In-Memory Tables

+ +

Time to scan SF 0.1 LINEITEM

+ + + + + + + + + + +
SourceTime
CSV on NVME SSD0.88745s
  
+

 

+
+ +
+

What does that 1s include?

+
    +
  • Disk to Ram
  • +
  • Buffering in the OS
  • +
  • IPC
  • +
  • Split on |
  • +
  • Parse Int, Float, Date
  • +
  • Iterate Over Each Tuple
  • +
+
+ +
+

In-Memory Tables

+ +

Time to scan SF 0.1 LINEITEM

+ + + + + + + + + + +
SourceTime
CSV on NVME SSD0.88745s
IndexedSeq[InternalRow]0.018s
+

~30x speedup

+
+ +
+

Takeaway: Read data in at CREATE TABLE

+
+ +
+

Compiler

+ +

+      Table(...)
+    
+

Return an iterator over the preloaded table.

+
+
+ +
+
+

Primary Key Index

+ +

Time to filter SF 0.1 LINEITEM for one orderkey

+ + + + + + + + + + + + + + +
SourceTime
CSV on NVME SSD0.9196s
IndexedSeq[InternalRow]0.0624s
  
+

 

+
+ +
+

What does that 62ms include?

+
    +
  • EqualTo(...).eval(...) for each row
  • +
  • Iterate Over Each Tuple
  • +
+
+ +
+

Primary Key Index

+ +

Time to filter SF 0.1 LINEITEM for one orderkey

+ + + + + + + + + + + + + + +
SourceTime
CSV on NVME SSD0.9196s
IndexedSeq[InternalRow]0.0624s
Sorted IndexedSeq[InternalRow] + Bin Search0.0008s
+

~80x speedup

+
+ +
+

Takeaway: Sort on primary key and binary search.

+
+ +
+

Compiler

+

+      Filter(expression, Table(...))
+    
+

If expression is a ...

+
+ +
+

If expression is a ...

+
+
EqualTo between the Table key and a constant
+
Binary search for the key!
+
[Greater|Less]Than[OrEquals] between the Table key and a constant
+
Binary search for the lower/upper bound
+
Greater and Lower
+
Binary search for the lower and upper bound
+
One of the above and more
+
Binary search + Filter the rest
+
+
+ +
+

But TPC-H doesn't have filters on keys...

+
+ +
+

Also Index

+

+      USING csv OPTIONS(
+      path '../TPCH/LINEITEM.csv', 
+      delimiter = '|',
+      primary_key = 'orderkey,linenumber',
+      tree_index = 'shipdate',
+      hash_index = 'linestatus|shipmode'
+    )
+    
+

tree_index and hash_index are |-separated lists of ,-separated indexes.

+
+
+ +
+ Questions? +
\ No newline at end of file