slides

2021-04-27 01:29:27 -04:00 · 2021-04-27 01:29:27 -04:00 · 035e606ffb
parent d0d01bef23
commit 035e606ffb
2 changed files with 246 additions and 2 deletions
--- a/src/teaching/cse-562/2021sp/index.erb
+++ b/src/teaching/cse-562/2021sp/index.erb
@ -124,10 +124,11 @@ schedule:
    topic: "Distributed Commit"
    materials:
      slides: slide/2021-04-22-ParallelUpdates.html
-  - date: "Apr. 26"
-    due: "Checkpoint 3"
  - date: "Apr. 27"
+    due: "Checkpoint 3"
    topic: "Indexing Review + Checkpoint 4"
+    materials: 
+      slides: slide/2021-04-27-Checkpoint4.html
  - date: "Apr. 29"
    topic: "TBD"
  - date: "May 4"
--- a/src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html
+++ b/src/teaching/cse-562/2021sp/slide/2021-04-27-Checkpoint4.html
@ -0,0 +1,243 @@
+---
+template: templates/cse4562_2021_slides.erb
+title: "Checkpoint 4"
+date: April, 2021
+textbook: "Ch. 8.3-8.4, 14.1-14.2, 14.4"
+---
+
+<section>
+
+  <section>
+    <h2>Checkpoint 4</h2>
+
+    <p>Just like Checkpoint 3, but now...</p>
+
+    <ul>
+      <li class="fragment">Tighter Bounds</li>
+      <li class="fragment">More Start-Up Time</li>
+      <li class="fragment">Hints About the Table</li>
+    </ul>
+  </section>
+</section>
+
+<section>
+  
+  <section>
+    <h3>Primary Key</h3>
+
+    <pre><code class="sql">
+    CREATE TABLE CUSTOMER(
+      CUSTKEY INT, 
+      NAME STRING, 
+      ADDRESS STRING, 
+      NATIONKEY INT, 
+      PHONE STRING, 
+      ACCTBAL FLOAT, 
+      MKTSEGMENT STRING, 
+      COMMENT STRING
+    ) USING csv OPTIONS(
+      path 'data/CUSTOMER.data', 
+      delimiter = '|',
+      primary_key = 'custkey'
+    )
+    </code></pre>
+
+  </section>
+
+  <section>
+    <h3>Primary Key</h3>
+    <pre><code class="sql">
+    primary_key = 'custkey',
+    </code></pre>
+    or
+    <pre><code class="sql">
+    primary_key = 'orderkey,lineitem',
+    </code></pre>
+
+    <p>comma-separated list describing the primary key of the table</p>
+  </section>
+
+  <section>
+    <dl>
+      <div class="fragment highlight-blue" data-fragment-index="5">
+        <dt>The Reference Implementation</dt>
+        <dd class="fragment" data-fragment-index="1">In-Memory Tables</dd>
+        <dd class="fragment" data-fragment-index="2">Primary Key Index</dd>
+        <dd class="fragment" data-fragment-index="3">Index Scans</dd>
+      </div>
+
+      <div class="fragment highlight-grey" data-fragment-index="5">
+        <dd class="fragment" data-fragment-index="4">Index-Nested Loop Join</dd>
+        <dt class="fragment" data-fragment-index="4">Ideas From Past Submissions</dt>
+        <dd class="fragment" data-fragment-index="4">Materialized Views</dd>
+        <dd class="fragment" data-fragment-index="4">Gather Statistics</dd>
+        <dd class="fragment" data-fragment-index="4">Secondary Indexes</dd>
+      </div>
+    </dl>
+  </section>
+</section>
+
+<section>
+  <section>
+    <h3>In-Memory Tables</h3>
+
+    <p>Time to scan SF 0.1 LINEITEM</p>
+    <table class="fragment">
+      <tr><th>Source</th><th>Time</th></tr>
+      <tr class="fragment">
+        <td><b>CSV on NVME SSD</b></td>
+        <td>0.88745s</td>
+      </tr>
+      <tr>
+        <td><b>&nbsp;</b></td>
+        <td>&nbsp;</td>
+      </tr>
+    </table>
+    <p>&nbsp;</p>
+  </section>
+
+  <section>
+    <h3>What does that 1s include?</h3>
+    <ul>
+      <li class="fragment">Disk to Ram</li>
+      <li class="fragment">Buffering in the OS</li>
+      <li class="fragment">IPC</li>
+      <li class="fragment">Split on |</li>
+      <li class="fragment">Parse Int, Float, Date</li>
+      <li class="fragment">Iterate Over Each Tuple</li>
+    </ul>
+  </section>
+
+  <section>
+    <h3>In-Memory Tables</h3>
+
+    <p>Time to scan SF 0.1 LINEITEM</p>
+    <table>
+      <tr><th>Source</th><th>Time</th></tr>
+      <tr>
+        <td><b>CSV on NVME SSD</b></td>
+        <td>0.88745s</td>
+      </tr>
+      <tr>
+        <td><b>IndexedSeq[InternalRow]</b></td>
+        <td class="fragment">0.018s</td>
+      </tr>
+    </table>
+    <p class="fragment">~30x speedup</p>
+  </section>
+
+  <section>
+    <p><b>Takeaway:</b> Read data in at <tt>CREATE TABLE</tt></p>
+  </section>
+
+  <section>
+    <h3>Compiler</h3>
+
+    <pre><code class="scala">
+      Table(...)
+    </code></pre>
+    <p>Return an iterator over the preloaded table.</p>
+  </section>
+</section>
+
+<section>
+  <section>
+    <h3>Primary Key Index</h3>
+
+    <p>Time to filter SF 0.1 LINEITEM for one orderkey</p>
+    <table>
+      <tr><th>Source</th><th>Time</th></tr>
+      <tr>
+        <td><b>CSV on NVME SSD</b></td>
+        <td>0.9196s</td>
+      </tr>
+      <tr>
+        <td><b>IndexedSeq[InternalRow]</b></td>
+        <td>0.0624s</td>
+      </tr>
+      <tr>
+        <td><b>&nbsp;</b></td>
+        <td>&nbsp;</td>
+      </tr>
+    </table>
+    <p>&nbsp;</p>
+  </section>
+
+  <section>
+    <h3>What does that 62ms include?</h3>
+    <ul>
+      <li class="fragment">EqualTo(...).eval(...) for each row</li>
+      <li class="fragment">Iterate Over Each Tuple</li>
+    </ul>
+  </section>
+
+  <section>
+    <h3>Primary Key Index</h3>
+
+    <p>Time to filter SF 0.1 LINEITEM for one orderkey</p>
+    <table>
+      <tr><th>Source</th><th>Time</th></tr>
+      <tr>
+        <td><b>CSV on NVME SSD</b></td>
+        <td>0.9196s</td>
+      </tr>
+      <tr>
+        <td><b>IndexedSeq[InternalRow]</b></td>
+        <td>0.0624s</td>
+      </tr>
+      <tr>
+        <td><b>Sorted IndexedSeq[InternalRow] + Bin Search</b></td>
+        <td class="fragment">0.0008s</td>
+      </tr>
+    </table>
+    <p class="fragment">~80x speedup</p>
+  </section>
+
+  <section>
+    <p><b>Takeaway: </b> Sort on primary key and binary search.</p>
+  </section>
+
+  <section>
+    <h3>Compiler</h3>
+    <pre><code class="scala">
+      Filter(expression, Table(...))
+    </code></pre>
+    <p class="fragment">If <tt>expression</tt> is a ...</p>
+  </section>
+
+  <section>
+    <p>If <tt>expression</tt> is a ...</p>
+    <dl>
+      <dt class="fragment"><tt>EqualTo</tt> between the Table key and a constant</dt>
+      <dd class="fragment">Binary search for the key!</dd>
+      <dt class="fragment"><tt>[Greater|Less]Than[OrEquals]</tt> between the Table key and a constant</dt>
+      <dd class="fragment">Binary search for the lower/upper bound</dd>
+      <dd class="fragment">Greater and Lower</dd>
+      <dd class="fragment">Binary search for the lower and upper bound</dd>
+      <dd class="fragment">One of the above and more</dd>
+      <dd class="fragment">Binary search + Filter the rest</dd>
+    </dl>
+  </section>
+
+  <section>
+    <p>But TPC-H doesn't have filters on keys...</p>
+  </section>
+
+  <section>
+    <h3>Also Index</h3>
+    <pre><code class="sql">
+      USING csv OPTIONS(
+      path '../TPCH/LINEITEM.csv', 
+      delimiter = '|',
+      primary_key = 'orderkey,linenumber',
+      tree_index = 'shipdate',
+      hash_index = 'linestatus|shipmode'
+    )
+    </code></pre>
+    <p><tt>tree_index</tt> and <tt>hash_index</tt> are |-separated lists of ,-separated indexes.</p>
+  </section>
+</section>
+
+<section>
+  <b>Questions?</b>
+</section>