Website/slides/cse4562sp2018/2018-02-19-Indexing1.html
2018-02-26 19:44:30 -05:00

366 lines
12 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>CSE 4/562 - Spring 2018</title>
<meta name="description" content="CSE 4/562 - Spring 2018">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
<!--[if lt IE 9]>
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
CSE 4/562 - Database Systems
</div>
<div class="slides">
<!-- 2018 by OK:
Indexing really belongs *before* we discuss join algorithms.
Not only is indexing fundamental to several joins (1-pass, 2-pass, INLJ), but it
organizes the discussion of algorithms in a more contiguous block.
-->
<section>
<h1>Indexes</h1>
<h3>CSE 4/562 Database Systems</h3>
<h5>February 19, 2018</h5>
</section>
<section>
<section>
<table>
<tr>
<td>
<img src="graphics/Books/DBSystemsHardcover.jpg" height="200px">
</td>
<td>
<img src="graphics/Books/DBSystemsSoftcover.jpg" height="200px">
</td>
</tr>
<tr class="fragment">
<td>$150</td>
<td>$50</td>
</tr>
<tr class="fragment">
<td>Index<br/>ToC</td>
<td>No Index<br/>ToC Summary</td>
</tr>
</table>
</section>
<section>
<h3>Today's Focus</h3>
<p style="margin: 100px;">
$\sigma_C(R)$ <span style="margin: 50px">and</span> $(\ldots \bowtie_C R)$
</p>
<p class="fragment" style="font-size: 70%">(Finding records in a table <span class="fragment">really fast</span>)</p>
</section>
<section>
<h3>Indexing Strategies</h3>
<dl>
<div class="fragment">
<dt>Rearrange the data.</dt>
<dd>Put things in a predictable location or a specific order.</dd>
<dd class="fragment">("clustering" the data)</dd>
</div>
<div class="fragment">
<dt>Wrap the data.</dt>
<dd>Record where specific data values live</dd>
<dd class="fragment">("indexing" the data).</dd>
</div>
</dl>
</section>
<section>
<h3>Data Organization</h3>
<dl>
<div class="fragment">
<dt>Unordered Heap</dt>
<dd>No organization at all. $O(N)$ reads.</dd>
</div>
<div class="fragment">
<dt>(Secondary) Index</dt>
<dd>Index structure over unorganized data. $O(\ll N)$ <b>random</b> reads for <b>some</b> queries.</dd>
</div>
<div class="fragment">
<dt>Clustered (Primary) Index</dt>
<dd>Index structure over clustered data. $O(\ll N)$ <b>sequential</b> reads for <b>some</b> queries.</dd>
</div>
</dl>
</section>
<section>
<h3>Data Organization</h3>
<img src="graphics/2018-02-19-Index-Types.svg" />
</section>
<section>
<h3>Data Organization</h3>
<img src="graphics/2018-02-19-PrimaryVsSecondary.png" />
</section>
<section>
<h3>Index Types</h3>
<dl>
<dt>Tree-Based</dt>
<dd>A hierarchy of decisions lead to data at the leaves.</dd>
<div class="fragment highlight-grey" data-fragment-index=2>
<dt>Hash-Based</dt>
<dd>A hash function puts data in predictable locations.</dd>
<dt>CDF-Based <span style="color: red" class="fragment" data-fragment-index=1>(new)</span></dt>
<dd>A more complex function predicts where data lives.</dd>
</div>
</dl>
</section>
</section>
<section>
<section>
<h3>Tree-Based Indexes</h3>
<svg data-src="graphics/2018-02-19-Tree-BinSearch.svg"/>
</section>
<section>
<h3>Tree-Based Indexes</h3>
<svg data-src="graphics/2018-02-19-Tree-Motivation.svg"/>
</section>
<section>
<h3>Challenges</h3>
<dl>
<dt>Balance</dt>
<dd>Bad question orders lead to poor performance!</dd>
<dt>IO</dt>
<dd>Each access to a binary tree node is a random access.</dd>
<dt class="fragment highlight-grey" data-fragment-index="1">Which Dimension</dt>
<dd class="fragment highlight-grey" data-fragment-index="1">Why limit ourselves to asking about one dimension?</dd>
</dl>
</section>
<section>
<svg data-src="graphics/2018-02-19-Tree-Unbalanced.svg" class="stretch"/>
</section>
<section>
<h3>Worst-Case Tree?</h3>
<div class="fragment" style="margin: 100px">$O(N)$ with the tree laid out left/right-deep</div>
<h3 class="fragment">Best-Case Tree?</h3>
<div class="fragment" style="margin: 100px">$O(\log N)$ with the tree perfectly balanced</div>
</section>
</section>
<section>
<section>
<h3>Binary Trees are Bad for IO</h3>
<p style="margin-top: 100px;">Every step of binary search is a random access</p>
<p style="margin-top: 100px;" class="fragment">Every tree node access is a random access</p>
</section>
<section>
<p>Random access IO is bad.</p>
</section>
<section>
<p><b>Idea: </b> Load a bunch of binary tree nodes together.</p>
</section>
<section>
<p><b>Binary Tree: </b> $1$ separator &amp; $2$ pointers</p>
<p>$log_2(N)$ Deep</p>
<p class="fragment" data-fragment-index="1" style="margin-top: 50px;"><b>$K$-ary Tree: </b> $(K-1)$ separators &amp; $K$ pointers</p>
<p class="fragment" data-fragment-index="1">$log_K(N)$ Deep</p>
</section>
<section>
<p><b>Important:</b> You still need to do binary search on each node of a $K$-ary tree, but now you're doing random access on memory (or cache) instead of disk (or memory)</p>
</section>
<section>
<h3>ISAM Trees</h3>
<img src="graphics/2018-02-19-ISAM.png" height="500px">
</section>
</section>
<section>
<section>
<p>How do you handle updates?</p>
<p style="margin-top: 50px;" class="fragment">B+Tree = ISAM + Updates</p>
</section>
<section>
<h3>Challenges</h3>
<ul>
<li class="fragment">Finding space for new records</li>
<li class="fragment">Keeping the tree balanced as new records are added</li>
</ul>
</section>
<section>
<p><b>Idea 1:</b> Reserve space for new records</p>
</section>
<section>
<svg data-src="graphics/2018-02-19-BTree-Reserved.svg" />
</section>
<section>
<p>Just maintaining open space won't work forever...</p>
</section>
<section>
<h3>Rules of B+Trees</h3>
<dl>
<dt>Keep space open for insertions in inner/data nodes.</dt>
<dd>Split nodes when theyre full</dd>
<dt>Avoid under-using space</dt>
<dd>Merge nodes when theyre under-filled</dd>
</dl>
<p class="fragment"><b>Maintain Invariant:</b> All Nodes ≥ 50% Full</p>
<p class="fragment">(Exception: The Root)</p>
</section>
</section>
<section>
<section><img src="graphics/2018-02-19-InsertExample-1.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-2.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-3.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-4.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-5.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-6.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-7.png" height="300px"/></section>
<section><img src="graphics/2018-02-19-InsertExample-8.png" height="300px"/></section>
<section><p>Deletions reverse this process (at 50% fill).</p></section>
</section>
<section>
<a href="https://www.cs.usfca.edu/~galles/visualization/BPlusTree.html" style="font-size: 200%; font-weight: bold;">Demo</a>
</section>
<section>
<p><b>Next Class</b>: Hash- and CDF-Based Indexes</p>
</section>
</div></div>
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: true,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
},
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
]
});
</script>
</body>
</html>