2018-02-23 01:43:24 -05:00
|
|
|
|
<!doctype html>
|
|
|
|
|
<html lang="en">
|
|
|
|
|
|
|
|
|
|
<head>
|
|
|
|
|
<meta charset="utf-8">
|
|
|
|
|
|
|
|
|
|
<title>CSE 4/562 - Spring 2018</title>
|
|
|
|
|
|
|
|
|
|
<meta name="description" content="CSE 4/562 - Spring 2018">
|
|
|
|
|
<meta name="author" content="Oliver Kennedy">
|
|
|
|
|
|
|
|
|
|
<meta name="apple-mobile-web-app-capable" content="yes" />
|
|
|
|
|
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
|
|
|
|
|
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
|
|
|
|
|
<link rel="stylesheet" href="ubodin.css" id="theme">
|
|
|
|
|
|
|
|
|
|
<!-- Code syntax highlighting -->
|
|
|
|
|
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
|
|
|
|
|
|
|
|
|
|
<!-- Printing and PDF exports -->
|
|
|
|
|
<script>
|
|
|
|
|
var link = document.createElement( 'link' );
|
|
|
|
|
link.rel = 'stylesheet';
|
|
|
|
|
link.type = 'text/css';
|
|
|
|
|
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
|
|
|
|
|
document.getElementsByTagName( 'head' )[0].appendChild( link );
|
|
|
|
|
</script>
|
|
|
|
|
|
|
|
|
|
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
|
|
|
|
|
|
|
|
|
|
<!--[if lt IE 9]>
|
|
|
|
|
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
|
|
|
|
|
<![endif]-->
|
|
|
|
|
</head>
|
|
|
|
|
|
|
|
|
|
<body>
|
|
|
|
|
|
|
|
|
|
<div class="reveal">
|
|
|
|
|
<!-- Any section element inside of this container is displayed as a slide -->
|
|
|
|
|
|
|
|
|
|
<div class="header">
|
|
|
|
|
<!-- Any Talk-Specific Header Content Goes Here -->
|
|
|
|
|
CSE 4/562 - Database Systems
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="slides">
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h1>Indexes</h1>
|
|
|
|
|
<h3>CSE 4/562 – Database Systems</h3>
|
2018-02-23 15:58:01 -05:00
|
|
|
|
<h5>February 23, 2018</h5>
|
2018-02-23 01:43:24 -05:00
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Recap</h3>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<div style="display: inline-block; margin-right: 100px;">
|
|
|
|
|
<h3>Index</h3>
|
|
|
|
|
<img src="graphics/2018-02-23-Index.png" height="300px" />
|
|
|
|
|
</div>
|
|
|
|
|
<div style="display: inline-block;">
|
|
|
|
|
<h3>Data</h3>
|
|
|
|
|
<img src="graphics/2018-02-23-Data.png" height="300px" />
|
|
|
|
|
</div>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<p>Data, even if well organized still requires you to page through a lot.</p>
|
|
|
|
|
|
|
|
|
|
<p>An index helps you quickly jump to specific data you might be interested in.</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Data Organization</h3>
|
|
|
|
|
|
|
|
|
|
<dl>
|
|
|
|
|
<div class="fragment">
|
|
|
|
|
<dt>Unordered Heap</dt>
|
|
|
|
|
<dd>No organization at all. $O(N)$ reads.</dd>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="fragment">
|
|
|
|
|
<dt>(Secondary) Index</dt>
|
|
|
|
|
<dd>Index structure over unorganized data. $O(\ll N)$ <b>random</b> reads for <b>some</b> queries.</dd>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="fragment">
|
|
|
|
|
<dt>Clustered (Primary) Index</dt>
|
|
|
|
|
<dd>Index structure over clustered data. $O(\ll N)$ <b>sequential</b> reads for <b>some</b> queries.</dd>
|
|
|
|
|
</div>
|
|
|
|
|
</dl>
|
|
|
|
|
</section>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Hash Indexes</h3>
|
|
|
|
|
|
|
|
|
|
<p style="margin-top: 100px; text-align: left;">
|
|
|
|
|
A hash function $h(k)$ is ...
|
|
|
|
|
<dl>
|
|
|
|
|
<dt>... deterministic</dt>
|
|
|
|
|
<dd>The same $k$ always produces the same hash value.</dd>
|
|
|
|
|
|
|
|
|
|
<dt>... (pseudo-)random</dt>
|
|
|
|
|
<dd>Different $k$s are unlikely to have the same hash value.</dd>
|
|
|
|
|
</dl>
|
|
|
|
|
</p>
|
|
|
|
|
<p class="fragment">Modulus $h(k)\%N$ gives you a random number in $[0, N)$</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<svg data-src="graphics/2018-02-23-HashTable.svg" class="stretch"/>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Problems</h3>
|
|
|
|
|
<dl>
|
|
|
|
|
<dt>$N$ is too small</dt>
|
|
|
|
|
<dd>Too many overflow pages (slower reads).</dd>
|
|
|
|
|
<dt>$N$ is too big</dt>
|
|
|
|
|
<dd>Too many normal pages (wasted space).</dd>
|
|
|
|
|
</dl>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<p><b>Idea:</b> Resize the structure as needed</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<p>To keep things simple, let's use $$h(k) = k$$</p>
|
|
|
|
|
<p class="fragment" style="font-size: 70%">(you wouldn't actually do this in practice)</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<svg data-src="graphics/2018-02-23-HashResize-Naive.svg" class="stretch"/>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Problems</h3>
|
|
|
|
|
<dl>
|
|
|
|
|
<dt class="fragment" data-fragment-index="1">Changing hash functions reallocates everything</dt>
|
|
|
|
|
<dd class="fragment" data-fragment-index="1">Only double/halve the size of a hash function</dd>
|
|
|
|
|
|
|
|
|
|
<dt class="fragment" data-fragment-index="2">Changing sizes still requires reading everything</dt>
|
|
|
|
|
<dd class="fragment" data-fragment-index="3"><b>Idea:</b> Only redistribute buckets that are too big</dd>
|
|
|
|
|
</dl>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<svg data-src="graphics/2018-02-23-HashResize-Dynamic.svg" class="stretch" />
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Dynamic Hashing</h3>
|
|
|
|
|
<ul>
|
|
|
|
|
<li>Add a level of indirection (Directory).</li>
|
|
|
|
|
<li>A data page $i$ can store data with $h(k)%2^n=i$ for any $n$.</li>
|
|
|
|
|
<li>Double the size of the directory (almost free) by duplicating existing entries.</li>
|
|
|
|
|
<li>When bucket $i$ fills up, split on the next power of 2.</li>
|
|
|
|
|
<li>Can also merge buckets/halve the directory size. </li>
|
|
|
|
|
</ul>
|
|
|
|
|
</section>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<section>
|
|
|
|
|
<h3>CDF-Based Indexing</h3>
|
|
|
|
|
<p class="fragment" style="margin-top: 100px;"><b>"The Case for Learned Index Structures"</b><br/>by Kraska, Beutel, Chi, Dean, Polyzotis</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<svg data-src="graphics/2018-02-23-CDF-Linear.svg"/>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<svg data-src="graphics/2018-02-23-CDF-LinearApprox.svg"/>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Cumulative Distribution Function (CDF)</h3>
|
|
|
|
|
<img src="graphics/2018-02-23-CDF-Plot.png" />
|
|
|
|
|
<p>$f(key) \mapsto position$</p>
|
|
|
|
|
<p class="fragment" style="font-size: 50%">(not exactly true, but close enough for today)</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Using CDFs to find records</h3>
|
|
|
|
|
<dl>
|
|
|
|
|
<dt>Ideal: $f(k) = position$</dt>
|
|
|
|
|
<dd>$f$ encodes the <b>exact</b> location of a record</dd>
|
|
|
|
|
|
|
|
|
|
<dt class="fragment">Ok: $f(k) \approx position$ <span class="fragment">($\left|f(k) - position\right| < \epsilon$)</span></dt>
|
|
|
|
|
<dd class="fragment">$f$ gets you to within $\epsilon$ of the key</dd>
|
|
|
|
|
<dd class="fragment">Only need local search on one (or so) leaf pages.</dd>
|
|
|
|
|
</dl>
|
|
|
|
|
<p class="fragment"><b>Simplified Use Case:</b> Static data with "infinite" prep time.</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>How to define $f$?</h3>
|
|
|
|
|
<ul>
|
|
|
|
|
<li class="fragment">Linear ($f(k) = a\cdot k + b$)</li>
|
|
|
|
|
<li class="fragment">Polynomial ($f(k) = a\cdot k + b \cdot k^2 + \ldots$)</li>
|
|
|
|
|
<li class="fragment">Neural Network ($f(k) = $<img src="graphics/Clipart/magic-wand.png" height="100px" style="vertical-align: middle;">)</li>
|
|
|
|
|
</ul>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<p>We have infinite prep time, so fit a (tiny) neural network to the CDF.</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Neural Networks</h3>
|
|
|
|
|
<ul>
|
|
|
|
|
<dt class="fragment" data-fragment-index="1">Extremely Generalized Regression</dt>
|
|
|
|
|
<dd class="fragment" data-fragment-index="1">Essentially a really really really complex, fittable function with a lot of parameters.</dd>
|
|
|
|
|
<dt class="fragment" data-fragment-index="2">Captures Nonlinearities</dt>
|
|
|
|
|
<dd class="fragment" data-fragment-index="2">Most regressions can't handle discontinuous functions, which many key spaces have.</dd>
|
|
|
|
|
<dt class="fragment" data-fragment-index="3">No Branching</dt>
|
|
|
|
|
<dd class="fragment" data-fragment-index="3"><code>if</code> statements are <b>really</b> expensive on modern processors.</dd>
|
|
|
|
|
<dd class="fragment" data-fragment-index="4">(Compare to B+Trees with $\log_2 N$ if statements)</dd>
|
|
|
|
|
</ul>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<h3>Summary</h3>
|
|
|
|
|
|
|
|
|
|
<dl>
|
|
|
|
|
<dt>Tree Indexes</dt>
|
|
|
|
|
<dd>$O(\log N)$ access, supports range queries, easy size changes.</dd>
|
|
|
|
|
|
|
|
|
|
<dt>Hash Indexes</dt>
|
|
|
|
|
<dd>$O(1)$ access, doesn't change size efficiently, only equality tests.</dd>
|
|
|
|
|
|
|
|
|
|
<dt>CDF Indexes</dt>
|
|
|
|
|
<dd>$O(1)$ access, supports range queries, static data only.</dd>
|
|
|
|
|
</dl>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
<section>
|
|
|
|
|
<p><b>Next Class:</b> Using Indexes</p>
|
|
|
|
|
</section>
|
|
|
|
|
|
|
|
|
|
</div></div>
|
|
|
|
|
|
|
|
|
|
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
|
|
|
|
|
|
|
|
|
|
<script>
|
|
|
|
|
|
|
|
|
|
// Full list of configuration options available at:
|
|
|
|
|
// https://github.com/hakimel/../reveal.js#configuration
|
|
|
|
|
Reveal.initialize({
|
2018-02-26 19:44:30 -05:00
|
|
|
|
controls: true,
|
2018-02-23 01:43:24 -05:00
|
|
|
|
progress: true,
|
|
|
|
|
history: true,
|
|
|
|
|
center: true,
|
|
|
|
|
slideNumber: true,
|
|
|
|
|
|
|
|
|
|
transition: 'fade', // none/fade/slide/convex/concave/zoom
|
|
|
|
|
|
|
|
|
|
chart: {
|
|
|
|
|
defaults: {
|
|
|
|
|
global: {
|
|
|
|
|
title: { fontColor: "#333", fontSize: 24 },
|
|
|
|
|
legend: {
|
|
|
|
|
labels: { fontColor: "#333", fontSize: 20 },
|
|
|
|
|
},
|
|
|
|
|
responsiveness: true
|
|
|
|
|
},
|
|
|
|
|
scale: {
|
|
|
|
|
scaleLabel: { fontColor: "#333", fontSize: 20 },
|
|
|
|
|
gridLines: { color: "#333", zeroLineColor: "#333" },
|
|
|
|
|
ticks: { fontColor: "#333", fontSize: 16 },
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
|
|
|
|
|
bar: { backgroundColor: [
|
|
|
|
|
"rgba(220,220,220,0.8)",
|
|
|
|
|
"rgba(151,187,205,0.8)",
|
|
|
|
|
"rgba(205,151,187,0.8)",
|
|
|
|
|
"rgba(187,205,151,0.8)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
|
|
|
|
|
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
// Optional ../reveal.js plugins
|
|
|
|
|
dependencies: [
|
|
|
|
|
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
|
|
|
|
|
condition: function() { return true; },
|
|
|
|
|
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
|
|
|
|
|
},
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
|
|
|
|
|
// Chart.min.js
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
|
|
|
|
|
// the plugin
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
|
|
|
|
|
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
|
|
|
|
|
]
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
</script>
|
|
|
|
|
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|