Website/slides/cse4562sp2018/2018-02-12-Algorithms.html
2018-02-12 17:18:08 -05:00

437 lines
15 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>CSE 4/562 - Spring 2018</title>
<meta name="description" content="CSE 4/562 - Spring 2018">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
<!--[if lt IE 9]>
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
CSE 4/562 - Database Systems
</div>
<div class="slides">
<section>
<h1>Query Evaluation</h1>
<h3>CSE 4/562 Database Systems</h3>
<h5>February 12, 2018</h5>
</section>
<section>
<section>
<h3>Query Evaluation Styles</h3>
<dl>
<dt class="fragment highlight-grey" data-fragment-index="2">All-At-Once (Collections)</dt>
<dd class="fragment highlight-grey" data-fragment-index="2">Bottom-up, one operator at a time.</dd>
<dt>Volcano-Style (Iterators)</dt>
<dd>Operators "request" one tuple at a time from children.</dd>
<dt class="fragment highlight-grey" data-fragment-index="1">Push-Style (Buffers)</dt>
<dd class="fragment highlight-grey" data-fragment-index="1">Operators continuously produce/consume tuples.</dd>
</dl>
</section>
<section>
<h3>Basic Mindset</h3>
<img src="graphics/2018-02-05-RA-Tree.svg" style="display: inline-block; vertical-align: middle;" />
<pre style="display: inline-block; vertical-align: middle; margin-left: 20px; width:550px;"><code class="python">
r = get_table("R")
s = get_table("S")
temp1 = apply_join(r, s, "R.B = S.B")
temp2 = apply_select(temp1, "S.C = 10")
result = apply_projection(temp2, "R.A")
</code></pre>
</section>
<section>
<h3>Basic Mindset</h3>
<pre><code class="python">
def build_tree(operator):
if """ operator is a base table """:
return get_table(...)
elif """ operator is a selection """:
return apply_select(operator.child, operator.condition)
elif """ handle remaining cases similarly """:
</code></pre>
</section>
<section>
<h3>Select</h3>
<p class="fragment" style="display: inline-block; vertical-align: middle; margin-right: 100px">
$$\sigma_{A \neq 3} R$$
</p>
<table style="display: inline-block; vertical-align: middle;">
<tr><th>A</th><th>B</th></tr>
<tr><td>1</td><td>2</td></tr>
<tr class="fragment highlight-grey"><td>3</td><td>4</td></tr>
<tr><td>5</td><td>6</td></tr>
</table>
</section>
<section>
<h3>Select</h3>
<pre><code class="python">
def apply_select(input, condition)
result = []
for row in input:
if condition(row):
result += [row]
return result;
</code></pre>
<p class="fragment">(All-At-Once)</p>
</section>
<section>
<h3>Select</h3>
<p style="display: inline-block; vertical-align: middle; margin-right: 100px">
$$\sigma_{A \neq 3} R$$
</p>
<table style="display: inline-block; vertical-align: middle; font-size: 80%">
<tr><th>A</th><th>B</th><td></td></tr>
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
<tr class="fragment"><td>1</td><td>2</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
<tr class="fragment"><td>3</td><td>4</td><td class="fragment" style="color: red; text-align: left;"><span style="margin-left: 30px;">X</span></td></tr>
<tr class="fragment" ><td>5</td><td>6</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
<tr class="fragment"><td colspan="2"><code>None</code></td><td class="fragment" style="color: red; text-align: left;"><code>return None;</code></td></tr>
</table>
</section>
<section>
<h3>Select</h3>
<svg data-src="graphics/2018-02-12-Flow-Select.svg" />
</section>
<section>
<h3>Project</h3>
<svg data-src="graphics/2018-02-12-Flow-Project.svg" />
</section>
<section>
<h3>Union</h3>
<svg data-src="graphics/2018-02-12-Flow-Union.svg" />
</section>
<section>
<h3>Cross</h3>
<pre><code class="python">
def apply_cross(lhs, rhs):
result = []
for r in lhs:
for s in rhs:
result += [r + s]
return result
</code></pre>
</section>
<section>
<h3>Cross</h3>
<svg data-src="graphics/2018-02-12-Flow-Cross.svg" />
</section>
<section>
<p>What's the complexity of this cross-product algorithm?</p>
<p>... in terms of compute</p>
<p>... in terms of IOs</p>
</section>
</section>
<section>
<section>
<h3>Cross Product Problems</h3>
<dl>
<dt>Need to scan the inner relation multiple times!</dt>
<dd class="fragment">Load data intelligently to mitigate expensive IOs</dd>
<dt>Every tuple needs to be paired with every other tuple!</dt>
<dd class="fragment">Exploit join conditions to minimize pairs of tuples</dd>
</dl>
</section>
<section>
<h3>Preloading Data</h3>
<p class="fragment">Nested-Loop Join</p>
<pre><code class="python">
def apply_cross(lhs, rhs):
result = []
while r = lhs.next():
while s = rhs.next():
result += [r + s]
rhs.reset()
return result
</code></pre>
</section>
<section>
<h3>Nested-Loop Join</h3>
<svg data-src="graphics/2018-02-12-Join-NLJ.svg" />
</section>
<section>
<p><b>Problem</b>: We need to evaluate <code>rhs</code> iterator<br/> once per record in <code>lhs</code></p>
</section>
<section>
<h3>Preloading Data</h3>
<p><b>Naive Solution</b>: Preload records from <code>lhs</code></p>
<pre><code class="python">
def apply_cross(lhs, rhs):
result = []
rhs_preloaded = []
while s = rhs.next():
rhs_preloaded += [s]
while r = lhs.next():
for s in rhs_preloaded:
result += [r + s]
return result
</code></pre>
<p class="fragment">Any problems with this?</p>
</section>
<section>
<h3>Preloading Data</h3>
<p><b>Better Solution</b>: Load both <code>lhs</code> and <code>rhs</code> records in blocks.</p>
<pre><code class="python">
def apply_cross(lhs, rhs):
result = []
while r_block = lhs.take(100):
while s_block = rhs.take(100):
for r in r_block:
for s in s_block:
result += [r + s]
rhs.reset()
return result
</code></pre>
</section>
<section>
<h3>Block-Nested Loop Join</h3>
<svg data-src="graphics/2018-02-12-Join-BNLJ.svg" class="stretch" />
</section>
<section>
<p>How big should the blocks be?</p>
<p class="fragment">What is the IO complexity of the algorithm?</p>
</section>
<section>
<h3>Join Conditions</h3>
<svg data-src="graphics/2018-02-12-Join-Grid.svg" />
<p class="fragment"><b>Problem</b>: Naively, any tuple matches any other</p>
</section>
<section>
<h3>Join Conditions</h3>
<svg data-src="graphics/2018-02-12-Join-OrderGrid.svg" />
<p><b>Solution</b>: First organize the data</p>
</section>
</section>
<section>
<section>
<h3>Strategies</h3>
<dl>
<dt>Sort/Merge Join</dt>
<dd>Sort all of the data upfront, then scan over both sides.</dd>
<dt>In-Memory Index Join (1-pass Hash; Hash Join)</dt>
<dd>Build an in-memory index on one table, scan the other.</dd>
<dt>Partition Join (2-pass Hash; External Hash Join)</dt>
<dd>Partition both sides so that tuples don't join across partitions.</dd>
</dl>
</section>
<section>
<h3>Sort/Merge Join</h3>
<svg data-src="graphics/2018-02-12-Join-SortMerge.svg" />
</section>
<section>
<h3>Hash Functions</h3>
<ul>
<li>A hash function is a function that maps a large data value to a small fixed-size value<ul>
<li>Typically is deterministic &amp; pseudorandom</li>
</ul></li>
<li>Used in Checksums, Hash Tables, Partitioning, Bloom Filters, Caching, Cryptography, Password Storage, …</li>
<li>Examples: MD5, SHA1, SHA2<ul>
<li>MD5() part of OpenSSL (on most OSX / Linux / Unix)</li>
</ul></li>
<li>Can map h(k) to range [0,N) with h(k) % N (modulus)</li>
</ul>
</section>
<section>
<h3>Hash Functions</h3>
<p style="margin-top: 50px">
$$h(X) \mod N$$
<ul>
<li>Pseudorandom output between $[0, N)$</li>
<li>Always the same output for a given $X$</li>
</ul>
</p>
</section>
<section>
<h3>1-Pass Hash Join</h3>
<svg data-src="graphics/2018-02-12-Join-1PassHash.svg" />
</section>
<section>
<h3>2-Pass Hash Join</h3>
<svg data-src="graphics/2018-02-12-Join-2PassHash.svg" />
</section>
<section>
<p>Why is it important that the hash function is pseudorandom?</p>
</section>
</section>
<section>
<h3>Next Class</h3>
<p style="margin-top: 100px">More operators, More algorithms</p>
</section>
</div></div>
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: true,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
},
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
]
});
</script>
</body>
</html>