514 lines
18 KiB
HTML
514 lines
18 KiB
HTML
<!doctype html>
|
||
<html lang="en">
|
||
|
||
<head>
|
||
<meta charset="utf-8">
|
||
|
||
<title>CSE 4/562 - Spring 2018</title>
|
||
|
||
<meta name="description" content="CSE 4/562 - Spring 2018">
|
||
<meta name="author" content="Oliver Kennedy">
|
||
|
||
<meta name="apple-mobile-web-app-capable" content="yes" />
|
||
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
|
||
|
||
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
|
||
<link rel="stylesheet" href="ubodin.css" id="theme">
|
||
|
||
<!-- Code syntax highlighting -->
|
||
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
|
||
|
||
<!-- Printing and PDF exports -->
|
||
<script>
|
||
var link = document.createElement( 'link' );
|
||
link.rel = 'stylesheet';
|
||
link.type = 'text/css';
|
||
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
|
||
document.getElementsByTagName( 'head' )[0].appendChild( link );
|
||
</script>
|
||
|
||
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
|
||
<![endif]-->
|
||
</head>
|
||
|
||
<body>
|
||
|
||
<div class="reveal">
|
||
<!-- Any section element inside of this container is displayed as a slide -->
|
||
|
||
<div class="header">
|
||
<!-- Any Talk-Specific Header Content Goes Here -->
|
||
CSE 4/562 - Database Systems
|
||
</div>
|
||
|
||
<div class="slides">
|
||
|
||
<section>
|
||
<h1>Query Evaluation</h1>
|
||
<h3>CSE 4/562 – Database Systems</h3>
|
||
<h5>February 12, 2018</h5>
|
||
</section>
|
||
|
||
<section>
|
||
<section>
|
||
<h3>Query Evaluation Styles</h3>
|
||
|
||
<dl>
|
||
<dt class="fragment highlight-grey" data-fragment-index="2">All-At-Once (Collections)</dt>
|
||
<dd class="fragment highlight-grey" data-fragment-index="2">Bottom-up, one operator at a time.</dd>
|
||
|
||
<dt>Volcano-Style (Iterators)</dt>
|
||
<dd>Operators "request" one tuple at a time from children.</dd>
|
||
|
||
<dt class="fragment highlight-grey" data-fragment-index="1">Push-Style (Buffers)</dt>
|
||
<dd class="fragment highlight-grey" data-fragment-index="1">Operators continuously produce/consume tuples.</dd>
|
||
</dl>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Basic Mindset</h3>
|
||
<img src="graphics/2018-02-05-RA-Tree.svg" style="display: inline-block; vertical-align: middle;" />
|
||
<pre style="display: inline-block; vertical-align: middle; margin-left: 20px; width:550px;"><code class="python">
|
||
r = get_table("R")
|
||
|
||
s = get_table("S")
|
||
|
||
temp1 = apply_join(r, s, "R.B = S.B")
|
||
|
||
temp2 = apply_select(temp1, "S.C = 10")
|
||
|
||
result = apply_projection(temp2, "R.A")
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Basic Mindset</h3>
|
||
<pre><code class="python">
|
||
def build_tree(operator):
|
||
|
||
if """ operator is a base table """:
|
||
return get_table(...)
|
||
|
||
elif """ operator is a selection """:
|
||
return apply_select(operator.child, operator.condition)
|
||
|
||
elif """ handle remaining cases similarly """:
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Select</h3>
|
||
|
||
<p class="fragment" style="display: inline-block; vertical-align: middle; margin-right: 100px">
|
||
$$\sigma_{A \neq 3} R$$
|
||
</p>
|
||
<table style="display: inline-block; vertical-align: middle;">
|
||
<tr><th>A</th><th>B</th></tr>
|
||
<tr><td>1</td><td>2</td></tr>
|
||
<tr class="fragment highlight-grey"><td>3</td><td>4</td></tr>
|
||
<tr><td>5</td><td>6</td></tr>
|
||
</table>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Select</h3>
|
||
|
||
<pre><code class="python">
|
||
def apply_select(input, condition)
|
||
result = []
|
||
|
||
for row in input:
|
||
if condition(row):
|
||
result += [row]
|
||
|
||
return result;
|
||
</code></pre>
|
||
<p class="fragment">(All-At-Once)</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Select</h3>
|
||
|
||
<p style="display: inline-block; vertical-align: middle; margin-right: 100px">
|
||
$$\sigma_{A \neq 3} R$$
|
||
</p>
|
||
<table style="display: inline-block; vertical-align: middle; font-size: 80%">
|
||
<tr><th>A</th><th>B</th><td></td></tr>
|
||
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
|
||
<tr class="fragment"><td>1</td><td>2</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
|
||
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
|
||
<tr class="fragment"><td>3</td><td>4</td><td class="fragment" style="color: red; text-align: left;"><span style="margin-left: 30px;">X</span></td></tr>
|
||
<tr class="fragment" ><td>5</td><td>6</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
|
||
<tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
|
||
<tr class="fragment"><td colspan="2"><code>None</code></td><td class="fragment" style="color: red; text-align: left;"><code>return None;</code></td></tr>
|
||
</table>
|
||
</section>
|
||
|
||
<section>
|
||
<!-- 2018-OK: Idea for future revision.
|
||
Introducing these algorithms alone is kind of a boring info-dump. A good way to
|
||
spice this segment up would be to combine it with some basic algorithm analysis.
|
||
|
||
Introduce a framework for analysis
|
||
- Memory Size
|
||
- Compute Requirement
|
||
- IO Requirement
|
||
|
||
In particular, spend a bunch of time setting up the framework first. We'll need to
|
||
cover attribution of costs to individual operators (e.g., Select doesn't introduce IOs)
|
||
Also worth covering: IOs vs Record Reads.
|
||
-->
|
||
<h3>Select</h3>
|
||
<svg data-src="graphics/2018-02-12-Flow-Select.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Project</h3>
|
||
<svg data-src="graphics/2018-02-12-Flow-Project.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Union</h3>
|
||
<svg data-src="graphics/2018-02-12-Flow-Union.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Cross</h3>
|
||
<pre><code class="python">
|
||
def apply_cross(lhs, rhs):
|
||
result = []
|
||
|
||
for r in lhs:
|
||
for s in rhs:
|
||
result += [r + s]
|
||
|
||
return result
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Cross</h3>
|
||
<svg data-src="graphics/2018-02-12-Flow-Cross.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<p>What's the complexity of this cross-product algorithm?</p>
|
||
<p>... in terms of compute</p>
|
||
<p>... in terms of IOs</p>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
<section>
|
||
<h3>Cross Product Problems</h3>
|
||
<dl>
|
||
<dt>Need to scan the inner relation multiple times!</dt>
|
||
<dd class="fragment">Load data intelligently to mitigate expensive IOs</dd>
|
||
|
||
<dt>Every tuple needs to be paired with every other tuple!</dt>
|
||
<dd class="fragment">Exploit join conditions to minimize pairs of tuples</dd>
|
||
</dl>
|
||
</section>
|
||
|
||
<section>
|
||
<!-- 2018-OK: This slide was a little awkward...
|
||
In particular, it doesn't map directly to the iterator model that
|
||
we've established so far in the lecture (at least not unless you
|
||
can mentally map imperative to continuation programming.
|
||
|
||
Suggestion: Extend this with a state machine as before, or somehow
|
||
tie it back to the original model. If nothing else, start with a
|
||
transition back to the all-at-once model.
|
||
-->
|
||
<h3>Preloading Data</h3>
|
||
<p class="fragment">Nested-Loop Join</p>
|
||
<pre><code class="python">
|
||
def apply_cross(lhs, rhs):
|
||
result = []
|
||
|
||
while r = lhs.next():
|
||
while s = rhs.next():
|
||
result += [r + s]
|
||
rhs.reset()
|
||
|
||
return result
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Nested-Loop Join</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-NLJ.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<p><b>Problem</b>: We need to evaluate <code>rhs</code> iterator<br/> once per record in <code>lhs</code></p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Preloading Data</h3>
|
||
|
||
<p><b>Naive Solution</b>: Preload records from <code>lhs</code></p>
|
||
<pre><code class="python">
|
||
def apply_cross(lhs, rhs):
|
||
result = []
|
||
rhs_preloaded = []
|
||
|
||
while s = rhs.next():
|
||
rhs_preloaded += [s]
|
||
|
||
while r = lhs.next():
|
||
for s in rhs_preloaded:
|
||
result += [r + s]
|
||
|
||
return result
|
||
</code></pre>
|
||
|
||
<p class="fragment">Any problems with this?</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Preloading Data</h3>
|
||
|
||
<p><b>Better Solution</b>: Load both <code>lhs</code> and <code>rhs</code> records in blocks.</p>
|
||
|
||
<pre><code class="python">
|
||
def apply_cross(lhs, rhs):
|
||
result = []
|
||
|
||
while r_block = lhs.take(100):
|
||
while s_block = rhs.take(100):
|
||
for r in r_block:
|
||
for s in s_block:
|
||
result += [r + s]
|
||
rhs.reset()
|
||
|
||
return result
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Block-Nested Loop Join</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-BNLJ.svg" class="stretch" />
|
||
</section>
|
||
|
||
<section>
|
||
<!-- 2018-OK: Missed opportunity on the slides...
|
||
Specifically, we could do a little more analysis of the inner loop. Maybe spend a slide
|
||
on an animation showing the overall cost of BNLJ?
|
||
|
||
-->
|
||
<p>How big should the blocks be?</p>
|
||
|
||
<p class="fragment">What is the IO complexity of the algorithm?</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Join Conditions</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-Grid.svg" />
|
||
<p class="fragment"><b>Problem</b>: Naively, any tuple matches any other</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Join Conditions</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-OrderGrid.svg" />
|
||
<p><b>Solution</b>: First organize the data</p>
|
||
</section>
|
||
|
||
</section>
|
||
|
||
<section>
|
||
|
||
<section>
|
||
<h3>Strategies for Implementing $R \bowtie_{R.A = S.A} S$</h3>
|
||
|
||
<dl>
|
||
<dt>Sort/Merge Join</dt>
|
||
<dd>Sort all of the data upfront, then scan over both sides.</dd>
|
||
|
||
<dt>In-Memory Index Join (1-pass Hash; Hash Join)</dt>
|
||
<dd>Build an in-memory index on one table, scan the other.</dd>
|
||
|
||
<dt>Partition Join (2-pass Hash; External Hash Join)</dt>
|
||
<dd>Partition both sides so that tuples don't join across partitions.</dd>
|
||
</dl>
|
||
</section>
|
||
|
||
<section>
|
||
<!-- 2018-OK: The motivation for this algorithm fell completely flat.
|
||
It might help if we approach SortMerge with IOs/Mem/CPU in mind.
|
||
The slide also deserves some discussion of *which* conditions it
|
||
can be used to support efficiently.
|
||
|
||
It might also help to discuss use cases where it's appropriate.
|
||
-->
|
||
<h3>Sort/Merge Join</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-SortMerge.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Sort/Merge Join</h3>
|
||
<dl>
|
||
<dt>Limited Queries</dt>
|
||
<dd>Only supports join conditions of the form $R.A = S.B$</dd>
|
||
|
||
<dt>Low Memory</dt>
|
||
<dd>Only needs to keep ~2 rows in memory at a time (not counting sort).</dd>
|
||
|
||
<dt>Low Added CPU/IO Cost</dt>
|
||
<dd>Only requires 1 scan over each input (not counting sort).</dd>
|
||
</dl>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Hash Functions</h3>
|
||
|
||
<ul>
|
||
<li>A hash function is a function that maps a large data value to a small fixed-size value<ul>
|
||
<li>Typically is deterministic & pseudorandom</li>
|
||
</ul></li>
|
||
<li>Used in Checksums, Hash Tables, Partitioning, Bloom Filters, Caching, Cryptography, Password Storage, …</li>
|
||
<li>Examples: MD5, SHA1, SHA2<ul>
|
||
<li>MD5() part of OpenSSL (on most OSX / Linux / Unix)</li>
|
||
</ul></li>
|
||
<li>Can map h(k) to range [0,N) with h(k) % N (modulus)</li>
|
||
</ul>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Hash Functions</h3>
|
||
|
||
<p style="margin-top: 50px">
|
||
$$h(X) \mod N$$
|
||
|
||
<ul>
|
||
<li>Pseudorandom output between $[0, N)$</li>
|
||
<li>Always the same output for a given $X$</li>
|
||
</ul>
|
||
</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>1-Pass Hash Join</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-1PassHash.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>1-Pass Hash Join</h3>
|
||
<dl>
|
||
<dt>Limited Queries</dt>
|
||
<dd>Only supports join conditions of the form $R.A = S.B$</dd>
|
||
|
||
<dt>Moderate-High Memory</dt>
|
||
<dd>Keeps 1 relation in memory</dd>
|
||
|
||
<dt>Low Added CPU/IO Cost</dt>
|
||
<dd>Only requires 1 scan over each input.</dd>
|
||
</dl>
|
||
<p class="fragment">Can use other in-memory indexes to support other join conditions.</p>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>2-Pass Hash Join</h3>
|
||
<svg data-src="graphics/2018-02-12-Join-2PassHash.svg" />
|
||
</section>
|
||
|
||
<section>
|
||
<h3>2-Pass Hash Join</h3>
|
||
<dl>
|
||
<dt>Limited Queries</dt>
|
||
<dd>Only supports join conditions of the form $R.A = S.B$</dd>
|
||
|
||
<dt>Low Memory</dt>
|
||
<dd>Never need more than 1 pair of partitions in memory</dd>
|
||
|
||
<dt>High IO Cost</dt>
|
||
<dd>Every record gets written out to disk, and back in.</dd>
|
||
</dl>
|
||
<p class="fragment">Can partition on data-values to support other types of queries.</p>
|
||
</section>
|
||
|
||
<section>
|
||
<p>Why is it important that the hash function is pseudorandom?</p>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
<h3>Next Class</h3>
|
||
<p style="margin-top: 100px">More operators, More algorithms</p>
|
||
</section>
|
||
|
||
</div></div>
|
||
|
||
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
|
||
|
||
<script>
|
||
|
||
// Full list of configuration options available at:
|
||
// https://github.com/hakimel/../reveal.js#configuration
|
||
Reveal.initialize({
|
||
controls: true,
|
||
progress: true,
|
||
history: true,
|
||
center: true,
|
||
slideNumber: true,
|
||
|
||
transition: 'fade', // none/fade/slide/convex/concave/zoom
|
||
|
||
chart: {
|
||
defaults: {
|
||
global: {
|
||
title: { fontColor: "#333", fontSize: 24 },
|
||
legend: {
|
||
labels: { fontColor: "#333", fontSize: 20 },
|
||
},
|
||
responsiveness: true
|
||
},
|
||
scale: {
|
||
scaleLabel: { fontColor: "#333", fontSize: 20 },
|
||
gridLines: { color: "#333", zeroLineColor: "#333" },
|
||
ticks: { fontColor: "#333", fontSize: 16 },
|
||
}
|
||
},
|
||
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
|
||
bar: { backgroundColor: [
|
||
"rgba(220,220,220,0.8)",
|
||
"rgba(151,187,205,0.8)",
|
||
"rgba(205,151,187,0.8)",
|
||
"rgba(187,205,151,0.8)"
|
||
]
|
||
},
|
||
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
|
||
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
|
||
},
|
||
|
||
// Optional ../reveal.js plugins
|
||
dependencies: [
|
||
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
||
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
|
||
condition: function() { return true; },
|
||
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
|
||
},
|
||
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
|
||
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
|
||
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
|
||
// Chart.min.js
|
||
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
|
||
// the plugin
|
||
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
|
||
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
|
||
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
|
||
]
|
||
});
|
||
|
||
</script>
|
||
|
||
</body>
|
||
</html>
|