Website/slides/cse4562sp2018/2018-02-12-Algorithms.html

<!doctype html>
<html lang="en">

	<head>
		<meta charset="utf-8">

		<title>CSE 4/562 - Spring 2018</title>

		<meta name="description" content="CSE 4/562 - Spring 2018">
		<meta name="author" content="Oliver Kennedy">

		<meta name="apple-mobile-web-app-capable" content="yes" />
		<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />

		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">

		<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
		<link rel="stylesheet" href="ubodin.css" id="theme">

		<!-- Code syntax highlighting -->
		<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">

		<!-- Printing and PDF exports -->
		<script>
			var link = document.createElement( 'link' );
			link.rel = 'stylesheet';
			link.type = 'text/css';
			link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
			document.getElementsByTagName( 'head' )[0].appendChild( link );
		</script>

    <script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>

		<!--[if lt IE 9]>
		<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
		<![endif]-->
	</head>

	<body>

		<div class="reveal">
		<!-- Any section element inside of this container is displayed as a slide -->

		<div class="header">
			<!-- Any Talk-Specific Header Content Goes Here -->
			CSE 4/562 - Database Systems
		</div>

		<div class="slides">

      <section>
        <h1>Query Evaluation</h1>
        <h3>CSE 4/562 – Database Systems</h3>
        <h5>February 12, 2018</h5>
      </section>

      <section>
        <section>
          <h3>Query Evaluation Styles</h3>

          <dl>
            <dt class="fragment highlight-grey" data-fragment-index="2">All-At-Once (Collections)</dt>
            <dd class="fragment highlight-grey" data-fragment-index="2">Bottom-up, one operator at a time.</dd>

            <dt>Volcano-Style (Iterators)</dt>
            <dd>Operators "request" one tuple at a time from children.</dd>

            <dt class="fragment highlight-grey" data-fragment-index="1">Push-Style (Buffers)</dt>
            <dd class="fragment highlight-grey" data-fragment-index="1">Operators continuously produce/consume tuples.</dd>
          </dl>
        </section>

        <section>
          <h3>Basic Mindset</h3>
          <img src="graphics/2018-02-05-RA-Tree.svg" style="display: inline-block; vertical-align: middle;" />
          <pre style="display: inline-block; vertical-align: middle; margin-left: 20px; width:550px;"><code class="python">
  r = get_table("R")

  s = get_table("S")

  temp1 = apply_join(r, s, "R.B = S.B")

  temp2 = apply_select(temp1, "S.C = 10")

  result = apply_projection(temp2, "R.A")
          </code></pre>
        </section>

        <section>
          <h3>Basic Mindset</h3>
          <pre><code class="python">
      def build_tree(operator):

        if """ operator is a base table """:
          return get_table(...)

        elif """ operator is a selection """:
          return apply_select(operator.child, operator.condition)

        elif """ handle remaining cases similarly """:
          </code></pre>
        </section>

        <section>
          <h3>Select</h3>

          <p class="fragment" style="display: inline-block; vertical-align: middle; margin-right: 100px">
            $$\sigma_{A \neq 3} R$$
          </p>
          <table style="display: inline-block; vertical-align: middle;">
            <tr><th>A</th><th>B</th></tr>
            <tr><td>1</td><td>2</td></tr>
            <tr class="fragment highlight-grey"><td>3</td><td>4</td></tr>
            <tr><td>5</td><td>6</td></tr>
          </table>
        </section>

        <section>
          <h3>Select</h3>

          <pre><code class="python">
                  def apply_select(input, condition)
                    result = []

                    for row in input:
                      if condition(row):
                        result += [row]

                    return result;
          </code></pre>
          <p class="fragment">(All-At-Once)</p>
        </section>

        <section>
          <h3>Select</h3>

          <p style="display: inline-block; vertical-align: middle; margin-right: 100px">
            $$\sigma_{A \neq 3} R$$
          </p>
          <table style="display: inline-block; vertical-align: middle; font-size: 80%">
            <tr><th>A</th><th>B</th><td></td></tr>
            <tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
            <tr class="fragment"><td>1</td><td>2</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
            <tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
            <tr class="fragment"><td>3</td><td>4</td><td class="fragment" style="color: red; text-align: left;"><span style="margin-left: 30px;">X</span></td></tr>
            <tr class="fragment" ><td>5</td><td>6</td><td class="fragment" style="color: green; text-align: left;"><code style="margin-left: 30px;">return row;</code></td></tr>
            <tr class="fragment"><td colspan="2"><code>getNext()</code></td><td style="text-align: left"><code>for row in input:</code></td></tr>
            <tr class="fragment"><td colspan="2"><code>None</code></td><td class="fragment" style="color: red; text-align: left;"><code>return None;</code></td></tr>
          </table>
        </section>

        <section>
          <!-- 2018-OK: Idea for future revision.
                        Introducing these algorithms alone is kind of a boring info-dump.  A good way to
                        spice this segment up would be to combine it with some basic algorithm analysis.

                        Introduce a framework for analysis
                          - Memory Size
                          - Compute Requirement
                          - IO Requirement

                        In particular, spend a bunch of time setting up the framework first.  We'll need to
                        cover attribution of costs to individual operators (e.g., Select doesn't introduce IOs)
                        Also worth covering: IOs vs Record Reads.
          -->
          <h3>Select</h3>
          <svg data-src="graphics/2018-02-12-Flow-Select.svg" />
        </section>

        <section>
          <h3>Project</h3>
          <svg data-src="graphics/2018-02-12-Flow-Project.svg" />
        </section>

        <section>
          <h3>Union</h3>
          <svg data-src="graphics/2018-02-12-Flow-Union.svg" />
        </section>

        <section>
          <h3>Cross</h3>
          <pre><code class="python">
                    def apply_cross(lhs, rhs):
                      result = []

                      for r in lhs:
                        for s in rhs:
                          result += [r + s]

                      return result
          </code></pre>
        </section>

        <section>
          <h3>Cross</h3>
          <svg data-src="graphics/2018-02-12-Flow-Cross.svg" />
        </section>

        <section>
          <p>What's the complexity of this cross-product algorithm?</p>
          <p>... in terms of compute</p>
          <p>... in terms of IOs</p>
        </section>
      </section>

      <section>
        <section>
          <h3>Cross Product Problems</h3>
          <dl>
            <dt>Need to scan the inner relation multiple times!</dt>
            <dd class="fragment">Load data intelligently to mitigate expensive IOs</dd>

            <dt>Every tuple needs to be paired with every other tuple!</dt>
            <dd class="fragment">Exploit join conditions to minimize pairs of tuples</dd>
          </dl>
        </section>

        <section>
          <!-- 2018-OK: This slide was a little awkward...
                        In particular, it doesn't map directly to the iterator model that
                        we've established so far in the lecture (at least not unless you
                        can mentally map imperative to continuation programming.

                        Suggestion: Extend this with a state machine as before, or somehow
                        tie it back to the original model.  If nothing else, start with a
                        transition back to the all-at-once model.
          -->
          <h3>Preloading Data</h3>
          <p class="fragment">Nested-Loop Join</p>
          <pre><code class="python">
                    def apply_cross(lhs, rhs):
                      result = []

                      while r = lhs.next():
                        while s = rhs.next():
                          result += [r + s]
                        rhs.reset()

                      return result
          </code></pre>
        </section>

        <section>
          <h3>Nested-Loop Join</h3>
          <svg data-src="graphics/2018-02-12-Join-NLJ.svg" />
        </section>

        <section>
          <p><b>Problem</b>: We need to evaluate <code>rhs</code> iterator<br/> once per record in <code>lhs</code></p>
        </section>

        <section>
          <h3>Preloading Data</h3>

          <p><b>Naive Solution</b>: Preload records from <code>lhs</code></p>
          <pre><code class="python">
                    def apply_cross(lhs, rhs):
                      result = []
                      rhs_preloaded = []

                      while s = rhs.next():
                        rhs_preloaded += [s]

                      while r = lhs.next():
                        for s in rhs_preloaded:
                          result += [r + s]

                      return result
          </code></pre>

          <p class="fragment">Any problems with this?</p>
        </section>

        <section>
          <h3>Preloading Data</h3>

          <p><b>Better Solution</b>: Load both <code>lhs</code> and <code>rhs</code> records in blocks.</p>

          <pre><code class="python">
                    def apply_cross(lhs, rhs):
                      result = []

                      while r_block = lhs.take(100):
                        while s_block = rhs.take(100):
                          for r in r_block:
                            for s in s_block:
                              result += [r + s]
                        rhs.reset()

                      return result
          </code></pre>
        </section>

        <section>
          <h3>Block-Nested Loop Join</h3>
          <svg data-src="graphics/2018-02-12-Join-BNLJ.svg" class="stretch" />
        </section>

        <section>
          <!-- 2018-OK: Missed opportunity on the slides...
                        Specifically, we could do a little more analysis of the inner loop.  Maybe spend a slide
                        on an animation showing the overall cost of BNLJ?

          -->
          <p>How big should the blocks be?</p>

          <p class="fragment">What is the IO complexity of the algorithm?</p>
        </section>

        <section>
          <h3>Join Conditions</h3>
          <svg data-src="graphics/2018-02-12-Join-Grid.svg" />
          <p class="fragment"><b>Problem</b>: Naively, any tuple matches any other</p>
        </section>

        <section>
          <h3>Join Conditions</h3>
          <svg data-src="graphics/2018-02-12-Join-OrderGrid.svg" />
          <p><b>Solution</b>: First organize the data</p>
        </section>

      </section>

      <section>

        <section>
          <h3>Strategies for Implementing $R \bowtie_{R.A = S.A} S$</h3>

          <dl>
            <dt>Sort/Merge Join</dt>
            <dd>Sort all of the data upfront, then scan over both sides.</dd>

            <dt>In-Memory Index Join (1-pass Hash; Hash Join)</dt>
            <dd>Build an in-memory index on one table, scan the other.</dd>

            <dt>Partition Join (2-pass Hash; External Hash Join)</dt>
            <dd>Partition both sides so that tuples don't join across partitions.</dd>
          </dl>
        </section>

        <section>
          <!-- 2018-OK: The motivation for this algorithm fell completely flat.
                        It might help if we approach SortMerge with IOs/Mem/CPU in mind.
                        The slide also deserves some discussion of *which* conditions it
                        can be used to support efficiently.

                        It might also help to discuss use cases where it's appropriate.
          -->
          <h3>Sort/Merge Join</h3>
          <svg data-src="graphics/2018-02-12-Join-SortMerge.svg" />
        </section>

        <section>
          <h3>Sort/Merge Join</h3>
          <dl>
            <dt>Limited Queries</dt>
            <dd>Only supports join conditions of the form $R.A = S.B$</dd>

            <dt>Low Memory</dt>
            <dd>Only needs to keep ~2 rows in memory at a time (not counting sort).</dd>

            <dt>Low Added CPU/IO Cost</dt>
            <dd>Only requires 1 scan over each input (not counting sort).</dd>
          </dl>
        </section>

        <section>
          <h3>Hash Functions</h3>

          <ul>
            <li>A hash function is a function that maps a large data value to a small fixed-size value<ul>
              <li>Typically is deterministic &amp; pseudorandom</li>
            </ul></li>
            <li>Used in Checksums, Hash Tables, Partitioning, Bloom Filters, Caching, Cryptography, Password Storage, …</li>
            <li>Examples: MD5, SHA1, SHA2<ul>
              <li>MD5() part of OpenSSL (on most OSX / Linux / Unix)</li>
            </ul></li>
            <li>Can map h(k) to range [0,N) with h(k) % N (modulus)</li>
          </ul>
        </section>

        <section>
          <h3>Hash Functions</h3>

          <p style="margin-top: 50px">
            $$h(X) \mod N$$

            <ul>
              <li>Pseudorandom output between $[0, N)$</li>
              <li>Always the same output for a given $X$</li>
            </ul>
          </p>
        </section>

        <section>
          <h3>1-Pass Hash Join</h3>
          <svg data-src="graphics/2018-02-12-Join-1PassHash.svg" />
        </section>

        <section>
          <h3>1-Pass Hash Join</h3>
          <dl>
            <dt>Limited Queries</dt>
            <dd>Only supports join conditions of the form $R.A = S.B$</dd>

            <dt>Moderate-High Memory</dt>
            <dd>Keeps 1 relation in memory</dd>

            <dt>Low Added CPU/IO Cost</dt>
            <dd>Only requires 1 scan over each input.</dd>
          </dl>
          <p class="fragment">Can use other in-memory indexes to support other join conditions.</p>
        </section>

        <section>
          <h3>2-Pass Hash Join</h3>
          <svg data-src="graphics/2018-02-12-Join-2PassHash.svg" />
        </section>

        <section>
          <h3>2-Pass Hash Join</h3>
          <dl>
            <dt>Limited Queries</dt>
            <dd>Only supports join conditions of the form $R.A = S.B$</dd>

            <dt>Low Memory</dt>
            <dd>Never need more than 1 pair of partitions in memory</dd>

            <dt>High IO Cost</dt>
            <dd>Every record gets written out to disk, and back in.</dd>
          </dl>
          <p class="fragment">Can partition on data-values to support other types of queries.</p>
        </section>

        <section>
          <p>Why is it important that the hash function is pseudorandom?</p>
        </section>
      </section>

      <section>
        <h3>Next Class</h3>
        <p style="margin-top: 100px">More operators, More algorithms</p>
      </section>

		</div></div>

		<script src="../reveal.js-3.6.0/js/reveal.js"></script>

		<script>

			// Full list of configuration options available at:
			// https://github.com/hakimel/../reveal.js#configuration
			Reveal.initialize({
				controls: true,
				progress: true,
				history: true,
				center: true,
				slideNumber: true,

				transition: 'fade', // none/fade/slide/convex/concave/zoom

        chart: {
					defaults: {
						global: {
							title: { fontColor: "#333", fontSize: 24 },
							legend: {
								labels: { fontColor: "#333", fontSize: 20 },
							},
							responsiveness: true
						},
						scale: {
							scaleLabel: { fontColor: "#333", fontSize: 20 },
							gridLines: { color: "#333", zeroLineColor: "#333" },
							ticks: { fontColor: "#333", fontSize: 16 },
						}
					},
					line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
					bar: { backgroundColor: [
  						"rgba(220,220,220,0.8)",
    					"rgba(151,187,205,0.8)",
    					"rgba(205,151,187,0.8)",
    					"rgba(187,205,151,0.8)"
    				]
    			},
					pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
					radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
				},

				// Optional ../reveal.js plugins
				dependencies: [
					{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
					{ src: '../reveal.js-3.6.0/plugin/math/math.js',
						condition: function() { return true; },
						mathjax: '../reveal.js-3.6.0/js/MathJax.js'
					 },
					{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
					{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
					{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
			    // Chart.min.js
			    { src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
			    // the plugin
			    { src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
          { src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
          { src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
				]
			});

		</script>

	</body>
</html>