Cost based optimization 2 slides

2018-03-04 15:00:02 -05:00 · 2018-03-04 15:00:02 -05:00 · e8870d34dc
parent 5c8e187858
commit e8870d34dc
2 changed files with 574 additions and 1 deletions
--- a/slides/cse4562sp2018/2018-02-28-CostBasedOptimization1.html
+++ b/slides/cse4562sp2018/2018-02-28-CostBasedOptimization1.html
@ -159,7 +159,7 @@
            </tr>
            <tr class="fragment" data-fragment-index="6">
              <td>Union</td>
-              <td>$R \cup S$</td>
+              <td>$R \uplus S$</td>
              <td>$0$</td>
              <td>$O(1)$</td>
            </tr>
--- a/slides/cse4562sp2018/2018-03-05-CostBasedOptimization2.html
+++ b/slides/cse4562sp2018/2018-03-05-CostBasedOptimization2.html
@ -0,0 +1,573 @@
+<!doctype html>
+<html lang="en">
+
+  <head>
+    <meta charset="utf-8">
+
+    <title>CSE 4/562 - Spring 2018</title>
+
+    <meta name="description" content="CSE 4/562 - Spring 2018">
+    <meta name="author" content="Oliver Kennedy">
+
+    <meta name="apple-mobile-web-app-capable" content="yes" />
+    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
+
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
+
+    <link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
+    <link rel="stylesheet" href="ubodin.css" id="theme">
+
+    <!-- Code syntax highlighting -->
+    <link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
+
+    <!-- Printing and PDF exports -->
+    <script>
+      var link = document.createElement( 'link' );
+      link.rel = 'stylesheet';
+      link.type = 'text/css';
+      link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
+      document.getElementsByTagName( 'head' )[0].appendChild( link );
+    </script>
+
+    <script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
+
+    <!--[if lt IE 9]>
+    <script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
+    <![endif]-->
+  </head>
+
+  <body>
+
+    <div class="reveal">
+    <!-- Any section element inside of this container is displayed as a slide -->
+
+    <div class="header">
+      <!-- Any Talk-Specific Header Content Goes Here -->
+      CSE 4/562 - Database Systems
+    </div>
+
+    <div class="slides">
+
+      <section>
+        <h1>Cost Based Optimization</h1>
+        <h3>CSE 4/562 – Database Systems</h3>
+        <h5>February 28, 2018</h5>
+      </section>
+      <!-- ============================================ -->
+
+      <section>
+        <section>
+          <h3>Remember the Real Goals</h3>
+          <ol>
+            <li>Accurately <b>rank</b> the plans.</li>
+            <li>Don't spend more time optimizing than you get back.</li>
+            <li>Don't pick a plan that uses more memory than you have.</li>
+          </ol>
+        </section>
+
+        <section>
+          <h3>Accounting</h3>
+          <p style="margin-top: 50px;">Figure out the cost of each <b>individual</b> operator.</p>
+          <p style="margin-top: 50px;">Only count the number of IOs <b>added</b> by each operator.</p>
+        </section>
+
+        <section>
+          <table style="font-size: 70%">
+            <tr><th>Operation</th><th>RA</th><th>IOs Added (#pages)</th><th>Memory (#tuples)</th></tr>
+            <tr>
+              <td>Table Scan</td>
+              <td>$R$</td>
+              <td>$\frac{|R|}{\mathcal P}$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>Projection</td>
+              <td>$\pi(R)$</td>
+              <td>$0$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>Selection</td>
+              <td>$\sigma(R)$</td>
+              <td>$0$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>Union</td>
+              <td>$R \cup S$</td>
+              <td>$0$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td style="vertical-align: middle;">Sort <span>(In-Mem)</span></td>
+              <td style="vertical-align: middle;">$\tau(R)$</td>
+              <td>$0$</td>
+              <td>$O(|R|)$</td>
+            </tr>
+            <tr>
+              <td>Sort (On-Disk)</td>
+              <td>$\tau(R)$</td>
+              <td>$\frac{2 \cdot \lfloor log_{\mathcal B}(|R|) \rfloor}{\mathcal P}$</td>
+              <td>$O(\mathcal B)$</td>
+            </tr>
+            <tr>
+              <td><span>(B+Tree)</span> Index Scan</td>
+              <td>$Index(R, c)$</td>
+              <td>$\log_{\mathcal I}(|R|) + \frac{|\sigma_c(R)|}{\mathcal P}$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>(Hash) Index Scan</td>
+              <td>$Index(R, c)$</td>
+              <td>$1$</td>
+              <td>$O(1)$</td>
+            </tr>
+          </table>
+
+          <ol style="font-size: 50%; margin-top: 50px;">
+            <li>Tuples per Page ($\mathcal P$) <span>– Normally defined per-schema</span></li>
+            <li>Size of $R$ ($|R|$)</li>
+            <li>Pages of Buffer ($\mathcal B$)</li>
+            <li>Keys per Index Page ($\mathcal I$)</li>
+          </ol>
+        </section>
+        <section>
+          <table style="font-size: 70%">
+            <tr><th width="300px">Operation</th><th>RA</th><th>IOs Added (#pages)</th><th>Memory (#tuples)</th></tr>
+            <tr>
+              <td style="font-size: 60%">Nested Loop Join <span>(Buffer $S$ in mem)</span></td>
+              <td>$R \times S$</td>
+              <td>$0$</td>
+              <td>$O(|S|)$</td>
+            </tr>
+            <tr>
+              <td style="font-size: 60%">Nested Loop Join (Buffer $S$ on disk)</td>
+              <td>$R \times_{disk} S$</td>
+              <td>$(1+ |R|) \cdot \frac{|S|}{\mathcal P}$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>1-Pass Hash Join</td>
+              <td>$R \bowtie_{1PH, c} S$</td>
+              <td>$0$</td>
+              <td>$O(|S|)$</td>
+            </tr>
+            <tr>
+              <td>2-Pass Hash Join</td>
+              <td>$R \bowtie_{2PH, c} S$</td>
+              <td>$\frac{2|R| + 2|S|}{\mathcal P}$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>Sort-Merge Join </td>
+              <td>$R \bowtie_{SM, c} S$</td>
+              <td>[Sort]</td>
+              <td>[Sort]</td>
+            </tr>
+            <tr>
+              <td><span>(Tree)</span> Index NLJ</td>
+              <td>$R \bowtie_{INL, c}$</td>
+              <td>$|R| \cdot (\log_{\mathcal I}(|S|) + \frac{|\sigma_c(S)|}{\mathcal P})$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td>(Hash) Index NLJ</td>
+              <td>$R \bowtie_{INL, c}$</td>
+              <td>$|R| \cdot 1$</td>
+              <td>$O(1)$</td>
+            </tr>
+            <tr>
+              <td><span>(In-Mem)</span> Aggregate</td>
+              <td>$\gamma_A(R)$</td>
+              <td>$0$</td>
+              <td>$adom(A)$</td>
+            </tr>
+            <tr>
+              <td style="font-size: 90%">(Sort/Merge) Aggregate</td>
+              <td>$\gamma_A(R)$</td>
+              <td>[Sort]</td>
+              <td>[Sort]</td>
+            </tr>
+          </table>
+
+          <ol style="font-size: 50%;">
+            <li>Tuples per Page ($\mathcal P$) <span>– Normally defined per-schema</span></li>
+            <li>Size of $R$ ($|R|$)</li>
+            <li>Pages of Buffer ($\mathcal B$)</li>
+            <li>Keys per Index Page ($\mathcal I$)</li>
+            <li>Number of distinct values of $A$ ($adom(A)$)</li>
+          </ol>
+        </section>
+      </section>
+
+      <!-- ============================================ -->
+
+      <section>
+        <section>
+          <p>Estimating IOs requires Estimating $|Q(R)|$</p>
+        </section>
+
+        <section>
+          <h3>Cardinality Estimation</h3>
+          <p class="fragment">Unlike estimating IOs, cardinality estimation doesn't care about the algorithm, so we'll just be working with raw RA.</p>
+
+          <p class="fragment">Also unlike estimating IOs, we care about the cardinality of $|Q(R)|$ as a whole, rather than the contribution of each individual operator.</p>
+        </section>
+
+        <section>
+          <table style="font-size: 70%">
+            <tr>
+              <th>Operator</th>
+              <th>RA</th>
+              <th>Estimated Size</th>
+            </tr>
+
+            <tr>
+              <td>Table</td>
+              <td>$R$</td>
+              <td class="fragment" data-fragment-index="1">$|R|$</td>
+            </tr>
+
+            <tr>
+              <td>Projection</td>
+              <td>$\pi(Q)$</td>
+              <td class="fragment" data-fragment-index="2">$|Q|$</td>
+            </tr>
+
+            <tr>
+              <td>Union</td>
+              <td>$Q_1 \uplus Q_2$</td>
+              <td class="fragment" data-fragment-index="3">$|Q_1| + |Q_2|$</td>
+            </tr>
+
+            <tr>
+              <td>Cross Product</td>
+              <td>$Q_1 \times Q_2$</td>
+              <td class="fragment" data-fragment-index="4">$|Q_1| \times |Q_2|$</td>
+            </tr>
+
+            <tr>
+              <td>Sort</td>
+              <td>$\tau(Q)$</td>
+              <td class="fragment" data-fragment-index="5">$|Q|$</td>
+            </tr>
+
+            <tr>
+              <td>Limit</td>
+              <td>$\texttt{LIMIT}_N(Q)$</td>
+              <td class="fragment" data-fragment-index="6">$N$</td>
+            </tr>
+
+            <tr>
+              <td>Selection</td>
+              <td>$\sigma_c(Q)$</td>
+              <td class="fragment" data-fragment-index="8">$|Q| \times \texttt{SEL}(c, Q)$</td>
+            </tr>
+
+            <tr>
+              <td>Join</td>
+              <td>$Q_1 \bowtie_c Q_2$</td>
+              <td class="fragment" data-fragment-index="9">$|Q_1| \times |Q_2| \times \texttt{SEL}(c, Q_1\times Q_2)$</td>
+            </tr>
+
+            <tr>
+              <td>Distinct</td>
+              <td>$\delta_A(Q)$</td>
+              <td class="fragment" data-fragment-index="11">$\texttt{UNIQ}(A, Q)$</td>
+            </tr>
+
+            <tr>
+              <td>Aggregate</td>
+              <td>$\gamma_{A, B \leftarrow \Sigma}(Q)$</td>
+              <td class="fragment" data-fragment-index="12">$\texttt{UNIQ}(A, Q)$</td>
+            </tr>
+          </table>
+
+          <ul style="font-size: 50%; margin-top: 20px">
+            <li class="fragment" data-fragment-index="7">$\texttt{SEL}(c, Q)$: Selectivity of $c$ on $Q$, or $\frac{|\sigma_c(Q)|}{|Q|}$</li>
+            <li class="fragment" data-fragment-index="10">$\texttt{UNIQ}(A, Q)$: # of distinct values of $A$ in $Q$.
+          </ul>
+        </section>
+
+        <!-- 2018 by OK: 
+          Things to cover:
+            - Defaults: The 10% rule
+            - Basic Assumptions: 
+              - Selectivity: MIN/MAX+COUNT, Uniform distribution, No correlations
+              - Unique Values: COUNT DISTINCT, No correlations
+            - Histograms: Nonuniform distributions
+            - Constraints: Keys, FDs, FKey (implications for Joins)
+        -->
+
+        <section>
+          <h3>Cardinality Estimation</h3>
+          <h4>(The Hard Parts)</h4>
+
+          <dl>
+            <dt style="margin-top: 50px;">$\sigma_c(Q)$ (Cardinality Estimation)</dt>
+            <dd>How many tuples will a condition $c$ allow to pass?</dd>
+
+            <dt style="margin-top: 50px;">$\delta_A(Q)$ (Distinct Values Estimation)</dt>
+            <dd>How many distinct values of attribute(s) $A$ exist?</dd>
+          </dl>
+        </section>
+
+        <section>
+          <h3>Remember the Real Goals</h3>
+          <ol>
+            <li>Accurately <b>rank</b> the plans.</li>
+            <li>Don't spend more time optimizing than you get back.</li>
+          </ol>
+        </section>
+
+        <section>
+          <h3>(Some) Estimation Techniques</h3>
+
+          <dl style="font-size: 80%">
+            <div class="fragment">
+              <dt>Guess Randomly</dt>
+              <dd>Rules of thumb if you have no other options...</dd>
+            </div>
+
+            <div class="fragment">
+              <dt>Uniform Prior</dt>
+              <dd>Use basic statistics to make a very rough guess.</dd>
+            </div>
+
+            <div class="fragment">
+              <dt>Sampling / History</dt>
+              <dd>Small, Quick Sampling Runs (or prior executions of the query).</dd>
+            </div>
+
+            <div class="fragment">
+              <dt>Histograms</dt>
+              <dd>Using more detailed statistics for improved guesses.</dd>
+            </div>
+
+            <div class="fragment">
+              <dt>Constraints</dt>
+              <dd>Using rules about the data for improved guesses.</dd>
+            </div>
+          </dl>
+        </section>
+      </section>
+
+      <!-- ============================================ -->
+
+      <section>
+        <section>
+          <h3>(Some) Estimation Techniques</h3>
+
+          <dl style="font-size: 80%">
+            <dt>Guess Randomly</dt>
+            <dd>Rules of thumb if you have no other options...</dd>
+
+            <dt style="color: grey;">Uniform Prior</dt>
+            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>
+
+            <dt style="color: grey;">Sampling / History</dt>
+            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>
+
+            <dt style="color: grey;">Histograms</dt>
+            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>
+
+            <dt style="color: grey;">Constraints</dt>
+            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
+          </dl>
+        </section>
+
+        <section>
+          <h3>The 10% Selectivity Rule</h3>
+
+          <p>Every select or distinct operator passes 10% of all rows.</p>
+
+          <div class="fragment">
+            $$\sigma_{A = 1 \wedge B = 2}(R)$$
+          </div>
+          <div class="fragment">
+            $$|\sigma_{A = 1 \wedge B = 2}(R)| = 0.1 \cdot |R|$$
+          </div>
+
+          <div class="fragment" style="margin-top: 50px;">
+            $$\sigma_{A = 1}(\sigma_{B = 2}(R))$$
+          </div>
+          <div class="fragment">
+            $$|\sigma_{A = 1}(\sigma_{B = 2}(R))| = 0.1 \cdot |\sigma_{B = 2}(R)| = 0.1 \cdot 0.1 \cdot |R|$$
+          </div>
+
+          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 50px;">(Queries are typically standardized first)</p>
+
+          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 20px;">(The specific % varies by DBMS.  E.g., Teradata uses 10% for the first <code>AND</code> clause, and 75% for every subsequent clause)</p>
+        </section>
+
+        <section>
+          <p>The 10% rule is a fallback when everything else fails. <br/> Usually, databases collect statistics...</p>
+        </section>
+      </section>
+
+      <!-- ============================================ -->
+
+      <section>
+        <section>
+          <h3>(Some) Estimation Techniques</h3>
+
+          <dl style="font-size: 80%">
+            <dt style="color: grey;">Guess Randomly</dt>
+            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>
+
+            <dt>Uniform Prior</dt>
+            <dd>Use basic statistics to make a very rough guess.</dd>
+
+            <dt style="color: grey;">Sampling / History</dt>
+            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>
+
+            <dt style="color: grey;">Histograms</dt>
+            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>
+
+            <dt style="color: grey;">Constraints</dt>
+            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
+          </dl>
+        </section>
+
+        <section>
+          <h3>Uniform Prior</h3>
+
+          <p style="text-align: left; margin-bottom: 0px; font-weight: bold;">We assume that for $\sigma_c(Q)$...</p>
+          <ol>
+            <li>Basic statistics are known about $Q$: <ul>
+              <li style="margin-top: 0px;"><code>COUNT(*)</code></li>
+              <li style="margin-top: 0px;"><code>COUNT(DISTINCT A)</code> (for each A)</li>
+              <li style="margin-top: 0px;"><code>MIN(A)</code>, <code>MAX(A)</code> (for each numeric A)</li>
+            </ul></li>
+            <li>Attribute values are uniformly distributed.</li>
+            <li>No inter-attribute correlations.</li>
+          </ol>
+          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 20px;">
+            If (1) fails, fall back to the 10% rule.  
+          </p>
+          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 0px;">
+            If (2) or (3) fails, it'll often still be a <i>good enough</i> estimate.
+          </p>
+        </section>
+
+        <section>
+          <h3>Some Conditions</h3>
+
+          <p>Selectivity is a probability ($\texttt{SEL}(c, Q) = P(c)$)</p>
+          <table style="font-size: 85%">
+            <tr class="fragment">
+              <td>$P(A = x_1)$</td>
+              <td>$=$</td>
+              <td class="fragment">$\frac{1}{\texttt{COUNT(DISTINCT A)}}$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(A \in (x_1, x_2, \ldots, x_N))$</td>
+              <td>$=$</td>
+              <td class="fragment">$\frac{N}{\texttt{COUNT(DISTINCT A)}}$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(A \leq x_1)$</td>
+              <td>$=$</td>
+              <td class="fragment">$\frac{x_1 - \texttt{MIN(A)}}{\texttt{MAX(A)} - \texttt{MIN(A)}}$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(x_1 \leq A \leq x_2)$</td>
+              <td>$=$</td>
+              <td class="fragment">$\frac{x_2 - x_1}{\texttt{MAX(A)} - \texttt{MIN(A)}}$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(A = B)$</td>
+              <td>$=$</td>
+              <td class="fragment" style="font-size: 60%">$\textbf{min}\left( \frac{1}{\texttt{COUNT(DISTINCT A)}}, \frac{1}{\texttt{COUNT(DISTINCT B)}} \right)$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(c_1 \wedge c_2)$</td>
+              <td>$=$</td>
+              <td class="fragment" >$P(c_1) \cdot P(c_2)$</td>
+            </tr>
+
+            <tr class="fragment">
+              <td>$P(c_1 \vee c_2)$</td>
+              <td>$=$</td>
+              <td class="fragment" >$1 - (1 - P(c_1)) \cdot (1 - P(c_2))$</td>
+            </tr>
+          </table>
+
+          <p style="font-size: 60%">(With constants $x_1$, $x_2$, ...)</p>
+        </section>
+      </section>
+
+    </div></div>
+
+    <script src="../reveal.js-3.6.0/js/reveal.js"></script>
+
+    <script>
+
+      // Full list of configuration options available at:
+      // https://github.com/hakimel/../reveal.js#configuration
+      Reveal.initialize({
+        controls: false,
+        progress: true,
+        history: true,
+        center: true,
+        slideNumber: true,
+
+        transition: 'fade', // none/fade/slide/convex/concave/zoom
+
+        chart: {
+          defaults: { 
+            global: { 
+              title: { fontColor: "#333", fontSize: 24 }, 
+              legend: {
+                labels: { fontColor: "#333", fontSize: 20 },
+              },
+              responsiveness: true
+            },
+            scale: { 
+              scaleLabel: { fontColor: "#333", fontSize: 20 }, 
+              gridLines: { color: "#333", zeroLineColor: "#333" }, 
+              ticks: { fontColor: "#333", fontSize: 16 }, 
+            } 
+          },
+          line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]}, 
+          bar: { backgroundColor: [ 
+              "rgba(220,220,220,0.8)",
+              "rgba(151,187,205,0.8)",
+              "rgba(205,151,187,0.8)",
+              "rgba(187,205,151,0.8)"
+            ]
+          }, 
+          pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
+          radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]}, 
+        },
+
+        // Optional ../reveal.js plugins
+        dependencies: [
+          { src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
+          { src: '../reveal.js-3.6.0/plugin/math/math.js', 
+            condition: function() { return true; },
+            mathjax: '../reveal.js-3.6.0/js/MathJax.js'
+           },
+          { src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
+          { src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
+          { src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
+          { src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
+          { src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
+          // Chart.min.js
+          { src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
+          // the plugin
+          { src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
+          { src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
+          { src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
+        ]
+      });
+
+    </script>
+
+  </body>
+</html>