Website/slides/cse4562sp2018/2018-03-05-CostBasedOptimization2.html

<!doctype html>
<html lang="en">

  <head>
    <meta charset="utf-8">

    <title>CSE 4/562 - Spring 2018</title>

    <meta name="description" content="CSE 4/562 - Spring 2018">
    <meta name="author" content="Oliver Kennedy">

    <meta name="apple-mobile-web-app-capable" content="yes" />
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />

    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">

    <link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
    <link rel="stylesheet" href="ubodin.css" id="theme">

    <!-- Code syntax highlighting -->
    <link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">

    <!-- Printing and PDF exports -->
    <script>
      var link = document.createElement( 'link' );
      link.rel = 'stylesheet';
      link.type = 'text/css';
      link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
      document.getElementsByTagName( 'head' )[0].appendChild( link );
    </script>

    <script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>

    <!--[if lt IE 9]>
    <script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
    <![endif]-->
  </head>

  <body>

    <div class="reveal">
    <!-- Any section element inside of this container is displayed as a slide -->

    <div class="header">
      <!-- Any Talk-Specific Header Content Goes Here -->
      CSE 4/562 - Database Systems
    </div>

    <div class="slides">

      <section>
        <h1>Cost Based Optimization</h1>
        <h3>CSE 4/562 – Database Systems</h3>
        <h5>March 5-7, 2018</h5>
      </section>
      <!-- ============================================ -->

      <section>
        <section>
          <h3>Remember the Real Goals</h3>
          <ol>
            <li>Accurately <b>rank</b> the plans.</li>
            <li>Don't spend more time optimizing than you get back.</li>
            <li>Don't pick a plan that uses more memory than you have.</li>
          </ol>
        </section>

        <section>
          <h3>Accounting</h3>
          <p style="margin-top: 50px;">Figure out the cost of each <b>individual</b> operator.</p>
          <p style="margin-top: 50px;">Only count the number of IOs <b>added</b> by each operator.</p>
        </section>

        <section>
          <table style="font-size: 70%">
            <tr><th>Operation</th><th>RA</th><th>IOs Added (#pages)</th><th>Memory (#tuples)</th></tr>
            <tr>
              <td>Table Scan</td>
              <td>$R$</td>
              <td>$\frac{|R|}{\mathcal P}$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>Projection</td>
              <td>$\pi(R)$</td>
              <td>$0$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>Selection</td>
              <td>$\sigma(R)$</td>
              <td>$0$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>Union</td>
              <td>$R \cup S$</td>
              <td>$0$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td style="vertical-align: middle;">Sort <span>(In-Mem)</span></td>
              <td style="vertical-align: middle;">$\tau(R)$</td>
              <td>$0$</td>
              <td>$O(|R|)$</td>
            </tr>
            <tr>
              <td>Sort (On-Disk)</td>
              <td>$\tau(R)$</td>
              <td>$\frac{2 \cdot \lfloor log_{\mathcal B}(|R|) \rfloor}{\mathcal P}$</td>
              <td>$O(\mathcal B)$</td>
            </tr>
            <tr>
              <td><span>(B+Tree)</span> Index Scan</td>
              <td>$Index(R, c)$</td>
              <td>$\log_{\mathcal I}(|R|) + \frac{|\sigma_c(R)|}{\mathcal P}$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>(Hash) Index Scan</td>
              <td>$Index(R, c)$</td>
              <td>$1$</td>
              <td>$O(1)$</td>
            </tr>
          </table>

          <ol style="font-size: 50%; margin-top: 50px;">
            <li>Tuples per Page ($\mathcal P$) <span>– Normally defined per-schema</span></li>
            <li>Size of $R$ ($|R|$)</li>
            <li>Pages of Buffer ($\mathcal B$)</li>
            <li>Keys per Index Page ($\mathcal I$)</li>
          </ol>
        </section>
        <section>
          <table style="font-size: 70%">
            <tr><th width="300px">Operation</th><th>RA</th><th>IOs Added (#pages)</th><th>Memory (#tuples)</th></tr>
            <tr>
              <td style="font-size: 60%">Nested Loop Join <span>(Buffer $S$ in mem)</span></td>
              <td>$R \times S$</td>
              <td>$0$</td>
              <td>$O(|S|)$</td>
            </tr>
            <tr>
              <td style="font-size: 60%">Nested Loop Join (Buffer $S$ on disk)</td>
              <td>$R \times_{disk} S$</td>
              <td>$(1+ |R|) \cdot \frac{|S|}{\mathcal P}$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>1-Pass Hash Join</td>
              <td>$R \bowtie_{1PH, c} S$</td>
              <td>$0$</td>
              <td>$O(|S|)$</td>
            </tr>
            <tr>
              <td>2-Pass Hash Join</td>
              <td>$R \bowtie_{2PH, c} S$</td>
              <td>$\frac{2|R| + 2|S|}{\mathcal P}$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>Sort-Merge Join </td>
              <td>$R \bowtie_{SM, c} S$</td>
              <td>[Sort]</td>
              <td>[Sort]</td>
            </tr>
            <tr>
              <td><span>(Tree)</span> Index NLJ</td>
              <td>$R \bowtie_{INL, c}$</td>
              <td>$|R| \cdot (\log_{\mathcal I}(|S|) + \frac{|\sigma_c(S)|}{\mathcal P})$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td>(Hash) Index NLJ</td>
              <td>$R \bowtie_{INL, c}$</td>
              <td>$|R| \cdot 1$</td>
              <td>$O(1)$</td>
            </tr>
            <tr>
              <td><span>(In-Mem)</span> Aggregate</td>
              <td>$\gamma_A(R)$</td>
              <td>$0$</td>
              <td>$adom(A)$</td>
            </tr>
            <tr>
              <td style="font-size: 90%">(Sort/Merge) Aggregate</td>
              <td>$\gamma_A(R)$</td>
              <td>[Sort]</td>
              <td>[Sort]</td>
            </tr>
          </table>

          <ol style="font-size: 50%;">
            <li>Tuples per Page ($\mathcal P$) <span>– Normally defined per-schema</span></li>
            <li>Size of $R$ ($|R|$)</li>
            <li>Pages of Buffer ($\mathcal B$)</li>
            <li>Keys per Index Page ($\mathcal I$)</li>
            <li>Number of distinct values of $A$ ($adom(A)$)</li>
          </ol>
        </section>
      </section>

      <!-- ============================================ -->

      <section>
        <section>
          <p>Estimating IOs requires Estimating $|Q(R)|$</p>
        </section>

        <section>
          <h3>Cardinality Estimation</h3>
          <p class="fragment">Unlike estimating IOs, cardinality estimation doesn't care about the algorithm, so we'll just be working with raw RA.</p>

          <p class="fragment">Also unlike estimating IOs, we care about the cardinality of $|Q(R)|$ as a whole, rather than the contribution of each individual operator.</p>
        </section>

        <section>
          <table style="font-size: 70%">
            <tr>
              <th>Operator</th>
              <th>RA</th>
              <th>Estimated Size</th>
            </tr>

            <tr>
              <td>Table</td>
              <td>$R$</td>
              <td class="fragment" data-fragment-index="1">$|R|$</td>
            </tr>

            <tr>
              <td>Projection</td>
              <td>$\pi(Q)$</td>
              <td class="fragment" data-fragment-index="2">$|Q|$</td>
            </tr>

            <tr>
              <td>Union</td>
              <td>$Q_1 \uplus Q_2$</td>
              <td class="fragment" data-fragment-index="3">$|Q_1| + |Q_2|$</td>
            </tr>

            <tr>
              <td>Cross Product</td>
              <td>$Q_1 \times Q_2$</td>
              <td class="fragment" data-fragment-index="4">$|Q_1| \times |Q_2|$</td>
            </tr>

            <tr>
              <td>Sort</td>
              <td>$\tau(Q)$</td>
              <td class="fragment" data-fragment-index="5">$|Q|$</td>
            </tr>

            <tr>
              <td>Limit</td>
              <td>$\texttt{LIMIT}_N(Q)$</td>
              <td class="fragment" data-fragment-index="6">$N$</td>
            </tr>

            <tr>
              <td>Selection</td>
              <td>$\sigma_c(Q)$</td>
              <td class="fragment" data-fragment-index="8">$|Q| \times \texttt{SEL}(c, Q)$</td>
            </tr>

            <tr>
              <td>Join</td>
              <td>$Q_1 \bowtie_c Q_2$</td>
              <td class="fragment" data-fragment-index="9">$|Q_1| \times |Q_2| \times \texttt{SEL}(c, Q_1\times Q_2)$</td>
            </tr>

            <tr>
              <td>Distinct</td>
              <td>$\delta_A(Q)$</td>
              <td class="fragment" data-fragment-index="11">$\texttt{UNIQ}(A, Q)$</td>
            </tr>

            <tr>
              <td>Aggregate</td>
              <td>$\gamma_{A, B \leftarrow \Sigma}(Q)$</td>
              <td class="fragment" data-fragment-index="12">$\texttt{UNIQ}(A, Q)$</td>
            </tr>
          </table>

          <ul style="font-size: 50%; margin-top: 20px">
            <li class="fragment" data-fragment-index="7">$\texttt{SEL}(c, Q)$: Selectivity of $c$ on $Q$, or $\frac{|\sigma_c(Q)|}{|Q|}$</li>
            <li class="fragment" data-fragment-index="10">$\texttt{UNIQ}(A, Q)$: # of distinct values of $A$ in $Q$.
          </ul>
        </section>

        <!-- 2018 by OK: 
          Things to cover:
            - Defaults: The 10% rule
            - Basic Assumptions: 
              - Selectivity: MIN/MAX+COUNT, Uniform distribution, No correlations
              - Unique Values: COUNT DISTINCT, No correlations
            - Histograms: Nonuniform distributions
            - Constraints: Keys, FDs, FKey (implications for Joins)
        -->

        <section>
          <h3>Cardinality Estimation</h3>
          <h4>(The Hard Parts)</h4>

          <dl>
            <dt style="margin-top: 50px;">$\sigma_c(Q)$ (Cardinality Estimation)</dt>
            <dd>How many tuples will a condition $c$ allow to pass?</dd>

            <dt style="margin-top: 50px;">$\delta_A(Q)$ (Distinct Values Estimation)</dt>
            <dd>How many distinct values of attribute(s) $A$ exist?</dd>
          </dl>
        </section>

        <section>
          <h3>Remember the Real Goals</h3>
          <ol>
            <li>Accurately <b>rank</b> the plans.</li>
            <li>Don't spend more time optimizing than you get back.</li>
          </ol>
        </section>

        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <div class="fragment">
              <dt>Guess Randomly</dt>
              <dd>Rules of thumb if you have no other options...</dd>
            </div>

            <div class="fragment">
              <dt>Uniform Prior</dt>
              <dd>Use basic statistics to make a very rough guess.</dd>
            </div>

            <div class="fragment">
              <dt>Sampling / History</dt>
              <dd>Small, Quick Sampling Runs (or prior executions of the query).</dd>
            </div>

            <div class="fragment">
              <dt>Histograms</dt>
              <dd>Using more detailed statistics for improved guesses.</dd>
            </div>

            <div class="fragment">
              <dt>Constraints</dt>
              <dd>Using rules about the data for improved guesses.</dd>
            </div>
          </dl>
        </section>
      </section>

      <!-- ============================================ -->

      <section>
        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: blue;">Guess Randomly</dt>
            <dd style="color: blue;">Rules of thumb if you have no other options...</dd>

            <dt style="color: grey;">Uniform Prior</dt>
            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: grey;">Sampling / History</dt>
            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: grey;">Histograms</dt>
            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: grey;">Constraints</dt>
            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>

        <section>
          <h3>The 10% Selectivity Rule</h3>

          <p>Every select or distinct operator passes 10% of all rows.</p>

          <div class="fragment">
            $$\sigma_{A = 1 \wedge B = 2}(R)$$
          </div>
          <div class="fragment">
            $$|\sigma_{A = 1 \wedge B = 2}(R)| = 0.1 \cdot |R|$$
          </div>

          <div class="fragment" style="margin-top: 50px;">
            $$\sigma_{A = 1}(\sigma_{B = 2}(R))$$
          </div>
          <div class="fragment">
            $$|\sigma_{A = 1}(\sigma_{B = 2}(R))| = 0.1 \cdot |\sigma_{B = 2}(R)| = 0.1 \cdot 0.1 \cdot |R|$$
          </div>

          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 50px;">(Queries are typically standardized first)</p>

          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 20px;">(The specific % varies by DBMS.  E.g., Teradata uses 10% for the first <code>AND</code> clause, and 75% for every subsequent clause)</p>
        </section>

        <section>
          <p>The 10% rule is a fallback when everything else fails. <br/> Usually, databases collect statistics...</p>
        </section>
      </section>

      <!-- ============================================ -->

      <section>
        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: grey;">Guess Randomly</dt>
            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>

            <dt style="color: blue;">Uniform Prior</dt>
            <dd style="color: blue;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: grey;">Sampling / History</dt>
            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: grey;">Histograms</dt>
            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: grey;">Constraints</dt>
            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>

        <section>
          <h3>Uniform Prior</h3>

          <p style="text-align: left; margin-bottom: 0px; font-weight: bold;">We assume that for $\sigma_c(Q)$ or $\delta_A(Q)$...</p>
          <ol>
            <li>Basic statistics are known about $Q$: <ul>
              <li style="margin-top: 0px;"><code>COUNT(*)</code></li>
              <li style="margin-top: 0px;"><code>COUNT(DISTINCT A)</code> (for each A)</li>
              <li style="margin-top: 0px;"><code>MIN(A)</code>, <code>MAX(A)</code> (for each numeric A)</li>
            </ul></li>
            <li>Attribute values are uniformly distributed.</li>
            <li>No inter-attribute correlations.</li>
          </ol>
          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 20px;">
            If (1) fails, fall back to the 10% rule.  
          </p>
          <p class="fragment" style="font-size: 80%; font-weight: bold; margin-top: 0px;">
            If (2) or (3) fails, it'll often still be a <i>good enough</i> estimate.
          </p>
        </section>

        <section>
          <p>Estimating $\delta_A(Q)$ requires only <code>COUNT(DISTINCT A)</code></p>
        </section>

        <section>
          <h3>Estimating Selectivity</h3>

          <p>Selectivity is a probability ($\texttt{SEL}(c, Q) = P(c)$)</p>
          <table style="font-size: 85%">
            <tr class="fragment">
              <td>$P(A = x_1)$</td>
              <td>$=$</td>
              <td class="fragment">$\frac{1}{\texttt{COUNT(DISTINCT A)}}$</td>
            </tr>

            <tr class="fragment">
              <td>$P(A \in (x_1, x_2, \ldots, x_N))$</td>
              <td>$=$</td>
              <td class="fragment">$\frac{N}{\texttt{COUNT(DISTINCT A)}}$</td>
            </tr>

            <tr class="fragment">
              <td>$P(A \leq x_1)$</td>
              <td>$=$</td>
              <td class="fragment">$\frac{x_1 - \texttt{MIN(A)}}{\texttt{MAX(A)} - \texttt{MIN(A)}}$</td>
            </tr>

            <tr class="fragment">
              <td>$P(x_1 \leq A \leq x_2)$</td>
              <td>$=$</td>
              <td class="fragment">$\frac{x_2 - x_1}{\texttt{MAX(A)} - \texttt{MIN(A)}}$</td>
            </tr>

            <tr class="fragment">
              <td>$P(A = B)$</td>
              <td>$=$</td>
              <td class="fragment" style="font-size: 60%">$\textbf{min}\left( \frac{1}{\texttt{COUNT(DISTINCT A)}}, \frac{1}{\texttt{COUNT(DISTINCT B)}} \right)$</td>
            </tr>

            <tr class="fragment">
              <td>$P(c_1 \wedge c_2)$</td>
              <td>$=$</td>
              <td class="fragment" >$P(c_1) \cdot P(c_2)$</td>
            </tr>

            <tr class="fragment">
              <td>$P(c_1 \vee c_2)$</td>
              <td>$=$</td>
              <td class="fragment" >$1 - (1 - P(c_1)) \cdot (1 - P(c_2))$</td>
            </tr>
          </table>

          <p style="font-size: 60%">(With constants $x_1$, $x_2$, ...)</p>
        </section>

        <section>
          <h3>Limitations</h3>

          <dl>
            <div class="fragment">
              <dt>Don't always have statistics for $Q$</dt>
              <dd>For example, $\pi_{A \leftarrow (B \times C)}(R)$</dd>
            </div>

            <div class="fragment">
              <dt>Don't always have clear rules for $c$</dt>
              <dd>For example, $\sigma_{\texttt{FitsModel}(A, B, C)}(R)$</dd>
            </div>

            <div class="fragment">
              <dt>Attribute values are not always uniformly distributed.</dt>
              <dd>For example, <span style="font-size: 60%"> $|\sigma_{SPC\_COMMON = 'pin\ oak'}(T)|$ vs $|\sigma_{SPC\_COMMON = 'honeylocust'}(T)|$</span></dd>
            </div>

            <div class="fragment">
              <dt>Attribute values are sometimes correlated.</dt>
              <dd>For example, $\sigma_{(stump < 5) \wedge (diam > 3)}(T)$</dd>
            </div>

          </dl>
        </section>
      </section>
      <section>
        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: grey;">Guess Randomly</dt>
            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>

            <dt style="color: grey;">Uniform Prior</dt>
            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: blue;">Sampling / History</dt>
            <dd style="color: blue;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: grey;">Histograms</dt>
            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: grey;">Constraints</dt>
            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>

        <section>
          <p><b>Idea 1:</b> Pick 100 tuples at random from each input table.</p>
        </section>

        <section>
          <svg data-src="graphics/2018-03-05-JoinIssue.svg" />
        </section>

        <section>
          <h3>The Birthday Paradox</h3>

          <p style="margin-top: 50px;">
            Assume: $\texttt{UNIQ}(A, R) = \texttt{UNIQ}(A, S) = N$
          </p>

          <p style="margin-top: 50px;">
            It takes $O(\sqrt{N})$ samples from both $R$ and $S$ <br/> to get even <b>one match.</b>
          </p>
        </section>

        <section>
          <p>To be resumed later in the term when we talk about AQP</p>
        </section>

        <section>
          <p><b>How DBs Do It</b>: Instrument queries while running them.<ul>
            <li class="fragment">The first time you run a query it <i>might</i> be slow.</li>
            <li class="fragment">The second, third, fourth, etc... times it'll be fast.</li>
          </ul></p>
        </section>
      </section>

      <section>

        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: grey;">Guess Randomly</dt>
            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>

            <dt style="color: grey;">Uniform Prior</dt>
            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: grey;">Sampling / History</dt>
            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: blue;">Histograms</dt>
            <dd style="color: blue;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: grey;">Constraints</dt>
            <dd style="color: grey;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>

        <section>
          <h3>Limitations of Uniform Prior</h3>

          <dl>
            <div class="fragment highlight-grey" data-fragment-index="1">
              <dt>Don't always have statistics for $Q$</dt>
              <dd>For example, $\pi_{A \leftarrow (B \times C)}(R)$</dd>
            </div>

            <div class="fragment highlight-grey" data-fragment-index="1">
              <dt>Don't always have clear rules for $c$</dt>
              <dd>For example, $\sigma_{\texttt{FitsModel}(A, B, C)}(R)$</dd>
            </div>

            <div class="fragment highlight-blue" data-fragment-index="1">
              <dt>Attribute values are not always uniformly distributed.</dt>
              <dd>For example, <span style="font-size: 60%"> $|\sigma_{SPC\_COMMON = 'pin\ oak'}(T)|$ vs $|\sigma_{SPC\_COMMON = 'honeylocust'}(T)|$</span></dd>
            </div>

            <div class="fragment highlight-grey" data-fragment-index="1">
              <dt>Attribute values are sometimes correlated.</dt>
              <dd>For example, $\sigma_{(stump < 5) \wedge (diam > 3)}(T)$</dd>
            </div>

          </dl>
        </section>

        <section>
          <p class="fragment highlight-grey" data-fragment-index="1">
            <b>Ideal Case:</b> You have some 
            $$f(x) = \left(\texttt{SELECT COUNT(*) WHERE A = x}\right)$$
            (and similarly for the other aggregates)
          </p>
          <p class="fragment" data-fragment-index="1">
            <b>Slightly Less Ideal Case:</b> You have some 
            $$f(x) \approx \left(\texttt{SELECT COUNT(*) WHERE A = x}\right)$$
          </p>
        </section>

        <section>
          <p>If this sounds like CDF-based indexing... you're right!</p>

          <p class="fragment">... but we're not going to talk about NNs today</p>
        </section>
      </section>

      <section>
        <section>
          <p>
            <b>Simpler/Faster Idea: </b> Break $f(x)$ into chunks
          </p>
        </section>

        <section>
          <h3>Example Data</h3>
          <table style="font-size: 80%">
            <tr><th>Name</th>      <th>YearsEmployed</th>  <th>Role</th></tr>
            <tr><td>'Alice'</td>   <td>3</td>              <td>1</td></tr>
            <tr><td>'Bob'</td>     <td>2</td>              <td>2</td></tr>
            <tr><td>'Carol'</td>   <td>3</td>              <td>1</td></tr>
            <tr><td>'Dave'</td>    <td>1</td>              <td>3</td></tr>
            <tr><td>'Eve'</td>     <td>2</td>              <td>2</td></tr>
            <tr><td>'Fred'</td>    <td>2</td>              <td>3</td></tr>
            <tr><td>'Gwen'</td>    <td>4</td>              <td>1</td></tr>
            <tr><td>'Harry'</td>   <td>2</td>              <td>3</td></tr>
          </table>
        </section>

        <section>
          <h3>Histograms</h3>
          <table style="font-size: 70%">
            <tr><th>YearsEmployed</th><th>COUNT</th></tr>
            <tr><td>1</td>            <td>1</td>    </tr>
            <tr><td>2</td>            <td>4</td>    </tr>
            <tr><td>3</td>            <td>2</td>    </tr>
            <tr><td>4</td>            <td>1</td>    </tr>
          </table>

          <table>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(DISTINCT YearsEmployed)</code> </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MIN(YearsEmployed)</code>            </td><td class="fragment">$= 1$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MAX(YearsEmplyed)</code>             </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(*) YearsEmployed = 2</code>    </td><td class="fragment">$= 4$</td></tr>
          </table>
        </section>

        <section>
          <h3>Histograms</h3>
          <table style="font-size: 70%">
            <tr><th>YearsEmployed</th><th>COUNT</th></tr>
            <tr><td>1-2</td>          <td>5</td>    </tr>
            <tr><td>3-4</td>          <td>3</td>    </tr>
          </table>

          <table>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(DISTINCT YearsEmployed)</code> </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MIN(YearsEmployed)</code>            </td><td class="fragment">$= 1$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MAX(YearsEmplyed)</code>             </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(*) YearsEmployed = 2</code>    </td><td class="fragment">$= \frac{5}{2}$</td></tr>
          </table>
        </section>

        <section>
          <h3>The Extreme Case</h3>
          <table style="font-size: 70%">
            <tr><th>YearsEmployed</th><th>COUNT</th></tr>
            <tr><td>1-4</td>          <td>8</td>    </tr>
          </table>

          <table>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(DISTINCT YearsEmployed)</code> </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MIN(YearsEmployed)</code>            </td><td class="fragment">$= 1$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>MAX(YearsEmplyed)</code>             </td><td class="fragment">$= 4$</td></tr>
            <tr class="fragment"><td style="font-size: 70%"><code>COUNT(*) YearsEmployed = 2</code>    </td><td class="fragment">$= \frac{8}{4}$</td></tr>
          </table>
        </section>

        <section>
          <h3>More Example Data</h3>
          <table style="font-size: 80%; float: left;">
            <tr><th>Value</th>  <th>COUNT</th>  </tr>
            <tr><td> 1-10</td>  <td>20</td>     </tr>
            <tr><td>11-20</td>  <td> 0</td>     </tr>
            <tr><td>21-30</td>  <td>15</td>     </tr>
            <tr><td>31-40</td>  <td>30</td>     </tr>
            <tr><td>41-50</td>  <td>22</td>     </tr>
            <tr><td>51-60</td>  <td>63</td>     </tr>
            <tr><td>61-70</td>  <td>10</td>     </tr>
            <tr><td>71-80</td>  <td>10</td>     </tr>
          </table>

          <table style="margin-top: 100px;">
            <tr class="fragment">
              <td style="font-size: 70%; width: 350px;"><code>SELECT … WHERE A = 33</code> </td>
              <td class="fragment" style="font-size: 80%; text-align: left; width: 200px;">$= \frac{1}{40-30}\cdot 30 = 3$</td>
            </tr>
            <tr><td style="height: 70px;"></td><td></td></tr>
            <tr class="fragment">
              <td style="font-size: 70%; width: 350px;"><code>SELECT … WHERE A > 33</code> </td>
              <td class="fragment" style="font-size: 80%; text-align: left; width: 200px;">$= \frac{40-33}{40-30}\cdot 30+22$ $\;\;\;+63+10+10$ $= 126$ </td>
            </tr>
          </table>
        </section>
      </section>

      <section>
        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: grey;">Guess Randomly</dt>
            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>

            <dt style="color: grey;">Uniform Prior</dt>
            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: grey;">Sampling / History</dt>
            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: grey;">Histograms</dt>
            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: blue;">Constraints</dt>
            <dd style="color: blue;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>
      </section>

      <section>
        <section>
          <h3>(Some) Estimation Techniques</h3>

          <dl style="font-size: 80%">
            <dt style="color: grey;">Guess Randomly</dt>
            <dd style="color: grey;">Rules of thumb if you have no other options...</dd>

            <dt style="color: grey;">Uniform Prior</dt>
            <dd style="color: grey;">Use basic statistics to make a very rough guess.</dd>

            <dt style="color: grey;">Sampling / History</dt>
            <dd style="color: grey;">Small, Quick Sampling Runs (or prior executions of the query).</dd>

            <dt style="color: grey;">Histograms</dt>
            <dd style="color: grey;">Using more detailed statistics for improved guesses.</dd>

            <dt style="color: blue;">Constraints</dt>
            <dd style="color: blue;">Using rules about the data for improved guesses.</dd>
          </dl>
        </section>

        <section>
          <h3>Key / Unique Constraints</h3>
          <pre style="margin-top: 50px;"><code class="sql">
            CREATE TABLE R ( 
              A int,
              B int UNIQUE
              ... 
              PRIMARY KEY A
            );
          </code></pre>
          <p style="margin-top: 50px;">
            No duplicate values in the column.
            $$\texttt{COUNT(DISTINCT A)} = \texttt{COUNT(*)}$$
          </p>
        </section>

        <section>
          <h3>Foreign Key Constraints</h3>
          <pre style="margin-top: 50px;"><code class="sql">
            CREATE TABLE S ( 
              B int,
              ... 
              FOREIGN KEY B REFERENCES R.B
            );
          </code></pre>
          <p style="margin-top: 50px;">
            All values in the column appear in another table.
            $$\pi_{attrs(S)}\left(S \bowtie_B R\right) \subseteq S$$
          </p>
        </section>

        <section>
          <h3>Functional Dependencies</h3>

          <pre style="margin-top: 50px;"><code class="sql">
            Not expressible in SQL
          </code></pre>

          <p style="margin-top: 50px;">
            One set of columns uniquely determines another.<br/>
            $\pi_{A}(\delta(\pi_{A, B}(R)))$ has no duplicates and...
            $$\pi_{attrs(R)-A}(R) \bowtie_A \delta(\pi_{A, B}(R)) = R$$
          </p>
        </section>

        <section>
          <h3>Constraints</h3>

          <h4>The Good</h4>
          <ul>
            <li style="font-size: 70%" class="fragment">Sanity check on your data: Inconsistent data triggers failures.</li>
            <li style="font-size: 70%" class="fragment">More opportunities for query optimization.</li>
          </ul>

          <h4 style="margin-top: 50px;" class="fragment">The Not-So Good</h4>
          <ul>
            <li style="font-size: 70%" class="fragment">Validating constraints whenever data changes is (usually) expensive.</li>
            <li style="font-size: 70%" class="fragment">Inconsistent data triggers failures.</li>
          </ul>

        </section>

        <section>
          <h3>Foreign Key Constraints</h3>

          <p style="margin-top: 50px;">Foreign keys are like pointers.  What happens with broken pointers?</p>
        </section>

        <section>
          <h3>Foreign Key Enforcement</h3>

          <p>Foreign keys are defined with update triggers <code>ON INSERT [X]</code>, <code>ON UPDATE [X]</code>, <code>ON DELETE [X]</code>.  Depending on what [X] is, the constraint is enforced differently:</p>

          <dl style="font-size: 80%">
            <dt><code>CASCADE</code></dt>
            <dd>Create/delete rows as needed to avoid invalid foreign keys.</dd>

            <dt><code>NO ACTION</code></dt>
            <dd>Abort any transaction that ends with an invalid foreign key reference.</dd>

            <dt><code>SET NULL</code></dt>
            <dd>Automatically replace any invalid foreign key references with NULL</dd>.
          </dl>
        </section>

        <section>
          <p style="font-weight: bold;">
            <code>CASCADE</code> and <code>NO ACTION</code> ensure that the data never has broken pointers, so
          </p>
          $$\pi_{attrs(S)}\left(S \bowtie_B R\right) = S$$
        </section>

        <section>
          <h3>Functional Dependencies</h3>

          <p style="margin-top: 50px;"><b>A generalization of keys:</b> One set of attributes that uniquely identify another.</p>

          <ul>
            <li>SS# uniquely identifies Name.</li>
            <li>Employee uniquely identifies Manager.</li>
            <li>Order number uniquely identifies Customer Address.</li>
          </ul>

          <p class="fragment">Two rows with the same As must have the same Bs</p>
          <p class="fragment" style="font-size: 80%">(but can still have identical Bs for two different As)</p>
        </section>

        <section>
          <h3>Normal Forms</h3>
          <p style="margin-top: 50px;">"All functional dependencies should be keys."</p>
          <p class="fragment">(Otherwise you want two separate relations)</p>
          <p class="fragment">(for more details, see CSE 560)</p>
        </section>
        
        <section>
          
          $$P(A = B) = min\left(\frac{1}{\texttt{COUNT}(\texttt{DISTINCT } A)}, \frac{1}{\texttt{COUNT}(\texttt{DISTINCT } B)}\right)$$

        </section>
        <section>

          <p>
            $$R \bowtie_{R.A = S.B} S = \sigma_{R.A = S.B}(R \times S)$$
            (and $S.B$ is a foreign key referencing $R.A$)
          </p>

          <p class="fragment" style="margin-top: 30px; font-size: 80%">
            The (foreign) key constraint gives us two things...
            $$\texttt{COUNT}(\texttt{DISTINCT } A) \approx \texttt{COUNT}(\texttt{DISTINCT } B)$$
            <span style="font-size: 60%; font-weight: bold; margin: 0px;">and</span>
            $$\texttt{COUNT}(\texttt{DISTINCT } A) = |R|$$
          </p>

          <p class="fragment" style="margin-top: 30px; font-size: 80%">
            Based on the first property the total number of rows is roughly...
            $$|R| \times |S| \times \frac{1}{\texttt{COUNT}(\texttt{DISTINCT } A)}$$
          </p>

          <p class="fragment" style="margin-top: 30px; font-size: 80%">
            Then based on the second property...
            $$ = |R| \times |S| \times \frac{1}{|R|} = |S|$$
          </p>

          <p class="fragment" style="margin-top: 30px; font-size: 50%">(Statistics/Histograms will give you the same outcome... but constraints can be easier to propagate)</p>
        </section>
      </section>

      <section>
        <p><b>Next class:</b> Exam Review</p>
      </section>

    </div></div>

    <script src="../reveal.js-3.6.0/js/reveal.js"></script>

    <script>

      // Full list of configuration options available at:
      // https://github.com/hakimel/../reveal.js#configuration
      Reveal.initialize({
        controls: true,
        progress: true,
        history: true,
        center: true,
        slideNumber: true,

        transition: 'fade', // none/fade/slide/convex/concave/zoom

        chart: {
          defaults: { 
            global: { 
              title: { fontColor: "#333", fontSize: 24 }, 
              legend: {
                labels: { fontColor: "#333", fontSize: 20 },
              },
              responsiveness: true
            },
            scale: { 
              scaleLabel: { fontColor: "#333", fontSize: 20 }, 
              gridLines: { color: "#333", zeroLineColor: "#333" }, 
              ticks: { fontColor: "#333", fontSize: 16 }, 
            } 
          },
          line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]}, 
          bar: { backgroundColor: [ 
              "rgba(220,220,220,0.8)",
              "rgba(151,187,205,0.8)",
              "rgba(205,151,187,0.8)",
              "rgba(187,205,151,0.8)"
            ]
          }, 
          pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
          radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]}, 
        },

        // Optional ../reveal.js plugins
        dependencies: [
          { src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
          { src: '../reveal.js-3.6.0/plugin/math/math.js', 
            condition: function() { return true; },
            mathjax: '../reveal.js-3.6.0/js/MathJax.js'
           },
          { src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
          { src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
          { src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
          { src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
          { src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
          // Chart.min.js
          { src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
          // the plugin
          { src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
          { src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
          { src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
        ]
      });

    </script>

  </body>
</html>