2018-03-04 15:00:02 -05:00
<!doctype html>
< html lang = "en" >
< head >
< meta charset = "utf-8" >
< title > CSE 4/562 - Spring 2018< / title >
< meta name = "description" content = "CSE 4/562 - Spring 2018" >
< meta name = "author" content = "Oliver Kennedy" >
< meta name = "apple-mobile-web-app-capable" content = "yes" / >
< meta name = "apple-mobile-web-app-status-bar-style" content = "black-translucent" / >
< meta name = "viewport" content = "width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui" >
< link rel = "stylesheet" href = "../reveal.js-3.6.0/css/reveal.css" >
< link rel = "stylesheet" href = "ubodin.css" id = "theme" >
<!-- Code syntax highlighting -->
< link rel = "stylesheet" href = "../reveal.js-3.6.0/lib/css/zenburn.css" >
<!-- Printing and PDF exports -->
< script >
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
< / script >
< script src = "../reveal.js-3.6.0/lib/js/head.min.js" > < / script >
<!-- [if lt IE 9]>
< script src = "../reveal.js-3.6.0/lib/js/html5shiv.js" > < / script >
<![endif]-->
< / head >
< body >
< div class = "reveal" >
<!-- Any section element inside of this container is displayed as a slide -->
< div class = "header" >
<!-- Any Talk - Specific Header Content Goes Here -->
CSE 4/562 - Database Systems
< / div >
< div class = "slides" >
< section >
< h1 > Cost Based Optimization< / h1 >
< h3 > CSE 4/562 – Database Systems< / h3 >
2018-03-05 15:49:38 -05:00
< h5 > March 5-7, 2018< / h5 >
2018-03-04 15:00:02 -05:00
< / section >
<!-- ============================================ -->
< section >
< section >
< h3 > Remember the Real Goals< / h3 >
< ol >
< li > Accurately < b > rank< / b > the plans.< / li >
< li > Don't spend more time optimizing than you get back.< / li >
< li > Don't pick a plan that uses more memory than you have.< / li >
< / ol >
< / section >
< section >
< h3 > Accounting< / h3 >
< p style = "margin-top: 50px;" > Figure out the cost of each < b > individual< / b > operator.< / p >
< p style = "margin-top: 50px;" > Only count the number of IOs < b > added< / b > by each operator.< / p >
< / section >
< section >
< table style = "font-size: 70%" >
< tr > < th > Operation< / th > < th > RA< / th > < th > IOs Added (#pages)< / th > < th > Memory (#tuples)< / th > < / tr >
< tr >
< td > Table Scan< / td >
< td > $R$< / td >
< td > $\frac{|R|}{\mathcal P}$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > Projection< / td >
< td > $\pi(R)$< / td >
< td > $0$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > Selection< / td >
< td > $\sigma(R)$< / td >
< td > $0$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > Union< / td >
< td > $R \cup S$< / td >
< td > $0$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td style = "vertical-align: middle;" > Sort < span > (In-Mem)< / span > < / td >
< td style = "vertical-align: middle;" > $\tau(R)$< / td >
< td > $0$< / td >
< td > $O(|R|)$< / td >
< / tr >
< tr >
< td > Sort (On-Disk)< / td >
< td > $\tau(R)$< / td >
< td > $\frac{2 \cdot \lfloor log_{\mathcal B}(|R|) \rfloor}{\mathcal P}$< / td >
< td > $O(\mathcal B)$< / td >
< / tr >
< tr >
< td > < span > (B+Tree)< / span > Index Scan< / td >
< td > $Index(R, c)$< / td >
< td > $\log_{\mathcal I}(|R|) + \frac{|\sigma_c(R)|}{\mathcal P}$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > (Hash) Index Scan< / td >
< td > $Index(R, c)$< / td >
< td > $1$< / td >
< td > $O(1)$< / td >
< / tr >
< / table >
< ol style = "font-size: 50%; margin-top: 50px;" >
< li > Tuples per Page ($\mathcal P$) < span > – Normally defined per-schema< / span > < / li >
< li > Size of $R$ ($|R|$)< / li >
< li > Pages of Buffer ($\mathcal B$)< / li >
< li > Keys per Index Page ($\mathcal I$)< / li >
< / ol >
< / section >
< section >
< table style = "font-size: 70%" >
< tr > < th width = "300px" > Operation< / th > < th > RA< / th > < th > IOs Added (#pages)< / th > < th > Memory (#tuples)< / th > < / tr >
< tr >
< td style = "font-size: 60%" > Nested Loop Join < span > (Buffer $S$ in mem)< / span > < / td >
< td > $R \times S$< / td >
< td > $0$< / td >
< td > $O(|S|)$< / td >
< / tr >
< tr >
< td style = "font-size: 60%" > Nested Loop Join (Buffer $S$ on disk)< / td >
< td > $R \times_{disk} S$< / td >
< td > $(1+ |R|) \cdot \frac{|S|}{\mathcal P}$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > 1-Pass Hash Join< / td >
< td > $R \bowtie_{1PH, c} S$< / td >
< td > $0$< / td >
< td > $O(|S|)$< / td >
< / tr >
< tr >
< td > 2-Pass Hash Join< / td >
< td > $R \bowtie_{2PH, c} S$< / td >
< td > $\frac{2|R| + 2|S|}{\mathcal P}$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > Sort-Merge Join < / td >
< td > $R \bowtie_{SM, c} S$< / td >
< td > [Sort]< / td >
< td > [Sort]< / td >
< / tr >
< tr >
< td > < span > (Tree)< / span > Index NLJ< / td >
< td > $R \bowtie_{INL, c}$< / td >
< td > $|R| \cdot (\log_{\mathcal I}(|S|) + \frac{|\sigma_c(S)|}{\mathcal P})$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > (Hash) Index NLJ< / td >
< td > $R \bowtie_{INL, c}$< / td >
< td > $|R| \cdot 1$< / td >
< td > $O(1)$< / td >
< / tr >
< tr >
< td > < span > (In-Mem)< / span > Aggregate< / td >
< td > $\gamma_A(R)$< / td >
< td > $0$< / td >
< td > $adom(A)$< / td >
< / tr >
< tr >
< td style = "font-size: 90%" > (Sort/Merge) Aggregate< / td >
< td > $\gamma_A(R)$< / td >
< td > [Sort]< / td >
< td > [Sort]< / td >
< / tr >
< / table >
< ol style = "font-size: 50%;" >
< li > Tuples per Page ($\mathcal P$) < span > – Normally defined per-schema< / span > < / li >
< li > Size of $R$ ($|R|$)< / li >
< li > Pages of Buffer ($\mathcal B$)< / li >
< li > Keys per Index Page ($\mathcal I$)< / li >
< li > Number of distinct values of $A$ ($adom(A)$)< / li >
< / ol >
< / section >
< / section >
<!-- ============================================ -->
< section >
< section >
< p > Estimating IOs requires Estimating $|Q(R)|$< / p >
< / section >
< section >
< h3 > Cardinality Estimation< / h3 >
< p class = "fragment" > Unlike estimating IOs, cardinality estimation doesn't care about the algorithm, so we'll just be working with raw RA.< / p >
< p class = "fragment" > Also unlike estimating IOs, we care about the cardinality of $|Q(R)|$ as a whole, rather than the contribution of each individual operator.< / p >
< / section >
< section >
< table style = "font-size: 70%" >
< tr >
< th > Operator< / th >
< th > RA< / th >
< th > Estimated Size< / th >
< / tr >
< tr >
< td > Table< / td >
< td > $R$< / td >
< td class = "fragment" data-fragment-index = "1" > $|R|$< / td >
< / tr >
< tr >
< td > Projection< / td >
< td > $\pi(Q)$< / td >
< td class = "fragment" data-fragment-index = "2" > $|Q|$< / td >
< / tr >
< tr >
< td > Union< / td >
< td > $Q_1 \uplus Q_2$< / td >
< td class = "fragment" data-fragment-index = "3" > $|Q_1| + |Q_2|$< / td >
< / tr >
< tr >
< td > Cross Product< / td >
< td > $Q_1 \times Q_2$< / td >
< td class = "fragment" data-fragment-index = "4" > $|Q_1| \times |Q_2|$< / td >
< / tr >
< tr >
< td > Sort< / td >
< td > $\tau(Q)$< / td >
< td class = "fragment" data-fragment-index = "5" > $|Q|$< / td >
< / tr >
< tr >
< td > Limit< / td >
< td > $\texttt{LIMIT}_N(Q)$< / td >
< td class = "fragment" data-fragment-index = "6" > $N$< / td >
< / tr >
< tr >
< td > Selection< / td >
< td > $\sigma_c(Q)$< / td >
< td class = "fragment" data-fragment-index = "8" > $|Q| \times \texttt{SEL}(c, Q)$< / td >
< / tr >
< tr >
< td > Join< / td >
< td > $Q_1 \bowtie_c Q_2$< / td >
< td class = "fragment" data-fragment-index = "9" > $|Q_1| \times |Q_2| \times \texttt{SEL}(c, Q_1\times Q_2)$< / td >
< / tr >
< tr >
< td > Distinct< / td >
< td > $\delta_A(Q)$< / td >
< td class = "fragment" data-fragment-index = "11" > $\texttt{UNIQ}(A, Q)$< / td >
< / tr >
< tr >
< td > Aggregate< / td >
< td > $\gamma_{A, B \leftarrow \Sigma}(Q)$< / td >
< td class = "fragment" data-fragment-index = "12" > $\texttt{UNIQ}(A, Q)$< / td >
< / tr >
< / table >
< ul style = "font-size: 50%; margin-top: 20px" >
< li class = "fragment" data-fragment-index = "7" > $\texttt{SEL}(c, Q)$: Selectivity of $c$ on $Q$, or $\frac{|\sigma_c(Q)|}{|Q|}$< / li >
< li class = "fragment" data-fragment-index = "10" > $\texttt{UNIQ}(A, Q)$: # of distinct values of $A$ in $Q$.
< / ul >
< / section >
<!-- 2018 by OK:
Things to cover:
- Defaults: The 10% rule
- Basic Assumptions:
- Selectivity: MIN/MAX+COUNT, Uniform distribution, No correlations
- Unique Values: COUNT DISTINCT, No correlations
- Histograms: Nonuniform distributions
- Constraints: Keys, FDs, FKey (implications for Joins)
-->
< section >
< h3 > Cardinality Estimation< / h3 >
< h4 > (The Hard Parts)< / h4 >
< dl >
< dt style = "margin-top: 50px;" > $\sigma_c(Q)$ (Cardinality Estimation)< / dt >
< dd > How many tuples will a condition $c$ allow to pass?< / dd >
< dt style = "margin-top: 50px;" > $\delta_A(Q)$ (Distinct Values Estimation)< / dt >
< dd > How many distinct values of attribute(s) $A$ exist?< / dd >
< / dl >
< / section >
< section >
< h3 > Remember the Real Goals< / h3 >
< ol >
< li > Accurately < b > rank< / b > the plans.< / li >
< li > Don't spend more time optimizing than you get back.< / li >
< / ol >
< / section >
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< div class = "fragment" >
< dt > Guess Randomly< / dt >
< dd > Rules of thumb if you have no other options...< / dd >
< / div >
< div class = "fragment" >
< dt > Uniform Prior< / dt >
< dd > Use basic statistics to make a very rough guess.< / dd >
< / div >
< div class = "fragment" >
< dt > Sampling / History< / dt >
< dd > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< / div >
< div class = "fragment" >
< dt > Histograms< / dt >
< dd > Using more detailed statistics for improved guesses.< / dd >
< / div >
< div class = "fragment" >
< dt > Constraints< / dt >
< dd > Using rules about the data for improved guesses.< / dd >
< / div >
< / dl >
< / section >
< / section >
<!-- ============================================ -->
< section >
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
2018-03-04 16:37:37 -05:00
< dt style = "color: blue;" > Guess Randomly< / dt >
< dd style = "color: blue;" > Rules of thumb if you have no other options...< / dd >
2018-03-04 15:00:02 -05:00
< dt style = "color: grey;" > Uniform Prior< / dt >
< dd style = "color: grey;" > Use basic statistics to make a very rough guess.< / dd >
< dt style = "color: grey;" > Sampling / History< / dt >
< dd style = "color: grey;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< dt style = "color: grey;" > Histograms< / dt >
< dd style = "color: grey;" > Using more detailed statistics for improved guesses.< / dd >
< dt style = "color: grey;" > Constraints< / dt >
< dd style = "color: grey;" > Using rules about the data for improved guesses.< / dd >
< / dl >
< / section >
< section >
< h3 > The 10% Selectivity Rule< / h3 >
< p > Every select or distinct operator passes 10% of all rows.< / p >
< div class = "fragment" >
$$\sigma_{A = 1 \wedge B = 2}(R)$$
< / div >
< div class = "fragment" >
$$|\sigma_{A = 1 \wedge B = 2}(R)| = 0.1 \cdot |R|$$
< / div >
< div class = "fragment" style = "margin-top: 50px;" >
$$\sigma_{A = 1}(\sigma_{B = 2}(R))$$
< / div >
< div class = "fragment" >
$$|\sigma_{A = 1}(\sigma_{B = 2}(R))| = 0.1 \cdot |\sigma_{B = 2}(R)| = 0.1 \cdot 0.1 \cdot |R|$$
< / div >
< p class = "fragment" style = "font-size: 80%; font-weight: bold; margin-top: 50px;" > (Queries are typically standardized first)< / p >
< p class = "fragment" style = "font-size: 80%; font-weight: bold; margin-top: 20px;" > (The specific % varies by DBMS. E.g., Teradata uses 10% for the first < code > AND< / code > clause, and 75% for every subsequent clause)< / p >
< / section >
< section >
< p > The 10% rule is a fallback when everything else fails. < br / > Usually, databases collect statistics...< / p >
< / section >
< / section >
<!-- ============================================ -->
< section >
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< dt style = "color: grey;" > Guess Randomly< / dt >
< dd style = "color: grey;" > Rules of thumb if you have no other options...< / dd >
2018-03-04 16:37:37 -05:00
< dt style = "color: blue;" > Uniform Prior< / dt >
< dd style = "color: blue;" > Use basic statistics to make a very rough guess.< / dd >
2018-03-04 15:00:02 -05:00
< dt style = "color: grey;" > Sampling / History< / dt >
< dd style = "color: grey;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< dt style = "color: grey;" > Histograms< / dt >
< dd style = "color: grey;" > Using more detailed statistics for improved guesses.< / dd >
< dt style = "color: grey;" > Constraints< / dt >
< dd style = "color: grey;" > Using rules about the data for improved guesses.< / dd >
< / dl >
< / section >
< section >
< h3 > Uniform Prior< / h3 >
2018-03-04 21:29:25 -05:00
< p style = "text-align: left; margin-bottom: 0px; font-weight: bold;" > We assume that for $\sigma_c(Q)$ or $\delta_A(Q)$...< / p >
2018-03-04 15:00:02 -05:00
< ol >
< li > Basic statistics are known about $Q$: < ul >
< li style = "margin-top: 0px;" > < code > COUNT(*)< / code > < / li >
< li style = "margin-top: 0px;" > < code > COUNT(DISTINCT A)< / code > (for each A)< / li >
< li style = "margin-top: 0px;" > < code > MIN(A)< / code > , < code > MAX(A)< / code > (for each numeric A)< / li >
< / ul > < / li >
< li > Attribute values are uniformly distributed.< / li >
< li > No inter-attribute correlations.< / li >
< / ol >
< p class = "fragment" style = "font-size: 80%; font-weight: bold; margin-top: 20px;" >
If (1) fails, fall back to the 10% rule.
< / p >
< p class = "fragment" style = "font-size: 80%; font-weight: bold; margin-top: 0px;" >
If (2) or (3) fails, it'll often still be a < i > good enough< / i > estimate.
< / p >
< / section >
< section >
2018-03-04 21:29:25 -05:00
< p > Estimating $\delta_A(Q)$ requires only < code > COUNT(DISTINCT A)< / code > < / p >
< / section >
< section >
< h3 > Estimating Selectivity< / h3 >
2018-03-04 15:00:02 -05:00
< p > Selectivity is a probability ($\texttt{SEL}(c, Q) = P(c)$)< / p >
< table style = "font-size: 85%" >
< tr class = "fragment" >
< td > $P(A = x_1)$< / td >
< td > $=$< / td >
< td class = "fragment" > $\frac{1}{\texttt{COUNT(DISTINCT A)}}$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(A \in (x_1, x_2, \ldots, x_N))$< / td >
< td > $=$< / td >
< td class = "fragment" > $\frac{N}{\texttt{COUNT(DISTINCT A)}}$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(A \leq x_1)$< / td >
< td > $=$< / td >
< td class = "fragment" > $\frac{x_1 - \texttt{MIN(A)}}{\texttt{MAX(A)} - \texttt{MIN(A)}}$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(x_1 \leq A \leq x_2)$< / td >
< td > $=$< / td >
< td class = "fragment" > $\frac{x_2 - x_1}{\texttt{MAX(A)} - \texttt{MIN(A)}}$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(A = B)$< / td >
< td > $=$< / td >
< td class = "fragment" style = "font-size: 60%" > $\textbf{min}\left( \frac{1}{\texttt{COUNT(DISTINCT A)}}, \frac{1}{\texttt{COUNT(DISTINCT B)}} \right)$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(c_1 \wedge c_2)$< / td >
< td > $=$< / td >
< td class = "fragment" > $P(c_1) \cdot P(c_2)$< / td >
< / tr >
< tr class = "fragment" >
< td > $P(c_1 \vee c_2)$< / td >
< td > $=$< / td >
< td class = "fragment" > $1 - (1 - P(c_1)) \cdot (1 - P(c_2))$< / td >
< / tr >
< / table >
< p style = "font-size: 60%" > (With constants $x_1$, $x_2$, ...)< / p >
< / section >
2018-03-04 16:37:37 -05:00
< section >
< h3 > Limitations< / h3 >
< dl >
< div class = "fragment" >
< dt > Don't always have statistics for $Q$< / dt >
< dd > For example, $\pi_{A \leftarrow (B \times C)}(R)$< / dd >
< / div >
< div class = "fragment" >
< dt > Don't always have clear rules for $c$< / dt >
< dd > For example, $\sigma_{\texttt{FitsModel}(A, B, C)}(R)$< / dd >
< / div >
< div class = "fragment" >
< dt > Attribute values are not always uniformly distributed.< / dt >
< dd > For example, < span style = "font-size: 60%" > $|\sigma_{SPC\_COMMON = 'pin\ oak'}(T)|$ vs $|\sigma_{SPC\_COMMON = 'honeylocust'}(T)|$< / span > < / dd >
< / div >
< div class = "fragment" >
< dt > Attribute values are sometimes correlated.< / dt >
< dd > For example, $\sigma_{(stump < 5 ) \ wedge ( diam > 3)}(T)$< / dd >
< / div >
< / dl >
< / section >
< / section >
< section >
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< dt style = "color: grey;" > Guess Randomly< / dt >
< dd style = "color: grey;" > Rules of thumb if you have no other options...< / dd >
< dt style = "color: grey;" > Uniform Prior< / dt >
< dd style = "color: grey;" > Use basic statistics to make a very rough guess.< / dd >
< dt style = "color: blue;" > Sampling / History< / dt >
< dd style = "color: blue;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< dt style = "color: grey;" > Histograms< / dt >
< dd style = "color: grey;" > Using more detailed statistics for improved guesses.< / dd >
< dt style = "color: grey;" > Constraints< / dt >
< dd style = "color: grey;" > Using rules about the data for improved guesses.< / dd >
< / dl >
< / section >
< section >
< p > < b > Idea 1:< / b > Pick 100 tuples at random from each input table.< / p >
< / section >
2018-03-04 21:29:25 -05:00
< section >
< svg data-src = "graphics/2018-03-05-JoinIssue.svg" / >
< / section >
< section >
< h3 > The Birthday Paradox< / h3 >
< p style = "margin-top: 50px;" >
Assume: $\texttt{UNIQ}(A, R) = \texttt{UNIQ}(A, S) = N$
< / p >
< p style = "margin-top: 50px;" >
It takes $O(\sqrt{N})$ samples from both $R$ and $S$ < br / > to get even < b > one match.< / b >
< / p >
< / section >
< section >
< p > To be resumed later in the term when we talk about AQP< / p >
< / section >
< section >
< p > < b > How DBs Do It< / b > : Instrument queries while running them.< ul >
< li class = "fragment" > The first time you run a query it < i > might< / i > be slow.< / li >
< li class = "fragment" > The second, third, fourth, etc... times it'll be fast.< / li >
< / ul > < / p >
< / section >
2018-03-04 16:37:37 -05:00
< / section >
< section >
2018-03-04 21:29:25 -05:00
2018-03-04 16:37:37 -05:00
< section >
2018-03-04 21:29:25 -05:00
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< dt style = "color: grey;" > Guess Randomly< / dt >
< dd style = "color: grey;" > Rules of thumb if you have no other options...< / dd >
< dt style = "color: grey;" > Uniform Prior< / dt >
< dd style = "color: grey;" > Use basic statistics to make a very rough guess.< / dd >
< dt style = "color: grey;" > Sampling / History< / dt >
< dd style = "color: grey;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< dt style = "color: blue;" > Histograms< / dt >
< dd style = "color: blue;" > Using more detailed statistics for improved guesses.< / dd >
< dt style = "color: grey;" > Constraints< / dt >
< dd style = "color: grey;" > Using rules about the data for improved guesses.< / dd >
< / dl >
< / section >
< section >
< h3 > Limitations of Uniform Prior< / h3 >
2018-03-04 16:37:37 -05:00
< dl >
2018-03-04 21:29:25 -05:00
< div class = "fragment highlight-grey" data-fragment-index = "1" >
2018-03-04 16:37:37 -05:00
< dt > Don't always have statistics for $Q$< / dt >
< dd > For example, $\pi_{A \leftarrow (B \times C)}(R)$< / dd >
< / div >
2018-03-04 21:29:25 -05:00
< div class = "fragment highlight-grey" data-fragment-index = "1" >
2018-03-04 16:37:37 -05:00
< dt > Don't always have clear rules for $c$< / dt >
< dd > For example, $\sigma_{\texttt{FitsModel}(A, B, C)}(R)$< / dd >
< / div >
2018-03-04 21:29:25 -05:00
< div class = "fragment highlight-blue" data-fragment-index = "1" >
2018-03-04 16:37:37 -05:00
< dt > Attribute values are not always uniformly distributed.< / dt >
< dd > For example, < span style = "font-size: 60%" > $|\sigma_{SPC\_COMMON = 'pin\ oak'}(T)|$ vs $|\sigma_{SPC\_COMMON = 'honeylocust'}(T)|$< / span > < / dd >
< / div >
2018-03-04 21:29:25 -05:00
< div class = "fragment highlight-grey" data-fragment-index = "1" >
2018-03-04 16:37:37 -05:00
< dt > Attribute values are sometimes correlated.< / dt >
< dd > For example, $\sigma_{(stump < 5 ) \ wedge ( diam > 3)}(T)$< / dd >
< / div >
< / dl >
< / section >
2018-03-04 21:29:25 -05:00
< section >
< p class = "fragment highlight-grey" data-fragment-index = "1" >
< b > Ideal Case:< / b > You have some
$$f(x) = \left(\texttt{SELECT COUNT(*) WHERE A = x}\right)$$
(and similarly for the other aggregates)
< / p >
< p class = "fragment" data-fragment-index = "1" >
< b > Slightly Less Ideal Case:< / b > You have some
$$f(x) \approx \left(\texttt{SELECT COUNT(*) WHERE A = x}\right)$$
< / p >
< / section >
< section >
< p > If this sounds like CDF-based indexing... you're right!< / p >
< p class = "fragment" > ... but we're not going to talk about NNs today< / p >
< / section >
< / section >
< section >
< section >
< p >
< b > Simpler/Faster Idea: < / b > Break $f(x)$ into chunks
< / p >
< / section >
< section >
< h3 > Example Data< / h3 >
< table style = "font-size: 80%" >
< tr > < th > Name< / th > < th > YearsEmployed< / th > < th > Role< / th > < / tr >
< tr > < td > 'Alice'< / td > < td > 3< / td > < td > 1< / td > < / tr >
< tr > < td > 'Bob'< / td > < td > 2< / td > < td > 2< / td > < / tr >
< tr > < td > 'Carol'< / td > < td > 3< / td > < td > 1< / td > < / tr >
< tr > < td > 'Dave'< / td > < td > 1< / td > < td > 3< / td > < / tr >
< tr > < td > 'Eve'< / td > < td > 2< / td > < td > 2< / td > < / tr >
< tr > < td > 'Fred'< / td > < td > 2< / td > < td > 3< / td > < / tr >
< tr > < td > 'Gwen'< / td > < td > 4< / td > < td > 1< / td > < / tr >
< tr > < td > 'Harry'< / td > < td > 2< / td > < td > 3< / td > < / tr >
< / table >
< / section >
< section >
< h3 > Histograms< / h3 >
< table style = "font-size: 70%" >
< tr > < th > YearsEmployed< / th > < th > COUNT< / th > < / tr >
< tr > < td > 1< / td > < td > 1< / td > < / tr >
< tr > < td > 2< / td > < td > 4< / td > < / tr >
< tr > < td > 3< / td > < td > 2< / td > < / tr >
< tr > < td > 4< / td > < td > 1< / td > < / tr >
< / table >
< table >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(DISTINCT YearsEmployed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MIN(YearsEmployed)< / code > < / td > < td class = "fragment" > $= 1$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MAX(YearsEmplyed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(*) YearsEmployed = 2< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< / table >
< / section >
< section >
< h3 > Histograms< / h3 >
< table style = "font-size: 70%" >
< tr > < th > YearsEmployed< / th > < th > COUNT< / th > < / tr >
< tr > < td > 1-2< / td > < td > 5< / td > < / tr >
< tr > < td > 3-4< / td > < td > 3< / td > < / tr >
< / table >
< table >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(DISTINCT YearsEmployed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MIN(YearsEmployed)< / code > < / td > < td class = "fragment" > $= 1$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MAX(YearsEmplyed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(*) YearsEmployed = 2< / code > < / td > < td class = "fragment" > $= \frac{5}{2}$< / td > < / tr >
< / table >
< / section >
< section >
< h3 > The Extreme Case< / h3 >
< table style = "font-size: 70%" >
< tr > < th > YearsEmployed< / th > < th > COUNT< / th > < / tr >
< tr > < td > 1-4< / td > < td > 8< / td > < / tr >
< / table >
< table >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(DISTINCT YearsEmployed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MIN(YearsEmployed)< / code > < / td > < td class = "fragment" > $= 1$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > MAX(YearsEmplyed)< / code > < / td > < td class = "fragment" > $= 4$< / td > < / tr >
< tr class = "fragment" > < td style = "font-size: 70%" > < code > COUNT(*) YearsEmployed = 2< / code > < / td > < td class = "fragment" > $= \frac{8}{4}$< / td > < / tr >
< / table >
< / section >
< section >
< h3 > More Example Data< / h3 >
< table style = "font-size: 80%; float: left;" >
< tr > < th > Value< / th > < th > COUNT< / th > < / tr >
< tr > < td > 1-10< / td > < td > 20< / td > < / tr >
< tr > < td > 11-20< / td > < td > 0< / td > < / tr >
< tr > < td > 21-30< / td > < td > 15< / td > < / tr >
< tr > < td > 31-40< / td > < td > 30< / td > < / tr >
< tr > < td > 41-50< / td > < td > 22< / td > < / tr >
< tr > < td > 51-60< / td > < td > 63< / td > < / tr >
< tr > < td > 61-70< / td > < td > 10< / td > < / tr >
< tr > < td > 71-80< / td > < td > 10< / td > < / tr >
< / table >
< table style = "margin-top: 100px;" >
< tr class = "fragment" >
< td style = "font-size: 70%; width: 350px;" > < code > SELECT … WHERE A = 33< / code > < / td >
< td class = "fragment" style = "font-size: 80%; text-align: left; width: 200px;" > $= \frac{1}{40-30}\cdot 30 = 3$< / td >
< / tr >
< tr > < td style = "height: 70px;" > < / td > < td > < / td > < / tr >
< tr class = "fragment" >
< td style = "font-size: 70%; width: 350px;" > < code > SELECT … WHERE A > 33< / code > < / td >
< td class = "fragment" style = "font-size: 80%; text-align: left; width: 200px;" > $= \frac{40-33}{40-30}\cdot 30+22$ $\;\;\;+63+10+10$ $= 126$ < / td >
< / tr >
< / table >
< / section >
< / section >
< section >
2018-03-04 16:37:37 -05:00
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< dt style = "color: grey;" > Guess Randomly< / dt >
< dd style = "color: grey;" > Rules of thumb if you have no other options...< / dd >
< dt style = "color: grey;" > Uniform Prior< / dt >
< dd style = "color: grey;" > Use basic statistics to make a very rough guess.< / dd >
< dt style = "color: grey;" > Sampling / History< / dt >
< dd style = "color: grey;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
2018-03-04 21:29:25 -05:00
< dt style = "color: grey;" > Histograms< / dt >
< dd style = "color: grey;" > Using more detailed statistics for improved guesses.< / dd >
2018-03-04 16:37:37 -05:00
2018-03-04 21:29:25 -05:00
< dt style = "color: blue;" > Constraints< / dt >
< dd style = "color: blue;" > Using rules about the data for improved guesses.< / dd >
2018-03-04 16:37:37 -05:00
< / dl >
< / section >
2018-03-04 15:00:02 -05:00
< / section >
2018-03-07 00:45:02 -05:00
< section >
< section >
< h3 > (Some) Estimation Techniques< / h3 >
< dl style = "font-size: 80%" >
< dt style = "color: grey;" > Guess Randomly< / dt >
< dd style = "color: grey;" > Rules of thumb if you have no other options...< / dd >
< dt style = "color: grey;" > Uniform Prior< / dt >
< dd style = "color: grey;" > Use basic statistics to make a very rough guess.< / dd >
< dt style = "color: grey;" > Sampling / History< / dt >
< dd style = "color: grey;" > Small, Quick Sampling Runs (or prior executions of the query).< / dd >
< dt style = "color: grey;" > Histograms< / dt >
< dd style = "color: grey;" > Using more detailed statistics for improved guesses.< / dd >
< dt style = "color: blue;" > Constraints< / dt >
< dd style = "color: blue;" > Using rules about the data for improved guesses.< / dd >
< / dl >
< / section >
< section >
< h3 > Key / Unique Constraints< / h3 >
< pre style = "margin-top: 50px;" > < code class = "sql" >
CREATE TABLE R (
A int,
B int UNIQUE
...
PRIMARY KEY A
);
< / code > < / pre >
< p style = "margin-top: 50px;" >
No duplicate values in the column.
$$\texttt{COUNT(DISTINCT A)} = \texttt{COUNT(*)}$$
< / p >
< / section >
< section >
< h3 > Foreign Key Constraints< / h3 >
< pre style = "margin-top: 50px;" > < code class = "sql" >
CREATE TABLE S (
B int,
...
FOREIGN KEY B REFERENCES R.B
);
< / code > < / pre >
< p style = "margin-top: 50px;" >
All values in the column appear in another table.
$$\pi_{attrs(S)}\left(S \bowtie_B R\right) \subseteq S$$
< / p >
< / section >
< section >
< h3 > Functional Dependencies< / h3 >
< pre style = "margin-top: 50px;" > < code class = "sql" >
Not expressible in SQL
< / code > < / pre >
< p style = "margin-top: 50px;" >
One set of columns uniquely determines another.< br / >
$\pi_{A}(\delta(\pi_{A, B}(R)))$ has no duplicates and...
$$\pi_{attrs(R)-A}(R) \bowtie_A \delta(\pi_{A, B}(R)) = R$$
< / p >
< / section >
< section >
< h3 > Constraints< / h3 >
< h4 > The Good< / h4 >
< ul >
< li style = "font-size: 70%" class = "fragment" > Sanity check on your data: Inconsistent data triggers failures.< / li >
< li style = "font-size: 70%" class = "fragment" > More opportunities for query optimization.< / li >
< / ul >
< h4 style = "margin-top: 50px;" class = "fragment" > The Not-So Good< / h4 >
< ul >
< li style = "font-size: 70%" class = "fragment" > Validating constraints whenever data changes is (usually) expensive.< / li >
< li style = "font-size: 70%" class = "fragment" > Inconsistent data triggers failures.< / li >
< / ul >
< / section >
< section >
< h3 > Foreign Key Constraints< / h3 >
< p style = "margin-top: 50px;" > Foreign keys are like pointers. What happens with broken pointers?< / p >
< / section >
< section >
< h3 > Foreign Key Enforcement< / h3 >
< p > Foreign keys are defined with update triggers < code > ON INSERT [X]< / code > , < code > ON UPDATE [X]< / code > , < code > ON DELETE [X]< / code > . Depending on what [X] is, the constraint is enforced differently:< / p >
< dl style = "font-size: 80%" >
< dt > < code > CASCADE< / code > < / dt >
< dd > Create/delete rows as needed to avoid invalid foreign keys.< / dd >
< dt > < code > NO ACTION< / code > < / dt >
< dd > Abort any transaction that ends with an invalid foreign key reference.< / dd >
< dt > < code > SET NULL< / code > < / dt >
< dd > Automatically replace any invalid foreign key references with NULL< / dd > .
< / dl >
< / section >
< section >
< p style = "font-weight: bold;" >
< code > CASCADE< / code > and < code > NO ACTION< / code > ensure that the data never has broken pointers, so
< / p >
$$\pi_{attrs(S)}\left(S \bowtie_B R\right) = S$$
< / section >
< section >
< h3 > Functional Dependencies< / h3 >
< p style = "margin-top: 50px;" > < b > A generalization of keys:< / b > One set of attributes that uniquely identify another.< / p >
< ul >
< li > SS# uniquely identifies Name.< / li >
< li > Employee uniquely identifies Manager.< / li >
< li > Order number uniquely identifies Customer Address.< / li >
< / ul >
< p class = "fragment" > Two rows with the same As must have the same Bs< / p >
< p class = "fragment" style = "font-size: 80%" > (but can still have identical Bs for two different As)< / p >
< / section >
< section >
< h3 > Normal Forms< / h3 >
< p style = "margin-top: 50px;" > "All functional dependencies should be keys."< / p >
< p class = "fragment" > (Otherwise you want two separate relations)< / p >
< p class = "fragment" > (for more details, see CSE 560)< / p >
< / section >
< section >
$$P(A = B) = min\left(\frac{1}{\texttt{COUNT}(\texttt{DISTINCT } A)}, \frac{1}{\texttt{COUNT}(\texttt{DISTINCT } B)}\right)$$
< / section >
< section >
< p >
$$R \bowtie_{R.A = S.B} S = \sigma_{R.A = S.B}(R \times S)$$
(and $S.B$ is a foreign key referencing $R.A$)
< / p >
< p class = "fragment" style = "margin-top: 30px; font-size: 80%" >
The (foreign) key constraint gives us two things...
$$\texttt{COUNT}(\texttt{DISTINCT } A) \approx \texttt{COUNT}(\texttt{DISTINCT } B)$$
< span style = "font-size: 60%; font-weight: bold; margin: 0px;" > and< / span >
$$\texttt{COUNT}(\texttt{DISTINCT } A) = |R|$$
< / p >
< p class = "fragment" style = "margin-top: 30px; font-size: 80%" >
Based on the first property the total number of rows is roughly...
$$|R| \times |S| \times \frac{1}{\texttt{COUNT}(\texttt{DISTINCT } A)}$$
< / p >
< p class = "fragment" style = "margin-top: 30px; font-size: 80%" >
Then based on the second property...
$$ = |R| \times |S| \times \frac{1}{|R|} = |S|$$
< / p >
< p class = "fragment" style = "margin-top: 30px; font-size: 50%" > (Statistics/Histograms will give you the same outcome... but constraints can be easier to propagate)< / p >
< / section >
< / section >
< section >
< p > < b > Next class:< / b > Exam Review< / p >
< / section >
2018-03-04 15:00:02 -05:00
< / div > < / div >
< script src = "../reveal.js-3.6.0/js/reveal.js" > < / script >
< script >
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
2018-03-09 17:38:56 -05:00
controls: true,
2018-03-04 15:00:02 -05:00
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
},
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
]
});
< / script >
< / body >
< / html >