Website/slides/cse4562sp2018/2018-01-31-SQL+Physical.html
2018-02-05 10:39:46 -05:00

451 lines
21 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>CSE 4/562 - Spring 2018</title>
<meta name="description" content="CSE 4/562 - Spring 2018">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<!--[if lt IE 9]>
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
CSE 4/562 - Database Systems
</div>
<div class="slides">
<section>
<h1>SQL &amp;<br/> Physical Layout</h1>
<h3>CSE 4/562 Database Systems</h3>
<h5>January 31, 2018</h5>
</section>
<section>
<section>
<h2>SQL</h2>
<ul>
<li>Developed by IBM (for System R) in the 1970s.</li>
<li>Standard used by many vendors.<ul style="font-size: 70%" class="tight">
<li>SQL-86 (original standard)</li>
<li>SQL-89 (minor revisions; integrity constraints)</li>
<li>SQL-92 (major revision; basis for modern SQL)</li>
<li>SQL-99 (XML, window queries, generated default values)</li>
<li>SQL 2003 (major revisions to XML support)</li>
<li>SQL 2008 (minor extensions)</li>
<li>SQL 2011 (minor extensions; temporal databases)</li>
</li></ul>
</ul>
</section>
<section>
<h3>A Basic SQL Query</h3>
<svg data-src="graphics/2018-01-31-parts_of_sql.svg" height="400px"/>
</section>
<section>
<pre><code class="sql">
SELECT [DISTINCT] targetlist
FROM relationlist
WHERE condition
</code></pre>
<ol>
<li class="fragment">Compute the $2^n$ combinations of tuples in all relations appearing in <span style="color: red;">relationlist</span></li>
<li class="fragment">Discard tuples that fail the <span style="color: red;">condition</span></li>
<li class="fragment">Delete attributes not in <span style="color: red;">targetlist</span></li>
<li class="fragment">If <span style="font-family: Courier, fixedwidth;">DISTINCT</span> is specified, eliminate duplicate rows</li>
</ol>
<p style="font-size: 70%;" class="fragment">
This is the least efficient strategy to compute a query!
A good optimizer will find <b>more efficient strategies</b> to compute <b>the same answer.</b>
</p>
</section>
<section>
<h3>Example Data</h3>
<img src="graphics/2018-01-31-Trees.png" height="500px">
</section>
<section>
<pre><code class="SQL">SELECT * FROM Trees;</code></pre>
<p class="fragment" style="font-size: 70%">Wildcards (<code>*</code>, <code>tablename.*</code>) are special targets that select all attributes.</p>
<div style="width: 800px; overflow-x: scroll; font-size: small; margin-left: auto; margin-right: auto;" class="fragment">
<table>
<tr><th>CREATED_AT</th><th>TREE_ID</th><th>BLOCK_ID</th><th>THE_GEOM</th><th>TREE_DBH</th><th>STUMP_DIAM</th><th>CURB_LOC</th><th>STATUS</th><th>HEALTH</th><th>SPC_LATIN</th><th>SPC_COMMON</th><th>STEWARD</th><th>GUARDS</th><th>SIDEWALK</th><th>USER_TYPE</th><th>PROBLEMS</th><th>ROOT_STONE</th><th>ROOT_GRATE</th><th>ROOT_OTHER</th><th>TRNK_WIRE</th><th>TRNK_LIGHT</th><th>TRNK_OTHER</th><th>BRNCH_LIGH</th><th>BRNCH_SHOE</th><th>BRNCH_OTHE</th><th>ADDRESS</th><th>ZIPCODE</th><th>ZIP_CITY</th><th>CB_NUM</th><th>BOROCODE</th><th>BORONAME</th><th>CNCLDIST</th><th>ST_ASSEM</th><th>ST_SENATE</th><th>NTA</th><th>NTA_NAME</th><th>BORO_CT</th><th>STATE</th><th>LATITUDE</th><th>LONGITUDE</th><th>X_SP</th><th>Y_SP</th></tr>
<tr><td>'08/27/2015'</td><td>180683</td><td>348711</td><td>'POINT (-73.84421521958048 40.723091773924274)'</td><td>3</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Fair'</td><td>'Acer rubrum'</td><td>'red maple'</td><td>'None'</td><td>'None'</td><td>'NoDamage'</td><td>'TreesCount Staff'</td><td>'None'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'108-005 70 AVENUE'</td><td>'11375'</td><td>'Forest Hills'</td><td>406</td><td>4</td><td>'Queens'</td><td>29</td><td>28</td><td>16</td><td>'QN17'</td><td>'Forest Hills'</td><td>4073900</td><td>'New York'</td><td>40.72309177</td><td>-73.84421522</td><td>1027431.14821</td><td>202756.768749</td></tr>
<tr><td>'09/03/2015'</td><td>200540</td><td>315986</td><td>'POINT (-73.81867945834878 40.79411066708779)'</td><td>21</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Fair'</td><td>'Quercus palustris'</td><td>'pin oak'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'TreesCount Staff'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'147-074 7 AVENUE'</td><td>'11357'</td><td>'Whitestone'</td><td>407</td><td>4</td><td>'Queens'</td><td>19</td><td>27</td><td>11</td><td>'QN49'</td><td>'Whitestone'</td><td>4097300</td><td>'New York'</td><td>40.79411067</td><td>-73.81867946</td><td>1034455.70109</td><td>228644.837379</td></tr>
<tr><td>'09/05/2015'</td><td>204026</td><td>218365</td><td>'POINT (-73.93660770459083 40.717580740099116)'</td><td>3</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Gleditsia triacanthos var. inermis'</td><td>'honeylocust'</td><td>'1or2'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'None'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'390 MORGAN AVENUE'</td><td>'11211'</td><td>'Brooklyn'</td><td>301</td><td>3</td><td>'Brooklyn'</td><td>34</td><td>50</td><td>18</td><td>'BK90'</td><td>'East Williamsburg'</td><td>3044900</td><td>'New York'</td><td>40.71758074</td><td>-73.9366077</td><td>1001822.83131</td><td>200716.891267</td></tr>
<tr><td>'09/05/2015'</td><td>204337</td><td>217969</td><td>'POINT (-73.93445615919741 40.713537494833226)'</td><td>10</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Gleditsia triacanthos var. inermis'</td><td>'honeylocust'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'1027 GRAND STREET'</td><td>'11211'</td><td>'Brooklyn'</td><td>301</td><td>3</td><td>'Brooklyn'</td><td>34</td><td>53</td><td>18</td><td>'BK90'</td><td>'East Williamsburg'</td><td>3044900</td><td>'New York'</td><td>40.71353749</td><td>-73.93445616</td><td>1002420.35833</td><td>199244.253136</td></tr>
<tr><td>'08/30/2015'</td><td>189565</td><td>223043</td><td>'POINT (-73.97597938483258 40.66677775537875)'</td><td>21</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Tilia americana'</td><td>'American linden'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'603 6 STREET'</td><td>'11215'</td><td>'Brooklyn'</td><td>306</td><td>3</td><td>'Brooklyn'</td><td>39</td><td>44</td><td>21</td><td>'BK37'</td><td>'Park Slope-Gowanus'</td><td>3016500</td><td>'New York'</td><td>40.66677776</td><td>-73.97597938</td><td>990913.775046</td><td>182202.425999</td></tr>
<tr class="fragment"><td colspan="42" style="text-align: left; font-weight: bold;">... and 683783 more</td></tr>
</table>
</div>
</section>
<section>
<pre><code class="sql">
SELECT tree_id, spc_common, boroname
FROM Trees
WHERE boroname = 'Brooklyn'
</code></pre>
<p>In English, what does this query compute?</p>
<p style="font-size: smaller;" class="fragment">What is the ID, Commmon Name and Borough of Trees in Brooklyn?</p>
<table style="font-size: small;" class="fragment">
<tr><th>TREE_ID</th><th>SPC_COMMON</th><th>BORONAME</th></tr>
<tr><td>204026</td><td>'honeylocust'</td><td>'Brooklyn'</td></tr>
<tr><td>204337</td><td>'honeylocust'</td><td>'Brooklyn'</td></tr>
<tr><td>189565</td><td>'American linden'</td><td>'Brooklyn'</td></tr>
<tr><td>192755</td><td>'London planetree'</td><td>'Brooklyn'</td></tr>
<tr><td>189465</td><td>'London planetree'</td><td>'Brooklyn'</td></tr>
<tr><td style="font-weight: bold;" colspan="3">... and 177287 more</td></tr>
</table>
</section>
<section>
<pre><code class="sql">
SELECT latitude, longitude
FROM Trees, SpeciesInfo
WHERE Trees.spc_common = SpeciesInfo.name
AND SpeciesInfo.has_unpleasant_smell = 'Yes';
</code></pre>
<p>In English, what does this query compute?</p>
<p style="font-size: smaller;" class="fragment">What are the coordinates of Trees with bad smells?</p>
<table style="font-size: small;" class="fragment">
<tr><th>LATITUDE</th><th>LONGITUDE</th></tr>
<tr><td>40.59378755</td><td>-73.9915968</td></tr>
<tr><td>40.69149917</td><td>-73.97258754</td></tr>
<tr><td>40.74829709</td><td>-73.98065645</td></tr>
<tr><td>40.68767857</td><td>-73.96764605</td></tr>
<tr><td>40.739991</td><td>-73.86526993</td></tr>
<tr><td style="font-weight: bold;" colspan="5">... and more</td>
</table>
</section>
<section>
<pre><code class="sql">
SELECT Trees.latitude, Trees.longitude
FROM Trees, SpeciesInfo
WHERE Trees.spc_common = SpeciesInfo.name
AND SpeciesInfo.has_unpleasant_smell = 'Yes';
</code></pre>
<p style="font-size: smaller;">... is the same as ...</p>
<pre><code class="sql">
SELECT T.latitude, T.longitude
FROM Trees T, SpeciesInfo S
WHERE T.spc_common = S.name
AND S.has_unpleasant_smell = 'Yes';
</code></pre>
<p style="font-size: smaller;">... is (usually) the same as ...</p>
<pre><code class="sql">
SELECT latitude, longitude
FROM Trees, SpeciesInfo
WHERE spc_common = name
AND has_unpleasant_smell = 'Yes';
</code></pre>
</section>
<section>
<h2>Expressions</h2>
<pre><code class="sql">
SELECT tree_id,
stump_diam / 2 AS stump_radius,
stump_area = 3.14 * stump_diam * stump_diam / 4
FROM Trees;
</code></pre>
<p style="font-size: 70%;">
Arithmetic expressions can appear in targets or conditions.
Use = or AS to assign names to these attributes.
(The behavior of unnamed attributes is unspecified)
</p>
</section>
<section>
<h2>Expressions</h2>
<pre><code class="sql">
SELECT tree_id, spc_common FROM Trees WHERE spc_common LIKE '%maple'
</code></pre>
<table style="font-size: small;">
<tr><th>TREE_ID</th><th>SPC_COMMON</th></tr>
<tr><td>180683</td><td>'red maple'</td></tr>
<tr><td>204325</td><td>'sycamore maple'</td></tr>
<tr><td>205044</td><td>'Amur maple'</td></tr>
<tr><td>184031</td><td>'red maple'</td></tr>
<tr><td>208974</td><td>'red maple'</td></tr>
</table>
<p style="font-size: 70%;">SQL uses single quotes for string literals</p>
<p style="font-size: 70%;"><code>LIKE</code> is used for String Matches</p>
<p style="font-size: 70%;"><code>%</code> matches 0 or more characters</p>
</section>
<section>
<h2>Union</h2>
<pre><code class="sql">
SELECT tree_id FROM Trees WHERE spc_common = 'red maple'
UNION [ALL]
SELECT tree_id FROM Trees WHERE spc_common = 'sycamore maple'
</code></pre>
<p style="font-size: 70%">Computes the <b>set-union</b> of any two <b>union-compatible</b> sets of tuples</p>
<p style="font-size: 70%">Adding <code>ALL</code> preserves duplicates across the inputs (<b>bag-union</b>).</p>
</section>
<section>
<h2>Aggregate Queries</h2>
<pre><code class="sql">
SELECT [DISTINCT] targetlist
FROM relationlist
WHERE condition
GROUP BY groupinglist
HAVING groupcondition
</code></pre>
<div style="font-size: 70%">
<p>The <span color="red">targetlist</span> now contains <b>(a)</b> Grouped attributes, and <b>(b)</b>Aggregate expressions.</p>
<p>Targets of type (a) must be a subset of the grouping-list</p>
<p style="font-size: 70%">(intuitively each answer tuple corresponds to a single group, and each group must have a single value for each attribute)</p>
<p style="margin-top: 20px"><span color="red">groupcondition</span> is applied <i>after</i> aggregation and may contain aggregate expressions.</p>
</div>
</section>
<section>
<h2>Aggregate Queries</h2>
<pre><code class="sql">
SELECT spc_common, count(*) FROM Trees GROUP BY spc_common
</code></pre>
<table style="font-size: small;">
<tr><th>SPC_COMMON </th><th>COUNT</th></tr>
<tr><td>''Schubert' chokecherry' </td><td>4888</td></tr>
<tr><td>'American beech' </td><td>273</td></tr>
<tr><td>'American elm' </td><td>7975</td></tr>
<tr><td>'American hophornbeam' </td><td>1081</td></tr>
<tr><td>'American hornbeam' </td><td>1517</td></tr>
<tr><td colspan="2" style="font-weight: bold;">... and more</td>
</table>
</section>
</section>
<section>
<section>
<h2>Physical Layout</h2>
</section>
<section>
<pre><code class="python">
from re import split;
with open('Trees.csv', 'r') as f:
for line in f:
fields = split(",", line);
if(fields[30] == 'Brooklyn'):
print(fields[0]);
</code></pre>
<aside class="notes">
Problems:
(1) Expensive: Full scan over the data + split expensive
(2) 'split' sensitive to formatting bugs
(3) Hardcoded Schema (e.g, 30 = BORONAME, 0 = TREE_ID)
(4) No type information (e.g., fields[5] / 2 for STUMP_DIAM)
</aside>
</section>
<section>
<h2>Record Layouts</h2>
</section>
<section>
<h3>Record Layout 1: Fixed</h3>
<svg data-src="graphics/2018-01-31-record-fixed.svg" />
</section>
<section>
<h3>Record Layout 2: Delimiters</h3>
<svg data-src="graphics/2018-01-31-record-separator.svg" />
</section>
<section>
<h3>Record Layout 2: Headers</h3>
<svg data-src="graphics/2018-01-31-record-header.svg" />
</section>
<section>
<h3>Record Formats</h3>
<dl>
<dt>Fixed</dt>
<dd>Constant-size fields. Field $i$ at byte $\sum_{j < i} |Field_j|$</dd>
<dt>Delimited</dt>
<dd>Special character or string (e.g., <code>,</code>) between fields</dd>
<dt>Header</dt>
<dd>Fixed-size header points to start of each field</dd>
<dt>&nbsp;</dt>
<dd>&nbsp;</dd>
</dl>
</section>
<section>
<h3>File Formats</h3>
<dl>
<dt>Fixed</dt>
<dd>Constant-size records. Record $i$ at byte $|Record| \times i$</dd>
<dt>Delimited</dt>
<dd>Special character or string (e.g., <code>\r\n</code>) at record end</dd>
<dt>Header</dt>
<dd>Index in file points to start of each record</dd>
<dt class="fragment" data-fragment-index="1">Paged</dt>
<dd class="fragment" data-fragment-index="1">Align records to paging boundaries</dd>
</dl>
</section>
<section>
<img src="graphics/2018-01-31-mem_hierarchy.png">
</section>
<section>
<svg data-src="graphics/2018-01-31-mem_bulk_loading.svg" class="stretch"/>
<imagecredits>openclipart.org</imagecredits>
</section>
<section>
<dl>
<dt>File</dt>
<dd>A collection of pages (or records)</dd>
<dt>Page</dt>
<dd>A fixed-size collection of records</dd>
<dd style="font-size: smaller;">Page size is usually dictated by hardware.<br/>Mem Page $\approx$ 4KB&nbsp;&nbsp;&nbsp;Cache Line $\approx$ 64B</dd>
<dt>Record</dt>
<dd>One or more fields (for now)</dd>
<dt>Field</dt>
<dd>A primitive value (for now)</dd>
</dl>
</section>
<section>
<svg data-src="graphics/2018-01-29-db_as_mediator.svg" class="stretch"/>
</section>
<section>
<pre><code class="python">
with db_open('Trees') as data:
for record in data:
if(record['BORONAME'] == 'Brooklyn'):
print(record['TREE_ID']);
</code></pre>
</section>
</section>
</div></div>
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
controls: true,
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
},
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
]
});
</script>
</body>
</html>