Website/slides/cse4562sp2018/2018-01-31-SQL+Physical.html

451 lines
21 KiB
HTML
Raw Normal View History

2018-01-30 23:40:56 -05:00
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>CSE 4/562 - Spring 2018</title>
<meta name="description" content="CSE 4/562 - Spring 2018">
<meta name="author" content="Oliver Kennedy">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
<link rel="stylesheet" href="../reveal.js-3.6.0/css/reveal.css">
<link rel="stylesheet" href="ubodin.css" id="theme">
<!-- Code syntax highlighting -->
<link rel="stylesheet" href="../reveal.js-3.6.0/lib/css/zenburn.css">
<!-- Printing and PDF exports -->
<script>
var link = document.createElement( 'link' );
link.rel = 'stylesheet';
link.type = 'text/css';
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.6.0/css/print/pdf.css' : '../reveal.js-3.6.0/css/print/paper.css';
document.getElementsByTagName( 'head' )[0].appendChild( link );
</script>
<!--[if lt IE 9]>
<script src="../reveal.js-3.6.0/lib/js/html5shiv.js"></script>
<![endif]-->
</head>
<body>
<div class="reveal">
<!-- Any section element inside of this container is displayed as a slide -->
<div class="header">
<!-- Any Talk-Specific Header Content Goes Here -->
CSE 4/562 - Database Systems
</div>
<div class="slides">
<section>
<h1>SQL &amp;<br/> Physical Layout</h1>
<h3>CSE 4/562 Database Systems</h3>
<h5>January 31, 2018</h5>
</section>
<section>
<section>
<h2>SQL</h2>
<ul>
<li>Developed by IBM (for System R) in the 1970s.</li>
<li>Standard used by many vendors.<ul style="font-size: 70%" class="tight">
<li>SQL-86 (original standard)</li>
<li>SQL-89 (minor revisions; integrity constraints)</li>
<li>SQL-92 (major revision; basis for modern SQL)</li>
<li>SQL-99 (XML, window queries, generated default values)</li>
<li>SQL 2003 (major revisions to XML support)</li>
<li>SQL 2008 (minor extensions)</li>
<li>SQL 2011 (minor extensions; temporal databases)</li>
</li></ul>
</ul>
</section>
<section>
<h3>A Basic SQL Query</h3>
<svg data-src="graphics/2018-01-31-parts_of_sql.svg" height="400px"/>
</section>
<section>
<pre><code class="sql">
SELECT [DISTINCT] targetlist
FROM relationlist
WHERE condition
</code></pre>
<ol>
<li class="fragment">Compute the $2^n$ combinations of tuples in all relations appearing in <span style="color: red;">relationlist</span></li>
<li class="fragment">Discard tuples that fail the <span style="color: red;">condition</span></li>
<li class="fragment">Delete attributes not in <span style="color: red;">targetlist</span></li>
<li class="fragment">If <span style="font-family: Courier, fixedwidth;">DISTINCT</span> is specified, eliminate duplicate rows</li>
</ol>
<p style="font-size: 70%;" class="fragment">
This is the least efficient strategy to compute a query!
A good optimizer will find <b>more efficient strategies</b> to compute <b>the same answer.</b>
</p>
</section>
<section>
<h3>Example Data</h3>
<img src="graphics/2018-01-31-Trees.png" height="500px">
</section>
<section>
<pre><code class="SQL">SELECT * FROM Trees;</code></pre>
<p class="fragment" style="font-size: 70%">Wildcards (<code>*</code>, <code>tablename.*</code>) are special targets that select all attributes.</p>
<div style="width: 800px; overflow-x: scroll; font-size: small; margin-left: auto; margin-right: auto;" class="fragment">
<table>
<tr><th>CREATED_AT</th><th>TREE_ID</th><th>BLOCK_ID</th><th>THE_GEOM</th><th>TREE_DBH</th><th>STUMP_DIAM</th><th>CURB_LOC</th><th>STATUS</th><th>HEALTH</th><th>SPC_LATIN</th><th>SPC_COMMON</th><th>STEWARD</th><th>GUARDS</th><th>SIDEWALK</th><th>USER_TYPE</th><th>PROBLEMS</th><th>ROOT_STONE</th><th>ROOT_GRATE</th><th>ROOT_OTHER</th><th>TRNK_WIRE</th><th>TRNK_LIGHT</th><th>TRNK_OTHER</th><th>BRNCH_LIGH</th><th>BRNCH_SHOE</th><th>BRNCH_OTHE</th><th>ADDRESS</th><th>ZIPCODE</th><th>ZIP_CITY</th><th>CB_NUM</th><th>BOROCODE</th><th>BORONAME</th><th>CNCLDIST</th><th>ST_ASSEM</th><th>ST_SENATE</th><th>NTA</th><th>NTA_NAME</th><th>BORO_CT</th><th>STATE</th><th>LATITUDE</th><th>LONGITUDE</th><th>X_SP</th><th>Y_SP</th></tr>
<tr><td>'08/27/2015'</td><td>180683</td><td>348711</td><td>'POINT (-73.84421521958048 40.723091773924274)'</td><td>3</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Fair'</td><td>'Acer rubrum'</td><td>'red maple'</td><td>'None'</td><td>'None'</td><td>'NoDamage'</td><td>'TreesCount Staff'</td><td>'None'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'108-005 70 AVENUE'</td><td>'11375'</td><td>'Forest Hills'</td><td>406</td><td>4</td><td>'Queens'</td><td>29</td><td>28</td><td>16</td><td>'QN17'</td><td>'Forest Hills'</td><td>4073900</td><td>'New York'</td><td>40.72309177</td><td>-73.84421522</td><td>1027431.14821</td><td>202756.768749</td></tr>
<tr><td>'09/03/2015'</td><td>200540</td><td>315986</td><td>'POINT (-73.81867945834878 40.79411066708779)'</td><td>21</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Fair'</td><td>'Quercus palustris'</td><td>'pin oak'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'TreesCount Staff'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'147-074 7 AVENUE'</td><td>'11357'</td><td>'Whitestone'</td><td>407</td><td>4</td><td>'Queens'</td><td>19</td><td>27</td><td>11</td><td>'QN49'</td><td>'Whitestone'</td><td>4097300</td><td>'New York'</td><td>40.79411067</td><td>-73.81867946</td><td>1034455.70109</td><td>228644.837379</td></tr>
<tr><td>'09/05/2015'</td><td>204026</td><td>218365</td><td>'POINT (-73.93660770459083 40.717580740099116)'</td><td>3</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Gleditsia triacanthos var. inermis'</td><td>'honeylocust'</td><td>'1or2'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'None'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'390 MORGAN AVENUE'</td><td>'11211'</td><td>'Brooklyn'</td><td>301</td><td>3</td><td>'Brooklyn'</td><td>34</td><td>50</td><td>18</td><td>'BK90'</td><td>'East Williamsburg'</td><td>3044900</td><td>'New York'</td><td>40.71758074</td><td>-73.9366077</td><td>1001822.83131</td><td>200716.891267</td></tr>
<tr><td>'09/05/2015'</td><td>204337</td><td>217969</td><td>'POINT (-73.93445615919741 40.713537494833226)'</td><td>10</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Gleditsia triacanthos var. inermis'</td><td>'honeylocust'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'1027 GRAND STREET'</td><td>'11211'</td><td>'Brooklyn'</td><td>301</td><td>3</td><td>'Brooklyn'</td><td>34</td><td>53</td><td>18</td><td>'BK90'</td><td>'East Williamsburg'</td><td>3044900</td><td>'New York'</td><td>40.71353749</td><td>-73.93445616</td><td>1002420.35833</td><td>199244.253136</td></tr>
<tr><td>'08/30/2015'</td><td>189565</td><td>223043</td><td>'POINT (-73.97597938483258 40.66677775537875)'</td><td>21</td><td>0</td><td>'OnCurb'</td><td>'Alive'</td><td>'Good'</td><td>'Tilia americana'</td><td>'American linden'</td><td>'None'</td><td>'None'</td><td>'Damage'</td><td>'Volunteer'</td><td>'Stones'</td><td>'Yes'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'No'</td><td>'603 6 STREET'</td><td>'11215'</td><td>'Brooklyn'</td><td>306</td><td>3</td><td>'Brooklyn'</td><td>39</td><td>44</td><td>21</td><td>'BK37'</td><td>'Park Slope-Gowanus'</td><td>3016500</td><td>'New York'</td><td>40.66677776</td><td>-73.97597938</td><td>990913.775046</td><td>182202.425999</td></tr>
<tr class="fragment"><td colspan="42" style="text-align: left; font-weight: bold;">... and 683783 more</td></tr>
</table>
</div>
</section>
<section>
<pre><code class="sql">
SELECT tree_id, spc_common, boroname
FROM Trees
WHERE boroname = 'Brooklyn'
</code></pre>
<p>In English, what does this query compute?</p>
<p style="font-size: smaller;" class="fragment">What is the ID, Commmon Name and Borough of Trees in Brooklyn?</p>
<table style="font-size: small;" class="fragment">
2018-02-05 10:39:46 -05:00
<tr><th>TREE_ID</th><th>SPC_COMMON</th><th>BORONAME</th></tr>
<tr><td>204026</td><td>'honeylocust'</td><td>'Brooklyn'</td></tr>
<tr><td>204337</td><td>'honeylocust'</td><td>'Brooklyn'</td></tr>
<tr><td>189565</td><td>'American linden'</td><td>'Brooklyn'</td></tr>
<tr><td>192755</td><td>'London planetree'</td><td>'Brooklyn'</td></tr>
<tr><td>189465</td><td>'London planetree'</td><td>'Brooklyn'</td></tr>
<tr><td style="font-weight: bold;" colspan="3">... and 177287 more</td></tr>
2018-01-30 23:40:56 -05:00
</table>
</section>
<section>
<pre><code class="sql">
SELECT latitude, longitude
FROM Trees, SpeciesInfo
WHERE Trees.spc_common = SpeciesInfo.name
AND SpeciesInfo.has_unpleasant_smell = 'Yes';
</code></pre>
<p>In English, what does this query compute?</p>
<p style="font-size: smaller;" class="fragment">What are the coordinates of Trees with bad smells?</p>
<table style="font-size: small;" class="fragment">
<tr><th>LATITUDE</th><th>LONGITUDE</th></tr>
<tr><td>40.59378755</td><td>-73.9915968</td></tr>
<tr><td>40.69149917</td><td>-73.97258754</td></tr>
<tr><td>40.74829709</td><td>-73.98065645</td></tr>
<tr><td>40.68767857</td><td>-73.96764605</td></tr>
<tr><td>40.739991</td><td>-73.86526993</td></tr>
<tr><td style="font-weight: bold;" colspan="5">... and more</td>
</table>
</section>
<section>
<pre><code class="sql">
SELECT Trees.latitude, Trees.longitude
FROM Trees, SpeciesInfo
WHERE Trees.spc_common = SpeciesInfo.name
AND SpeciesInfo.has_unpleasant_smell = 'Yes';
</code></pre>
<p style="font-size: smaller;">... is the same as ...</p>
<pre><code class="sql">
SELECT T.latitude, T.longitude
FROM Trees T, SpeciesInfo S
WHERE T.spc_common = S.name
AND S.has_unpleasant_smell = 'Yes';
</code></pre>
<p style="font-size: smaller;">... is (usually) the same as ...</p>
<pre><code class="sql">
SELECT latitude, longitude
FROM Trees, SpeciesInfo
WHERE spc_common = name
AND has_unpleasant_smell = 'Yes';
</code></pre>
</section>
<section>
<h2>Expressions</h2>
<pre><code class="sql">
SELECT tree_id,
stump_diam / 2 AS stump_radius,
stump_area = 3.14 * stump_diam * stump_diam / 4
FROM Trees;
</code></pre>
<p style="font-size: 70%;">
Arithmetic expressions can appear in targets or conditions.
Use = or AS to assign names to these attributes.
(The behavior of unnamed attributes is unspecified)
</p>
</section>
<section>
<h2>Expressions</h2>
<pre><code class="sql">
SELECT tree_id, spc_common FROM Trees WHERE spc_common LIKE '%maple'
</code></pre>
<table style="font-size: small;">
<tr><th>TREE_ID</th><th>SPC_COMMON</th></tr>
<tr><td>180683</td><td>'red maple'</td></tr>
<tr><td>204325</td><td>'sycamore maple'</td></tr>
<tr><td>205044</td><td>'Amur maple'</td></tr>
<tr><td>184031</td><td>'red maple'</td></tr>
<tr><td>208974</td><td>'red maple'</td></tr>
</table>
<p style="font-size: 70%;">SQL uses single quotes for string literals</p>
<p style="font-size: 70%;"><code>LIKE</code> is used for String Matches</p>
<p style="font-size: 70%;"><code>%</code> matches 0 or more characters</p>
</section>
<section>
<h2>Union</h2>
<pre><code class="sql">
SELECT tree_id FROM Trees WHERE spc_common = 'red maple'
UNION [ALL]
SELECT tree_id FROM Trees WHERE spc_common = 'sycamore maple'
</code></pre>
<p style="font-size: 70%">Computes the <b>set-union</b> of any two <b>union-compatible</b> sets of tuples</p>
2018-01-31 13:40:33 -05:00
<p style="font-size: 70%">Adding <code>ALL</code> preserves duplicates across the inputs (<b>bag-union</b>).</p>
2018-01-30 23:40:56 -05:00
</section>
<section>
<h2>Aggregate Queries</h2>
<pre><code class="sql">
SELECT [DISTINCT] targetlist
FROM relationlist
WHERE condition
GROUP BY groupinglist
2018-01-31 13:40:33 -05:00
HAVING groupcondition
2018-01-30 23:40:56 -05:00
</code></pre>
<div style="font-size: 70%">
2018-01-31 13:40:33 -05:00
<p>The <span color="red">targetlist</span> now contains <b>(a)</b> Grouped attributes, and <b>(b)</b>Aggregate expressions.</p>
<p>Targets of type (a) must be a subset of the grouping-list</p>
2018-01-30 23:40:56 -05:00
<p style="font-size: 70%">(intuitively each answer tuple corresponds to a single group, and each group must have a single value for each attribute)</p>
2018-01-31 13:40:33 -05:00
<p style="margin-top: 20px"><span color="red">groupcondition</span> is applied <i>after</i> aggregation and may contain aggregate expressions.</p>
2018-01-30 23:40:56 -05:00
</div>
</section>
<section>
<h2>Aggregate Queries</h2>
<pre><code class="sql">
SELECT spc_common, count(*) FROM Trees GROUP BY spc_common
</code></pre>
<table style="font-size: small;">
<tr><th>SPC_COMMON </th><th>COUNT</th></tr>
<tr><td>''Schubert' chokecherry' </td><td>4888</td></tr>
<tr><td>'American beech' </td><td>273</td></tr>
<tr><td>'American elm' </td><td>7975</td></tr>
<tr><td>'American hophornbeam' </td><td>1081</td></tr>
<tr><td>'American hornbeam' </td><td>1517</td></tr>
<tr><td colspan="2" style="font-weight: bold;">... and more</td>
</table>
</section>
</section>
<section>
<section>
<h2>Physical Layout</h2>
</section>
<section>
<pre><code class="python">
from re import split;
with open('Trees.csv', 'r') as f:
for line in f:
2018-02-04 22:03:49 -05:00
fields = split(",", line);
2018-01-30 23:40:56 -05:00
if(fields[30] == 'Brooklyn'):
print(fields[0]);
</code></pre>
<aside class="notes">
Problems:
(1) Expensive: Full scan over the data + split expensive
(2) 'split' sensitive to formatting bugs
(3) Hardcoded Schema (e.g, 30 = BORONAME, 0 = TREE_ID)
(4) No type information (e.g., fields[5] / 2 for STUMP_DIAM)
</aside>
</section>
<section>
<h2>Record Layouts</h2>
</section>
<section>
<h3>Record Layout 1: Fixed</h3>
<svg data-src="graphics/2018-01-31-record-fixed.svg" />
</section>
<section>
<h3>Record Layout 2: Delimiters</h3>
<svg data-src="graphics/2018-01-31-record-separator.svg" />
</section>
<section>
2018-01-31 13:44:01 -05:00
<h3>Record Layout 2: Headers</h3>
2018-01-30 23:40:56 -05:00
<svg data-src="graphics/2018-01-31-record-header.svg" />
</section>
2018-01-31 13:44:01 -05:00
<section>
<h3>Record Formats</h3>
<dl>
<dt>Fixed</dt>
<dd>Constant-size fields. Field $i$ at byte $\sum_{j < i} |Field_j|$</dd>
<dt>Delimited</dt>
<dd>Special character or string (e.g., <code>,</code>) between fields</dd>
<dt>Header</dt>
<dd>Fixed-size header points to start of each field</dd>
2018-01-31 13:45:47 -05:00
<dt>&nbsp;</dt>
<dd>&nbsp;</dd>
2018-01-31 13:44:01 -05:00
</dl>
</section>
2018-01-30 23:40:56 -05:00
<section>
<h3>File Formats</h3>
<dl>
<dt>Fixed</dt>
<dd>Constant-size records. Record $i$ at byte $|Record| \times i$</dd>
<dt>Delimited</dt>
<dd>Special character or string (e.g., <code>\r\n</code>) at record end</dd>
<dt>Header</dt>
<dd>Index in file points to start of each record</dd>
<dt class="fragment" data-fragment-index="1">Paged</dt>
<dd class="fragment" data-fragment-index="1">Align records to paging boundaries</dd>
</dl>
</section>
<section>
<img src="graphics/2018-01-31-mem_hierarchy.png">
</section>
<section>
<svg data-src="graphics/2018-01-31-mem_bulk_loading.svg" class="stretch"/>
2018-01-31 13:51:02 -05:00
<imagecredits>openclipart.org</imagecredits>
2018-01-30 23:40:56 -05:00
</section>
<section>
<dl>
<dt>File</dt>
<dd>A collection of pages (or records)</dd>
<dt>Page</dt>
<dd>A fixed-size collection of records</dd>
<dd style="font-size: smaller;">Page size is usually dictated by hardware.<br/>Mem Page $\approx$ 4KB&nbsp;&nbsp;&nbsp;Cache Line $\approx$ 64B</dd>
<dt>Record</dt>
<dd>One or more fields (for now)</dd>
<dt>Field</dt>
<dd>A primitive value (for now)</dd>
</dl>
</section>
2018-01-31 15:13:26 -05:00
<section>
2018-01-31 15:15:51 -05:00
<svg data-src="graphics/2018-01-29-db_as_mediator.svg" class="stretch"/>
2018-01-31 15:13:26 -05:00
</section>
<section>
<pre><code class="python">
with db_open('Trees') as data:
for record in data:
2018-02-04 22:03:49 -05:00
if(record['BORONAME'] == 'Brooklyn'):
print(record['TREE_ID']);
2018-01-31 15:13:26 -05:00
</code></pre>
</section>
2018-01-30 23:40:56 -05:00
</section>
</div></div>
<script src="../reveal.js-3.6.0/lib/js/head.min.js"></script>
<script src="../reveal.js-3.6.0/js/reveal.js"></script>
<script>
// Full list of configuration options available at:
// https://github.com/hakimel/../reveal.js#configuration
Reveal.initialize({
2018-02-02 01:24:15 -05:00
controls: true,
2018-01-30 23:40:56 -05:00
progress: true,
history: true,
center: true,
slideNumber: true,
transition: 'fade', // none/fade/slide/convex/concave/zoom
chart: {
defaults: {
global: {
title: { fontColor: "#333", fontSize: 24 },
legend: {
labels: { fontColor: "#333", fontSize: 20 },
},
responsiveness: true
},
scale: {
scaleLabel: { fontColor: "#333", fontSize: 20 },
gridLines: { color: "#333", zeroLineColor: "#333" },
ticks: { fontColor: "#333", fontSize: 16 },
}
},
line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]},
bar: { backgroundColor: [
"rgba(220,220,220,0.8)",
"rgba(151,187,205,0.8)",
"rgba(205,151,187,0.8)",
"rgba(187,205,151,0.8)"
]
},
pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]},
radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]},
},
// Optional ../reveal.js plugins
dependencies: [
{ src: '../reveal.js-3.6.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
{ src: '../reveal.js-3.6.0/plugin/math/math.js',
condition: function() { return true; },
mathjax: '../reveal.js-3.6.0/js/MathJax.js'
},
{ src: '../reveal.js-3.6.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
{ src: '../reveal.js-3.6.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
{ src: '../reveal.js-3.6.0/plugin/zoom-js/zoom.js', async: true },
{ src: '../reveal.js-3.6.0/plugin/notes/notes.js', async: true },
// Chart.min.js
{ src: '../reveal.js-3.6.0/plugin/chart/Chart.min.js'},
// the plugin
{ src: '../reveal.js-3.6.0/plugin/chart/csv2chart.js'},
{ src: '../reveal.js-3.6.0/plugin/svginline/es6-promise.auto.js', async: false },
{ src: '../reveal.js-3.6.0/plugin/svginline/data-src-svg.js', async: false }
]
});
</script>
</body>
</html>