288 lines
9.3 KiB
HTML
288 lines
9.3 KiB
HTML
|
<!doctype html>
|
||
|
<html lang="en">
|
||
|
|
||
|
<head>
|
||
|
<meta charset="utf-8">
|
||
|
|
||
|
<title>Spark</title>
|
||
|
|
||
|
<meta name="description" content="Mimir">
|
||
|
<meta name="author" content="Oliver Kennedy">
|
||
|
|
||
|
<meta name="apple-mobile-web-app-capable" content="yes" />
|
||
|
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
||
|
|
||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
|
||
|
|
||
|
<link rel="stylesheet" href="../reveal.js-3.5.0/css/reveal.css">
|
||
|
<link rel="stylesheet" href="ubodin.css" id="theme">
|
||
|
|
||
|
<!-- Code syntax highlighting -->
|
||
|
<link rel="stylesheet" href="../reveal.js-3.5.0/lib/css/zenburn.css">
|
||
|
|
||
|
<!-- Printing and PDF exports -->
|
||
|
<script>
|
||
|
var link = document.createElement( 'link' );
|
||
|
link.rel = 'stylesheet';
|
||
|
link.type = 'text/css';
|
||
|
link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.5.0/css/print/pdf.css' : '../reveal.js-3.5.0/css/print/paper.css';
|
||
|
document.getElementsByTagName( 'head' )[0].appendChild( link );
|
||
|
</script>
|
||
|
|
||
|
<script>document.write('<script src="http://' + (location.host || 'localhost').split(':')[0] + ':35729/livereload.js?snipver=1"></' + 'script>')</script>
|
||
|
|
||
|
<!--[if lt IE 9]>
|
||
|
<script src="../reveal.js-3.5.0/lib/js/html5shiv.js"></script>
|
||
|
<![endif]-->
|
||
|
</head>
|
||
|
|
||
|
<body>
|
||
|
|
||
|
<div class="reveal">
|
||
|
<!-- Any section element inside of this container is displayed as a slide -->
|
||
|
|
||
|
<div class="header">
|
||
|
<!-- Any Talk-Specific Header Content Goes Here -->
|
||
|
Spark
|
||
|
</div>
|
||
|
<div class="footer">
|
||
|
<!-- Any Talk-Specific Footer Content Goes Here -->
|
||
|
<div style="float: left; margin-top: 15px; ">
|
||
|
Exploring <u><b>O</b></u>nline <u><b>D</b></u>ata <u><b>In</b></u>teractions
|
||
|
</div>
|
||
|
<img src="graphics/FullText-white.png" height="40" style="float: right;"/>
|
||
|
</div>
|
||
|
|
||
|
<div class="slides">
|
||
|
|
||
|
<section>
|
||
|
<section>
|
||
|
<h2>Spark</h2>
|
||
|
<h4>(NoSQL, but with SQL)</h4>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>First a little history</h3>
|
||
|
|
||
|
<dl>
|
||
|
<div class="fragment">
|
||
|
<dt>Early-Mid 1900s</dt>
|
||
|
<dd>Computers used for tabulating data</dd>
|
||
|
</div>
|
||
|
<div class="fragment">
|
||
|
<dt>1970s</dt>
|
||
|
<dd>Relational model, Postgres, System-R, Oracle, DB2</dd>
|
||
|
</div>
|
||
|
<div class="fragment">
|
||
|
<dt>1980</dt>
|
||
|
<dd>Lotus, dBase</dd>
|
||
|
</div>
|
||
|
<div class="fragment">
|
||
|
<dt>1990s</dt>
|
||
|
<dd>Object/Object-Relational Databases, Distributed Databases</dd>
|
||
|
</div>
|
||
|
<div class="fragment">
|
||
|
<dt>2000s</dt>
|
||
|
<dd>The Dark Ages...</dd>
|
||
|
</div>
|
||
|
</dl>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<p><b>Google: </b> Databases suck! Use Map/Reduce Instead</p>
|
||
|
<img src="graphics/mapreduce.png" height="500px">
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<p><b>Yahoo: </b> Our Map/Reduce implementation is open source</p>
|
||
|
<img src="graphics/hadoop.png" height="500px">
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>The Good</h3>
|
||
|
<ul>
|
||
|
<li class="fragment">Programmer-Friendly Language</li>
|
||
|
<li class="fragment">Distributed-Computing-Friendly Metaphors</li>
|
||
|
<li class="fragment">Extremely Resilient Runtime</li>
|
||
|
</ul>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>The Bad</h3>
|
||
|
<ul>
|
||
|
<li class="fragment"><strike>Programmer-Friendly</strike>Non-Declarative Language</li>
|
||
|
<li class="fragment"><strike>Distributed-Computing-Friendly</strike>Programmer-Hostile Metaphors</li>
|
||
|
<li class="fragment">Extremely <strike>Resilient</strike>Slow Runtime</li>
|
||
|
</ul>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<img src="graphics/hadoopVSdbs.svg">
|
||
|
</section>
|
||
|
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<section>
|
||
|
<img src="graphics/spark.png" height="400px">
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>Key Features</h3>
|
||
|
<ul>
|
||
|
<li>High-performance resilience.</li>
|
||
|
<li>Use of metaphors to extract parallelism.</li>
|
||
|
<li>Lots of metaphors for distributed programming.</li>
|
||
|
<li>If you can do it in { Scala, Python, Java, R }, you can do it in Spark.</li>
|
||
|
<li>If you know SQL and { Scala, Python, Java, R }, you know Spark</li>
|
||
|
</ul>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<svg data-src="graphics/sparkstack.svg" height="600px">
|
||
|
</section>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<section>
|
||
|
<h3>Resilient Distributed Data Structures (RDDs)</h3>
|
||
|
|
||
|
<dl style="font-size: 75%">
|
||
|
<div class="fragment">
|
||
|
<dt>Read-Only</dt>
|
||
|
<dd>You can't insert, update, or modify rows...</dd>
|
||
|
</div>
|
||
|
|
||
|
<div class="fragment">
|
||
|
<dt>Transformable</dt>
|
||
|
<dd>... but you can create (cheaply) new RDDs by modifying existing RDDs.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div class="fragment">
|
||
|
<dt>Opaque</dt>
|
||
|
<dd>Spark just sees a bunch of rows. It doesn't know how to interpret them.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div class="fragment">
|
||
|
<dt>Lazy</dt>
|
||
|
<dd>Spark saves <b>how</b> to construct an RDD, but waits to actually do so.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div class="fragment">
|
||
|
<dt>Distributed</dt>
|
||
|
<dd>When Spark constructs an RDD, it automatically assigns rows to workers.</dd>
|
||
|
</div>
|
||
|
</dl>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>Where do RDDs come from</h3>
|
||
|
|
||
|
<ul>
|
||
|
<li>Call "parallelize" on a { Scala, Python, Java, R } array/collection</li>
|
||
|
<li>Load a text file from disk or HDFS (1 row per line).</li>
|
||
|
<li>Load a database table (1 row per row).</li>
|
||
|
<li>Transform (map, flatMap, filter) an existing RDD.</li>
|
||
|
</ul>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<div>
|
||
|
<h3>FlatMap?</h3>
|
||
|
<p>A function that reads in one row and returns any number of rows.</p>
|
||
|
</div>
|
||
|
<div>
|
||
|
<h3>Map?</h3>
|
||
|
<p>A function that reads in one row and returns one row.</p>
|
||
|
</div>
|
||
|
<div>
|
||
|
<h3>Filter?</h3>
|
||
|
<p>A function that reads in one row and returns true (keep) or false (toss).</p>
|
||
|
</div>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h3>Resilient Distributed Data Structures (RDDs)</h3>
|
||
|
|
||
|
<dl style="font-size: 75%">
|
||
|
<div>
|
||
|
<dt>Read-Only</dt>
|
||
|
<dd>You can't insert, update, or modify rows...</dd>
|
||
|
</div>
|
||
|
|
||
|
<div>
|
||
|
<dt>Transformable</dt>
|
||
|
<dd>... but you can create (cheaply) new RDDs by modifying existing RDDs.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div class="fragment highlight-blue">
|
||
|
<dt>Opaque</dt>
|
||
|
<dd>Spark just sees a bunch of rows. It doesn't know how to interpret them.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div>
|
||
|
<dt>Lazy</dt>
|
||
|
<dd>Spark saves <b>how</b> to construct an RDD, but waits to actually do so.</dd>
|
||
|
</div>
|
||
|
|
||
|
<div>
|
||
|
<dt>Distributed</dt>
|
||
|
<dd>When Spark constructs an RDD, it automatically assigns rows to workers.</dd>
|
||
|
</div>
|
||
|
</dl>
|
||
|
</section>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<section>
|
||
|
<h3>DataFrames</h3>
|
||
|
|
||
|
<p>RDDs with Schemas: Every row has a set of attributes and all of the records have the same attributes.</p>
|
||
|
</section>
|
||
|
|
||
|
<section>
|
||
|
<h1>Demo</h1>
|
||
|
</section>
|
||
|
</section>
|
||
|
</div>
|
||
|
|
||
|
</div></div>
|
||
|
|
||
|
<script src="../reveal.js-3.5.0/lib/js/head.min.js"></script>
|
||
|
<script src="../reveal.js-3.5.0/js/reveal.js"></script>
|
||
|
|
||
|
<script>
|
||
|
|
||
|
// Full list of configuration options available at:
|
||
|
// https://github.com/hakimel/../reveal.js#configuration
|
||
|
Reveal.initialize({
|
||
|
controls: false,
|
||
|
progress: true,
|
||
|
history: true,
|
||
|
center: true,
|
||
|
slideNumber: true,
|
||
|
|
||
|
transition: 'fade', // none/fade/slide/convex/concave/zoom
|
||
|
|
||
|
// Optional ../reveal.js plugins
|
||
|
dependencies: [
|
||
|
{ src: '../reveal.js-3.5.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/math/math.js',
|
||
|
condition: function() { return true; },
|
||
|
mathjax: '../reveal.js-3.5.0/js/MathJax.js'
|
||
|
},
|
||
|
{ src: '../reveal.js-3.5.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/zoom-js/zoom.js', async: true },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/notes/notes.js', async: true },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/svginline/es6-promise.auto.js', async: false },
|
||
|
{ src: '../reveal.js-3.5.0/plugin/svginline/data-src-svg.js', async: false }
|
||
|
]
|
||
|
});
|
||
|
|
||
|
</script>
|
||
|
|
||
|
</body>
|
||
|
</html>
|