Website/slides/cse662fa2018/2018-08-29-Seeds.html

<!doctype html>
<html lang="en">

	<head>
		<meta charset="utf-8">

		<title>CSE 662 - Languages and Runtimes for Big Data</title>

		<meta name="description" content="Material for the University at Buffalo's CSE-662 'Languages and Runtimes for Big Data'">
		<meta name="author" content="Oliver Kennedy">

		<meta name="apple-mobile-web-app-capable" content="yes">
		<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">

		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">

		<link rel="stylesheet" href="../reveal.js-3.7.0/css/reveal.css">
		<link rel="stylesheet" href="../reveal.js-3.7.0/css/theme/moon.css" id="theme">

		<!-- Theme used for syntax highlighting of code -->
		<link rel="stylesheet" href="../reveal.js-3.7.0/lib/css/zenburn.css">

		<!-- Printing and PDF exports -->
		<script>
			var link = document.createElement( 'link' );
			link.rel = 'stylesheet';
			link.type = 'text/css';
			link.href = window.location.search.match( /print-pdf/gi ) ? '../reveal.js-3.7.0/css/print/pdf.css' : '../reveal.js-3.7.0/css/print/paper.css';
			document.getElementsByTagName( 'head' )[0].appendChild( link );
		</script>

		<!--[if lt IE 9]>
		<script src="lib/js/html5shiv.js"></script>
		<![endif]-->
	</head>

	<body>

		<div class="reveal">

			<!-- Any section element inside of this container is displayed as a slide -->
			<div class="slides">
        <section>
  				<section>
            <h2>Project Seeds</h2>
          </section>

          <section>
            <h3>Reminder</h3>
            <p>Learned Index Structures due Weds (1 week)</p>
          </section>

          <section>
            <h3>Expectations</h3>

            <div style="font-size: 50%">
              <h4>Checkpoint 1: Project Description (Due Sept 23, 11:59)</h4>
              <ul style="width: 700px">
                <li>What is the specific challenge that you will solve?</li>
                <li>What metrics will you use to evaluate success?</li>
                <li>What deliverables will you produce?</li>
              </ul>
              <hr/>
              <h4>Checkpoint 2: Progress Report (Due Oct 21, 11:59)</h4>
              <ul style="width: 700px">
                <li>What challenges have you overcome so far?</li>
                <li>How does your existing work compare to other, similar approaches?</li>
                <li>What design decisions have you made so far and why?</li>
                <li>How have your goals changed from checkpoint 1?</li>
                <li>What challenges remain for you to overcome?</li>
              </ul>
              <hr/>
              <h4>Checkpoint 3: Final Report (Due Dec 9, 11:59)</h4>
              <ul style="width: 700px">
                <li>What specific challenges did you solve?</li>
                <li>How does your final solution compare to other, similar approaches?</li>
                <li>Were the design decisions you made correct and why?</li>
              </ul>
            </div>

          </section>
        </section>

        <section>

          <section>
            <h3>Decentralized IoT Plumbing</h3>
          </section>

          <section>
            <img src="graphics/InternetOfThings.svg" height="600px" />
          </section>

          <section>
            <svg class="fragment" data-src="graphics/computer.svg" height="200px" style="vertical-align: middle;" />
            <span style="font-size: 300%; vertical-align: middle; opacity: 0;">+</span>
            <svg class="fragment" data-src="graphics/Energy-Saver-Lightbulb-Bright.svg"  height="200px"  style="vertical-align: middle;"  />
          </section>

          <section>
             <svg data-src="graphics/mad-scientist.svg" height="400px"/>
          </section>

          <section>
            <img src="graphics/Computer-Bulb.svg" height="600px">
          </section>

          <section>
            <h3>What IoT Means</h3>

            <p>Lots of devices with...<dl>
              <div class="fragment">
                <dt>Sensors (Temperature, RFID, Cameras)</dt>
                <dd>Inputs from the outside world.</dd>
              </div>
              <div class="fragment">
                <dt>Actuators (Robots, Lightbulbs, Conveyor Belts)</dt>
                <dd>Outputs to affect the outside world.</dd>
              </div>
              <div class="fragment">
                <dt>Reasonable Compute Resources</dt>
                <dd>The ability to actually decide how.</dd>
              </div>
            </dl></p>
          </section>

          <section>
            <svg data-src="graphics/2018-08-29-ClassicalIoT.svg" class="stretch" style="background-color: white"/>
          </section>

          <section>
            <svg data-src="graphics/2018-08-29-DistributedIoT.svg" class="stretch" style="background-color: white"/>
          </section>

          <section>
            <h3>Core Idea</h3>
            <dl>
              <dt>The user gives you...</dt>
              <dd>A list of nodes (sensors/actuators)</dd>
              <dd>A list of activities (globally what to do and when)</dd>
              <dt>Your code compiles and deploys...</dt>
              <dd>Triggers for nodes (locally what to do and when)</dd>
            </dl>
          </section>

          <section>
            <h3>Things to Think About...</h3>
            <ul>
              <li class="fragment">How does the user specify activities to your system?</li>
              <li class="fragment">Which node(s) is(/are) responsible for required computation?</li>
              <li class="fragment">How do you get data from where it is to where the compute happens?</li>
              <li class="fragment">What resources (compute, network) will be needed to execute on your plan?</li>
              <li class="fragment">How do you optimize the necessary compute for one activity? <span class="fragment">across <u>all</u> activities?</span></li>
            </ul>
          </section>

        </section>

        <section>

          <section>
            <h3>Uncertainty-Aware Machine Learning</h3>
	        </section>

          <section>
            <dl>
              <img src="graphics/2018-08-29-obamacare_stats_fail.jpg" />
              <p class="fragment">Not all data sources are created equal.</p>
            </dl>
          </section>

          <section>
            <img src="graphics/2018-08-29-missing.png">
            <p class="fragment">Even within one data set, some data may be more trustworthy than others.</p>
          </section>

          <section>
            <h3>Mixed-Quality Training</h3>
            <p>How do you train a classifier/neural net/markov model/etc... on mixed-quality data?</p>
            <ul>
              <li class="fragment">Preprocess the data <span class="fragment">("fix" the errors)</span></li>
              <li class="fragment">Train separate models on subsets of the data</li>
              <li class="fragment">Ignore the errors and hope for the best</li>
            </ul>
            <p class="fragment"><b>Problem:</b> Usually easier to "fix" than to label missing data.</p>
          </section>

          <section>
            <p>But what if the data is already labeled!</p>
          </section>

          <section>
            <h3>Core Idea</h3>
            <dl>
              <dt>You get...</dt>
              <dd>A dataset</dd>
              <dd>Descriptions of uncertainty (what kind is up to you)</dd>
              <dt>You make...</dt>
              <dd>A model (of some sort) that is of higher quality using labels than not using them.</dd>
            </dl>
            <p class="fragment">Ideally the model is interpretable as well.</p>
          </section>

          <section>
            <h3>Things to Think About</h3>

            <ul>
              <li class="fragment">What statistical properties are you aiming for?</li>
              <li class="fragment">How should you describe uncertain data?</li>
              <li class="fragment">How should the model interact with missing data? <span class="fragment">... to less reliable data?</span></li>
              <li class="fragment">How does uncertainty in the training data affect the model's predictions</li>
            </ul>
          </section>

        </section>

        <section>

          <section>
            <h3>Web-of-Trust for Crowdsourced Data</h3>
          </section>

          <section>
            <img src="graphics/2018-08-29-crowdsourcing.jpg" />
          </section>

          <section>
            <h3>Crowdsourcing</h3>

            <p class="fragment">Have a question?</p>

            <p class="fragment">Most people will give you a bad answer.</p>

            <p class="fragment">A few will give you a bad answer.</p>

            <p class="fragment">The average of a bunch of bad answers and a few good answers is a good answer?</p>
          </section>

          <section>
            <h3>Crowdsourcing with Trust!</h3>
          </section>

          <section>
            <h3>Web of Trust</h3>

            <img src="graphics/2018-08-29-WebOfTrustsvg.svg" height="400px" />
          </section>

          <section>
            <svg data-src="graphics/2018-08-29-WebOfTrustAnim.svg" class="stretch" />
          </section>

          <section>
            <h3>Core Idea</h3>
            <dl>
              <dt>You get...</dt>
              <dd>A set of participants</dd>
              <dd>A set of (possibly contradictory) facts stated by each participant</dd>
              <dd>A set of trust levels for each pair of participants</dd>
              <dt>You produce...</dt>
              <dd>A (weighted?) set of facts for each user.</dd>
            </dl>
          </section>

          <section>
            <h3>Things to Think About</h3>

            <ul>
              <li class="fragment">How do trust levels combine? (Transitively vs Additively)</li>
              <li class="fragment">How do derivations of contradictory facts combine (e.g., average trust vs most trusted wins)</li>
              <li class="fragment">Can the model be maintained incrementally as new facts arrive/users change how much they trust other users?</span></li>
              <li class="fragment">What happens for pairs of users who don't know how much they trust each other?</li>
            </ul>
          </section>

        </section>

        <section>

          <section>
            <h3>Sensitivity Analysis in Mimir</h3>
          </section>

          <section>
            <svg data-src="graphics/2018-08-29-NormalDBVsProbDB.svg" stretch style="background-color: lightgrey"/>
          </section>

          <section>
            <p><b>Problem:</b> Often there is a very large number of possible worlds.</p>

            <p class="fragment"><b>Solution:</b> Break down possible worlds by choices.</p>

            <p class="fragment"><b>Question:</b> Which choices have the biggest impact on a query result?</p>
          </section>

          <section>
            <h3>Sensitivity/Influence</h3>

            <p><i>Sensitivity analysis and explanations for robust query evaluation in probabilistic databases.</i><br/>
            Kanagal, Li, Deshpande (SIGMOD 2011)</p>

            <p><i>Tracing data errors with view-conditioned causality</i><br/>
            Meliou, Gatterbauer, Nath, Suciu (SIGMOD 2011)</p>

          </section>

          <section>
            <h3>Approach</h3>
            <p class="fragment"><b>Unit of Choice: </b> Is a tuple (fact) in the source data or not?</p>
            <ol>
              <li class="fragment">Compute the "derivative" of the query result with respect to the probability of each source tuple.</li>
              <li class="fragment">Find the tuple that maxizes the derivative.</p>
            </ol>
          </section>

          <section>
            <h3>Mimir</h3>

            <p>Let queries call a nondeterministic "choice" function that decides which "world" to visit.</p>

            <pre><code>
    SELECT CASE VGTerm("A", ROWID) WHEN 1 THEN "FOO"
                                          ELSE "BAR"
           END AS A, Input.*
    FROM Input;
            </code></pre>

            <p><tt>VGTerm("A", ROWID)</tt> generates a separate value for each row.</p>
          </section>


          <section>
            <h3>Core Idea</h3>
            <dl>
              <dt>You get...</dt>
              <dd>A deterministic database</dd>
              <dd>A non-deterministic query (and a set of tools for sampling from its outputs).</dd>
              <dt>You produce...</dt>
              <dd>Which "call" to the query has the biggest influence on the output.</dd>
            </dl>
          </section>

          <section>
            <h3>Things to Think About</h3>

            <ul>
              <li class="fragment">What kind(s) of influence measures make sense?</li>
              <li class="fragment">How to compute influence efficiently for all tuples in parallel?</li>
              <li class="fragment">Early pruning: Can some influence measures be computed exactly?</span></li>
            </ul>
          </section>

        </section>

        <section>

          <section>
            <h3>Sandboxed Python</h3>

            <img src="graphics/Python.svg" height="300px"  style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <span style="color: red; font-size: 500%; vertical-align: middle;" class="fragment" data-fragment-index=2>♥</span>
            <img src="graphics/Apache_Spark_logo.svg" height="200px"  style="border: 0px; vertical-align: middle; background-color: lightgrey; padding: 10px; box-shadow: none;" class="fragment" data-fragment-index=1/>
          </section>

          <section>
            <img src="graphics/Python.svg" height="300px"  style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <span style="color: lightgrey; font-size: 500%; vertical-align: middle;">→</span>
            <img src="graphics/Apache_Spark_logo.svg" height="200px"  style="border: 0px; vertical-align: middle; background-color: lightgrey; padding: 10px; box-shadow: none;"/>
          </section>

          <section>
            <img src="graphics/server3d.svg" width="100px" style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <img src="graphics/server3d.svg" width="100px" style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <img src="graphics/server3d.svg" width="100px" style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <img src="graphics/2018-08-29-PyBurglar.svg" height="500px" style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" class="fragment" />
          </section>

          <section>
            <img src="graphics/Python.svg" height="300px"  style="border: 0px; vertical-align: middle; background-color: inherit; box-shadow: none;" />
            <span style="color: red; font-size: 500%; vertical-align: middle;">←</span>
            <img src="graphics/Apache_Spark_logo.svg" height="200px"  style="border: 0px; vertical-align: middle; background-color: lightgrey; padding: 10px; box-shadow: none;"/>
          </section>

          <section>
            <img src="graphics/2018-08-29-Sandbox.svg"/>
          </section>

          <section>
            <img src="graphics/2018-08-29-Sandbox-Real.svg"/>
          </section>


          <section>
            <h3>Core Idea</h3>
            <dl>
              <dt>You get...</dt>
              <dd>Python Code</dd>
              <dd>Inputs to the code (or a socket)</dd>
              <dt>Your system produces...</dt>
              <dd>Output for the code... without calling out of the sandbox.</dd>
            </dl>
          </section>

          <section>
            <h3>Things to Think About</h3>

            <ul>
              <li class="fragment">What security guarantees are you providing?</li>
              <li class="fragment">How can you prove to yourselves that those guarantees are enforced?</li>
              <li class="fragment">What tooling can you use to wrap/execute python?</span></li>
            </ul>
          </section>

        </section>

        <section>
          <h3>In-Class Assignment</h3>
          <ul>
            <li>Form a group of 3-4 people that you'll work with for the duration of the semester.</li>
            <li>Come up with a clever group name (or one will be made up for you).</li>
            <li>Challenge: Form a group with people you don't know or don't know well.</li>
          </ul>
        </section>


			</div>

		</div>

		<script src="../reveal.js-3.7.0/lib/js/head.min.js"></script>
		<script src="../reveal.js-3.7.0/js/reveal.js"></script>

		<script>

			// More info https://github.com/hakimel/reveal.js#configuration
			Reveal.initialize({
				controls: true,
				progress: true,
				history: true,
				center: true,

				transition: 'slide', // none/fade/slide/convex/concave/zoom

				// More info https://github.com/hakimel/reveal.js#dependencies
				dependencies: [
					{ src: '../reveal.js-3.7.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
					{ src: '../reveal.js-3.7.0/plugin/markdown/marked.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: '../reveal.js-3.7.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
					{ src: '../reveal.js-3.7.0/plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },
					{ src: '../reveal.js-3.7.0/plugin/search/search.js', async: true },
					{ src: '../reveal.js-3.7.0/plugin/zoom-js/zoom.js', async: true },
					{ src: '../reveal.js-3.7.0/plugin/notes/notes.js', async: true },
          { src: '../reveal.js-3.6.0/plugin/math/math.js',
            condition: function() { return true; },
            mathjax: '../reveal.js-3.7.0/js/MathJax.js'
          },
          { src: '../reveal.js-3.7.0/plugin/svginline/es6-promise.auto.js', async: false },
          { src: '../reveal.js-3.7.0/plugin/svginline/data-src-svg.js', async: false }
				]
			});

		</script>

	</body>
</html>