Slides
parent
ce7deb1e68
commit
9b34d953cc
|
@ -61,4 +61,147 @@ def data_table(schema, data, params = {})
|
||||||
data.zip(row_args).map { |row, args| tag("tr", row.join, args) }.join("\n"),
|
data.zip(row_args).map { |row, args| tag("tr", row.join, args) }.join("\n"),
|
||||||
params.fetch(:table_args, {})
|
params.fetch(:table_args, {})
|
||||||
)
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
class RATreeNode
|
||||||
|
def initialize(type, params, children = [])
|
||||||
|
@type = type
|
||||||
|
@params = params
|
||||||
|
@children = children
|
||||||
|
@self_width = 100
|
||||||
|
@self_height = 100
|
||||||
|
case @type
|
||||||
|
when :table then
|
||||||
|
@self_width = 40*params[:name].length
|
||||||
|
@self_height = 50
|
||||||
|
when :select, :join then
|
||||||
|
@self_width += 15*params[:pred].length
|
||||||
|
when :project then
|
||||||
|
@self_width += 15*params[:attrs].length
|
||||||
|
end
|
||||||
|
@height_above_children = 100
|
||||||
|
end
|
||||||
|
|
||||||
|
def subscript(x)
|
||||||
|
"<tspan style='font-size: 40%; vertical-align: sub;'>#{x}</tspan>"
|
||||||
|
end
|
||||||
|
|
||||||
|
def symbol
|
||||||
|
case @type
|
||||||
|
when :select then "<tspan style='font-size: 200%'> 𝛔#{subscript @params[:pred]}</tspan>"
|
||||||
|
when :project then "<tspan style='font-size: 200%'> 𝛑#{subscript @params[:attrs]}</tspan>"
|
||||||
|
when :aggregate then "<tspan style='font-size: 200%'>#{subscript @params[:groupby] if @params.has_key? :groupby}𝛄#{subscript @params[:aggregates]}</tspan>"
|
||||||
|
when :join then "<tspan style='font-size: 400%'>⋈#{subscript @params[:pred]}</tspan>"
|
||||||
|
when :cross then "<tspan style='font-size: 400%'>⨉</tspan>"
|
||||||
|
when :diff then "<tspan style='font-size: 300%; font-weight: bold'> -</tspan>"
|
||||||
|
when :union then "<tspan style='font-size: 400%'>⊎</tspan>"
|
||||||
|
when :table then "<tspan style='font-weight: bold; font-family: Courier, fixed-width; font-size: 150%'>#{@params[:name]}</tspan>"
|
||||||
|
else type.to_s
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def height(config = {})
|
||||||
|
unless @height
|
||||||
|
if @children.nil?
|
||||||
|
@height = @self_height
|
||||||
|
else
|
||||||
|
@height = @children.map { |c| c.height(config) }.max + (@self_height + @height_above_children)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
@height
|
||||||
|
end
|
||||||
|
|
||||||
|
def child_width(config = {})
|
||||||
|
return 0 if @children.nil?
|
||||||
|
unless @child_width
|
||||||
|
separator_x = 20
|
||||||
|
@child_width = @children.map { |c| c.width(config) }.sum + separator_x * (@children.size-1)
|
||||||
|
end
|
||||||
|
@child_width
|
||||||
|
end
|
||||||
|
|
||||||
|
def width(config = {})
|
||||||
|
unless @width
|
||||||
|
if @children.nil?
|
||||||
|
@width = @self_width
|
||||||
|
else
|
||||||
|
@width = [
|
||||||
|
child_width,
|
||||||
|
@self_width
|
||||||
|
].max
|
||||||
|
end
|
||||||
|
end
|
||||||
|
@width
|
||||||
|
end
|
||||||
|
|
||||||
|
def symbol_text(config)
|
||||||
|
symbol_x = width(config) / 2 - (@self_width / 2)
|
||||||
|
symbol_y = 0
|
||||||
|
debug = "#{config.fetch(:indent, "")}<rect x='#{symbol_x}' y='#{symbol_y}' width='#{@self_width}' height='#{@self_height}' style='fill: red'/>\n" if config.fetch(:debug, false)
|
||||||
|
"#{debug}#{config.fetch(:indent, "")}<text x='#{symbol_x}' y='#{symbol_y+@self_height}'>#{symbol}</text>\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
def render(config = {})
|
||||||
|
return symbol_text(config) if @children.nil?
|
||||||
|
indent = config.fetch(:indent, "")
|
||||||
|
separator_x = 20
|
||||||
|
separator_y = @height_above_children
|
||||||
|
children_x = [0]
|
||||||
|
children_x = [(width(config) - child_width(config)) / 2] if width(config) > child_width(config)
|
||||||
|
(1..@children.length).each { |i| children_x[i] = children_x[i-1] + @children[i-1].width + separator_x }
|
||||||
|
children_y = separator_y + @self_height
|
||||||
|
|
||||||
|
child_blobs = @children.map.with_index do |c, i|
|
||||||
|
rendered = c.render(config.merge( indent: indent+" " ))
|
||||||
|
p rendered if config.fetch(:debug, false)
|
||||||
|
p children_x[i] if config.fetch(:debug, false)
|
||||||
|
"#{indent} <g transform='translate(#{children_x[i]}, #{children_y})'>\n#{rendered}</g>\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
line_x = width(config) / 2
|
||||||
|
line_y = (@self_height) * 1.1
|
||||||
|
target_y = line_y + @height_above_children
|
||||||
|
|
||||||
|
child_lines = @children.map.with_index do |c, i|
|
||||||
|
target_x = (children_x[i] + children_x[i+1] - separator_x) / 2
|
||||||
|
"#{indent} <line x1='#{line_x}' y1='#{line_y}' x2='#{target_x}' y2='#{target_y}' stroke='black' stroke-width='4'/>\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
symbol_text(config)+child_blobs.join+child_lines.join
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def ra_table(name)
|
||||||
|
RATreeNode.new(:table, { name: name }, nil)
|
||||||
|
end
|
||||||
|
def ra_union(*children)
|
||||||
|
RATreeNode.new(:union, {}, children)
|
||||||
|
end
|
||||||
|
def ra_diff(*children)
|
||||||
|
RATreeNode.new(:diff, {}, children)
|
||||||
|
end
|
||||||
|
def ra_join(predicate, lhs, rhs)
|
||||||
|
RATreeNode.new(:table, { pred: predicate }, [lhs, rhs])
|
||||||
|
end
|
||||||
|
def ra_aggregate(groupby, aggregates, input)
|
||||||
|
RATreeNode.new(:aggregate, { groupby: groupby, aggregates: aggregates}, [input])
|
||||||
|
end
|
||||||
|
def ra_select(predicate, input)
|
||||||
|
RATreeNode.new(:select, { pred: predicate }, [input])
|
||||||
|
end
|
||||||
|
def ra_project(attrs, input)
|
||||||
|
attrs = attrs.map { |k, v| "#{k} ← #{v}"}.join("; ") if attrs.is_a? Hash
|
||||||
|
RATreeNode.new(:project, { attrs: attrs }, [input])
|
||||||
|
end
|
||||||
|
|
||||||
|
def relational_algebra(params = {})
|
||||||
|
indent = params.fetch(:indent, "")
|
||||||
|
ra = yield
|
||||||
|
scale = if ra.height > 500 then 500.0 / ra.height else 1 end
|
||||||
|
return (
|
||||||
|
"#{indent}<svg height='#{(ra.height+20)*scale}' width='#{(ra.width+20)*scale}'>\n"+
|
||||||
|
"#{indent}<g transform='scale(#{scale})'>"+
|
||||||
|
ra.render(params.merge( indent: indent+" " ))+
|
||||||
|
"#{indent}</g></svg>\n"
|
||||||
|
)
|
||||||
end
|
end
|
|
@ -0,0 +1,326 @@
|
||||||
|
---
|
||||||
|
template: templates/cse4562_2019_slides.erb
|
||||||
|
title: Incomplete and Probabilistic Databases
|
||||||
|
date: May 1, 2019
|
||||||
|
textbook: "<a href='https://github.com/UBOdin/mimir/wiki/Concepts-CTables'>PDB Concepts and C-Tables</a>"
|
||||||
|
dependencies:
|
||||||
|
- lib/slide_utils.rb
|
||||||
|
---
|
||||||
|
<%
|
||||||
|
require "slide_utils.rb"
|
||||||
|
%>
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<img src="graphics/2019-04-31-4or9.png" height="300px" />
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<img src="graphics/2019-04-31-guacamole.png" class="stretch" />
|
||||||
|
<attribution><a href="https://www.anishathalye.com/2017/07/25/synthesizing-adversarial-examples/">https://www.anishathalye.com/2017/07/25/synthesizing-adversarial-examples/</a></attribution>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<img src="graphics/2019-04-31-catVSdog.jpg" class="stretch" />
|
||||||
|
<attribution><a href="https://www.pyimagesearch.com/pyimagesearch-gurus/?src=post-deep-learning-libs">Deep Learning Demystified</a></attribution>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>What happens when you don't know your data precisely?</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code>
|
||||||
|
SELECT * FROM Posts WHERE image_class = 'Cat';
|
||||||
|
</code></pre>
|
||||||
|
<pre class="fragment"><code>
|
||||||
|
SELECT COUNT(*) FROM Posts WHERE image_class = 'Cat';
|
||||||
|
</code></pre>
|
||||||
|
<pre class="fragment"><code>
|
||||||
|
SELECT user_id FROM Posts
|
||||||
|
WHERE image_class = 'Cat'
|
||||||
|
GROUP BY user_id HAVING COUNT(*) > 10;
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3 class="fragment">Incomplete Databases<br/>↓</h3>
|
||||||
|
<h3>Probabilistic Databases</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<ol>
|
||||||
|
<li>Representing Incompleteness</li>
|
||||||
|
<li class="fragment">Querying Incomplete Data</li>
|
||||||
|
<li class="fragment">Implementing It</li>
|
||||||
|
</ol>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<table><tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","1<span class='fragment highlight-current-red' data-fragment-index='1'>4</span>260"]], name: "$R_1$", rowids: true) %>
|
||||||
|
</td><td class="fragment" data-fragment-index="3">or</td>
|
||||||
|
<td class="fragment highlight-current-grey" data-fragment-index="2">
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","1<span class='fragment highlight-current-red' data-fragment-index='1'>9</span>260"]], name: "$R_2$", rowids: true) %>
|
||||||
|
</td></tr></table>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Incomplete Database</b> ($\mathcal D$): A set of <i>possible worlds</i></p>
|
||||||
|
<p class="fragment"><b>Possible World</b> ($D \in \mathcal D$): One (of many) database instances</p>
|
||||||
|
<p class="fragment">(Require all possible worlds to have the same schema)</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>What does it mean to run a query on an incomplete database?</p>
|
||||||
|
<p class="fragment" data-fragment-index="1"><span class="fragment fade-out" data-fragment-index="2">$Q(\mathcal D) = ?$</span></p>
|
||||||
|
<p class="fragment" data-fragment-index="2">$Q(\mathcal D) = \{\;Q(D)\;|\;D \in \mathcal D \}$</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<table><tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %>
|
||||||
|
</td><td>or</td><td>
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %>
|
||||||
|
</td></tr></table>
|
||||||
|
<p class="fragment" style="font-size: 90%">$$Q_1 = \pi_{Name}\big( \sigma_{state = \texttt{'NY'}} (R \bowtie_{zip} ZipLookups) \big)$$</p>
|
||||||
|
<table class="fragment"><tr>
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">{</td>
|
||||||
|
<td style="vertical-align: middle;">
|
||||||
|
<%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$", rowids: true) %>
|
||||||
|
</td><td style="vertical-align: middle; font-weight: bold;">or</td><td style="vertical-align: middle;">
|
||||||
|
<%= data_table(["Name"], [["Alice"]], name: "$Q(R_2)$", rowids: true) %>
|
||||||
|
</td>
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">}</td>
|
||||||
|
</tr></table>
|
||||||
|
<aside class="notes">
|
||||||
|
19260 is Phoenixville, PA
|
||||||
|
</aside>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<table><tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %>
|
||||||
|
</td><td>or</td><td>
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %>
|
||||||
|
</td></tr></table>
|
||||||
|
<p class="fragment" style="font-size: 80%">$$Q_2 = \pi_{Name}\big( \sigma_{region = \texttt{'Northeast'}} (R \bowtie_{zip} ZipLookups) \big)$$</p>
|
||||||
|
<table class="fragment">
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">{</td>
|
||||||
|
<td style="vertical-align: middle;">
|
||||||
|
<%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$", rowids: true) %>
|
||||||
|
</td><td style="vertical-align: middle; font-weight: bold;">or</td><td style="vertical-align: middle;">
|
||||||
|
<%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_2)$", rowids: true) %>
|
||||||
|
</td>
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">}</td>
|
||||||
|
</tr></table>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<table><tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"]], name: "$R_1$", rowids: true) %>
|
||||||
|
</td><td>or</td><td>
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"]], name: "$R_2$", rowids: true) %>
|
||||||
|
</td></tr></table>
|
||||||
|
<p style="font-size: 80%">$$Q_2 = \pi_{Name}\big( \sigma_{region = \texttt{'Northeast'}} (R \bowtie_{zip} ZipLookups) \big)$$</p>
|
||||||
|
<table><tr>
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">{</td>
|
||||||
|
<td style="vertical-align: middle;">
|
||||||
|
<%= data_table(["Name"], [["Alice"], ["Bob"]], name: "$Q(R_1)$ or $Q(R_2)$", rowids: true) %>
|
||||||
|
</td>
|
||||||
|
<td style="font-size: 600%; margin: 0px; padding: 0px; height: 0.5em;">}</td>
|
||||||
|
</tr></table>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<img src="graphics/2019-04-31-NormalDB.svg" /><br/>
|
||||||
|
<hr class="fragment" data-fragment-index="1"/>
|
||||||
|
<svg data-src="graphics/2019-04-31-IncompleteDB.svg" class="fragment" data-fragment-index="1"/>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<p><b>Challenge:</b> There can be <u>lots</u> of possible worlds.</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Observation: </b> Possibilities for database creation break down into lots of independent choices.</p>
|
||||||
|
|
||||||
|
<p class="fragment"><u>Factorize</u> the database.</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<table><tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"], ["Carol", "13201"]], name: "$R_1$", rowids: true) %>
|
||||||
|
</td><td>
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"], ["Carol", "18201"]], name: "$R_2$", rowids: true) %>
|
||||||
|
</td></tr>
|
||||||
|
<tr><td>
|
||||||
|
<%= data_table(["Name", "ZipCode"], [["Alice", "10003"], ["Bob","14260"], ["Carol", "13201"]], name: "$R_3$", rowids: true) %>
|
||||||
|
</td><td>
|
||||||
|
<%= data_table(["Name", "Division"], [["Alice", "10003"], ["Bob","19260"], ["Carol", "18201"]], name: "$R_4$", rowids: true) %>
|
||||||
|
</td></tr></table>
|
||||||
|
<p class="fragment">Alice appears in both databases. <br/>The only differences are Bob and Carol's zip codes.</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>List Out Choices</h3>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>$\texttt{bob}$<span class="fragment" data-fragment-index="1">$ \in \{ 4, 9 \}$</span> (Bob's zip code digit)</li>
|
||||||
|
<li>$\texttt{carol}$<span class="fragment" data-fragment-index="1">$ \in \{ 3, 8 \}$</span> (Carol's zip code digit)</li>
|
||||||
|
</ul>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<% [false, true].each do |with_annotations| %>
|
||||||
|
<section>
|
||||||
|
<%= data_table(
|
||||||
|
["Name", "ZipCode"],
|
||||||
|
[ ["Alice", "10003"],
|
||||||
|
["Bob","14260"],
|
||||||
|
["Bob","14290"],
|
||||||
|
["Carol","13201"],
|
||||||
|
["Carol","18201"]
|
||||||
|
],
|
||||||
|
name: "$\\mathcal R$",
|
||||||
|
rowids: true,
|
||||||
|
annotations: if with_annotations then [
|
||||||
|
"always",
|
||||||
|
"if $\\texttt{bob} = 4$",
|
||||||
|
"if $\\texttt{bob} = 9$",
|
||||||
|
"if $\\texttt{carol} = 3$",
|
||||||
|
"if $\\texttt{carol} = 8$"
|
||||||
|
] else nil end
|
||||||
|
) %>
|
||||||
|
<div class="fragment">
|
||||||
|
<div style="font-size: 200%">+</div>
|
||||||
|
<p>$\big[\;\texttt{bob} \in \{4, 9\},\; \texttt{carol} \in \{3, 8\}\;\big]$</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<% end %>
|
||||||
|
<section>
|
||||||
|
<%= data_table(
|
||||||
|
["Name", "ZipCode"],
|
||||||
|
[ ["Alice", "10003"],
|
||||||
|
["Bob","14260"],
|
||||||
|
["Bob","14290"],
|
||||||
|
["Carol","13201"],
|
||||||
|
["Carol","18201"]
|
||||||
|
],
|
||||||
|
name: "$\\mathcal R$",
|
||||||
|
rowids: true,
|
||||||
|
annotations: [
|
||||||
|
"a",
|
||||||
|
"b",
|
||||||
|
"c",
|
||||||
|
"d",
|
||||||
|
"e"
|
||||||
|
]
|
||||||
|
) %>
|
||||||
|
<div style="font-size: 200%">+</div>
|
||||||
|
<p>Pick one of each: $\big[\;\{a\},\; \{b, c\},\; \{d, e\}\;\big]$</p>
|
||||||
|
<p>Set those variables to $T$ and all others to $F$</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>$R_1 \equiv \big[a \rightarrow T, b \rightarrow T, d \rightarrow T, * \rightarrow F\big]$</p>
|
||||||
|
<%= data_table(
|
||||||
|
["Name", "ZipCode"],
|
||||||
|
[ ["Alice", "10003"],
|
||||||
|
["Bob","14260"],
|
||||||
|
["Bob","14290"],
|
||||||
|
["Carol","13201"],
|
||||||
|
["Carol","18201"]
|
||||||
|
],
|
||||||
|
name: "$\\mathcal R$",
|
||||||
|
rowids: true,
|
||||||
|
annotations: [
|
||||||
|
"T (a)",
|
||||||
|
"T (b)",
|
||||||
|
"F (c)",
|
||||||
|
"T (d)",
|
||||||
|
"F (e)"
|
||||||
|
]
|
||||||
|
) %>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>Use provenance as before...</p>
|
||||||
|
<p class="fragment">... but what about aggregates?</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code>
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM R NATURAL JOIN ZipCodeLookup
|
||||||
|
WHERE State = 'NY'
|
||||||
|
</code></pre>
|
||||||
|
<p style="font-size: 70%" class="fragment">
|
||||||
|
$$= \begin{cases}
|
||||||
|
1 & \textbf{if } \texttt{bob} = 9 \wedge \texttt{carol} = 8\\
|
||||||
|
2 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 8 \\&\; \vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3\\
|
||||||
|
3 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 3
|
||||||
|
\end{cases}$$</p>
|
||||||
|
<p class="fragment"><b>Problem: </b> A combinatorial explosion of possibilities</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Idea: </b> Simplify the problem</p>
|
||||||
|
<ol>
|
||||||
|
<li class="fragment">Is a particular tuple <i>Possible</i>?</li>
|
||||||
|
<li class="fragment">Is a particular tuple <i>Certain</i>?</li>
|
||||||
|
</ol>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<dl>
|
||||||
|
<div class="fragment">
|
||||||
|
<dt>Certain Tuple</dt>
|
||||||
|
<dd>A tuple that appears in all possible worlds</dd>
|
||||||
|
<dd class="fragment">$\forall D \in \mathcal D : t \in D$</dd>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="fragment">
|
||||||
|
<dt>Possible Tuple</dt>
|
||||||
|
<dd>A tuple that appears in at least one possible world</dd>
|
||||||
|
<dd class="fragment">$\exists D \in \mathcal D : t \in D$</dd>
|
||||||
|
</div>
|
||||||
|
</dl>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>Non-aggregate queries</h3>
|
||||||
|
<dl>
|
||||||
|
<dt>Is a tuple Certain?</dt>
|
||||||
|
<dd class="fragment">Is the provenance polynomial a tautology?</dd>
|
||||||
|
|
||||||
|
<dt>Is a tuple Possible?</dt>
|
||||||
|
<dd class="fragment">Is the provenance polynomial a contradiction?</dd>
|
||||||
|
</dl>
|
||||||
|
<p class="fragment">Pick your favorite SAT solver, plug in and go</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>Aggregate queries</h3>
|
||||||
|
|
||||||
|
<p style="margin-top: 50px; margin-bottom: 50px;">
|
||||||
|
As before, factorize the possible outcomes
|
||||||
|
</p>
|
||||||
|
<p class="fragment">
|
||||||
|
$$1 + \{\;1\;\textbf{if}\;\texttt{bob} = 4\;\} + \{\;1\;\textbf{if}\;\texttt{carol} = 3\;\}$$
|
||||||
|
</p>
|
||||||
|
<p style="margin-top: 50px;" class="fragment">
|
||||||
|
Not bigger than the aggregate input...
|
||||||
|
</p>
|
||||||
|
<p class="fragment">
|
||||||
|
...but at least it only reduces to bin-packing <br/>(or a similarly NP problem.)
|
||||||
|
</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>In short, incomplete databases are limited, but have some uses.</p>
|
||||||
|
<p class="fragment">What about probabilities?</p>
|
||||||
|
</section>
|
||||||
|
</section>
|
|
@ -0,0 +1,355 @@
|
||||||
|
---
|
||||||
|
template: templates/cse4562_2019_slides.erb
|
||||||
|
title: Checkpoint 4
|
||||||
|
date: May 3, 2019
|
||||||
|
textbook:
|
||||||
|
dependencies:
|
||||||
|
- lib/slide_utils.rb
|
||||||
|
---
|
||||||
|
<%
|
||||||
|
require "slide_utils.rb"
|
||||||
|
%>
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>A few things first...</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<img src="graphics/2019-05-03-DemoDay.png" class="stretch" />
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>4/562 Databake Off @ 3:00</h3>
|
||||||
|
<p>RSVP (limited space available) to participate</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>A note on optimization...</h3>
|
||||||
|
|
||||||
|
<p>Lots of interesting strategies used in Checkpoint 3</p>
|
||||||
|
<ul>
|
||||||
|
<li>Pre-parsing</li>
|
||||||
|
<li>Column Stores</li>
|
||||||
|
<li>Cost-based Opt</li>
|
||||||
|
<li class="fragment">Hyper-optimize the slowest query</li>
|
||||||
|
</ul>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h2>Checkpoint 4</h2>
|
||||||
|
<h3>Implement Updates</h3>
|
||||||
|
<p class="fragment">(lambda-architecture edition)</p>
|
||||||
|
<p class="fragment">Due May 20</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<ul>
|
||||||
|
<li>A stream of inserts, deletes, updates, and queries.</li>
|
||||||
|
<li>No restarts.</li>
|
||||||
|
<li>Answer queries as fast as possible.</li>
|
||||||
|
<li>Make sure query results account for DDL effects.</li>
|
||||||
|
</ul>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<dl>
|
||||||
|
<dt>Stage 0</dt>
|
||||||
|
<dd>10 minutes of prep</dd>
|
||||||
|
<dt>Stage 1</dt>
|
||||||
|
<dd>Inserts only</dd>
|
||||||
|
<dt>Stage 2</dt>
|
||||||
|
<dd>Inserts + Deletes</dd>
|
||||||
|
<dt>Stage 3</dt>
|
||||||
|
<dd>Inserts + Deletes + Updates</dd>
|
||||||
|
</dl>
|
||||||
|
<p class="fragment">No restarts.</p>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Do I need to implement block-based storage?</h3>
|
||||||
|
<p class="fragment">No (although you can).</p>
|
||||||
|
<p class="fragment">Ok... so what else can I do?</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>Classical Databases</h3>
|
||||||
|
<img src="graphics/2018-02-19-PrimaryVsSecondary.png" />
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Problem 1:</b> More indexes = Slower writes (bad for OLTP)</p>
|
||||||
|
<p><b>Problem 2:</b> Fewer indexes = Slower reads (bad for OLAP)</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>What if you have both OLAP and OLTP workloads?</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Idea:</b> Weekly / Nightly / Hourly dump<br/>from OLTP System to OLAP system.</p>
|
||||||
|
<p class="fragment">(Index the data while dumping)</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Problem:</b> Not seeing the freshest data!</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Better Idea:</b> OLTP DB + OLAP DB.</p>
|
||||||
|
<p class="fragment">OLTP DB has few indexes, but only stores recent updates.</p>
|
||||||
|
<p class="fragment">OLAP DB has many indexes, and stores everything except recent updates.</p>
|
||||||
|
<p class="fragment">Periodically migrate updates into OLAP DB.</p>
|
||||||
|
<p class="fragment">(Lambda Architecture)</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h2>Checkpoint 4</h2>
|
||||||
|
<h3>Suggested Approach: Lambda-Lite</h3>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Handling Inserts</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
INSERT INTO FOO(A, B, C) VALUES (1, 2, 3);
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra() do
|
||||||
|
ra_table("Orig")
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra(debug: false) do
|
||||||
|
ra_union(
|
||||||
|
ra_table("Orig"),
|
||||||
|
ra_table("New")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Example</h3>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
SELECT COUNT(*) FROM lineitem WHERE mktsegment = 'BUILDING';
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_aggregate(nil, "COUNT(*)",
|
||||||
|
ra_select("mktsegment = 'BUILDING'",
|
||||||
|
ra_table("lineitem")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_aggregate(nil, "COUNT(*)",
|
||||||
|
ra_select("mktsegment = 'BUILDING'",
|
||||||
|
ra_union(
|
||||||
|
ra_table("lineitem"),
|
||||||
|
ra_table("inserts")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Handling Deletes</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
DELETE FROM FOO WHERE A > 5;
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_table("Orig")
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_diff(
|
||||||
|
ra_table("Orig"),
|
||||||
|
ra_table("New")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
<p class="fragment">... but that's not quite how SQL Delete works.</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
DELETE FROM FOO WHERE A > 5;
|
||||||
|
</code></pre>
|
||||||
|
<div class="fragment">
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_select("A ≤ 5",
|
||||||
|
ra_table("FOO")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
DELETE FROM Orig WHERE Something;
|
||||||
|
</code></pre>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_select("NOT Something",
|
||||||
|
ra_table("Orig")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Example</h3>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
INSERT INTO lineitem(...) VALUES (...);
|
||||||
|
INSERT INTO lineitem(...) VALUES (...);
|
||||||
|
DELETE FROM lineitem WHERE shipdate BETWEEN date(1997-10-01)
|
||||||
|
AND date(1997-10-30);
|
||||||
|
SELECT COUNT(*) FROM lineitem WHERE mktsegment = 'BUILDING';
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_aggregate(nil, "COUNT(*)",
|
||||||
|
ra_select("mktsegment = 'BUILDING'",
|
||||||
|
ra_table("lineitem")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_aggregate(nil, "COUNT(*)",
|
||||||
|
ra_select("mktsegment = 'BUILDING'",
|
||||||
|
ra_union(
|
||||||
|
ra_table("lineitem"),
|
||||||
|
ra_table("inserts")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_aggregate(nil, "COUNT(*)",
|
||||||
|
ra_select("mktsegment = 'BUILDING'",
|
||||||
|
ra_select("shipdate NOT BETWEEN ...",
|
||||||
|
ra_union(
|
||||||
|
ra_table("lineitem"),
|
||||||
|
ra_table("inserts")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<h3>Handling Updates</h3>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
|
||||||
|
</code></pre>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_union(
|
||||||
|
ra_select( "C = 3",
|
||||||
|
ra_project( { A: "1", B: "2", C: "C" },
|
||||||
|
ra_table("Foo")
|
||||||
|
)
|
||||||
|
),
|
||||||
|
ra_select( "C ≠ 3",
|
||||||
|
ra_table("Foo")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<pre><code class="sql">
|
||||||
|
UPDATE Foo SET A = 1, B = 2 WHERE C = 3;
|
||||||
|
</code></pre>
|
||||||
|
<%=
|
||||||
|
relational_algebra do
|
||||||
|
ra_project( { A: "CASE WHEN C = 3 THEN 1 ELSE A END", B: "CASE ...", C: "C"},
|
||||||
|
ra_table("Foo")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
%>
|
||||||
|
<pre class="fragment "><code class="sql">
|
||||||
|
SELECT CASE WHEN C = 3 THEN 1 ELSE A END AS A,
|
||||||
|
CASE WHEN C = 3 THEN 2 ELSE B END AS B,
|
||||||
|
C AS C
|
||||||
|
FROM Foo;
|
||||||
|
</code></pre>
|
||||||
|
</section>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<h3>Final Advice</h3>
|
||||||
|
<ul>
|
||||||
|
<li class="fragment">This isn't the only way to implement updates.</li>
|
||||||
|
<li class="fragment">Optimizer performance is crucial!</li>
|
||||||
|
<li class="fragment">Consider periodically pausing to collapse updates</li>
|
||||||
|
</ul>
|
||||||
|
</section>
|
|
@ -0,0 +1,63 @@
|
||||||
|
---
|
||||||
|
template: templates/cse4562_2019_slides.erb
|
||||||
|
title: Incomplete and Probabilistic Databases
|
||||||
|
date: May 6, 2019
|
||||||
|
textbook: "<a href='https://github.com/UBOdin/mimir/wiki/Concepts-CTables'>PDB Concepts and C-Tables</a>"
|
||||||
|
dependencies:
|
||||||
|
- lib/slide_utils.rb
|
||||||
|
---
|
||||||
|
<%
|
||||||
|
require "slide_utils.rb"
|
||||||
|
%>
|
||||||
|
<section>
|
||||||
|
<section>
|
||||||
|
<p><b>Idea: </b> Make $\texttt{bob}$ and $\texttt{carol}$ random variables.</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>$$\texttt{bob} = \begin{cases} 4 & p = 0.8 \\ 9 & p = 0.2\end{cases}$$</p>
|
||||||
|
<p>$$\texttt{carol} = \begin{cases} 3 & p = 0.4 \\ 8 & p = 0.6\end{cases}$$</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p style="font-size: 70%">
|
||||||
|
$$Q(\mathcal D) = \begin{cases}
|
||||||
|
1 & \textbf{if } \texttt{bob} = 9 \wedge \texttt{carol} = 8\\
|
||||||
|
2 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 8 \\&\; \vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3\\
|
||||||
|
3 & \textbf{if } \texttt{bob} = 4 \wedge \texttt{carol} = 3
|
||||||
|
\end{cases}$$</p>
|
||||||
|
<p style="font-size: 90%" class="fragment">
|
||||||
|
$$ = \begin{cases}
|
||||||
|
1 & p = 0.2 \times 0.6\\
|
||||||
|
2 & p = 0.8 \times 0.6 + 0.2 \times 0.4\\
|
||||||
|
3 & p = 0.8 \times 0.4 \end{cases}$$
|
||||||
|
</p>
|
||||||
|
<p class="fragment">
|
||||||
|
$$ = \begin{cases}
|
||||||
|
1 & p = 0.12\\
|
||||||
|
2 & p = 0.56\\
|
||||||
|
3 & p = 0.32\end{cases}$$
|
||||||
|
</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>
|
||||||
|
$$Q(\mathcal D) = \begin{cases}
|
||||||
|
1 & p = 0.12\\
|
||||||
|
2 & p = 0.56\\
|
||||||
|
3 & p = 0.32\end{cases}$$
|
||||||
|
</p>
|
||||||
|
<p class="fragment" style="margin-top: 50px;">$E\left[Q(\mathcal D)\right] = 0.12+1.12+0.96 = 2.20$</p>
|
||||||
|
<p class="fragment" style="margin-top: 50px;">$P\left[Q(\mathcal D) \geq 2\right] = 0.56+0.32 = 0.88$</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p>In general, computing probabilities exactly is <code>#P</code></p>
|
||||||
|
|
||||||
|
<p style="margin-top: 50px;" class="fragment">... so we approximate</p>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<p><b>Idea 1</b>: Sample. Pick 10 random possible worlds and compute results for each.</p>
|
||||||
|
</section>
|
||||||
|
</section>
|
Binary file not shown.
After Width: | Height: | Size: 756 KiB |
Loading…
Reference in New Issue