From 8260cf4aa057d4714703ccdf2aaedf6c118d3a84 Mon Sep 17 00:00:00 2001
From: Oliver Kennedy <okennedy@buffalo.edu>
Date: Sun, 5 May 2019 23:24:26 -0400
Subject: [PATCH] ProbDB slides

---
 .../2019sp/slide/2019-05-01-IncompleteDBs.erb |  15 +-
 .../2019sp/slide/2019-05-06-ProbDBs.erb       | 333 +++++++++++++++++-
 2 files changed, 334 insertions(+), 14 deletions(-)
diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb
index a0df9fa1..9bc9c698 100644
--- a/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb
+++ b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb
@@ -1,6 +1,6 @@
 ---
 template: templates/cse4562_2019_slides.erb
-title: Incomplete and Probabilistic Databases
+title: Querying Incomplete Databases
 date: May 1, 2019
 textbook: "<a href='https://github.com/UBOdin/mimir/wiki/Concepts-CTables'>PDB Concepts and C-Tables</a>"
 dependencies: 
@@ -180,7 +180,7 @@ dependencies:
         ["Name", "ZipCode"], 
         [ ["Alice", "10003"], 
           ["Bob","14260"], 
-          ["Bob","14290"], 
+          ["Bob","19260"], 
           ["Carol","13201"], 
           ["Carol","18201"]
         ], 
@@ -205,7 +205,7 @@ dependencies:
       ["Name", "ZipCode"], 
       [ ["Alice", "10003"], 
         ["Bob","14260"], 
-        ["Bob","14290"], 
+        ["Bob","19260"], 
         ["Carol","13201"], 
         ["Carol","18201"]
       ], 
@@ -230,7 +230,7 @@ dependencies:
       ["Name", "ZipCode"], 
       [ ["Alice", "10003"], 
         ["Bob","14260"], 
-        ["Bob","14290"], 
+        ["Bob","19260"], 
         ["Carol","13201"], 
         ["Carol","18201"]
       ], 
@@ -315,12 +315,7 @@ dependencies:
       Not bigger than the aggregate input...
     </p>
     <p class="fragment">
-      ...but at least it only reduces to bin-packing <br/>(or a similarly NP problem.) 
+      ...but at least it only reduces to bin-packing <br/>(or a similarly known NP problem.) 
     </p>
   </section>
-
-  <section>
-    <p>In short, incomplete databases are limited, but have some uses.</p>
-    <p class="fragment">What about probabilities?</p>
-  </section>
 </section>
diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb
index f35c50b4..fb158152 100644
--- a/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb
+++ b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb
@@ -1,6 +1,6 @@
 ---
 template: templates/cse4562_2019_slides.erb
-title: Incomplete and Probabilistic Databases
+title: Probabilistic Databases
 date: May 6, 2019
 textbook: "<a href='https://github.com/UBOdin/mimir/wiki/Concepts-CTables'>PDB Concepts and C-Tables</a>"
 dependencies: 
@@ -10,15 +10,74 @@ dependencies:
   require "slide_utils.rb" 
 %>
 <section>
+
   <section>
-    <p><b>Idea: </b> Make $\texttt{bob}$ and $\texttt{carol}$ random variables.</p>
+    <img src="graphics/2019-04-31-NormalDB.svg" /><br/>
+    <hr class="fragment" data-fragment-index="1"/>
+    <svg data-src="graphics/2019-04-31-IncompleteDB.svg" class="fragment" data-fragment-index="1"/>
   </section>
 
   <section>
+    <h3>(One Form of) Incomplete Databases</h3>
+    <ul>
+      <li>Define each choice as a variable</li>
+      <li>Tag each row with a boolean formula over variables</li>
+      <li>Each possible world is one assignment of values to variables</li>
+      <li>The possible world has all rows tagged with formulas that evaluate to "true"</li>
+    </ul>
+  </section>
+
+  <section>
+    <dl>
+      <dt>Certain Tuple</dt>
+      <dd>A tuple that appears in all possible worlds</dd>
+
+      <dt>Possible Tuple</dt>
+      <dd>A tuple that appears in at least one possible world</dd>
+    </dl>
+  </section>
+
+  <section>
+    <p><b>Limitation: </b> Can't distinguish between possible-but unlikely and possible-but very likely.</p>
+  </section>
+
+  <section>
+    <p><b>Idea: </b> Make variables probabilistic</p>
+  </section>
+
+  <section>
+    <h3>Example</h3>
     <p>$$\texttt{bob} = \begin{cases} 4 & p = 0.8 \\ 9 & p = 0.2\end{cases}$$</p>
     <p>$$\texttt{carol} = \begin{cases} 3 & p = 0.4 \\ 8 & p = 0.6\end{cases}$$</p>
   </section>
 
+    <section>
+      <%= data_table(
+        ["Name", "ZipCode"], 
+        [ ["Alice", "10003"], 
+          ["Bob","14260"], 
+          ["Bob","19260"], 
+          ["Carol","13201"], 
+          ["Carol","18201"]
+        ], 
+        name: "$\\mathcal R$", 
+        rowids: true, 
+        annotations: [
+          "always", 
+          "if $\\texttt{bob} = 4$", 
+          "if $\\texttt{bob} = 9$", 
+          "if $\\texttt{carol} = 3$", 
+          "if $\\texttt{carol} = 8$"
+        ]
+      ) %>
+
+    <pre class="fragment"><code>
+                SELECT COUNT(*) 
+                FROM R NATURAL JOIN ZipCodeLookup 
+                WHERE State = 'NY'
+    </code></pre>
+  </section>
+
   <section>
     <p style="font-size: 70%">
     $$Q(\mathcal D) = \begin{cases} 
@@ -52,12 +111,278 @@ dependencies:
   </section>
 
   <section>
-    <p>In general, computing probabilities exactly is <code>#P</code></p>
+    <p>In general, computing marginal probabilities for result tuples exactly is <code>#P</code></p>
 
     <p style="margin-top: 50px;" class="fragment">... so we approximate</p>
   </section>
+</section>
+
+<section>
+  <section>
+    <p><b>Idea 1</b>: Sample.  Pick (e.g.) 10 random possible worlds and compute results for each.</p>
+  </section>
+
+  <% 
+    bob = []
+    carol = []
+    counts = [] 
+  %>
+  <% (0...5).each do |i| %>
+    <section>
+      <% 
+        bob[i] = if rand() < 0.8 then 4 else 9 end
+        carol[i] = if rand() < 0.4 then 3 else 8 end
+        counts[i] = [
+          1, # alice
+          (if bob[i] == 4 then 1 else 0 end),
+          (if carol[i] == 3 then 1 else 0 end),
+        ].compact.sum
+      %>
+      <p>$$R_{<%=i+1%>} \Leftarrow \{\; \texttt{bob} \rightarrow <%= bob[i] %>, \; \texttt{carol} \rightarrow <%= carol[i] %>\}$$</p>
+
+      <%= data_table(
+        ["Name", "ZipCode"], 
+        [ ["Alice", "10003"], 
+          ["Bob","1#{bob[i]}260"],
+          ["Carol","1#{carol[i]}201"]
+        ], 
+        name: "$\\mathcal R_{#{i+1}}$", 
+        rowids: true, 
+      ) %>
+
+      <p>$$\mathcal Q = \{\;<%=counts.join(",\\;")%>\;\}$$</p>
+      <% if i == 10 %>
+        <p class="fragment">$$E[\mathcal Q] \approx <%=counts.avg.round(2)%>$$</p>
+        <p class="fragment">$$P[\mathcal Q \geq 2] \approx <%=(counts.select { |c| c >= 2 }.count / counts.size.to_f).round(2) %>$$</p>
+      <% else %>
+        <p>&nbsp;</p>
+        <p>&nbsp;</p>
+      <% end %>
+    </section>
+  <% end %>
+</section>
+
+<section>
+  <section>
+    <p><b>Problem</b>: Sloooooooooooow.</p>
+    <p class="fragment">Can we make it faster?</p>
+  </section>
 
   <section>
-    <p><b>Idea 1</b>: Sample.  Pick 10 random possible worlds and compute results for each.</p>
+    <p><b>Idea 1.A</b>: Combine all samples into one query.</p>
+  </section>
+
+  <section>
+    <div style="font-size: 80%">
+    <%= data_table(
+      [  "Name",  "ZipCode",       "$\\mathcal{ID}$"], 
+      (0...5).map do |i|
+        [ ["Alice", "10003",         "#{i+1}"],
+          ["Bob",   "1#{bob[i]}260", "#{i+1}"],
+          ["Carol","1#{carol[i]}201","#{i+1}"] ]
+      end.flatten(1), 
+        name: "$\\mathcal R$", 
+        rowids: true, 
+      ) %>
+    </div>
+  </section>
+
+  <section>
+    <%= data_table(
+      [  "Count",       "$\\mathcal{ID}$"], 
+      (0...5).map do |i|
+        [counts[i], "#{i+1}"]
+      end,
+        name: "$\\mathcal Q$", 
+        rowids: true, 
+      ) %>
+  </section>
+
+  <section>
+    <h3>Querying Joint Sample Tables</h3>
+
+    <p>$\pi_A(R) \rightarrow $ <span class="fragment">$\pi_{A, \mathcal{ID}}(R)$</span></p>
+    <p class="fragment">$\sigma_\phi(R) \rightarrow $ <span class="fragment">$\sigma_{\phi}(R)$</span></p>
+    <p class="fragment">$R \uplus S \rightarrow $ <span class="fragment">$R \uplus S$</span></p>
+    <p class="fragment">$R \times S \rightarrow $ <span class="fragment"><span class="fragment"><span class="fragment">$\pi_{R.*, S.*, R.\mathcal{ID}}\big($</span>$\sigma_{R.\mathcal{ID} = S.\mathcal{ID}}( $</span>$ R \times S)\big)$</span></p>
+    <p class="fragment">$\delta R \rightarrow $ <span class="fragment">$\delta R$</span></p>
+    <p class="fragment">$_A\gamma_{Agg(*)}(R) \rightarrow $ <span class="fragment">$_{A, \mathcal{ID}}\gamma_{Agg(*)}(R)$</span></p>
+  </section>
+</section>
+
+<section>
+  <section>
+    <p>Still sloooooow.</p>
+    <p class="fragment">There's a lot of repetition.</p>
+  </section>
+
+  <section>
+    <p><b>Idea 2.B</b> Use native array-types in DBs</p>
+  </section>
+
+  <section>
+    <h3 class="fragment">Tuple Bundles</h3>
+    <div style="font-size: 80%">
+    <%= data_table(
+        [  "Name",  "ZipCode" ], 
+        [ ["Alice", "10003" ],
+          ["Bob",   "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]"],
+          ["Carol", "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]"]
+        ],
+        name: "$\\mathcal R$", 
+        rowids: true, 
+      ) %>
+    </div>
+    <attribution><a href="https://dl.acm.org/citation.cfm?id=1376686">MCDB: a monte carlo approach to managing uncertain data</a> (Jampani et. al.)</attribution>
+  </section>
+
+  <section>
+    <h3>Querying Tuple Bundles</h3>
+
+    <p>$\pi_A(R) \rightarrow $ <span class="fragment">$\pi_{A}(R)$</span></p>
+    <p class="fragment">$\sigma_\phi(R) \rightarrow $ <span class="fragment">?</span></p>
+    <p>&nbsp;</p>
+    <p>&nbsp;</p>
+    <p>&nbsp;</p>
+    <p>&nbsp;</p>
+  </section>
+
+  <section>
+    <p><b>Idea 1.B'</b> Also mark which tuples are present in which samples</p>
+  </section>
+
+  <section>
+    <div style="font-size: 70%">
+    <%= data_table(
+        [  "Name",  "ZipCode", "$\\mathcal W$" ], 
+        [ ["Alice", "10003", "11111" ],
+          ["Bob",   "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]", "11111"],
+          ["Carol", "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]", "11111"]
+        ],
+        name: "$\\mathcal R$", 
+        rowids: true, 
+      ) %>
+    </div>
+    <div class="fragment" style="margin: 50px;">↓ $\sigma_{InNYS(ZipCode)}(\mathcal R)$ ↓</div>
+    <div style="font-size: 70%" class="fragment">
+
+    <%= data_table(
+        [  "Name",  "ZipCode", "$\\mathcal W$" ], 
+        [ ["Alice", "10003", "11111" ],
+          ["Bob",   
+            "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]", 
+            (0...5).map { |i| if bob[i] == 4 then 1 else 0 end }.join ],
+          ["Carol", 
+            "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]", 
+            (0...5).map { |i| if carol[i] == 3 then 1 else 0 end }.join ]
+        ],
+        name: "$\\mathcal R$", 
+        rowids: true, 
+      ) %>
+    </div>
+  </section>
+
+  <section>
+    <h3>Querying Tuple Bundles</h3>
+
+    <p>$\pi_A(R) \rightarrow $ $\pi_{A}(R)$</p>
+    <p class="fragment">$\sigma_\phi(R) \rightarrow $ <span class="fragment"><span class="fragment">$\sigma_{\mathcal W = 0}($</span>$\pi_{\mathcal W \;\&\; \vec \phi}(R))$</span></p>
+    <p class="fragment">$R \uplus S \rightarrow $ <span class="fragment">$R \uplus S$</span></p>
+    <p class="fragment">$R \times S \rightarrow $ <span class="fragment"><span class="fragment"><span class="fragment">$\sigma_{\mathcal{W} = 0}\big($</span>$\pi_{R.*, S.*, R.\mathcal{W} \;\&\; S.\mathcal{W}}( $</span>$ R \times S)\big)$</span></p>
+    <p class="fragment">$_A\gamma_{Agg(B)}(R) \rightarrow $ <span class="fragment">$_A\gamma_{[ Agg\big(\textbf{if}(W[1])\{R.B[1]\}\big), Agg\big(\textbf{if}(W[2])\{R.B[2]\}\big), \ldots ]}(R)$</span></p>
+  </section>
+
+  <section>
+    <h3>Querying Joint Sample Tables</h3>
+
+    <p>$\pi_A(R) \rightarrow \pi_{A}(R)$</p>
+    <p>$\sigma_\phi(R) \rightarrow \sigma_{\mathcal W = 0}(\pi_{\mathcal W \;\&\; \vec \phi}(R))$</p>
+    <p>$R \uplus S \rightarrow R \uplus S$</p>
+    <p class="fragment highlight-blue" data-fragment-index="1">$R \times S \rightarrow \sigma_{\mathcal{W} = 0}\big(\pi_{R.*, S.*, R.\mathcal{W} \;\&\; S.\mathcal{W}}(  R \times S)\big)$</p>
+    <p class="fragment highlight-blue" data-fragment-index="2">$_A\gamma_{Agg(B)}(R) \rightarrow $ $_A\gamma_{[ Agg\big(\textbf{if}(W[1])\{R.B[1]\}\big), Agg\big(\textbf{if}(W[2])\{R.B[2]\}\big), \ldots ]}(R)$</p>
+    <p>(Generate aggregates for each sample separately)</p>
+    <p class="fragment" data-fragment-index="1" style="margin-top: 60px; font-size: 60%">Good luck ever doing an equi-join.</p>
+    <p class="fragment" data-fragment-index="2" style="margin-top: 0px; font-size: 60%">Hope your group-by variables aren't uncertain.</p>
+  </section>
+
+
+</section>
+
+<section>
+  <section>
+    <p>Inefficient equi-joins on uncertain variables.</p>
+    <p>Inefficient aggregates with uncertain variables.</p>
+    <p class="fragment">How many samples necessary to get desired precision?</p>
+  </section>
+
+  <section>
+    <p><b>Idea 2</b>: Symbolic Execution (Provenance)</p>
+  </section>
+
+  <section>
+    <p>$\sigma_{count \geq 2}(Q) =$</p>
+    <p class="fragment">$\texttt{bob} = 4 \wedge \texttt{carol} = 8 $<br/>
+                        $\vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3 $<br/>
+                        $\vee\; \texttt{bob} = 4 \wedge \texttt{carol} = 3$</p>
+    <p class="fragment">$P[\sigma_{count \geq 2}(Q)] = ?$ <span class="fragment">$\approx$ #SAT</span></p>
+  </section>
+
+  <section>
+    <h3>Computing Probabilities</h3>
+
+    <p class="fragment">$P[\texttt{x} \wedge \texttt{y}] = P[\texttt{x}] \cdot P[\texttt{y}]$<br/>(iff $\texttt{x}$ and $\texttt{y}$ are independent)</p>
+
+    <p class="fragment">$P[\texttt{x} \wedge \texttt{y}] = 0$<br/>(iff $\texttt{x}$ and $\texttt{y}$ are mutually exclusive)</p>
+
+    <p class="fragment">$P[\texttt{x} \vee \texttt{y}] = 1- (1-P[\texttt{x}]) \cdot (1-P[\texttt{y}])$<br/>(iff $\texttt{x}$ and $\texttt{y}$ are independent)</p>
+
+    <p class="fragment">$P[\texttt{x} \vee \texttt{y}] = P[\texttt{x}] + P[\texttt{y}]$<br/>(iff $\texttt{x}$ and $\texttt{y}$ are mutually exclusive)</p>
+
+    <p class="fragment" style="font-size: 70%; font-weight: bold;">Good enough to get us the probability of any boolean formula over mutually exclusive or independent variables</p>
+
+    <p class="fragment" style="font-size: 70%; font-weight: bold;">... and otherwise?</p>
+  </section>
+
+  <section>
+     <h3>Shannon Expansion</h3>
+
+     <p>For a boolean formula $f$ and variable $\texttt{x}$:</p>
+
+     <p>$$f = (\texttt{x} \wedge f[\texttt{x}\backslash T]) \vee (\neg \texttt{x} \wedge f[\texttt{x}\backslash F])$$</p>
+
+     <p class="fragment" style="margin-top: 50px;margin-bottom:0px;">Disjunction of mutually-exclusive terms!</p>
+     <p class="fragment" style="margin-top: 0px;margin-bottom:0px;">... each a conjunction of independent terms.</p>
+     <p class="fragment" style="margin-top: 0px;margin-bottom:0px;">... and $\texttt{x}$ removed from $f$</p>
+
+     <p class="fragment">Ok... just keep applying Shannon!</p>
+     <p class="fragment">Each application creates 2 new formulas (ExpTime!)</p>
+  </section>
+
+  <section>
+    <p><b>Idea 2.A</b>: Combine the two.  Use Shanon expansion as long as time/resources permit, then use a #SAT approximation.</p>
+
+    <attribution class="fragment"><a href="https://ieeexplore.ieee.org/abstract/document/4812442/">Sprout: Lazy vs. eager query plans for tuple-independent probabilistic databases</a> (Olteanu et. al.)</attribution>
+  </section>
+</section>
+
+<section>
+  <section>
+    <h3>More Resources</h3>
+    <dl style="font-size: 80%">
+      <dt><a href="https://dl.acm.org/citation.cfm?id=1376686">MCDB</a></dt>
+      <dd>Sampling-based probabilistic databases</dd>
+
+      <dt><a href="https://ieeexplore.ieee.org/abstract/document/4812442/">Sprout</a></dt>
+      <dd>"Any-time" Approximation.</dd>
+
+      <dt><a href="https://dl.acm.org/citation.cfm?id=2824055">Mimir</a></dt>
+      <dd>PL tricks to make ProbDBs faster</dd>
+
+      <dt><a href="https://dl.acm.org/citation.cfm?id=2809991">DeepDive</a></dt>
+      <dd>ProbDBs used in practice to populate Knowledge Bases.</dd>
+
+      <dt><a href="https://ieeexplore.ieee.org/abstract/document/4812509/">Integrating and Ranking Uncertain Scientific Data</a></dt>
+      <dd>ProbDBs used in practice to predict gene expressions / propose experiments.</dd>
+    </dl>
   </section>
 </section>
\ No newline at end of file