From 8260cf4aa057d4714703ccdf2aaedf6c118d3a84 Mon Sep 17 00:00:00 2001 From: Oliver Kennedy Date: Sun, 5 May 2019 23:24:26 -0400 Subject: [PATCH] ProbDB slides --- .../2019sp/slide/2019-05-01-IncompleteDBs.erb | 15 +- .../2019sp/slide/2019-05-06-ProbDBs.erb | 333 +++++++++++++++++- 2 files changed, 334 insertions(+), 14 deletions(-) diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb index a0df9fa1..9bc9c698 100644 --- a/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb +++ b/src/teaching/cse-562/2019sp/slide/2019-05-01-IncompleteDBs.erb @@ -1,6 +1,6 @@ --- template: templates/cse4562_2019_slides.erb -title: Incomplete and Probabilistic Databases +title: Querying Incomplete Databases date: May 1, 2019 textbook: "PDB Concepts and C-Tables" dependencies: @@ -180,7 +180,7 @@ dependencies: ["Name", "ZipCode"], [ ["Alice", "10003"], ["Bob","14260"], - ["Bob","14290"], + ["Bob","19260"], ["Carol","13201"], ["Carol","18201"] ], @@ -205,7 +205,7 @@ dependencies: ["Name", "ZipCode"], [ ["Alice", "10003"], ["Bob","14260"], - ["Bob","14290"], + ["Bob","19260"], ["Carol","13201"], ["Carol","18201"] ], @@ -230,7 +230,7 @@ dependencies: ["Name", "ZipCode"], [ ["Alice", "10003"], ["Bob","14260"], - ["Bob","14290"], + ["Bob","19260"], ["Carol","13201"], ["Carol","18201"] ], @@ -315,12 +315,7 @@ dependencies: Not bigger than the aggregate input...

- ...but at least it only reduces to bin-packing
(or a similarly NP problem.) + ...but at least it only reduces to bin-packing
(or a similarly known NP problem.)

- -
-

In short, incomplete databases are limited, but have some uses.

-

What about probabilities?

-
diff --git a/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb index f35c50b4..fb158152 100644 --- a/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb +++ b/src/teaching/cse-562/2019sp/slide/2019-05-06-ProbDBs.erb @@ -1,6 +1,6 @@ --- template: templates/cse4562_2019_slides.erb -title: Incomplete and Probabilistic Databases +title: Probabilistic Databases date: May 6, 2019 textbook: "PDB Concepts and C-Tables" dependencies: @@ -10,15 +10,74 @@ dependencies: require "slide_utils.rb" %>
+
-

Idea: Make $\texttt{bob}$ and $\texttt{carol}$ random variables.

+
+
+
+

(One Form of) Incomplete Databases

+
    +
  • Define each choice as a variable
  • +
  • Tag each row with a boolean formula over variables
  • +
  • Each possible world is one assignment of values to variables
  • +
  • The possible world has all rows tagged with formulas that evaluate to "true"
  • +
+
+ +
+
+
Certain Tuple
+
A tuple that appears in all possible worlds
+ +
Possible Tuple
+
A tuple that appears in at least one possible world
+
+
+ +
+

Limitation: Can't distinguish between possible-but unlikely and possible-but very likely.

+
+ +
+

Idea: Make variables probabilistic

+
+ +
+

Example

$$\texttt{bob} = \begin{cases} 4 & p = 0.8 \\ 9 & p = 0.2\end{cases}$$

$$\texttt{carol} = \begin{cases} 3 & p = 0.4 \\ 8 & p = 0.6\end{cases}$$

+
+ <%= data_table( + ["Name", "ZipCode"], + [ ["Alice", "10003"], + ["Bob","14260"], + ["Bob","19260"], + ["Carol","13201"], + ["Carol","18201"] + ], + name: "$\\mathcal R$", + rowids: true, + annotations: [ + "always", + "if $\\texttt{bob} = 4$", + "if $\\texttt{bob} = 9$", + "if $\\texttt{carol} = 3$", + "if $\\texttt{carol} = 8$" + ] + ) %> + +

+                SELECT COUNT(*) 
+                FROM R NATURAL JOIN ZipCodeLookup 
+                WHERE State = 'NY'
+    
+
+

$$Q(\mathcal D) = \begin{cases} @@ -52,12 +111,278 @@ dependencies:

-

In general, computing probabilities exactly is #P

+

In general, computing marginal probabilities for result tuples exactly is #P

... so we approximate

+
+ +
+
+

Idea 1: Sample. Pick (e.g.) 10 random possible worlds and compute results for each.

+
+ + <% + bob = [] + carol = [] + counts = [] + %> + <% (0...5).each do |i| %> +
+ <% + bob[i] = if rand() < 0.8 then 4 else 9 end + carol[i] = if rand() < 0.4 then 3 else 8 end + counts[i] = [ + 1, # alice + (if bob[i] == 4 then 1 else 0 end), + (if carol[i] == 3 then 1 else 0 end), + ].compact.sum + %> +

$$R_{<%=i+1%>} \Leftarrow \{\; \texttt{bob} \rightarrow <%= bob[i] %>, \; \texttt{carol} \rightarrow <%= carol[i] %>\}$$

+ + <%= data_table( + ["Name", "ZipCode"], + [ ["Alice", "10003"], + ["Bob","1#{bob[i]}260"], + ["Carol","1#{carol[i]}201"] + ], + name: "$\\mathcal R_{#{i+1}}$", + rowids: true, + ) %> + +

$$\mathcal Q = \{\;<%=counts.join(",\\;")%>\;\}$$

+ <% if i == 10 %> +

$$E[\mathcal Q] \approx <%=counts.avg.round(2)%>$$

+

$$P[\mathcal Q \geq 2] \approx <%=(counts.select { |c| c >= 2 }.count / counts.size.to_f).round(2) %>$$

+ <% else %> +

 

+

 

+ <% end %> +
+ <% end %> +
+ +
+
+

Problem: Sloooooooooooow.

+

Can we make it faster?

+
-

Idea 1: Sample. Pick 10 random possible worlds and compute results for each.

+

Idea 1.A: Combine all samples into one query.

+
+ +
+
+ <%= data_table( + [ "Name", "ZipCode", "$\\mathcal{ID}$"], + (0...5).map do |i| + [ ["Alice", "10003", "#{i+1}"], + ["Bob", "1#{bob[i]}260", "#{i+1}"], + ["Carol","1#{carol[i]}201","#{i+1}"] ] + end.flatten(1), + name: "$\\mathcal R$", + rowids: true, + ) %> +
+
+ +
+ <%= data_table( + [ "Count", "$\\mathcal{ID}$"], + (0...5).map do |i| + [counts[i], "#{i+1}"] + end, + name: "$\\mathcal Q$", + rowids: true, + ) %> +
+ +
+

Querying Joint Sample Tables

+ +

$\pi_A(R) \rightarrow $ $\pi_{A, \mathcal{ID}}(R)$

+

$\sigma_\phi(R) \rightarrow $ $\sigma_{\phi}(R)$

+

$R \uplus S \rightarrow $ $R \uplus S$

+

$R \times S \rightarrow $ $\pi_{R.*, S.*, R.\mathcal{ID}}\big($$\sigma_{R.\mathcal{ID} = S.\mathcal{ID}}( $$ R \times S)\big)$

+

$\delta R \rightarrow $ $\delta R$

+

$_A\gamma_{Agg(*)}(R) \rightarrow $ $_{A, \mathcal{ID}}\gamma_{Agg(*)}(R)$

+
+
+ +
+
+

Still sloooooow.

+

There's a lot of repetition.

+
+ +
+

Idea 2.B Use native array-types in DBs

+
+ +
+

Tuple Bundles

+
+ <%= data_table( + [ "Name", "ZipCode" ], + [ ["Alice", "10003" ], + ["Bob", "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]"], + ["Carol", "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]"] + ], + name: "$\\mathcal R$", + rowids: true, + ) %> +
+ MCDB: a monte carlo approach to managing uncertain data (Jampani et. al.) +
+ +
+

Querying Tuple Bundles

+ +

$\pi_A(R) \rightarrow $ $\pi_{A}(R)$

+

$\sigma_\phi(R) \rightarrow $ ?

+

 

+

 

+

 

+

 

+
+ +
+

Idea 1.B' Also mark which tuples are present in which samples

+
+ +
+
+ <%= data_table( + [ "Name", "ZipCode", "$\\mathcal W$" ], + [ ["Alice", "10003", "11111" ], + ["Bob", "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]", "11111"], + ["Carol", "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]", "11111"] + ], + name: "$\\mathcal R$", + rowids: true, + ) %> +
+
↓ $\sigma_{InNYS(ZipCode)}(\mathcal R)$ ↓
+
+ + <%= data_table( + [ "Name", "ZipCode", "$\\mathcal W$" ], + [ ["Alice", "10003", "11111" ], + ["Bob", + "["+(0...5).map { |i| "1#{bob[i]}260" }.join(", ")+"]", + (0...5).map { |i| if bob[i] == 4 then 1 else 0 end }.join ], + ["Carol", + "["+(0...5).map { |i| "1#{carol[i]}201" }.join(", ")+"]", + (0...5).map { |i| if carol[i] == 3 then 1 else 0 end }.join ] + ], + name: "$\\mathcal R$", + rowids: true, + ) %> +
+
+ +
+

Querying Tuple Bundles

+ +

$\pi_A(R) \rightarrow $ $\pi_{A}(R)$

+

$\sigma_\phi(R) \rightarrow $ $\sigma_{\mathcal W = 0}($$\pi_{\mathcal W \;\&\; \vec \phi}(R))$

+

$R \uplus S \rightarrow $ $R \uplus S$

+

$R \times S \rightarrow $ $\sigma_{\mathcal{W} = 0}\big($$\pi_{R.*, S.*, R.\mathcal{W} \;\&\; S.\mathcal{W}}( $$ R \times S)\big)$

+

$_A\gamma_{Agg(B)}(R) \rightarrow $ $_A\gamma_{[ Agg\big(\textbf{if}(W[1])\{R.B[1]\}\big), Agg\big(\textbf{if}(W[2])\{R.B[2]\}\big), \ldots ]}(R)$

+
+ +
+

Querying Joint Sample Tables

+ +

$\pi_A(R) \rightarrow \pi_{A}(R)$

+

$\sigma_\phi(R) \rightarrow \sigma_{\mathcal W = 0}(\pi_{\mathcal W \;\&\; \vec \phi}(R))$

+

$R \uplus S \rightarrow R \uplus S$

+

$R \times S \rightarrow \sigma_{\mathcal{W} = 0}\big(\pi_{R.*, S.*, R.\mathcal{W} \;\&\; S.\mathcal{W}}( R \times S)\big)$

+

$_A\gamma_{Agg(B)}(R) \rightarrow $ $_A\gamma_{[ Agg\big(\textbf{if}(W[1])\{R.B[1]\}\big), Agg\big(\textbf{if}(W[2])\{R.B[2]\}\big), \ldots ]}(R)$

+

(Generate aggregates for each sample separately)

+

Good luck ever doing an equi-join.

+

Hope your group-by variables aren't uncertain.

+
+ + +
+ +
+
+

Inefficient equi-joins on uncertain variables.

+

Inefficient aggregates with uncertain variables.

+

How many samples necessary to get desired precision?

+
+ +
+

Idea 2: Symbolic Execution (Provenance)

+
+ +
+

$\sigma_{count \geq 2}(Q) =$

+

$\texttt{bob} = 4 \wedge \texttt{carol} = 8 $
+ $\vee\; \texttt{bob} = 9 \wedge \texttt{carol} = 3 $
+ $\vee\; \texttt{bob} = 4 \wedge \texttt{carol} = 3$

+

$P[\sigma_{count \geq 2}(Q)] = ?$ $\approx$ #SAT

+
+ +
+

Computing Probabilities

+ +

$P[\texttt{x} \wedge \texttt{y}] = P[\texttt{x}] \cdot P[\texttt{y}]$
(iff $\texttt{x}$ and $\texttt{y}$ are independent)

+ +

$P[\texttt{x} \wedge \texttt{y}] = 0$
(iff $\texttt{x}$ and $\texttt{y}$ are mutually exclusive)

+ +

$P[\texttt{x} \vee \texttt{y}] = 1- (1-P[\texttt{x}]) \cdot (1-P[\texttt{y}])$
(iff $\texttt{x}$ and $\texttt{y}$ are independent)

+ +

$P[\texttt{x} \vee \texttt{y}] = P[\texttt{x}] + P[\texttt{y}]$
(iff $\texttt{x}$ and $\texttt{y}$ are mutually exclusive)

+ +

Good enough to get us the probability of any boolean formula over mutually exclusive or independent variables

+ +

... and otherwise?

+
+ +
+

Shannon Expansion

+ +

For a boolean formula $f$ and variable $\texttt{x}$:

+ +

$$f = (\texttt{x} \wedge f[\texttt{x}\backslash T]) \vee (\neg \texttt{x} \wedge f[\texttt{x}\backslash F])$$

+ +

Disjunction of mutually-exclusive terms!

+

... each a conjunction of independent terms.

+

... and $\texttt{x}$ removed from $f$

+ +

Ok... just keep applying Shannon!

+

Each application creates 2 new formulas (ExpTime!)

+
+ +
+

Idea 2.A: Combine the two. Use Shanon expansion as long as time/resources permit, then use a #SAT approximation.

+ + Sprout: Lazy vs. eager query plans for tuple-independent probabilistic databases (Olteanu et. al.) +
+
+ +
+
+

More Resources

+
+
MCDB
+
Sampling-based probabilistic databases
+ +
Sprout
+
"Any-time" Approximation.
+ +
Mimir
+
PL tricks to make ProbDBs faster
+ +
DeepDive
+
ProbDBs used in practice to populate Knowledge Bases.
+ +
Integrating and Ranking Uncertain Scientific Data
+
ProbDBs used in practice to predict gene expressions / propose experiments.
+
\ No newline at end of file