From 7b317cef400fb0804295f938c1d4a30496236975 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sun, 19 Feb 2023 22:05:51 -0500 Subject: [PATCH] CornellDB Talk --- src/talks/2023-02-20-CornellDB.erb | 510 +++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 src/talks/2023-02-20-CornellDB.erb diff --git a/src/talks/2023-02-20-CornellDB.erb b/src/talks/2023-02-20-CornellDB.erb new file mode 100644 index 00000000..2f03cd71 --- /dev/null +++ b/src/talks/2023-02-20-CornellDB.erb @@ -0,0 +1,510 @@ +--- +template: templates/talk_slides_v1.erb +title: "Microkernel Notebooks" +--- + +
+

μKernel Notebooks

+ +

Oliver Kennedy

+
University at Buffalo
+
+ +
+ +
+ +
+ +
+ +

Pimentel et al: "4.03% of notebooks on github are reproducible"

+
+ +
+ +

Joel Grus: "For beginners, with dozens of cells and more complex code [the ability to run code snippets out of order] is utterly confusing."

+
+
+ +
+ +
+ +
+

Nodebook

+ + + https://github.com/stitchfix/nodebook +
+ +
+ + https://openclipart.org +
+ +
+ +
+ +
+

A modest proposal...

+
+ +
+ + https://openclipart.com +
+ +
+

So now...

+
+ +
+ +
+ +
+ +
+ +
+

and...

+
+ +
+ +
+ +
+

and...

+
+ +
+ +

... or worse ...

+
+ +
+ +
+ +
+

and...

+
+ +
+ +
+ +
+

and...

+
+ +
+ +
+ +
+

Provenance

+ +
+ +
+

Why are you getting my hopes up?

+
+ +
+ + + +

The Vizier Notebook

+
+ +
+

... work in progress

+ +
+ +
+

+def social_link(base, provider = "facebook.com"):
+    if base is None:
+        return None
+    if base.startswith("http://"):
+        base = base.replace("http://", "https://")
+    if base.startswith("https://"):
+        return base
+    if base.startswith(provider) or base.startswith(f"www.{provider}"):
+        return "https://"+base
+    return f"https://{provider}/"+base
+
+vizierdb.export_module(social_link)
+	
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+vizierdb.export_module(social_link)
+	
+
+ +
+

Explicit Exports

+ + + +

... but they're annoying

+
+ +
+

+		c = 19
+	
+ +

+		b = 23
+	
+ +

+		a = b + c
+	
+
+ +
+

Writes: c

+ +
+ +

Writes: b

+ +
+ +

Reads: b, c; Writes: c

+
+ +
+

🧹

+
+ +
+

Python's scoping rules are a mess.

+ +

+	x = 1
+	def foo():
+	  x = 2         
+	  def bar():
+	    print(x)
+	  return bar
+	x += 10
+	baz = foo()     
+	baz()           # What is printed?
+	
+ +

... fortunately we only care about cross-cell dependencies (for the most part).

+
+ +
+

+import urrlib.request as r
+with r.urlopen("https://not.sus.com/code.py") as response:
+    eval( response.read() )
+	
+
+ +
+

 

+

???

+

... fortunately eval isn't a major part of notebook use.

+
+ +
+

+import pandas as pd
+pd.load_csv("myfile.csv")
+	
+
+ +
+

 

+

maybe safe???

+

... fortunately libraries are usually good at abstracting.

+
+ +
+

Idea: Optimistic Concurrency Control.

+ +

(Work in progress)

+
+ +
+

... work in progress

+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
SystemDependenciesExecutionParallelism
NotebookUnknownManualNone
WorkflowsFully KnownDAG
VizierBounded+Trace???
+
+ +
+

State

+ +
+ +
+

How do we know when it is safe to reuse a result?

+

How do we know what is safe to parallelize?

+
+ +
+

State?

+
+ +
+

+		df = pd.load_csv("foo.csv")
+	
+ +
    +
  1. The cell runs
  2. +
  3. The object returned by pd.load_csv is serialized and stored in a persistent store. (Artifact)
  4. +
  5. The persistent store assigns the serialized object an identifier. (Version)
  6. +
  7. The state is updated with a mapping from symbol df to the identifier. (Variable)
  8. +
+ +
+ +
+

State

+ + $$\Sigma \rightarrow \mathbb N \cup \{ \emptyset \}$$ +

(variable → version)

+

(e.g., $\{ retail \rightarrow 937, markets \rightarrow 252 \}$)

+
+ +
+

Cell History

+ +
+
Last Read
+
$2^{\Sigma \times \mathbb N}$ (e.g., $\{ retail \rightarrow 937 \}$)
+ +
Last Write
+
$2^{\Sigma \times (\mathbb N \cup \{\emptyset\})}$ (e.g., $\{ farmstands \rightarrow 939 \}$)
+
+
+ +
+

Execution

+ +
+
Current State
+
$\{ retail \rightarrow 946, markets \rightarrow 252 \}$
+ +
+
Last Read
+
$\{ retail \rightarrow 937 \}$
+
+
+
+ +
+

Execution

+ +
+
Current State
+
$\{ retail \rightarrow 937, markets \rightarrow 252 \}$
+ +
Last Read
+
$\{ retail \rightarrow 937 \}$
+ +
+
Last Write
+
$\{ farmstands \rightarrow 939 \}$
+
+ +
+
Next State
+
$\{ retail \rightarrow 937, markets \rightarrow 252, farmstands \rightarrow 939 \}$
+
+
+
+ +
+

Cell Dependencies

+ +
+
Could Read
+
$2^{\Sigma}$ (e.g., $\{retail\}$ or $everything$)
+ +
Could Write
+
$2^{\Sigma}$
+
+
+ +
+

Execution

+ +
+
Current State
+
$\{ markets \rightarrow 252 \}$
+ +
+
Could Write
+
$\{ retail \}$
+
+ +
+
Next State
+
$\{ retail \rightarrow ?, markets \rightarrow 252 \}$
+
+
+
+ +
+

Execution

+ +
+
Current State
+
$\{ retail \rightarrow ?, markets \rightarrow 252 \}$
+ +
+
Last Read
+
$\{ retail \rightarrow 937 \}$
+
(i.e., State Unknown)
+
+
+
+ +
+

Scheduling

+ +

is the cell...

+ +
+ +
+

Future/Work in progress...

+ + +
+ + + +
+ + +

https://vizierdb.info

+
+ +

Mike Brachmann, Boris Glavic, Nachiket Deo, Stefan Muller, Juliana Freire, Heiko Mueller, Sonia Castello, Munaf Arshad Qazi, William Spoth, Poonam Kumari, Soham Patel, and more...

+