From 3d01a15434864fe03ca9bbf490d44664508bc06d Mon Sep 17 00:00:00 2001 From: Oliver Kennedy Date: Thu, 11 Apr 2024 22:50:35 -0500 Subject: [PATCH] final talk version --- src/talks/2024-04-12-UIC-script.txt | 45 +++ src/talks/2024-04-12-UIC.erb | 203 ++++++++++--- .../2024-04-12/DataframeAbstraction.svg | 280 ++++++++++++++++++ .../2024-04-12/Dependencies-State.svg | 229 ++++++++++++++ .../2024-04-12/NotebookExtensions.svg | 261 ++++++++++++++++ src/talks/graphics/logos/breadcrumb.png | Bin 0 -> 5753 bytes 6 files changed, 974 insertions(+), 44 deletions(-) create mode 100644 src/talks/2024-04-12-UIC-script.txt create mode 100644 src/talks/graphics/2024-04-12/DataframeAbstraction.svg create mode 100644 src/talks/graphics/2024-04-12/Dependencies-State.svg create mode 100644 src/talks/graphics/2024-04-12/NotebookExtensions.svg create mode 100644 src/talks/graphics/logos/breadcrumb.png diff --git a/src/talks/2024-04-12-UIC-script.txt b/src/talks/2024-04-12-UIC-script.txt new file mode 100644 index 00000000..8244de11 --- /dev/null +++ b/src/talks/2024-04-12-UIC-script.txt @@ -0,0 +1,45 @@ +Open farmersmarket_2024-42231059.xlsx + +Geotag: Lon - Y; Lat - X + +Geoplot + - 1. too much data + - 2. oops, flipped + +Alter geotag. See geoplot rerun + +Still too much data. Have a .shp file, but vizier doesn't support an adaptor. Python: +------------------- +# Extract County Shapes +import shapefile + +with shapefile.Reader("cb_2018_us_county_500k.zip") as sf: + #for field in sf.fields: + # print(field) + # Get object containing an empty dataset. + ds = vizierdb.new_dataset() + ds.insert_column("county") + ds.insert_column("zip") + ds.insert_column("geometry", "geometry") + for entry in sf.shapeRecords(): + if entry.record[0] == '36': # 36 is NYS + row = [ entry.record[5], entry.record[4], entry.shape ] + #print(row) + ds.insert_row( row ) + ds.save("nys_counties") + ds.show() +------------------- + +Spatial join +------------------- +SELECT * +FROM nys_counties nys + JOIN wny_counties wny ON nys.county = wny.county +------------------- + +Add below, and name: usda_farmers_markets +------------------- + JOIN usda_farmers_markets f ON ST_CONTAINS(nys.geometry, f.geometry) +------------------- + +Watch updated chart \ No newline at end of file diff --git a/src/talks/2024-04-12-UIC.erb b/src/talks/2024-04-12-UIC.erb index 64897c03..e0476a91 100644 --- a/src/talks/2024-04-12-UIC.erb +++ b/src/talks/2024-04-12-UIC.erb @@ -304,7 +304,7 @@ end nbcell("if z:\n y = x + 2", idx: 2) end %> - +

If z == False:

Reads: $\{\;\textbf{z}\;\}$

Writes: $\{\;\;\}$

@@ -423,7 +423,6 @@ end "Bolt-on, Compact, and Rapid Program Slicing for Notebooks" (Shenkar et. al.; VLDB 2023) - (Similar ideas in Nodebook, etc...) @@ -446,10 +445,6 @@ end -
-

Vizier Demo

-
-
@@ -467,12 +462,6 @@ end

We need to be able to recover the kernel to any state.

-
-

Why have only one kernel?

- -

🤷

-
-
<%= notebook() do @@ -489,8 +478,17 @@ end
-

When is parallelism allowed?

-

When is a cell runnable?

+

Why have only one kernel?

+ +

🤷

+
+ +
+

Parallelism

+
@@ -522,14 +520,14 @@ end
Active if: $\forall (x \rightarrow \textbf{@i}) \in \texttt{DynamicReads} : \texttt{InState}[x] = \textbf{@i}$
$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{@i}\;|\;\forall (x \rightarrow \textbf{@i}) \in \texttt{DynamicWrites}\;\}$
-
Stale
-
Active if: first run or $\exists (x \rightarrow \textbf{@i}) \in \texttt{DynamicReads} : \texttt{InState}[x] \neq \textbf{@i}$
-
$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$
-
Runnable
Active if: $\forall x \in \texttt{StaticReads} : \texttt{InState}[x] \neq \textbf{???}$
$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$
+
Stale
+
Active if: first run or $\exists (x \rightarrow \textbf{@i}) \in \texttt{DynamicReads} : \texttt{InState}[x] \neq \textbf{@i}$
+
$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$
+
Unknown
Active otherwise.
$\texttt{OutState} = \texttt{InState} + \{\;x \rightarrow \textbf{???}\;|\;\forall x \in \texttt{StaticWrites}\;\}$
@@ -537,19 +535,19 @@ end
- - "The Right Tool for the Job: Data-Centric Workflows in Vizier" (Kennedy et. al.; IEEE DEB 2022) -
- -
-

Serial

- -

Parallel

- +
+

Serial

+ +
+
+

Parallel

+ +
"Runtime Provenance Refinement for Notebooks" (Deo et. al.; TaPP 2022)
+

Microkernel Notebooks

https://openclipart.com
@@ -580,6 +578,15 @@ end

🤷

+
+

Vizier Demo

+
+ +
+ + "The Right Tool for the Job: Data-Centric Workflows in Vizier" (Kennedy et. al.; IEEE DEB 2022) +
+

Repeatable Spreadsheet Dataframe Editing

@@ -608,48 +615,156 @@ end
-

... but this requires migrating state.

+

... but this requires migrating state... across languages.

https://openclipart.com
-

State Management

+ +
+ +
+

Approach 1: Pickle

+ +

Python's native serialization support.

+ +
+
+
The Good
+
Easy
+
+
+
The Bad
+
Not everything is serializable†
+
Limited compatibility with ¬Python
+
Expensive for e.g., dataframes
+
+
+
+ +
+

Approach 2: Json

+ +

Standard data interchange format.

+ +
+
+
The Good
+
Easy
+
Near universal platform compatibility
+
+
+
The Bad
+
Even less state is supported
+
Even more expensive for e.g., dataframes
+
Limited support for nuanced types (e.g., dates)
+
+
+
+ +
+

Approach 3: Arrow, Shapefile, Parquet, NPY

+ +

Specialized formats for specific datatypes.

+
+
+
The Good
+
High Performance
+
Precise, Well Typed
+
+
+
The Bad
+
Only one type of state is supported
+
+
+
+ +
+

Vizier (Now)

+ +

Vizier-level Typing.

    -
  • State needs to be checkpointed out of the process that created it.
  • -
  • State needs to be restored into the cell that is about to consume it.
  • +
  • Simple Data: JSON
  • +
  • Typed Data: Standard JSON Encoding
  • +
  • Special Data: 'Active' Data
  • +
  • Fallback: Pickle
- Naive approach: Pickle +

Active Data

- ... but pickle doesn't allow interop - ... but pickle doesn't always work (e.g., for 'File' objects) +

Datasets, Functions/Classes, etc...

+
    +
  • One concept, Many physical representations (Arrow, Parquet, CSV). +
      +
    • A cell interpreter may not support a representation.
    • +
    • Generating a standard representation can be expensive.
    • +
    +
  • +
  • State (e.g., Datasets) can get big. +
      +
    • An interpreter may not want/need to load the entire state.
    • +
    • Versioning all checkpoints becomes infeasible.
    • +
    +
  • +
- Interop: Define standards +

Desiderata

- - Primitive Values (int, float, date, etc...) - - Collection Types (map, list, etc...) - - Libraries - - Function [Challenge: Chained Dependencies] - - Dataframe/Series [Challenge: These are BIG] +
+

An abstraction that...

+
    +
  • ... represents the concept.
  • +
  • ... allows on-demand conversion between representations.
  • +
  • ... allows partial in-store interactions.
  • +
  • ... allows incremental changes.
  • +
+
+

Vizier's artifact store provides a thin wrapper around standards compliant libraries (e.g., Apache Spark).

- +

"Active" Data

+
+
+

... but it's a lot of special case code.

+
+ +
+

Generalizing Active Data

+

(future work)

+ +
    +
  • What's the right abstraction?
  • +
  • Efficient type coercion (without $N^2$)
  • +
  • Microservice RPCs
  • +
  • Caching Strategies
  • +
+ +

Questions?

+
+ + -<%# +

https://vizierdb.info

-

Mike Brachmann, Boris Glavic, Nachiket Deo, Juliana Freire, Heiko Mueller, Sonia Castello, Munaf Arshad Qazi, William Spoth, Poonam Kumari, Soham Patel, and more...

+

Mike Brachmann, Boris Glavic, Nachiket Deo, Juliana Freire, Heiko Mueller, Sonia Castello, Munaf Arshad Qazi, William Spoth, Poonam Kumari, Nicholas Brown, Soham Patel, Thomas Slowe, and more...

+ + +
+ Supported by: + + +
- %> diff --git a/src/talks/graphics/2024-04-12/DataframeAbstraction.svg b/src/talks/graphics/2024-04-12/DataframeAbstraction.svg new file mode 100644 index 00000000..f1cda281 --- /dev/null +++ b/src/talks/graphics/2024-04-12/DataframeAbstraction.svg @@ -0,0 +1,280 @@ + + + +Vdbcreate artifactartifact @1 createdwrite dataSQLExportSummary diff --git a/src/talks/graphics/2024-04-12/Dependencies-State.svg b/src/talks/graphics/2024-04-12/Dependencies-State.svg new file mode 100644 index 00000000..07b36255 --- /dev/null +++ b/src/talks/graphics/2024-04-12/Dependencies-State.svg @@ -0,0 +1,229 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + State needs to be checkpointed. + + + + + + 2 + + State needs to be restored. + + + + diff --git a/src/talks/graphics/2024-04-12/NotebookExtensions.svg b/src/talks/graphics/2024-04-12/NotebookExtensions.svg new file mode 100644 index 00000000..63f00831 --- /dev/null +++ b/src/talks/graphics/2024-04-12/NotebookExtensions.svg @@ -0,0 +1,261 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + 1 + + Explore + + + + + + + + + + + + + + + 2 + Revise + + + + + diff --git a/src/talks/graphics/logos/breadcrumb.png b/src/talks/graphics/logos/breadcrumb.png new file mode 100644 index 0000000000000000000000000000000000000000..86de55c38b870cbd616c4559d0ff3edf6095bcb1 GIT binary patch literal 5753 zcmeHLhcg`P^Ot)OM}#11j*EV}XwfBzs7JVyi*P}7M+DJNE)hLMucx=@oZeqG>O~8u zM~%+uCH#EU-CRHYJDXkVFuct z1--oBCw;LW9(;pGaqa6_x`c@zHZn@ z{<9{xS~<7|vC022)WQ0#-QdQYWB=QYT478UDfd7YH&vm{PM;2ABBVva_4VNV>!^p-_FwgHt)AFNVXkMqH1e#XG1;P#Y=od%Yu>0f+PT`!E z!e~zKDGv=mv%~CSv48Q?czWA_j{yXVahDB+qhFT`zf0Deqrmg@`Pf$G+Y4UeyWA_77)jps~Ml< zXL1$DVlqPJmFi7?Y8O*nT9LB0-WrHIWSVXmB~mk4FX zjhf5GWAZV7+)F2G6lX_%1Ry{JyM>tu_FF%5wuN&8P+<`~fWjuDB@R0sGH(^!-vcSOT9RHa?$G_eP~U#sz{4t(`5 zTC5Ol`0qFr>F(t%K6Ru*x@YuSbtcv$hr{hoZoc5HF8^z{OF2pJNL<#y@BNFt=dYZp zKKtd1E`WTlo?ZI#I@$kSlI$@*5#|J&GrED-iU({wR@Pk&W-mV#8tJ+K*tjsriTRWg zp;9fW>gDu;}Ey@7`Qz9e&x*1DVCFZG4B)dA>M>l7@D z2S#Vaf4!Uu0Jzk=Gq5rC&U$p~Ienn$GKArsmKoRa%>6S1ECf~PBGHwi4sQ9tf}}US zY-`^-vj!qk4f!&H5ZUe@g$rEhyHYg4+G|@^`lX}I8v4K$(BV+LB0lkfrp8s^a?pPJ@W@UjNE%f?5x|A7Wq;wX>M2fQW z&vs5pwjC0V?bmM5X!Y@KFf+VTEb!i}vfjO-DSfb#ijU7Zv45(39X0<}#;@Cxs?$-e zA7Hc2TqMj}ZY7omMB;T0|0Ek{6tSr8BBENAqY3SaXBVBPRo71k3jXG+S91k^yVM&y z8a_>e8DTf~KslTW|hoT@|%Jf$}{`~wU%(Or z%R6fyT?zl1zXzqcyf_NO`LmQc;tMZJKwjP^j)q=i+hj{|)e&tk0a0St@^pFMe|H1y zD^_y#660=%c9E7LDGvzM4H}%6E|N~*xu4J;JZ@O<>zZq>-%{&Z_wIY;jhbKG0`s&O zPZ~v5j*&=>6=>IBYyP6SGxV+M`h`Qn+c+yT6u*97080q*;}53^^=JtLXSNnd{`cM# z%hr=_tEAmdR|3bM8F}jBvp2^vcA;HP+5-9w0kY1pIeOfiYnUg<;DvGYP{OP^(Wtua{WA(s+ z5sC~Q7$wFfu3d=fJAl1C1_DiVg|G0r-_0%S`Igy{@=9;dih((_aV~vBNzN2^9`0Bk zek=5QJz1WldADmiUb90yeW3~|i`x0jdu~7`t&S)Dv&m>f6VdSNDS(VZ2hkqBi)j^& zUfGVex}~9e--ohOEc2S6jIG_j+~ScWWrc|G041WWoVoi_vL2qte^p3E!95Q7m;Bc^ zuI3JdAYr1S-uG7z4YlH_l@hn%eodKkOfnzhHRHEGEa#UsQ*WDh zhp!~4jVj6e4dT||48gblh;|0;5yWXW&f$*f(s9Na=0FK+Atb|KOGT3|6T*;kdA*+5 z#Oqwe`S^$Jeh-W8Z2oH0#~wv0Zx@I2oq-*<9ovf%p=qV2B|3~T6_q5ndAC%*>r#np zOw1l5dQC@lRKW1KO9-B?Ug_(|`XOD~tYRteh1Jk_AyWL{SLVF~pTfB}wV%WtMrbBZ ztdT`FCexD7GWG&wYem!PGtoRsMQmulDsgUm;cS%^Nd-FmC->1o8fL|uNY~B_Br+GFq-MnJu!z@3UsTPJa&;{uhhWL5U*Q)w{O)m+C^Ab1-IpR zCn^dOaZ3@WS&S7w!=ouWX|tTWjoF=rH6Q0vWj{)Hv`G_TW_jt>H9;fPC!iYD#LBJj z(f+!eGreMV;gB2@e_T@Y=&_<+m#94w6Jjzan|k-FVFx}^h-FFo8XT{g$YoA>*|-zH z(o59JUnFA(!kPuXlon9Gw>T_XZN~Mp?v#@ghB9Ul+t3UXGM;^AUtE$ zkm%Kaz{Q2Q`cPqBP%=wq!94u=Z&+pD=lv#1x=C5f$*6H7CYf+}ml_8YZdiluA4?U( z>HQNjeMUW6P@~b_R1@65mz-4+?5E=cQe9U*7&72J{4tV-mtfV?umY{0+Ao5p(vFMr zUNo1?UiFvlCfqR0~<}Q*nGZc4m}Opl^i|q zT?ZoqM;EN5cm}qid9Iv@mdt;}zqbsMF1qbSnpF4BBcc`AQH{m0=?SOp!tFe0nuwhY zcpFu$k{a4N;%D@z{+VUcxY;+AaUjkHhJJY{;$NTfw=l5nE~c)*bz(!J^2r0z=kcYc(jDTcLgt{LQEWEZ1N`clg~IA`Np^{#E#bI23^z(ys4a!!)OT+We_NlS zt0U>UK`QOxk>k&d@^C3c`|7kAc+G?`VMZv7uwqNpzVUah?IKLHN$GaKUl^ZQoAk{j z`aWj&j?Zl=wA|8(lb>8_nHx@Mt#{>jk2EQWh`Ry3q-yO-rp?JQUKpiOoR}p%Yl#!V z{=y0ydapnatbGDEes``_Vv_17acp)5*{U{qV*PBpvTDZ1*`)F!1WbXe9J{=Zz-WVQLGoTh09tG~YnI zxUz4Z>-)7(8j+#2ImNBSg(cAb?-LWQ6uMDmw9*jx_|oa($5Y)?4;WBl6_R>iRiTN3 z0kSvCDQGy+3BY34W3cp0Y_)3Kvqj`0gXpTFpYPJ-78{8d;WGu5zRQ*H)5gL8mU?12 z+_qRcRs+2AM+uQ;YsS%Quul=MDz}mHug!sy-DdXzF2M|0#4{)&X5Kz?#ykt`my!=S z&4+{8(=L|Bb}DL`yDatRQ>b&H<2B<&xinL$0tepcsx%bJ&$r?-!N!(=K@$+fOR+d-7} zdjrp__{LsLh~xA8uhCkD@nU=mJY}THO(ziqtjxp+trajF@{LbMt-G@#YXAV_8Nh^9 z+E~~Q2lda;R@?)tWDEmf*l6QVe{L>2TAFI*jQF4_89C1*nS7NyhD95n8)Qu;S`1En zdFS~+sSyvL;I!GpNzcvxmeP#?gxzxm@*_N_9j_*CE2#Q#e~{lV08os0GmT#VDsD5I z1(d!#CEe6+Zdiq$YmJ1bsAyX}y*rVq; z*;$hjjxJ+fQ5M_>L5gq@G0P{uG2K72hFye&pS{sTqJ53yMK}=C`e{yx*2iqj^-amG z4m;*Mg>D^R2(B)s2P1Xq{?CM_Uk(n%J}lRMBL>~KVig5^r4Qb%+XYn{^D50R@s(yy z_-PjG1k@KLZTjHGZ_ZSDE(iQq`E?n|(ldJ}9?WUR-z2zwWKC62TMqrILx@B}-SASA zqqeb}q_sf3966tVqasChB|C;W0J^uyP=o5+4u#p=snM6}vPp$On Hn@|4-8BHt| literal 0 HcmV?d00001