Edits
This commit is contained in:
parent
fdff835c63
commit
f6cb33c19e
|
@ -0,0 +1,274 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
<svg
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns:ns1="http://sozi.baierouge.fr"
|
||||
id="svg2"
|
||||
sodipodi:docname="package.svg"
|
||||
viewBox="0 0 188.33 197.75"
|
||||
sodipodi:version="0.32"
|
||||
version="1.0"
|
||||
inkscape:output_extension="org.inkscape.output.svg.inkscape"
|
||||
inkscape:version="0.46"
|
||||
>
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
bordercolor="#666666"
|
||||
inkscape:pageshadow="2"
|
||||
guidetolerance="10.0"
|
||||
pagecolor="#ffffff"
|
||||
gridtolerance="10.0"
|
||||
inkscape:zoom="2.8284272"
|
||||
objecttolerance="10.0"
|
||||
borderopacity="1.0"
|
||||
inkscape:current-layer="svg2"
|
||||
inkscape:cx="142.58555"
|
||||
inkscape:guide-bbox="true"
|
||||
inkscape:cy="108.73565"
|
||||
inkscape:window-y="-3"
|
||||
inkscape:window-x="-5"
|
||||
inkscape:window-height="992"
|
||||
showgrid="false"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:window-width="1680"
|
||||
showguides="true"
|
||||
>
|
||||
<sodipodi:guide
|
||||
id="guide2886"
|
||||
position="555.07882,515.48084"
|
||||
orientation="0,1"
|
||||
/>
|
||||
<sodipodi:guide
|
||||
id="guide3436"
|
||||
position="1801,402.99999"
|
||||
orientation="1,0"
|
||||
/>
|
||||
</sodipodi:namedview
|
||||
>
|
||||
<defs
|
||||
id="defs5"
|
||||
>
|
||||
<linearGradient
|
||||
id="linearGradient3498"
|
||||
y2="722.69"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
x2="1800.9"
|
||||
gradientTransform="matrix(2.5111 0 0 2.5111 -4465.6 -1731)"
|
||||
y1="721.07"
|
||||
x1="1779.5"
|
||||
inkscape:collect="always"
|
||||
>
|
||||
<stop
|
||||
id="stop3359"
|
||||
style="stop-color:#bb9652"
|
||||
offset="0"
|
||||
/>
|
||||
<stop
|
||||
id="stop3361"
|
||||
style="stop-color:#d4bf8a"
|
||||
offset="1"
|
||||
/>
|
||||
</linearGradient
|
||||
>
|
||||
<linearGradient
|
||||
id="linearGradient3501"
|
||||
y2="707.71"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
x2="1828.8"
|
||||
gradientTransform="matrix(2.5111 0 0 2.5111 -4464.8 -1730.2)"
|
||||
y1="754.31"
|
||||
x1="1834"
|
||||
inkscape:collect="always"
|
||||
>
|
||||
<stop
|
||||
id="stop3375"
|
||||
style="stop-color:#a28650"
|
||||
offset="0"
|
||||
/>
|
||||
<stop
|
||||
id="stop3377"
|
||||
style="stop-color:#dac491"
|
||||
offset="1"
|
||||
/>
|
||||
</linearGradient
|
||||
>
|
||||
<linearGradient
|
||||
id="linearGradient3504"
|
||||
y2="695.62"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
x2="1835.6"
|
||||
gradientTransform="matrix(2.5111 0 0 2.5111 -4464.8 -1730.2)"
|
||||
y1="704.37"
|
||||
x1="1788.1"
|
||||
inkscape:collect="always"
|
||||
>
|
||||
<stop
|
||||
id="stop3367"
|
||||
style="stop-color:#dccb94"
|
||||
offset="0"
|
||||
/>
|
||||
<stop
|
||||
id="stop3369"
|
||||
style="stop-color:#c4a95a"
|
||||
offset="1"
|
||||
/>
|
||||
</linearGradient
|
||||
>
|
||||
</defs
|
||||
>
|
||||
<path
|
||||
id="path2894"
|
||||
style="fill-rule:evenodd;fill:#d3bc5f"
|
||||
d="m110.29 1.9932l76.29 32.459v112.91l-76.29-56.297v-89.07z"
|
||||
/>
|
||||
<path
|
||||
id="path2896"
|
||||
style="fill-rule:evenodd;fill:url(#linearGradient3504)"
|
||||
d="m1.5117 18.829l55.887 43.794 129.18-28.171-76.29-32.459-108.78 16.836z"
|
||||
/>
|
||||
<path
|
||||
id="path2898"
|
||||
style="fill-rule:evenodd;fill:url(#linearGradient3501)"
|
||||
d="m57.399 62.623v133.77l129.18-49.03v-112.91l-129.18 28.173z"
|
||||
/>
|
||||
<path
|
||||
id="path2900"
|
||||
style="fill-rule:evenodd;fill:url(#linearGradient3498)"
|
||||
d="m0.6238 17.941l55.887 43.794v133.78l-55.887-76.02-0.0002-101.56z"
|
||||
/>
|
||||
<path
|
||||
id="path2575"
|
||||
sodipodi:nodetypes="cc"
|
||||
style="stroke:#784421;stroke-width:2.5111px;fill:none"
|
||||
d="m60.085 10.357l68.255 36.476"
|
||||
/>
|
||||
<path
|
||||
id="path3353"
|
||||
style="stroke:#c7a768;stroke-width:2.5111px;fill:none"
|
||||
d="m1.5695 18.833l55.872 43.631 129.64-27.936"
|
||||
/>
|
||||
<path
|
||||
id="path2577"
|
||||
sodipodi:nodetypes="cccccccccccc"
|
||||
style="opacity:.42169;fill-rule:evenodd;fill-opacity:.36486;stroke:#ffffff;stroke-width:.50222;fill:#34dbdb"
|
||||
d="m140.9 44.454l0.19 10.79-3.84-6.513-3.06 7.534-2.91-6.278-3.68 6.513-1.73-3.767-3.92 6.592-1.57-5.022-4.63 7.533-0.36-11.849 25.51-5.533z"
|
||||
/>
|
||||
<path
|
||||
id="path3355"
|
||||
style="stroke:#c7a768;stroke-width:2.5111px;fill:none"
|
||||
d="m57.128 62.15v134.34"
|
||||
/>
|
||||
<path
|
||||
id="path3349"
|
||||
sodipodi:nodetypes="ccccc"
|
||||
style="opacity:.5;fill-rule:evenodd;stroke:#ffffff;stroke-width:.50222;fill:#a9c7c9"
|
||||
d="m115.35 49.908l-66.226-38.451 21.03-3.139 70.936 36.097-25.74 5.493z"
|
||||
/>
|
||||
<path
|
||||
id="path3351"
|
||||
sodipodi:nodetypes="ccccccc"
|
||||
style="stroke:#6c5434;stroke-width:2.5111px;fill:none"
|
||||
d="m1.2556 18.833l0.3139 101.39 54.93 75.65 130.26-48.66-0.31-113.31-76.59-32.644-108.6 17.577z"
|
||||
/>
|
||||
<metadata
|
||||
>
|
||||
<rdf:RDF
|
||||
>
|
||||
<cc:Work
|
||||
>
|
||||
<dc:format
|
||||
>image/svg+xml</dc:format
|
||||
>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage"
|
||||
/>
|
||||
<cc:license
|
||||
rdf:resource="http://creativecommons.org/licenses/publicdomain/"
|
||||
/>
|
||||
<dc:publisher
|
||||
>
|
||||
<cc:Agent
|
||||
rdf:about="http://openclipart.org/"
|
||||
>
|
||||
<dc:title
|
||||
>Openclipart</dc:title
|
||||
>
|
||||
</cc:Agent
|
||||
>
|
||||
</dc:publisher
|
||||
>
|
||||
<dc:title
|
||||
>Cardboard box / package</dc:title
|
||||
>
|
||||
<dc:date
|
||||
>2009-05-21T05:05:20</dc:date
|
||||
>
|
||||
<dc:description
|
||||
>A cardboard packing box with tape holding it shut. Classic packing box / software package image. Enjoy!</dc:description
|
||||
>
|
||||
<dc:source
|
||||
>https://openclipart.org/detail/26101/cardboard-box-/-package-by-kliponius</dc:source
|
||||
>
|
||||
<dc:creator
|
||||
>
|
||||
<cc:Agent
|
||||
>
|
||||
<dc:title
|
||||
>Kliponius</dc:title
|
||||
>
|
||||
</cc:Agent
|
||||
>
|
||||
</dc:creator
|
||||
>
|
||||
<dc:subject
|
||||
>
|
||||
<rdf:Bag
|
||||
>
|
||||
<rdf:li
|
||||
>box</rdf:li
|
||||
>
|
||||
<rdf:li
|
||||
>cardboard</rdf:li
|
||||
>
|
||||
<rdf:li
|
||||
>icon</rdf:li
|
||||
>
|
||||
<rdf:li
|
||||
>package</rdf:li
|
||||
>
|
||||
<rdf:li
|
||||
>packaging</rdf:li
|
||||
>
|
||||
</rdf:Bag
|
||||
>
|
||||
</dc:subject
|
||||
>
|
||||
</cc:Work
|
||||
>
|
||||
<cc:License
|
||||
rdf:about="http://creativecommons.org/licenses/publicdomain/"
|
||||
>
|
||||
<cc:permits
|
||||
rdf:resource="http://creativecommons.org/ns#Reproduction"
|
||||
/>
|
||||
<cc:permits
|
||||
rdf:resource="http://creativecommons.org/ns#Distribution"
|
||||
/>
|
||||
<cc:permits
|
||||
rdf:resource="http://creativecommons.org/ns#DerivativeWorks"
|
||||
/>
|
||||
</cc:License
|
||||
>
|
||||
</rdf:RDF
|
||||
>
|
||||
</metadata
|
||||
>
|
||||
</svg
|
||||
>
|
After Width: | Height: | Size: 7.1 KiB |
|
@ -264,7 +264,7 @@
|
|||
<ul>
|
||||
<li>Schema matching</li>
|
||||
<li>Deduplication</li>
|
||||
<li>Format alignment (GIS coordinates, $ vs €)
|
||||
<li>Format alignment (GIS coordinates, $ vs €)</li>
|
||||
<li>Precision alignment (State vs County)</li>
|
||||
</ul>
|
||||
<p>
|
||||
|
@ -893,18 +893,21 @@
|
|||
</section>
|
||||
-->
|
||||
<section>
|
||||
<h3>VGTerm</h3>
|
||||
<p>$VGTerm(\ldots)$ constructs new variables<br/>(it's a skolem function)</p>
|
||||
<h3>VGTerms</h3>
|
||||
<p>A $VGTerm(\ldots)$ references configuration parameters<br/>(aka "variables").</p>
|
||||
<ul>
|
||||
<li class="fragment">$VGTerm('X')$ constructs a new variable $X$</li>
|
||||
<li class="fragment">$VGTerm('X', 1)$ constructs a new variable $X_{1}$</li>
|
||||
<li class="fragment">$VGTerm('X', B)$ evaluates $B$ and then constructs a new variable $X_{B}$</li>
|
||||
<li class="fragment">$VGTerm('X')$ references the variable $X$</li>
|
||||
<li class="fragment">$VGTerm('X', 1)$ references the variable $X_{1}$</li>
|
||||
<li class="fragment">$VGTerm('X', B)$ evaluates $B$ and then references the variable $X_{B}$</li>
|
||||
</ul>
|
||||
<citation>Lenses: An On-Demand Approach to ETL; Yang et. al.; VLDB 2015</citation>
|
||||
<aside class="notes">
|
||||
This is basically a skolem function.
|
||||
</aside>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<p>$VGTerm()$s behave like normal expressions</p>
|
||||
<p>$VGTerm()$s can be used like normal expressions</p>
|
||||
<pre><code>
|
||||
SELECT A, VGTerm('X', B) AS C FROM R;
|
||||
</code></pre>
|
||||
|
@ -914,16 +917,16 @@
|
|||
<thead>
|
||||
<tr><th>R |</th><th>A</th><th>B</th></tr>
|
||||
</thead><tbody>
|
||||
<tr><td align="right">|</td><td>1</td><td>2</th></tr>
|
||||
<tr><td align="right">|</td><td>3</td><td>4</th></tr>
|
||||
<tr><td align="right">|</td><td>5</td><td>4</th></tr>
|
||||
<tr><td align="right">|</td><td>1</td><td>2</td></tr>
|
||||
<tr><td align="right">|</td><td>3</td><td>4</td></tr>
|
||||
<tr><td align="right">|</td><td>5</td><td>4</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<table style="float: right" class="fragment" data-fragment-index="2">
|
||||
<tr><th>A</th><th>C</th></tr>
|
||||
<tr><td>1</td><td>$X_2$</th></tr>
|
||||
<tr><td>3</td><td>$X_4$</th></tr>
|
||||
<tr><td>5</td><td>$X_4$</th></tr>
|
||||
<tr><td>1</td><td>$X_2$</td></tr>
|
||||
<tr><td>3</td><td>$X_4$</td></tr>
|
||||
<tr><td>5</td><td>$X_4$</td></tr>
|
||||
</table>
|
||||
</div></center>
|
||||
<div style="clear: both;"> </div>
|
||||
|
@ -932,6 +935,7 @@
|
|||
</p>
|
||||
</section>
|
||||
|
||||
<!--
|
||||
<section>
|
||||
<p>Mimir defines a synthetic $ROWID$ value guaranteed to be unique for each row of a query.</p>
|
||||
<pre><code>
|
||||
|
@ -956,9 +960,10 @@
|
|||
</table>
|
||||
</div></center>
|
||||
</section>
|
||||
-->
|
||||
|
||||
<section>
|
||||
<h3>Schema Matching<h3>
|
||||
<h3>Schema Matching</h3>
|
||||
<div style="font-size: 16pt">
|
||||
$$ratings2(pid, num\_ratings, evaluation) \rightarrow (pid, rating)$$
|
||||
</div>
|
||||
|
@ -972,10 +977,13 @@
|
|||
END AS rating
|
||||
FROM ratings2;
|
||||
</code></pre>
|
||||
<p class="fragment" style="font-size: 18pt;">
|
||||
One global configuration variable decides which column gets mapped to "rating".
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Missing Value Imputation<h3>
|
||||
<h3>Missing Value Imputation</h3>
|
||||
<div style="font-size: 16pt">
|
||||
$$ratings1(pid, rating, review\_ct) \text{ s.t. } rating \text{ is not NULL}$$
|
||||
</div>
|
||||
|
@ -989,15 +997,157 @@
|
|||
review_ct
|
||||
FROM ratings1;
|
||||
</code></pre>
|
||||
<p class="fragment" style="font-size: 18pt;">
|
||||
A family of variables indexed by <tt>ROWID</tt> represent each imputed value.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<section>
|
||||
<h2>Defining Configurations</h2>
|
||||
|
||||
<svg width="700" height="400">
|
||||
|
||||
<image
|
||||
xlink:href="graphics/hawk88-personal-information.svg"
|
||||
width="89" height="85"
|
||||
x="0" y="170"
|
||||
/>
|
||||
<text x="10" y="220" style="font-size: 22px; font-weight: bold;">Config.</text>
|
||||
|
||||
<g class="fragment" data-fragment-index="1">
|
||||
<polyline
|
||||
style="
|
||||
fill: rgba(0, 0, 0, 0);
|
||||
stroke: rgba(0, 0, 0, 1);
|
||||
stroke-width: 3;
|
||||
"
|
||||
points="95,210 200,210 95,210 200,70 95,210 200,350"
|
||||
/>
|
||||
<image
|
||||
xlink:href="graphics/Kliponius-Cardboard-box-package.svg"
|
||||
width="94" height="99"
|
||||
x="220" y="20"
|
||||
/>
|
||||
<text x="240" y="80" style="font-size: 22px; font-weight: bold;">Model</text>
|
||||
|
||||
<image
|
||||
xlink:href="graphics/Kliponius-Cardboard-box-package.svg"
|
||||
width="94" height="99"
|
||||
x="220" y="160"
|
||||
/>
|
||||
<text x="240" y="220" style="font-size: 22px; font-weight: bold;">Model</text>
|
||||
|
||||
<image
|
||||
xlink:href="graphics/Kliponius-Cardboard-box-package.svg"
|
||||
width="94" height="99"
|
||||
x="220" y="300"
|
||||
/>
|
||||
<text x="240" y="360" style="font-size: 22px; font-weight: bold;">Model</text>
|
||||
</g>
|
||||
|
||||
<g>
|
||||
<g class="fragment" data-fragment-index="2">
|
||||
<polyline
|
||||
style="
|
||||
fill: rgba(0, 0, 0, 0);
|
||||
stroke: rgba(0, 0, 0, 1);
|
||||
stroke-width: 3;
|
||||
"
|
||||
points="340,210 380,210"
|
||||
/>
|
||||
<text x="385" y="216" style="font-size: 18px; font-weight: bold;">All assignments for one family.</text>
|
||||
</g>
|
||||
<g class="fragment" data-fragment-index="3">
|
||||
<polyline
|
||||
style="
|
||||
fill: rgba(0, 0, 0, 0);
|
||||
stroke: rgba(0, 0, 0, 1);
|
||||
stroke-width: 3;
|
||||
"
|
||||
points="340,210 380,180"
|
||||
/>
|
||||
<text x="385" y="186" style="font-size: 18px; font-weight: bold;">Description of the family in English.</text>
|
||||
</g>
|
||||
<g class="fragment" data-fragment-index="4">
|
||||
<polyline
|
||||
style="
|
||||
fill: rgba(0, 0, 0, 0);
|
||||
stroke: rgba(0, 0, 0, 1);
|
||||
stroke-width: 3;
|
||||
"
|
||||
points="340,210 380,240"
|
||||
/>
|
||||
<text x="385" y="246" style="font-size: 18px; font-weight: bold;">Other feasible assignments.</text>
|
||||
</g>
|
||||
</g>
|
||||
|
||||
<g class="fragment" data-fragment-index="4">
|
||||
<image
|
||||
xlink:href="graphics/hawk88-personal-information.svg"
|
||||
width="89" height="85"
|
||||
x="0" y="20"
|
||||
/>
|
||||
<text x="10" y="70" style="font-size: 22px; font-weight: bold;">Config.</text>
|
||||
<image
|
||||
xlink:href="graphics/hawk88-personal-information.svg"
|
||||
width="89" height="85"
|
||||
x="0" y="310"
|
||||
/>
|
||||
<text x="10" y="360" style="font-size: 22px; font-weight: bold;">Config.</text>
|
||||
<polyline
|
||||
style="
|
||||
fill: rgba(0, 0, 0, 0);
|
||||
stroke: rgba(0, 0, 0, 1);
|
||||
stroke-width: 3;
|
||||
"
|
||||
points="95,70 200,210 95,350 200,350 95,70 200,70 95,350"
|
||||
/>
|
||||
</g>
|
||||
|
||||
<g class="fragment" data-fragment-index="5">
|
||||
<text x="15" y="195" style="font-size: 22px; font-weight: bold;">(Best)</text>
|
||||
</g>
|
||||
|
||||
</svg>
|
||||
|
||||
<p class="fragment" data-fragment-index="5" style="font-size: 16pt;">
|
||||
Models designate one "best-guess" configuration.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Non-Deterministic Queries</h3>
|
||||
<h3>Example Models</h3>
|
||||
|
||||
<ul>
|
||||
<li>VG-Terms create non-deterministic branch-points in queries.</li>
|
||||
<li>Non-deterministic branch points are uniquely identified.</li>
|
||||
<li>Imputation using a SparkML classifier</li>
|
||||
<li>Heuristic detection of order-by columns for interpolation</li>
|
||||
<li>Schema matching based on edit-distance</li>
|
||||
<li>MayBMS-style probabilistic repair-key</li>
|
||||
<li>And more...</li>
|
||||
</ul>
|
||||
<p class="fragment">... so which branch gets taken?</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Convenience Operators: Lenses</h3>
|
||||
|
||||
<p>Lenses instantiate/train a model and wrap a query</p>
|
||||
<ul style="font-size: 16pt">
|
||||
<li>Domain Constraint Repair / Missing Value Imputation †</li>
|
||||
<li>Schema Matching †</li>
|
||||
<li>Sequence Repair</li>
|
||||
<li>Key Repair</li>
|
||||
<li>Arbitrary Choice</li>
|
||||
<li>Type Detection *</li>
|
||||
<li>Header Detection *</li>
|
||||
<li>JSON Shredder *</li>
|
||||
</ul>
|
||||
<citation>
|
||||
†Lenses: An On-Demand Approach to ETL; Yang et. al.; VLDB 2015<br/>
|
||||
*Adaptive Schema Databases; Spoth et. al.; CIDR 2017
|
||||
</citation>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
@ -1006,52 +1156,32 @@
|
|||
|
||||
<section>
|
||||
<h3>Uncertainty as Provenance</h3>
|
||||
|
||||
<p style="font-size: smaller;">(aka fun with query compilers)</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<p>Data Curation ends up with <b><i>one</i></b> canonical <i>good</i> dataset.</p>
|
||||
|
||||
<p class="fragment">Mimir picks one "best-guess" configuration</p>
|
||||
<p class="fragment">Mimir starts with the default "guess" configuration.</p>
|
||||
|
||||
<p class="fragment">As users explore, they validate or refine guesses for configuration variables as necessary.</p>
|
||||
|
||||
<aside class="notes">
|
||||
It's worth noting here that this allows users to use the same tool (or at least the same backend) for analytics, exploration, ... the entire analytics workflow.
|
||||
</aside>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h2>But...</h2>
|
||||
<h3>Are there any <u>relevant</u> configuration variables that I haven't validated yet?</h3>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>We might be wrong!?!</h3>
|
||||
|
||||
<p>If there's a possibility we might be wrong we need to...</p>
|
||||
<ul>
|
||||
<li>... communicate the fact to users.</li>
|
||||
<li>... help users understand why.</li>
|
||||
<li>... help users to fix it.</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<p>Each VG-Term family is associated with a <i>Model</i> object that facilitates introspection.
|
||||
<ul>
|
||||
<li>Selecting Best Guesses</li>
|
||||
<li class="fragment">Enumerating/Sampling alternatives</li>
|
||||
<li class="fragment">Generating Human-Readable descriptions of the branch</li>
|
||||
</ul>
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3 style="margin-bottom: 50px">Uncertainty as Provenance</h3>
|
||||
|
||||
<ul>
|
||||
<li class="fragment">Replace each VG-Term with a "best-guess" lookup function.</li>
|
||||
<li class="fragment">Best guess values are "tagged" with nondeterminsm.
|
||||
<ol>
|
||||
<li class="fragment">Which result cells/rows depend on tagged inputs?</li>
|
||||
<li class="fragment">What are the tag dependencies for a specific result?</li>
|
||||
<li class="fragment">How much do cells/rows depend on specific tags?</li>
|
||||
</ol></li>
|
||||
</ul>
|
||||
|
||||
<ol>
|
||||
<li>How much of my query result is affected by unvalidated variables?</li>
|
||||
<li class="fragment">Which variables affect my query results?</li>
|
||||
<li class="fragment">How bad is the situation?</li>
|
||||
</ol>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
@ -1059,9 +1189,18 @@
|
|||
<section>
|
||||
|
||||
<section>
|
||||
<h3>Which cells/rows depend on tagged inputs?</h3>
|
||||
<h3>How much of my query result is affected by unvalidated variables?</h3>
|
||||
|
||||
<p class="fragment"><b>Idea:</b> Extend query schemas with taint annotations.</p>
|
||||
<p class="fragment"><b>Idea:</b> Mark values in query results that depend on unvalidated variables.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<img src="graphics/console_results.png" />
|
||||
<citation>Communicating Data Quality in On-Demand Curation; Kumari et. al.; QDB 2016</citation>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<img src="graphics/console_plot.png" />
|
||||
</section>
|
||||
|
||||
<section>
|
||||
|
@ -1077,8 +1216,21 @@
|
|||
TRUE AS B_TAINTED
|
||||
FROM R;
|
||||
</code></pre>
|
||||
<p class="fragment" style="font-size: smaller;">Add <tt>*_TAINTED</tt> fields to each row.</p>
|
||||
<p style="font-size: smaller;">The Mimir compiler adds <tt>*_TAINTED</tt> fields to each row.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Non-Determinism Taint</h3>
|
||||
|
||||
<dl>
|
||||
<dt>A row is untainted if...</dt>
|
||||
<dd>... we can guarantee that it (or a counterpart) appears in the result regardless of configuration.</dd>
|
||||
<dt>A cell is untainted if...</dt>
|
||||
<dd>... we can guarantee that its value in the result is independent of the configuration.</dd>
|
||||
</dl>
|
||||
</section>
|
||||
|
||||
|
||||
<section>
|
||||
<h3>Non-Determinism Taint</h3>
|
||||
<pre><code>
|
||||
|
@ -1096,7 +1248,7 @@
|
|||
(B IS NULL) AS B_TAINTED
|
||||
FROM R;
|
||||
</code></pre>
|
||||
<p class="fragment" style="font-size: smaller;">Sometimes outputs are independent of VGTerms.</p>
|
||||
<p style="font-size: smaller;">Expressions with VGTerms can be conditionally tainted.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
|
@ -1109,6 +1261,7 @@ CREATE VIEW R_CLEANED AS
|
|||
</code></pre>
|
||||
</section>
|
||||
|
||||
<!--
|
||||
<section>
|
||||
<h3>Non-Determinism Taint</h3>
|
||||
<pre><code>
|
||||
|
@ -1122,6 +1275,7 @@ CREATE VIEW R_CLEANED AS
|
|||
</code></pre>
|
||||
<p class="fragment" style="font-size: smaller;">Selections can potentially taint rows.</p>
|
||||
</section>
|
||||
-->
|
||||
|
||||
<section>
|
||||
<h3>Non-Determinism Taint</h3>
|
||||
|
@ -1132,29 +1286,32 @@ CREATE VIEW R_CLEANED AS
|
|||
<pre><code>
|
||||
SELECT A, SUM(B) AS B,
|
||||
FALSE AS A_TAINTED,
|
||||
GROUP_OR(A_TAINTED OR B_TAINTED OR ROW_TAINTED) AS B_TAINTED
|
||||
GROUP_OR(B_TAINTED OR ROW_TAINTED)
|
||||
OR (SELECT GROUP_OR(A_TAINTED) FROM R_CLEANED) AS B_TAINTED
|
||||
GROUP_AND(A_TAINTED OR ROW_TAINTED) AS ROW_TAINTED
|
||||
FROM R_CLEANED;
|
||||
</code></pre>
|
||||
<p class="fragment" style="font-size: smaller;">Aggregates: Group-taint affects rows, not group-by attrs.</p>
|
||||
<p style="font-size: smaller;">Aggregates work too!</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Taint Benefits</h3>
|
||||
<ul>
|
||||
<li class="fragment">Much faster than classical Prob. DBs (comparable to deterministic queries).</li>
|
||||
<li class="fragment">At-a-glance visual of how bad your data is.</li>
|
||||
<li class="fragment">Can help to focus subsequent analysis.</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Taint Limitations</h3>
|
||||
<ul>
|
||||
<li>Taint is <span style="color: grey;">(probably)</span> C-Sound, but <b>not</b> C-Complete.</li>
|
||||
<li>Taint on group-by aggregates can be misleading.</li>
|
||||
<li>Taint does not work well with set difference.</li>
|
||||
<li class="fragment" data-fragment-index="1">Taint is <span style="color: grey; font-size: smaller;">(probably *)</span> C-Sound, but <span style="color: grey; font-size: smaller;">(usually *)</span> not C-Complete.<br/></li>
|
||||
<li class="fragment">Taint on group-by aggregates can be misleading.</li>
|
||||
<li class="fragment">Taint does not work well with set difference.</li>
|
||||
</ul>
|
||||
<p class="fragment">In spite of this, taint works well in practice.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<img src="graphics/console_results.png" />
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<img src="graphics/console_plot.png" />
|
||||
<citation class="fragment" data-fragment-index="1">*Ongong work w/ Su Feng, Aaron Huber, Boris Glavic</citation>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
@ -1162,24 +1319,9 @@ CREATE VIEW R_CLEANED AS
|
|||
<section>
|
||||
|
||||
<section>
|
||||
<h3>One more thing...</h3>
|
||||
<pre><code>
|
||||
SELECT A, VGTerm('X', ROWID) AS B FROM R;
|
||||
</code></pre>
|
||||
↓ ↓ ↓ ↓
|
||||
<pre><code>
|
||||
SELECT A, BEST_GUESS('X', ROWID) AS B,
|
||||
FALSE AS ROW_TAINTED,
|
||||
FALSE AS A_TAINTED,
|
||||
NOT IS_ACKNOWLEDGED('X', ROWID) AS B_TAINTED
|
||||
FROM R;
|
||||
</code></pre>
|
||||
<p style="font-size: smaller;">Allow users to turn-off taint for specific tags.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>What are the tags affecting a result?</h3>
|
||||
<p class="fragment"><b>Solution: </b> Static dependency analysis produces a list of tag families and queries to generate all relevant indexes.</p>
|
||||
<h3>Which variables affect my query results?</h3>
|
||||
<p class="fragment" data-fragment-index="1"><b>Idea: </b> Static dependency analysis produces a list of variable families and queries to generate all relevant indexes.</p>
|
||||
<citation class="fragment" data-fragment-index="1">Mimir: Bringing CTables into Practice; Nandi et. al.; ArXiV</citation>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
|
@ -1188,18 +1330,24 @@ CREATE VIEW R_CLEANED AS
|
|||
|
||||
|
||||
<section>
|
||||
<h3 style="margin-top: 100px">How much do results depend on a tag?</h3>
|
||||
<p class="fragment"><b>Solution: </b> Sensitivity analysis <span style="font-size: 18pt;">(Kanagal & Deshpande; SIGMOD 2011)</span>.</p>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<p>... but sensitivity analysis requires sampling...</p>
|
||||
<h3>How bad is the situation?</h3>
|
||||
<p class="fragment"><b>Idea: </b> Sample from the space of alternatives to...
|
||||
<ul>
|
||||
<li class="fragment">Estimate error, expectations, or other statistical measures.</li>
|
||||
<li class="fragment">Highlight other possible query results.</li>
|
||||
<li class="fragment">Compute sensitivity <span style="font-size: smaller">(Kanagal & Deshpande; SIGMOD 2011)</span></li>
|
||||
</ul>
|
||||
</p>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
||||
<section>
|
||||
|
||||
<section>
|
||||
<h2>Sampling is slooooow</h2>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Trivial Sampling</h3>
|
||||
<p>Evaluate the query $N$ times.<br/>Plug in samples instead of best guesses.</p>
|
||||
|
@ -1230,7 +1378,7 @@ CREATE VIEW R_CLEANED AS
|
|||
|
||||
<p class="fragment">Mimir isn't committed to one fixed data representation.</p>
|
||||
|
||||
<p class="fragment" style="font-size: smaller;">(work in progress)</p>
|
||||
<p class="fragment" style="font-size: smaller;">(optimization is a work in progress)</p>
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
|
Loading…
Reference in a new issue