diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/Rakefile b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/Rakefile new file mode 100755 index 00000000..8b9848ba --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/Rakefile @@ -0,0 +1,95 @@ +require 'rubygems' +require 'gnuplot' + +$:.push(".") +require 'util' +require 'plot' +require 'csvx' + +$modes = [:deterministic,:classic,:partition,:inline,:hybrid] +$query_names = { + :q1_noagg => "Q1", + :q3_noagg => "Q3", + :q5_noagg => "Q5", + :q9_noagg => "Q9" +} +# plot_output :pdf, size: "5in,2.5in", fsize: "12" +plot_output :aqua +# auto_open_plots :true + +# $plot_auto_open = true + +def to_seconds(time) + case time + when /([0-9]+)m([0-9.]+)s/ then + $1.to_i * 60 + $2.to_f + when /\?/ then + 0 + when /Timeout/ then + 1000000 + else + raise "Unknown time value '#{time}'" + end +end + +def sort_by_cols(order,hash) + order.map { |col| + hash[col] + } +end + +$data = + File.csv("data.csv", separator: / *, */). + map { |db,q,sf,mode,time| + [ (db+"_"+sf).to_sym, + [q.to_sym, + [mode.to_sym, to_seconds(time)]]] + }.reduce { |db, db_trials| + db_trials.reduce { |q, q_trials| + sort_by_cols($modes, q_trials.to_h) + } + } + +def plot_timing_bar_plot(gp, db, details = {}) + clusters = $query_names.keys.sort; + data = sort_by_cols( + clusters, + $data[db] + ).map {|timings| + det_t = timings.shift + timings.map { |t| t / det_t * 100} + } + gp.yrange details.fetch(:yrange, "[0:300]") + gp.key "font \"Times-Roman,10\" opaque box top left" + gp.ylabel "% of Deterministic Time" + draw_clustered_bar_plot(gp, + data: data, + dataset_labels: + $modes.map {|m| m.to_s.capitalize}[1..-1], + group_labels: + clusters.map {|c| $query_names[c]}, + box_style: + lambda {|i| "boxes fill solid #{(i.to_f/6)+0.25} lc #{$pretty_styles[4-i][:lt]}"} + ) +end + +plot 'sqlite100m' => "Rakefile" do |gp| + plot_timing_bar_plot(gp, :sqlite_100m) +end +plot 'sqlite1g' => "Rakefile" do |gp| + plot_timing_bar_plot(gp, :sqlite_1g) +end +plot 'dbx100m' => "Rakefile" do |gp| + plot_timing_bar_plot(gp, :oracle_100m, + yrange: "[0:400]" + ) +end +plot 'dbx1g' => "Rakefile" do |gp| + plot_timing_bar_plot(gp, :oracle_1g, + yrange: "[0:1400]" + ) +end + +task :all => ['sqlite100m', 'sqlite1g', 'dbx100m', 'dbx1g'] + +task :default => :all \ No newline at end of file diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/csvx.rb b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/csvx.rb new file mode 100755 index 00000000..6b4e6bdf --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/csvx.rb @@ -0,0 +1,46 @@ +class String + def from_csv(sep = /,/) + ret = chomp.split(sep) + idx = 0; + while idx < ret.length do + if ret[idx][0] == "\""[0] + while ret[idx][-1] != "\""[0] + raise "Unterminated quote" if idx+1 >= ret.length + ret[idx] = ret[idx]+","+ret[idx+1] + ret.delete_at(idx+1) + end + ret[idx] = ret[idx].sub(/^"/, "").sub(/"$/, "") + end + idx += 1 + end + ret + end +end + +class Array + def from_csv + self.map { |l| l.to_s.chomp.from_csv } + end + + def to_csv(f) + File.open(f, "w+") { |f| each { |row| f.puts(row.join(',')) }} + end +end + +class IO + def from_csv(args = {}) + header = args.fetch(:header, false) + separator = args.fetch(:separator, /,/) + keys = readline.chomp. + sub(/ *$/, "").sub(/^ */,""). + from_csv(separator) if header; + map { |l| l.to_s.chomp.from_csv(separator) }. + map { |a| if header then keys.zip(a).to_h else a end } + end +end + +class File + def File.csv(f, args = {}) + File.open(f) {|io| io.from_csv(args) } + end +end diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/data.csv b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/data.csv new file mode 100755 index 00000000..7134eece --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/data.csv @@ -0,0 +1,80 @@ +sqlite,q1_noagg,100m,deterministic,0m16.125s +sqlite,q1_noagg,100m,classic ,0m16.163s +sqlite,q1_noagg,100m,partition ,0m16.226s +sqlite,q1_noagg,100m,inline ,0m16.417s +sqlite,q1_noagg,100m,hybrid ,0m16.830s +sqlite,q1_noagg,1g ,deterministic,2m41.632s +sqlite,q1_noagg,1g ,classic ,2m55.999s +sqlite,q1_noagg,1g ,partition ,3m2.070s +sqlite,q1_noagg,1g ,inline ,2m54.776s +sqlite,q1_noagg,1g ,hybrid ,2m55.679s +sqlite,q3_noagg,100m,deterministic,0m1.743s +sqlite,q3_noagg,100m,classic ,Timeout +sqlite,q3_noagg,100m,partition ,Timeout +sqlite,q3_noagg,100m,inline ,0m2.048s +sqlite,q3_noagg,100m,hybrid ,0m2.012s +sqlite,q3_noagg,1g ,deterministic,0m4.687s +sqlite,q3_noagg,1g ,classic ,Timeout +sqlite,q3_noagg,1g ,partition ,Timeout +sqlite,q3_noagg,1g ,inline ,0m7.572s +sqlite,q3_noagg,1g ,hybrid ,0m7.992s +sqlite,q5_noagg,100m,deterministic,0m1.542s +sqlite,q5_noagg,100m,classic ,Timeout +sqlite,q5_noagg,100m,partition ,Timeout +sqlite,q5_noagg,100m,inline ,0m2.676s +sqlite,q5_noagg,100m,hybrid ,0m3.597s +sqlite,q5_noagg,1g ,deterministic,0m6.696s +sqlite,q5_noagg,1g ,classic ,Timeout +sqlite,q5_noagg,1g ,partition ,Timeout +sqlite,q5_noagg,1g ,inline ,0m11.351s +sqlite,q5_noagg,1g ,hybrid ,0m57.637s +sqlite,q9_noagg,100m,deterministic,0m3.037s +sqlite,q9_noagg,100m,classic ,Timeout +sqlite,q9_noagg,100m,partition ,Timeout +sqlite,q9_noagg,100m,inline ,12m22.873s +sqlite,q9_noagg,100m,hybrid ,0m6.037s +sqlite,q9_noagg,1g ,deterministic,0m38.967s +sqlite,q9_noagg,1g ,classic ,Timeout +sqlite,q9_noagg,1g ,partition ,Timeout +sqlite,q9_noagg,1g ,inline ,Timeout +sqlite,q9_noagg,1g ,hybrid ,1m9.280s +oracle,q1_noagg,100m,deterministic,0m19.716s +oracle,q1_noagg,100m,classic ,0m23.760s +oracle,q1_noagg,100m,partition ,0m21.517s +oracle,q1_noagg,100m,inline ,0m20.570s +oracle,q1_noagg,100m,hybrid ,0m21.685s +oracle,q3_noagg,100m,deterministic,0m1.887s +oracle,q3_noagg,100m,classic ,Timeout +oracle,q3_noagg,100m,partition ,Timeout +oracle,q3_noagg,100m,inline ,0m2.831s +oracle,q3_noagg,100m,hybrid ,0m2.482s +oracle,q5_noagg,100m,deterministic,0m2.165s +oracle,q5_noagg,100m,classic ,Timeout +oracle,q5_noagg,100m,partition ,Timeout +oracle,q5_noagg,100m,inline ,0m3.738s +oracle,q5_noagg,100m,hybrid ,0m5.722s +oracle,q9_noagg,100m,deterministic,0m3.883s +oracle,q9_noagg,100m,classic ,Timeout +oracle,q9_noagg,100m,partition ,Timeout +oracle,q9_noagg,100m,inline ,8m0.466s +oracle,q9_noagg,100m,hybrid ,0m10.610s +oracle,q1_noagg,1g ,deterministic,3m29.131s +oracle,q1_noagg,1g ,classic ,3m32.163s +oracle,q1_noagg,1g ,partition ,3m45.280s +oracle,q1_noagg,1g ,inline ,3m39.893s +oracle,q1_noagg,1g ,hybrid ,3m23.962s +oracle,q3_noagg,1g ,deterministic,0m5.437s +oracle,q3_noagg,1g ,classic ,Timeout +oracle,q3_noagg,1g ,partition ,Timeout +oracle,q3_noagg,1g ,inline ,0m3.738s +oracle,q3_noagg,1g ,hybrid ,0m5.722s +oracle,q5_noagg,1g ,deterministic,0m5.092s +oracle,q5_noagg,1g ,classic ,Timeout +oracle,q5_noagg,1g ,partition ,Timeout +oracle,q5_noagg,1g ,inline ,0m22.339s +oracle,q5_noagg,1g ,hybrid ,Timeout +oracle,q9_noagg,1g ,deterministic,0m28.182s +oracle,q9_noagg,1g ,classic ,Timeout +oracle,q9_noagg,1g ,partition ,Timeout +oracle,q9_noagg,1g ,inline ,Timeout +oracle,q9_noagg,1g ,hybrid ,5m8.406s \ No newline at end of file diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/dbx100m.pdf b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/dbx100m.pdf new file mode 100755 index 00000000..e69de29b diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/dbx1g.pdf b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/dbx1g.pdf new file mode 100755 index 00000000..e69de29b diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/plot.rb b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/plot.rb new file mode 100755 index 00000000..ff0aeffe --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/plot.rb @@ -0,0 +1,213 @@ + +$plot_terminal = "aqua" +$plot_suffix = nil; +$plot_auto_open = false; +$current_plot = nil; + +def plot_output(output, settings = {}) + case output + when :aqua then $plot_terminal = "aqua"; $plot_suffix = nil; + when :pdf then $plot_terminal = "pdf"; $plot_suffix = ".pdf" + when :png then $plot_terminal = "png"; $plot_suffix = ".png" + end + $plot_terminal_opts = settings +end + +def plot_terminal(setting_overrides = {}) + settings = $plot_terminal_opts.merge setting_overrides + $plot_terminal+( + if settings.size < 1 then "" else + " " + settings.to_a.flatten.join(" ") + end + ) +end + +$pretty_styles = [ + { :lt => "rgb \"#A00000\"", + # :fs => "rgb \"#A00000\"", + :lw => 2, + :pt => 1 + }, + { :lt => "rgb \"#00A000\"", + # :fs => "rgb \"#00A000\"", + :lw => 2, + :pt => 6 + }, + { :lt => "rgb \"#5060D0\"", + # :fs => "rgb \"#5060D0\"", + :lw => 2, + :pt => 2 + }, + { :lt => "rgb \"#F25900\"", + # :fs => "rgb \"#F25900\"", + :lw => 2, + :pt => 9 + } +]; + +def pretty_style(idx, opts = {}) + opts = opts.clone; + $pretty_styles[idx].each { |k, v| opts[k] = v unless opts.has_key? k } + opts.map { |kv| kv.to_a.join(" ") unless kv[1].nil? }.compact.join(" ") +end + +def pretty_plot(plot, opts = {}) + # plot based on Brighten Godfrey's blog post: + # http://youinfinitesnake.blogspot.com/2011/02/attractive-scientific-plots-with.html + + plot.terminal [ + "pdf", + "font \"#{opts.fetch(:fontface, "Times-Roman")},#{opts.fetch(:fontsize, 10)}\"", + "linewidth #{opts.fetch(:linewidth, 4)} rounded", + "fontscale #{opts.fetch(:fontscale, 1.0)}", + "size #{opts.fetch(:sizex, 5)}in,#{opts.fetch(:sizey, 3)}in" + ].join(" ") + + # Line style for axes + plot.style "line 80 lc #{opts.fetch(:bordercolor, "rgb \"#808080\"")}" + + # Line style for grid + plot.style "line 81 lt 0" # dashed + plot.style "line 81 lc #{opts.fetch(:gridcolor, "rgb \"#808080\"")}" # grey + + plot.grid "back linestyle 81" + + border_groups = + opts.fetch(:border, [:left, :bottom]).map do |b| + case b + when :bottom then 1 + when :left then 2 + when :top then 4 + when :right then 8 + when :all then 1+2+4+8 + else raise "Invalid border type : #{b}" + end + end.sum + + plot.border "#{border_groups} back linestyle 80" # Remove border on top and right. These + # borders are useless and make it harder + # to see plotted lines near the border. + # Also, put it in grey; no need for so much emphasis on a border. + plot.xtics "nomirror" + plot.ytics "nomirror" + + if(opts.fetch(:logx, false)) then + plot.logscal "x" + plot.mxtics "10" # Makes logscale look good. + end + if(opts.fetch(:logy, false)) then + plot.logscal "y" + plot.mytics "10" # Makes logscale look good. + end + + # Line styles: try to pick pleasing colors, rather + # than strictly primary colors or hard-to-see colors + # like gnuplot's default yellow. Make the lines thick + # so they're easy to see in small plots in papers. + $pretty_styles.each_index { |x| plot.style "line #{x+1} #{pretty_style(x)}" } + + plot.key "bottom right" +end + +def auto_open_plots(new_val = true) + $plot_auto_open = new_val; +end + +def row_data(data) + $current_plot.data << Gnuplot::DataSet.new(data.unzip) { |ds| yield ds } +end + + +def plot(args = {}) + task(args) do + task_name = case args + when Hash then args.keys[0] + when Symbol,String then args.to_s + end + Gnuplot.open do |gp| + Gnuplot::Plot.new(gp) do |plot| + $current_plot = plot; + + plot.terminal plot_terminal + if $plot_suffix and task_name then + plot.output "#{task_name}#{$plot_suffix}" + end + yield plot; + end + end + if $plot_auto_open and [".pdf", ".png"].include? $plot_suffix + system("open #{task_name}#{$plot_suffix}") + end + end +end + +def line_plot(args = {}) + plot(args) do |plot| + data_elements = yield(plot) + data_elements = { :data => data_elements } unless data_elements.is_a? Hash; + + data = data_elements[:data].unzip; + xaxis = data_elements.fetch(:xaxis) { data.shift }; + keys = data_elements.fetch(:keys) { data.map { nil; } } + withs = data_elements.fetch(:with, "linespoints"); + withs = data.map { withs } unless withs.is_a? Array; + + raise "Missing data!" if data.nil?; + raise "Missing X Axis!" if xaxis.nil?; + + data.zip(keys, withs).each do |line, key, with| + plot.data << Gnuplot::DataSet.new([xaxis, line]) do |ds| + ds.title = key unless key.nil? + ds.with = with unless with.nil? + end + end + end +end + +def draw_clustered_bar_plot plot, args = {} + data = args.fetch(:data).unzip; + base_offset = args.fetch(:base_offset, 0); + interbar_offset = args.fetch(:interbar_offset, 18); + intergroup_offset = args.fetch(:intergroup_offset, interbar_offset); + margins = args.fetch(:margins, intergroup_offset); + bar_width = args.fetch(:bar_width, 10); + tic_commands = args.fetch(:tic_commands, ""); + label_offset = args.fetch(:label_offset, 0); + box_style = args.fetch(:box_style, + lambda { |i| "boxes fill pattern #{i}" }); + + plot.grid "noxtics" + group_offset = base_offset + margins + group_size = interbar_offset * data.length + intergroup_offset; + plot.boxwidth bar_width.to_s; + pattern = 0; + data.zip(args[:dataset_labels]).each do |dataset, dataset_title| + offset = group_offset - group_size; + group_offset += interbar_offset; + + indices = dataset.map { |i| offset += group_size; } + plot.data << Gnuplot::DataSet.new([indices,dataset]) do |ds| + ds.title = dataset_title + ds.with = box_style.call(pattern += 1); + end + end + + label_offset += (group_size+intergroup_offset-margins)/2 + group_offset = base_offset - label_offset; + plot.xtics "(#{args[:group_labels].map do |label| + "\"#{label}\" #{group_offset += group_size}"; + end.join(", ")}) scale 0 #{tic_commands}"; + + plot.xrange "[-10:#{group_offset+label_offset+margins-intergroup_offset}]" +end + +def draw_bar_plot plot, args + plot.key "off" + args = args.clone + args[:data] = args[:data].map {|d| [d]} + args[:dataset_labels] = [""]; + args[:group_labels] = args[:labels]; + + draw_clustered_bar_plot plot, args +end + diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/sqlite100m.pdf b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/sqlite100m.pdf new file mode 100755 index 00000000..e69de29b diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/sqlite1g.pdf b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/sqlite1g.pdf new file mode 100755 index 00000000..e69de29b diff --git a/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/util.rb b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/util.rb new file mode 100755 index 00000000..7436d622 --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/CT-InPractice/util.rb @@ -0,0 +1,394 @@ +class Tokenizer + def initialize(string, token, input_source = nil) + @tokens = string.scan(token); + @last = nil; + @input_source = input_source; + @string = string; + end + + def scan + while @tokens.size > 0 + if !(yield @tokens.shift) then break; end + end + end + + def peek + if @tokens.size > 0 then @tokens[0] + else nil; end + end + + def next + @last = + if @tokens.size > 0 then @tokens.shift + else nil; end + end + + def last + @last; + end + + def more? + @tokens.size > 0; + end + + def flatten + @tokens = @tokens.flatten; + end + + def assert_next(token, errstr = nil) + case token + when String then raise_error(errstr || "Expected '#{token}' but found '#{last}'") unless self.next == token + when Array then raise_error(errstr || "Expected '#{token.join("','")}' but found '#{last}'") unless token.include? self.next; + end + self.last; + end + + def raise_error(errstr); + errstr = "#{errstr} (line #{@input_source.lineno})" if @input_source; + errstr = "#{errstr} (#{@string})" unless @input_source; + raise "Parse Error: #{errstr}"; + end + + def tokens_up_to(token) + ret = Array.new; + while (more? && (self.next != token)) + ret.push(last); + end + ret; + end +end + +class Array + def map_index + (0...length).to_a.map { |i| yield(i, self[i]) } + end + + def to_h + ret = Hash.new; + each { |k,v| ret[k] = v; } + return ret; + end + + def unzip + ret = Array.new; + each_index do |i| + ret.push Array.new(i) while ret.length < self[i].length + ret.each_index do |j| + ret[j][i] = self[i][j] + end + end + return ret; + end + + def count + size + end + + def sum + ret = 0; + each { |item| ret += item } + return ret; + end + + def avg + sum.to_f / length.to_f + end + + def rms_avg + Math.sqrt(map { |x| x.to_f * x.to_f }.avg) + end + + def stddev + Math.sqrt((avg ** 2 - (map{|i| i.to_f ** 2}.avg)).abs) + end + + def reduce(&reducer) + ret = Hash.new; + each do |k,v| + ret[k] = Array.new unless ret.has_key? k; + ret[k].push(v); + end + if reducer.nil? then ret + else + ret.to_a.collect do |k,vs| + [ k, reducer.call(k, vs) ] + end.to_h + end + end + + # Round-robin partition into K arrays + def subdivide(k) + cnt = 0; + ret = (0...k).map {|i| Array.new }; + each { |i| ret[cnt % k].push i; cnt += 1; }; + ret; + end + + # Inorder partition into groups of K elements + def partition(k) + (0...(size / k.to_f).ceil).map do |i| + self[k*i...[k*(i+1), size].min] + end + end + + def zip_members + self[0].zip(*(self[1..-1])) + end + + def grep(pattern, &block) + ret = []; + if block.nil? + then each { |l| ret.push(l) if pattern =~ l; } + else each { |l| match = pattern.match(l); + ret.push(block.call(match)) if match; } + end + ret + end + + def window(window_size = 10, &block) + if length <= window_size then + if block.nil? then return [self.clone]; + else return [block.call(self)]; + end + else + ret = Array.new; + w = Array.new; + each do |item| + w.push(item); + w.shift if w.length > window_size; + if w.length >= window_size then + ret.push(if block.nil? then [w.clone] else block.call(w) end) + end + end + ret + end + end + + def fold(accum = nil) + each { |i| accum = yield accum, i } + accum + end + + def pick_samples_evenly(num_samples) + return self if(self.length <= num_samples); + keep_steps = (self.length / num_samples).to_i + step = 0; + self.delete_if { step += 1; (step % keep_step) == 0 } + end + + def to_table(headers = nil) + row_sizes = + ((headers.nil? ? [] : [headers]) + self). + map { |row| row.map { |c| c.to_s.length } }. + unzip. + map { |col| col.compact.max } + + ( unless headers.nil? then + [ " " + headers.zip(row_sizes).map do |col, exp_size| + col + (if col.size < exp_size then + (" " * (exp_size - col.size)) + else "" end) + end.join(" | "), + ("-" * (row_sizes.sum + 2 + (row_sizes.length - 1) * 3)) + ] + else [] end + + map do |row| + " " + row.zip(row_sizes).map do |col, exp_size| + col = col.to_s + if col.size < exp_size + then col.center(exp_size) + else col + end + end.join(" | ") + end + ).join("\n") + end + + def for_all + each { |v| return false unless yield v } + true; + end + + def each_prefix + each_index do |i| + yield self[0..i]; + end + end + + def select + map { |x| x if yield x }.compact + end + + def cogroup + ret = Hash.new { |h,k| h[k] = [nil] * size } + each_index do |i| + self[i].each do |k, v| + ret[k][i] = v + end + end + ret + end + + # Return every cnt'th element of the array. + def every(cnt, start = 0) + (0..(((size-1-start)/cnt).to_i)).map { |i| self[i*cnt+start] } + end + + # Create batches of up to size cnt. + def batch(cnt) + (0..(((size-1)/cnt).to_i)).map { |i| self[(i*cnt)...((i+1)*cnt)] } + end + + def flatmap + ret = [] + each { |i| ret = ret + yield(i) } + ret + end + + def project(*keys) + map { |x| x.project(*keys) } + end + + def unique + last = nil + sort. + map { |c| last = c if c != last }. +# map { |c| p c }. + compact + end + + def histogram(bin_width = 5) + min_val = (min - min % bin_width).to_i + max_val = (max - max % bin_width + bin_width).to_i + + (min_val..max_val).to_a.every(bin_width). + map { |x| [x, 0] }. + to_h. + join(map { |x| (x.to_f / bin_width).to_i * bin_width }. + reduce { |k,v| v.count }, + :left + ). + map { |bin, cnt| [bin, cnt.compact.sum] }. + sort { |a, b| a[0] <=> b[0] } + end + + def cumulative_sum + tot = 0; + map { |x| tot += x } + end +end + +class Hash + def intersect(other) + keys.find_all { |k| other.has_key?(k) } + end + + def bar_graph_dataset(bar = 0.5, set_sep = 1.0, bar_sep = 0.2) + curr_width = 0; + tics = collect do |human,data| + next_delta = data.length * bar + (data.length - 1) * bar_sep; + curr_width += next_delta + set_sep; + "\"#{human}\" #{curr_width - next_delta / 2}" + end + + curr_width = 0; + points = values.collect do |data| + curr_width += set_sep - bar_sep + data.collect do |point| + curr_width += bar_sep + bar; + [curr_width - bar / 2, point] + end + end.unzip; + + return ["(#{tics.join(', ')})" , points, "[0:#{curr_width+set_sep}]"]; + end + + def to_sorted_a + keys.sort.map do |k| + [k, self[k]] + end + end + + def map_leaves(prefix = []) + keys.to_a.map do |k| + [ k, + if self[k].is_a? Hash + then self[k].map_leaves(prefix+[k]) { |ik,v| yield(ik, v) } + else yield(prefix+[k], v) + end + ] + end.to_h + end + + def project(*keys) + keys.map { |k| self[k] } + end + + def join(h, outer = :no) + case outer + when :full then + keys + h.keys.find_all { |k| not has_key? k } + when :left then + keys + when :right then + h.keys + else + intersect(h) + end. + map { |k| [k, [self[k], h[k]]] }.to_h + end +end + +class Float + def sig_figs(n) + if self == 0.0 then self + else + mult = (10.0 ** (Math.log10(self).ceil.to_f - n.to_i.to_f)) + (self / mult).round * mult; + end + end +end + +class IO + def tee_readlines + ret = []; + each { |l| yield l; ret.push l } + ret + end + + def grep + map {|x| x if yield x}.compact + end +end + +class Integer + def to_bytestring + return "-#{(-self).to_bytestring}" if self < 0; + depth = (Math.log(self/2) / (10.0 * Math.log(2))).to_i + scales = ["B", "KB", "MB", "GB", "PB", "EB"]; + depth = scales.length-1 if depth >= scales.length; + "#{(self.to_f / (1024.0**(depth))).to_f.sig_figs(4)} #{scales[depth]}" + end + + def d(die) + (0...self).map { rand(die)+1 } + end +end + +class String + def pluralize(num) + if num == 1 then self + else self+"s" + end + end +end + +class Dir + def Dir.in_dir(d) + old_d = Dir.getwd + Dir.chdir d + ret = yield + Dir.chdir old_d + ret + end +end \ No newline at end of file diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/PDB.pdf b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/PDB.pdf new file mode 100755 index 00000000..65999bb4 Binary files /dev/null and b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/PDB.pdf differ diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/Rakefile b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/Rakefile new file mode 100755 index 00000000..7eaf2f21 --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/Rakefile @@ -0,0 +1,102 @@ +$:.push "." + +require "gnuplot" +require "util.rb" +require "plot.rb" +require "csvx.rb" + +data = File.csv("data.csv", header: true) + +plot_output :aqua +# $plot_auto_open = true + +all_strategies = + [ + "Mimir-Mat", + "Mimir-Inline", + "Mimir-Sample", + "Mimir-Partition", + "MCDB-Mimir", + "SQLite-Det", + "MayBMS-PGSQL", + "MayBMS-SQLite", + ] + +data.map { |r| [r["Query"].split(/-/)[0], r] } + .reduce + .each do |group, records| + + plot group => ["data.csv", "Rakefile"] do |plot| + + strategies = + all_strategies + .where { |s| records.index { |r| r["Strategy"] == s } } + queries = + records + .map { |r| r["Query"] } + .uniq.sort + lookup = records.map { |r| [[r["Strategy"], r["Query"]], r["Time"]] }.to_h + strategies + + pretty_plot(plot, border: [:all]) + plot.key "left top opaque" + plot.ylabel "Time (s)" + max_y = + records.map { |r| r["Time"] } + .where { |r| /[\-0-9.]+/ =~ r } + .map { |r| r.to_f } + .max + + case group + when "TPCH" + max_y = 300 + plot.key "center top outside maxcols 3 maxrows 2" + when "PDB" + max_y = 45 + plot.key "center top outside maxcols 3 maxrows 3" + end + plot.yrange "[0:#{max_y}]" + + labels = [] + + draw_clustered_bar_plot(plot, + data: (queries.map.with_index { |q,qi| + strategies.map.with_index { |s,si| + time = lookup[[s,q]] + x_pos = bar_plot_position(si, qi, strategies.length)+1 + font = "Helvetica-Bold,6" + case time + when nil then + labels.push "'?? Missing ??' at #{x_pos},6 font '#{font}' rotate by 90" + when "TIMEOUT" then + labels.push "'TIME OUT' at #{x_pos},#{max_y * 0.80} font '#{font}' rotate by 90 front" + max_y * 1.2 + when "UNSUPPORTED" + labels.push "'UNSUPPORTED' at #{x_pos},5 font '#{font}' rotate by 90 front tc ls #{si+1}" + 1 + else + time = time.to_f + if time > max_y + labels.push "'[ #{time.to_i}s ]' at #{x_pos},#{max_y * 0.80} font 'Helvetica-Bold,8' rotate by 90 front" + end + time + end + } + }), + dataset_labels: strategies, + group_labels: queries, + bar_width: 15 + ) + labels.each { |l| plot.label l } + + (1...queries.length).each { |qi| + x_pos = bar_plot_position(0, qi, strategies.length) - 18 + plot.arrow("from #{x_pos},0 to #{x_pos},#{max_y} nohead lc rgb \"#808080\"") + } + + + end + + task :default => group + + end diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/TPCH.pdf b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/TPCH.pdf new file mode 100755 index 00000000..6ed703e9 Binary files /dev/null and b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/TPCH.pdf differ diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/csvx.rb b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/csvx.rb new file mode 100755 index 00000000..38ae2661 --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/csvx.rb @@ -0,0 +1,64 @@ +class String + def from_csv(sep = /,/) + ret = [[]] + c = chars + quote = "\"" + comma = "," + i = 0 + expecting_quote = false + while i < c.length + if c[i] == quote + if ret[-1].empty? then expecting_quote = true + elsif c[i+1] == quote then i += 1; ret[-1].push(quote) + elsif expecting_quote and (c[i+1] == comma) then + ret.push([]) + expecting_quote = false + i += 1 + elsif expecting_quote and (c[i+1] == nil) then + expecting_quote = false + else + raise "Invalid CSV Line (misplaced quote at #{i}; #{c[i+1]}): #{self}" + end + elsif c[i] == comma + if expecting_quote + ret[-1].push(comma) + else + ret.push([]) + expecting_quote = false + end + else + ret[-1].push(c[i]) + end + i += 1 + end + ret.map { |col| col.join} + end +end + +class Array + def from_csv + self.map { |l| l.to_s.chomp.from_csv } + end + + def to_csv(f) + File.open(f, "w+") { |f| each { |row| f.puts(row.join(',')) }} + end +end + +class IO + def from_csv(args = {}) + header = args.fetch(:header, false) + separator = args.fetch(:separator, /,/) + keys = readline.chomp. + sub(/ *$/, "").sub(/^ */,""). + from_csv(separator) if header; + map { |l| l.to_s.chomp.from_csv(separator) }. + map { |a| if header then keys.zip(a).to_h else a end } + end +end + +class File + def File.csv(f, args = {}) + File.open(f) {|io| io.from_csv(args) } + end +end diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/data.csv b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/data.csv new file mode 100755 index 00000000..b0f323ba --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/data.csv @@ -0,0 +1,57 @@ +Strategy,Query,Time +Mimir-Mat,PDB-1,25.98781968615949 +Mimir-Mat,PDB-2,20.71622445844114 +Mimir-Mat,PDB-3,41.98619099296629 +MayBMS-PGSQL,PDB-1,23.439012999999996 +MayBMS-PGSQL,PDB-2,13.000651999999999 +MayBMS-PGSQL,PDB-3,20.2954832 +MayBMS-SQLite,PDB-1,22.1345477 +MayBMS-SQLite,PDB-2,7.291376699999999 +MayBMS-SQLite,PDB-3,29.1511957 +Mimir-Inline,TPCH-1,16.040970255620778 +Mimir-Inline,TPCH-3,19.171183695830404 +Mimir-Inline,TPCH-5,43.3495686205104 +Mimir-Inline,TPCH-9,98.61139780338854 +Mimir-Mat,TPCH-1,33.623222251608965 +Mimir-Mat,TPCH-3,4.8350385190919045 +Mimir-Mat,TPCH-5,11.789478918723763 +Mimir-Mat,TPCH-9,28.924315941147505 +Mimir-Sample,TPCH-1,119.61607021316885 +Mimir-Sample,TPCH-3,162.00108394436538 +Mimir-Sample,TPCH-5,258.74168805666267 +Mimir-Sample,TPCH-9,TIMEOUT +Mimir-Partition,TPCH-1,UNSUPPORTED +Mimir-Partition,TPCH-3,UNSUPPORTED +Mimir-Partition,TPCH-5,UNSUPPORTED +Mimir-Partition,TPCH-9,UNSUPPORTED +--MayBMS-PGSQL,TPCH-1,UNSUPPORTED +--MayBMS-PGSQL,TPCH-3,UNSUPPORTED +--MayBMS-PGSQL,TPCH-5,UNSUPPORTED +--MayBMS-PGSQL,TPCH-9,UNSUPPORTED +--MayBMS-SQLite,TPCH-1,UNSUPPORTED +--MayBMS-SQLite,TPCH-3,UNSUPPORTED +--MayBMS-SQLite,TPCH-5,UNSUPPORTED +--MayBMS-SQLite,TPCH-9,UNSUPPORTED +MCDB-Mimir,TPCH-1,14.65919488966465 +MCDB-Mimir,TPCH-3,TIMEOUT +MCDB-Mimir,TPCH-5,TIMEOUT +MCDB-Mimir,TPCH-9,TIMEOUT +SQLite-Det,PDB-1,9.521 +SQLite-Det,PDB-2,7.59 +SQLite-Det,PDB-3,31.22 +SQLite-Det,TPCH-1,19.561 +SQLite-Det,TPCH-3,22.835 +SQLite-Det,TPCH-5,33.308 +SQLite-Det,TPCH-9,51.125 +Mimir-Inline,PDB-1,TIMEOUT +Mimir-Inline,PDB-2,30.827455023303628 +Mimir-Inline,PDB-3,TIMEOUT +Mimir-Sample,PDB-1,TIMEOUT +Mimir-Sample,PDB-2,242.5666234549135 +Mimir-Sample,PDB-3,TIMEOUT +Mimir-Partition,PDB-1,TIMEOUT +Mimir-Partition,PDB-2,TIMEOUT +Mimir-Partition,PDB-3,TIMEOUT +MCDB-Mimir,PDB-1,TIMEOUT +MCDB-Mimir,PDB-2,TIMEOUT +MCDB-Mimir,PDB-3,TIMEOUT diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/plot.rb b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/plot.rb new file mode 100755 index 00000000..8c3da93e --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/plot.rb @@ -0,0 +1,227 @@ + +$plot_terminal = "aqua" +$plot_suffix = nil; +$plot_auto_open = false; +$current_plot = nil; + +def plot_output(output, settings = {}) + case output + when :aqua then $plot_terminal = "aqua"; $plot_suffix = nil; + when :pdf then $plot_terminal = "pdf"; $plot_suffix = ".pdf" + when :png then $plot_terminal = "png"; $plot_suffix = ".png" + end + $plot_terminal_opts = settings +end + +def plot_terminal(plot, settings) + settings = $plot_terminal_opts + settings + $plot_terminal+( + if settings.size < 1 then "" else + " " + settings.to_a.flatten.join(" ") + end + ) +end + +$pretty_styles = [ + { :lt => "rgb \"#A00000\"", + :lw => 2, + :pt => 1 + }, + { :lt => "rgb \"#00A000\"", + :lw => 2, + :pt => 6 + }, + { :lt => "rgb \"#5060D0\"", + :lw => 2, + :pt => 2 + }, + { :lt => "rgb \"#F25900\"", + :lw => 2, + :pt => 9 + } +]; + +def pretty_style(idx, opts = {}) + opts = opts.clone; + $pretty_styles[idx].each { |k, v| opts[k] = v unless opts.has_key? k } + opts.map { |kv| kv.to_a.join(" ") unless kv[1].nil? }.compact.join(" ") +end + +def pretty_plot(plot, opts = {}) + # plot based on Brighten Godfrey's blog post: + # http://youinfinitesnake.blogspot.com/2011/02/attractive-scientific-plots-with.html + + plot.terminal [ + "pdf", + "font \"#{opts.fetch(:fontface, "Times-Roman")},#{opts.fetch(:fontsize, 10)}\"", + "linewidth #{opts.fetch(:linewidth, 4)} rounded", + "fontscale #{opts.fetch(:fontscale, 1.0)}", + "size #{opts.fetch(:sizex, 5)}in,#{opts.fetch(:sizey, 3)}in" + ].join(" ") + + # Line style for axes + plot.style "line 80 lc #{opts.fetch(:bordercolor, "rgb \"#808080\"")}" + + # Line style for grid + plot.style "line 81 lt 0" # dashed + plot.style "line 81 lc #{opts.fetch(:gridcolor, "rgb \"#808080\"")}" # grey + + plot.grid "back linestyle 81" + + border_groups = + opts.fetch(:border, [:left, :bottom]).map do |b| + case b + when :bottom then 1 + when :left then 2 + when :top then 4 + when :right then 8 + when :all then 1+2+4+8 + else raise "Invalid border type : #{b}" + end + end.sum + + plot.border "#{border_groups} back linestyle 80" # Remove border on top and right. These + # borders are useless and make it harder + # to see plotted lines near the border. + # Also, put it in grey; no need for so much emphasis on a border. + plot.xtics "nomirror" + plot.ytics "nomirror" + + if(opts.fetch(:logx, false)) then + plot.logscal "x" + plot.mxtics "10" # Makes logscale look good. + end + if(opts.fetch(:logy, false)) then + plot.logscal "y" + plot.mytics "10" # Makes logscale look good. + end + + # Line styles: try to pick pleasing colors, rather + # than strictly primary colors or hard-to-see colors + # like gnuplot's default yellow. Make the lines thick + # so they're easy to see in small plots in papers. + $pretty_styles.each_index { |x| plot.style "line #{x+1} #{pretty_style(x)}" } + + plot.key "bottom right" +end + +def auto_open_plots(new_val = true) + $plot_auto_open = new_val; +end + +def row_data(data) + $current_plot.data << Gnuplot::DataSet.new(data.unzip) { |ds| yield ds } +end + + +def plot(args = {}) + task(args) do + task_name = case args + when Hash then args.keys[0] + when Symbol,String then args.to_s + end + Gnuplot.open do |gp| + Gnuplot::Plot.new(gp) do |plot| + $current_plot = plot; + + $plot_terminal + + plot.terminal $plot_terminal + if $plot_suffix and task_name then + plot.output "#{task_name}#{$plot_suffix}" + end + yield plot; + end + end + if $plot_auto_open and [".pdf", ".png"].include? $plot_suffix + system("open #{task_name}#{$plot_suffix}") + end + end +end + +def line_plot(args = {}) + plot(args) do |plot| + data_elements = yield(plot) + data_elements = { :data => data_elements } unless data_elements.is_a? Hash; + + data = data_elements[:data].unzip; + xaxis = data_elements.fetch(:xaxis) { data.shift }; + keys = data_elements.fetch(:keys) { data.map { nil; } } + withs = data_elements.fetch(:with, "linespoints"); + withs = data.map { withs } unless withs.is_a? Array; + + raise "Missing data!" if data.nil?; + raise "Missing X Axis!" if xaxis.nil?; + + data.zip(keys, withs).each do |line, key, with| + plot.data << Gnuplot::DataSet.new([xaxis, line]) do |ds| + ds.title = key unless key.nil? + ds.with = with unless with.nil? + end + end + end +end + +def bar_plot_position(group, element, num_elements, args = {}) + base_offset = args.fetch(:base_offset, 0); + interbar_offset = args.fetch(:interbar_offset, 18); + intergroup_offset = args.fetch(:intergroup_offset, interbar_offset); + margins = args.fetch(:margins, intergroup_offset); + bar_width = args.fetch(:bar_width, 10); + + group_offset = base_offset + margins + group_size = interbar_offset * num_elements + intergroup_offset; + + position_relative_to_element = (interbar_offset * (group)) + position_of_group = (group_size * element) + + group_offset + position_relative_to_element + position_of_group +end + +def draw_clustered_bar_plot plot, args = {} + data = args.fetch(:data).unzip; + base_offset = args.fetch(:base_offset, 0); + interbar_offset = args.fetch(:interbar_offset, 18); + intergroup_offset = args.fetch(:intergroup_offset, interbar_offset); + margins = args.fetch(:margins, intergroup_offset); + bar_width = args.fetch(:bar_width, 10); + tic_commands = args.fetch(:tic_commands, ""); + label_offset = args.fetch(:label_offset, 0); + box_style = args.fetch(:box_style, + lambda { |i| "boxes fill pattern #{i}" }); + + plot.grid "noxtics" + group_offset = base_offset + margins + group_size = interbar_offset * data.length + intergroup_offset; + plot.boxwidth bar_width.to_s; + pattern = 0; + data.zip(args[:dataset_labels]).each do |dataset, dataset_title| + offset = group_offset - group_size; + group_offset += interbar_offset; + + indices = dataset.map { |i| offset += group_size; } + plot.data << Gnuplot::DataSet.new([indices,dataset]) do |ds| + ds.title = dataset_title + ds.with = box_style.call(pattern += 1); + end + end + + label_offset += (group_size+intergroup_offset-margins)/2 + group_offset = base_offset - label_offset; + plot.xtics "(#{args[:group_labels].map do |label| + "\"#{label}\" #{group_offset += group_size}"; + end.join(", ")}) scale 0 #{tic_commands}"; + + plot.xrange "[-10:#{group_offset+label_offset+margins-intergroup_offset}]" +end + +def draw_bar_plot plot, args + plot.key "off" + args = args.clone + args[:data] = args[:data].map {|d| [d]} + args[:dataset_labels] = [""]; + args[:group_labels] = args[:labels]; + + draw_clustered_bar_plot plot, args +end + diff --git a/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/util.rb b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/util.rb new file mode 100755 index 00000000..20526547 --- /dev/null +++ b/slides/talks/2018-1-Tour-Mimir/data/PushdownVis/util.rb @@ -0,0 +1,471 @@ +class Tokenizer + def initialize(string, token, input_source = nil) + @tokens = string.scan(token); + @last = nil; + @input_source = input_source; + @string = string; + end + + def scan + while @tokens.size > 0 + if !(yield @tokens.shift) then break; end + end + end + + def peek + if @tokens.size > 0 then @tokens[0] + else nil; end + end + + def next + @last = + if @tokens.size > 0 then @tokens.shift + else nil; end + end + + def last + @last; + end + + def more? + @tokens.size > 0; + end + + def flatten + @tokens = @tokens.flatten; + end + + def assert_next(token, errstr = nil) + case token + when String then raise_error(errstr || "Expected '#{token}' but found '#{last}'") unless self.next == token + when Array then raise_error(errstr || "Expected '#{token.join("','")}' but found '#{last}'") unless token.include? self.next; + end + self.last; + end + + def raise_error(errstr); + errstr = "#{errstr} (line #{@input_source.lineno})" if @input_source; + errstr = "#{errstr} (#{@string})" unless @input_source; + raise "Parse Error: #{errstr}"; + end + + def tokens_up_to(token) + ret = Array.new; + while (more? && (self.next != token)) + ret.push(last); + end + ret; + end +end + +class Array + def map_index + (0...length).to_a.map { |i| yield(i, self[i]) } + end + + def to_h + ret = Hash.new; + each { |k,v| ret[k] = v; } + return ret; + end + + def unzip + ret = Array.new; + each_index do |i| + ret.push Array.new(i) while ret.length < self[i].length + ret.each_index do |j| + ret[j][i] = self[i][j] + end + end + return ret; + end + + def count + size + end + + def sum + ret = 0; + each { |item| ret += item } + return ret; + end + + def avg + sum.to_f / length.to_f + end + + def prod + ret = 1; + each { |item| ret *= item } + return ret; + end + + def rms_avg + Math.sqrt(map { |x| x.to_f ** 2 }.avg) + end + + def rms_err + Math.sqrt(map { |x,y| (x.to_f - y.to_f) ** 2 }.avg) + end + + def stddev + Math.sqrt((avg ** 2 - (map{|i| i.to_f ** 2}.avg)).abs) + end + + def reduce(&reducer) + ret = Hash.new; + each do |k,v| + ret[k] = Array.new unless ret.has_key? k; + ret[k].push(v); + end + if reducer.nil? then ret + else + ret.to_a.collect do |k,vs| + [ k, reducer.call(k, vs) ] + end.to_h + end + end + + # Round-robin partition into K arrays + def subdivide(k) + cnt = 0; + ret = (0...k).map {|i| Array.new }; + each { |i| ret[cnt % k].push i; cnt += 1; }; + ret; + end + + # Inorder partition into groups of K elements + def take_groups(k) + (0...(size / k.to_f).ceil).map do |i| + self[k*i...[k*(i+1), size].min] + end + end + + def zip_members + self[0].zip(*(self[1..-1])) + end + + def grep(pattern, &block) + ret = []; + if block.nil? + then each { |l| ret.push(l) if pattern =~ l; } + else each { |l| match = pattern.match(l); + ret.push(block.call(match)) if match; } + end + ret + end + + def window(window_size = 10, &block) + if length <= window_size then + if block.nil? then return [self.clone]; + else return [block.call(self)]; + end + else + ret = Array.new; + w = Array.new; + each do |item| + w.push(item); + w.shift if w.length > window_size; + if w.length >= window_size then + ret.push(if block.nil? then [w.clone] else block.call(w) end) + end + end + ret + end + end + + def fold(accum = nil) + each { |i| accum = yield accum, i } + accum + end + + def pick_samples_evenly(num_samples) + return self if(self.length <= num_samples); + keep_steps = (self.length / num_samples).to_i + step = 0; + self.delete_if { step += 1; (step % keep_step) == 0 } + end + + def to_table(headers = nil) + row_sizes = + ((headers.nil? ? [] : [headers]) + self). + map { |row| row.map { |c| c.to_s.length } }. + unzip. + map { |col| col.compact.max } + + ( unless headers.nil? then + [ " " + headers.zip(row_sizes).map do |col, exp_size| + col + (if col.size < exp_size then + (" " * (exp_size - col.size)) + else "" end) + end.join(" | "), + ("-" * (row_sizes.sum + 2 + (row_sizes.length - 1) * 3)) + ] + else [] end + + map do |row| + " " + row.zip(row_sizes).map do |col, exp_size| + col = col.to_s + if col.size < exp_size + then col.center(exp_size) + else col + end + end.join(" | ") + end + ).join("\n") + end + + def tabulate_schemaless_records + keys = map {|r| r.keys}.flatten.unique.sort + + [ keys , + map {|r| keys.map {|k| r[k] }} + ] + end + + def for_all + each { |v| return false unless yield v } + true; + end + + def each_prefix + each_index do |i| + yield self[0..i]; + end + end + + def select + map { |x| x if yield x }.compact + end + + def cogroup + ret = Hash.new { |h,k| h[k] = [nil] * size } + each_index do |i| + self[i].each do |k, v| + ret[k][i] = v + end + end + ret + end + + # Return every cnt'th element of the array. + def every(cnt, start = 0) + (0..(((size-1-start)/cnt).to_i)).map { |i| self[i*cnt+start] } + end + + # Create batches of up to size cnt. + def batch(cnt) + (0..(((size-1)/cnt).to_i)).map { |i| self[(i*cnt)...((i+1)*cnt)] } + end + + def flatmap + ret = [] + each { |i| ret = ret + yield(i) } + ret + end + + def project(*keys) + map { |x| x.project(*keys) } + end + + def unique + last = nil + sort. + map { |c| last = c if c != last }. +# map { |c| p c }. + compact + end + + def histogram(bin_width = 5) + min_val = (min - min % bin_width).to_i + max_val = (max - max % bin_width + bin_width).to_i + + (min_val..max_val).to_a.every(bin_width). + map { |x| [x, 0] }. + to_h. + join(map { |x| (x.to_f / bin_width).to_i * bin_width }. + reduce { |k,v| v.count }, + :left + ). + map { |bin, cnt| [bin, cnt.compact.sum] }. + sort { |a, b| a[0] <=> b[0] } + end + + def cumulative_sum + tot = 0; + map { |x| tot += x } + end + + def splice(val, idx) + return [val] + self if idx <= 0 + return self + [val] if idx >= length + return self[0...idx] + [val] + self[idx..-1] + end + + def all_sorts + return [[]] if empty? + return [self] if length == 1 + hd = self[0] + self[1..-1].all_sorts.map do |rest| + (0..rest.length).map { |i| rest.splice(hd, i) } + end.flatten(1) + end + + def merge(other, args = {}) + if args.has_key?(:eq) + args[:eq] = [args[:eq], args[:eq]] unless args[:eq].is_a? Array + a, b = args[:eq] + idx = Hash.new { |h,k| h[k] = [] } + self.each {|i| idx[i[a]].push i } + other.map {|j| idx[i[b]].map { |i| i + j } }.flatten(1) + else + self.map {|i| + other.map {|j| + i + j if yield i,j + }.compact + }.flatten(1) + end + end + + def where + map {|i| i if yield i }.compact + end +end + +class Hash + def intersect(other) + keys.find_all { |k| other.has_key?(k) } + end + + def bar_graph_dataset(bar = 0.5, set_sep = 1.0, bar_sep = 0.2) + curr_width = 0; + tics = collect do |human,data| + next_delta = data.length * bar + (data.length - 1) * bar_sep; + curr_width += next_delta + set_sep; + "\"#{human}\" #{curr_width - next_delta / 2}" + end + + curr_width = 0; + points = values.collect do |data| + curr_width += set_sep - bar_sep + data.collect do |point| + curr_width += bar_sep + bar; + [curr_width - bar / 2, point] + end + end.unzip; + + return ["(#{tics.join(', ')})" , points, "[0:#{curr_width+set_sep}]"]; + end + + def to_sorted_a + keys.sort.map do |k| + [k, self[k]] + end + end + + def map_leaves(prefix = []) + keys.to_a.map do |k| + [ k, + if self[k].is_a? Hash + then self[k].map_leaves(prefix+[k]) { |ik,v| yield(ik, v) } + else yield(prefix+[k], v) + end + ] + end.to_h + end + + def project(*keys) + keys.map { |k| self[k] } + end + + def join(h, outer = :no) + case outer + when :full then + keys + h.keys.find_all { |k| not has_key? k } + when :left then + keys + when :right then + h.keys + else + intersect(h) + end. + map { |k| [k, [self[k], h[k]]] }.to_h + end + + def flatten_tree(sep = nil, prefix = nil) + map { |k,v| + unless prefix.nil? + k = sep + k.to_s unless sep.nil? + k = prefix.to_s + k.to_s + end + case v + when Hash then v.flatten_tree(sep, k).to_a + else [ [k.to_sym, v] ] + end + }.flatten(1).to_h + end +end + +class Float + def sig_figs(n) + if self == 0.0 then self + else + mult = (10.0 ** (Math.log10(self).ceil.to_f - n.to_i.to_f)) + (self / mult).round * mult; + end + end +end + +class IO + def tee_readlines + ret = []; + each { |l| yield l; ret.push l } + ret + end + + def grep + map {|x| x if yield x}.compact + end +end + +class File + def File.stream(inFile, outFile, mode = "w+") + File.open(inFile) do |inHandle| + File.open(outFile, mode) do |outHandle| + yield(inHandle, outHandle) + end + end + end +end + +class Integer + def to_bytestring + return "-#{(-self).to_bytestring}" if self < 0; + depth = (Math.log(self/2) / (10.0 * Math.log(2))).to_i + scales = ["B", "KB", "MB", "GB", "PB", "EB"]; + depth = scales.length-1 if depth >= scales.length; + "#{(self.to_f / (1024.0**(depth))).to_f.sig_figs(4)} #{scales[depth]}" + end + + def d(die) + (0...self).map { rand(die)+1 } + end +end + +class String + def pluralize(num) + if num == 1 then self + else self+"s" + end + end +end + +class Dir + def Dir.in_dir(d) + old_d = Dir.getwd + Dir.chdir d + ret = yield + Dir.chdir old_d + ret + end +end + diff --git a/slides/talks/2018-1-Tour-Mimir/index.html b/slides/talks/2018-1-Tour-Mimir/index.html index ee5acf71..1fa5745b 100644 --- a/slides/talks/2018-1-Tour-Mimir/index.html +++ b/slides/talks/2018-1-Tour-Mimir/index.html @@ -288,1210 +288,51 @@ State of the art: DataGuide, Wrangler, etc...

+ +

Loading requires curation...

Data Curation is Hard!

-

State of the Art

- - - (skilledup.com) - -

Alice spends weeks curating her data before using it.

+ Automation: Put up a bunch of paper PNGs from papers on ER, schema detection, missing value repair, etc...
-

Relational databases make this worse...

-

The data needs... -

-

-

This is all required upfront. Before asking a single question.

-
-
- -
- -
-

Relational DBs are useless in early stages of curation.

-

Why?

+ Ooops: 12 year old terrorist, etc...
-

- In the name of Codd,
thou shalt not give the user a wrong answer. -

-

There are tons of good heuristics available for guessing how to clean data.

+ Problem: Fox graph vs https://blog.udacity.com/wp-content/uploads/2014/11/data_analysis1.jpg
-

- Thou shalt not give the user a wrong answer. -

- -

- ... but what if we did? -

-

- What would it take for that to be ok? -

+ ProbDBs are a possible solution, but... + +Label , PDB-1, PDB-2, PDB-3, TPCH-1, TPCH-3, TPCH-5, TPCH-9 +SQLite , 9.521, 7.59, 31.22, 19.561, 22.835, 33.308, 51.125 +MayBMS-SQLite, 22.1345477, 7.291376699999999, 29.1511957 +MayBMS-PGSql , 23.439012999999996, 13.000651999999999, 20.2954832 +Sampling , TIMEOUT, 242.5666234549135, 300, 119.61607021316885, 162.00108394436538, 258.74168805666267, 300 + +
-

Industry says...

-
- - - - - - - - - - - -              -

My phone is guessing, but is letting me know that it did

- Apple iOS 10; Phone App -
- -
- -

Good Explanations, Alternatives, and Feedback Vectors

- Bing Translate (c.a. 2016) -
- -
-

Communication

- -
- -
-

What if a database did the same?

-

(they can)

-
- -
- -
-
-

On representing incomplete information in a relational data base

-

T. Imielinski & W. Lipski Jr.(VLDB 1981)

-

- Incomplete and Probabilistic Databases
have existed since the 1980s -

-
- -
- - - - - - - - - - - - Q(D) - - - - - - Q(D) - Q(D) - Q(D) - - - - - - - - - - - ? - - - - - - - - Probability - Expectation - Variance - Histogram - - - - - - -

- We've gotten good at query processing on uncertain data.
- But not sourcing uncertain data - ... or communicating results to humans. -

-
- -
-

Challenges

- -

A small shift in how we think about PDBs addresses all three points.

-
-
- -
-
-

It's not the data that's uncertain,
it's the interpretation

-
- -
- - - - - - - - -
TimeSensor ReadingTemp Around Sensor
131.6Roughly 31.6˚C
2-999Around 30˚C?
328.1Roughly 28.1˚C?
432.2Roughly 32.2˚C
-

The reading is deterministic

-

... but what we care about is what the reading measures

-
- -
- - - - - - - - Q1(D) - Q2(D) - Q3(D) - Q4(D) - - - - - - - - - - - - - - -

Insight: Treat data as 100% deterministic.

-

Instead, queries propose alternative interpretations.

-
- -
-

ratings1.rating matches either ratings2.numratings or ratings2.evaluation.

-

-	SELECT pid, rating FROM ratings1 UNION ALL
-	SELECT pid, num_ratings AS rating FROM ratings2;
-					
- or -

-	SELECT pid, rating FROM ratings1 UNION ALL
-	SELECT pid, evaluation AS rating FROM ratings2;
-					
-
- -
-

Repair missing values in rating

-

- SELECT pid, CASE WHEN rating is null 
-             THEN interpolate(...) ELSE rating END AS rating 
- FROM ratings;
-					
- or -

- SELECT pid, CASE WHEN rating is null 
-             THEN classifier(...) ELSE rating END AS rating 
- FROM ratings;
-					
- or ... -
- -
-

Effects

-
    -
  1. It's clear where uncertainty comes from.
  2. -
  3. Results can be communicated through provenance.
  4. -
  5. Query evaluation is decoupled from physical layout.
  6. -
-
-
- -
- -
-

Non-Deterministic Queries

-
- -
- - - - - - Q(D) - - - - - - - - Q1(D) - - 1 - - - - - - Q2(D) - - 2 - - - - - - Q1(D) - - 1 - - - - - - - - -

- Non-deterministic queries reference an external configuration. -

- (OpenClipArt.org) -
- - - -
-

VGTerms

-

A $VGTerm(\ldots)$ references configuration parameters
(aka "variables").

- - Lenses: An On-Demand Approach to ETL; Yang et. al.; VLDB 2015 - -
- -
-

$VGTerm()$s can be used like normal expressions

-

-                  SELECT A, VGTerm('X', B) AS C FROM R;
-					
- -
- - - - - - - - -
RAB
12
34
54
- - - - - -
Q(R)AC
1$X_2$
3$X_4$
5$X_4$
-
-
 
-

- ... variables are identified by a family (i.e. $'X'$),
and optional indexes (i.e., $B$). -

-
- - - -
-

Schema Matching

-
- $$ratings2(pid, num\_ratings, evaluation) \rightarrow (pid, rating)$$ -
-

- SELECT 
-    pid, 
-    CASE VGTerm('MATCH_RATING') 
-      WHEN 'NUM_RATINGS' THEN num_ratings
-      WHEN 'EVALUATION'  THEN evaluation
-      ELSE null
-    END AS rating
- FROM ratings2;
-					
-

- One global configuration variable decides which column gets mapped to "rating". -

-
- -
-

Missing Value Imputation

-
- $$ratings1(pid, rating, review\_ct) \text{ s.t. } rating \text{ is not NULL}$$ -
-

- SELECT 
-    pid, 
-    CASE WHEN rating IS NULL
-         THEN VGTerm('RATING', ROWID) 
-         ELSE rating
-      END AS rating,
-    review_ct
- FROM ratings1;
-					
-

- A family of variables indexed by ROWID represent each imputed value. -

-
- -
- -
-
-

Defining Configurations

- - - - - Config. - - - - - Model - - - Model - - - Model - - - - - - All assignments for one family. - - - - Description of the family in English. - - - - Other feasible assignments. - - - - - - Config. - - Config. - - - - - (Best) - - - - -

- Models designate one "best-guess" configuration. -

-
- -
-

Example Models

- - -
- -
-

Convenience Operators: Lenses

- -

Lenses instantiate/train a model and wrap a query

- - - †Lenses: An On-Demand Approach to ETL; Yang et. al.; VLDB 2015
- *Adaptive Schema Databases; Spoth et. al.; CIDR 2017 -
-
- -
- -
- -
-

Probabilistic ETL

-
- -
-

ETL: Extract/Transform/Load

-

One big query that gets you to a clean dataset

- -

Challenge: Designing ETL pipelines can be a full-time job.

-
- -
- -

Mimir starts with the default "guess" configuration.

- -

As users explore, they validate or refine guesses for configuration variables as necessary.

- - -
- -
-

Useful Provenance Questions

-
    -
  1. How much of my query result is affected by unvalidated variables?
  2. -
  3. Which variables affect my query results?
  4. -
  5. How bad is the situation?
  6. -
-
- -
- -
- -
-

Provenance Question 1

-

How much of my query result is affected by unvalidated variables?

- -

Idea: Mark values in query results that depend on unvalidated variables.

-
- -
- - Communicating Data Quality in On-Demand Curation; Kumari et. al.; QDB 2016 -
- -
- -
- -
-

Non-Determinism Taint

-

- SELECT A, VGTerm('X', ROWID) AS B FROM R;
-					
-↓    ↓    ↓    ↓ -

- SELECT A, VGTerm('X', ROWID) AS B, 
-        FALSE AS ROW_TAINTED,
-        FALSE AS A_TAINTED,
-        TRUE AS B_TAINTED
-  FROM R;
-					
-

The Mimir compiler adds *_TAINTED fields to each row.

-
- -
-

Non-Determinism Taint

- -
-
A row is untainted if...
-
... we can guarantee that it (or a counterpart) appears in the result regardless of configuration.
-
A cell is untainted if...
-
... we can guarantee that its value in the result is independent of the configuration.
-
-
- - -
-

Non-Determinism Taint

-

- SELECT A, CASE WHEN B IS NULL 
-                THEN VGTerm('X', ROWID) 
-                ELSE B END AS B 
- FROM R;
-					
-↓    ↓    ↓    ↓ -

- SELECT A, CASE WHEN B IS NULL 
-                THEN VGTerm('X', ROWID) 
-                ELSE B END AS B, 
-        FALSE AS ROW_TAINTED, FALSE AS A_TAINTED,
-        (B IS NULL) AS B_TAINTED
-  FROM R;
-					
-

Expressions with VGTerms can be conditionally tainted.

-
- - - - -
-

- CREATE VIEW R_CLEANED AS
-    SELECT A, CASE WHEN B IS NULL 
-                   THEN VGTerm('X', ROWID) 
-                   ELSE B END AS B 
-    FROM R;
-
- SELECT A, SUM(B) AS B FROM R_CLEANED GROUP BY A;
-					
-↓    ↓    ↓    ↓ -

- SELECT A, SUM(B) AS B, 
-        FALSE AS A_TAINTED,
-        GROUP_OR(B_TAINTED OR ROW_TAINTED) 
-          OR (SELECT GROUP_OR(A_TAINTED) FROM R_CLEANED) AS B_TAINTED
-        GROUP_AND(A_TAINTED OR ROW_TAINTED) AS ROW_TAINTED
- FROM R_CLEANED;
-					
-

Aggregates work too!

-
- -
-

Taint Benefits

- -
- -
-

Taint Limitations

- -

In spite of this, taint works well in practice.

- *Ongong work w/ Su Feng, Aaron Huber, Boris Glavic -
- -
- -
- -
-

Provenance Question 2

-

Which variables affect my query results?

-

Idea: Static dependency analysis produces a list of variable families and queries to generate all relevant indexes.

- Mimir: Bringing CTables into Practice; Nandi et. al.; ArXiV -
- -
- -
- - -
-

Provenance Question 3

-

How bad is the situation?

-

Idea: Sample from the space of alternatives to... -

-

-
- -
- -
- -
-

Sampling is slooooow

-
- -
-

Trivial Sampling

-

Evaluate the query $N$ times.
Plug in samples instead of best guesses.

-
-

Better Solutions

-

Merge evaluation to mitigate redundancy.

-
-
- -
-

Sparse Encoding

- - -
- - - - - - - -
$R_1$AB
12
34
$R_2$AB
15
-
- ➔ - - - - - - -
$R_{sparse}$ABS#
121
341
152
-
-
- -
-

Tuple Bundles

- - -
- - - - - - - -
$R_1$AB
12
34
$R_2$AB
15
-
- ➔ - - - - - -
$R_{bundle}$AB$\phi$
1[2,5][T,T]
34[T,F]
-
-
- -
- -
- -
- -

Which one to use?

-
- -
-

Either!

- -

Mimir isn't committed to one fixed data representation.

- -

(optimization is a work in progress)

-
- -
- -
- -

Demo

@@ -1527,6 +368,33 @@ CREATE VIEW R_CLEANED AS transition: 'fade', // none/fade/slide/convex/concave/zoom + chart: { + defaults: { + global: { + title: { fontColor: "#333" }, + legend: { + labels: { fontColor: "#333", fontSize: 20 }, + }, + responsiveness: true + }, + scale: { + scaleLabel: { fontColor: "#333", fontSize: 20 }, + gridLines: { color: "#333", zeroLineColor: "#333" }, + ticks: { fontColor: "#333", fontSize: 16 }, + } + }, + line: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ], "borderDash": [ [5,10], [0,0] ]}, + bar: { backgroundColor: [ + "rgba(220,220,220,0.8)", + "rgba(151,187,205,0.8)", + "rgba(205,151,187,0.8)", + "rgba(187,205,151,0.8)" + ] + }, + pie: { backgroundColor: [ ["rgba(0,0,0,.8)" , "rgba(220,20,20,.8)", "rgba(20,220,20,.8)", "rgba(220,220,20,.8)", "rgba(20,20,220,.8)"] ]}, + radar: { borderColor: [ "rgba(20,220,220,.8)" , "rgba(220,120,120,.8)", "rgba(20,120,220,.8)" ]}, + }, + // Optional ../reveal.js plugins dependencies: [ { src: '../reveal.js-3.5.0/lib/js/classList.js', condition: function() { return !document.body.classList; } }, @@ -1538,7 +406,11 @@ CREATE VIEW R_CLEANED AS { src: '../reveal.js-3.5.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } }, { src: '../reveal.js-3.5.0/plugin/highlight/highlight.js', async: true, condition: function() { return !!document.querySelector( 'pre code' ); }, callback: function() { hljs.initHighlightingOnLoad(); } }, { src: '../reveal.js-3.5.0/plugin/zoom-js/zoom.js', async: true }, - { src: '../reveal.js-3.5.0/plugin/notes/notes.js', async: true } + { src: '../reveal.js-3.5.0/plugin/notes/notes.js', async: true }, + // Chart.min.js + { src: '../reveal.js-3.5.0/plugin/chart/Chart.min.js'}, + // the plugin + { src: '../reveal.js-3.5.0/plugin/chart/csv2chart.js'} ] });