paper-Vizier-SpreadsheetOve.../results/gen_graph.py

186 lines
4.8 KiB
Python

import re
#import matplotlib as mpl
import matplotlib.pyplot as plt
#import numpy as np
def read_dataspread(testbed, experiment):
def extract(line):
match = re.match("(.+)@(\\d+): ([^:]+): ([0-9.]+)", line)
if match is None:
return []
else:
print(line)
return [(
testbed,
"dataspread",
int(match.group(2)),
match.group(3).lower(),
float(match.group(4)),
experiment,
)]
with open(f"{testbed}-dataspread-{experiment}.log") as f:
data = [
match
for line in f.readlines()
for match in extract(line)
]
return data
def read_vizier(testbed, experiment):
def extract(line):
match = re.match("(.*)@(\\d+)/(true|false): ([^:]+): ([0-9.]+)", line)
if match is None:
print(line)
return []
else:
return [(
testbed,
"vizier-batch" if match.group(3) == "true" else "vizier",
int(match.group(2)),
match.group(4).lower(),
float(match.group(5)),
experiment,
)]
with open(f"{testbed}-vizier-{experiment}.log") as f:
data = [
match
for line in f.readlines()
for match in extract(line)
]
return data
# Schema:
# 0. testbed platform: 'desktop' or 'laptop'
# 1. system: 'vizier', 'vizier-batch', or 'dataspread'
# 2. data-size: int (number of rows of lineitem)
# 3. test-stage:
# 'init spreadsheet' - time to load the spreadsheet
# 'init formulas' - time to start processing formulas
# 'monitoring overhead' - vizier specific... not relevant
# 'update one' - time to update one cell
# 'update all' - time to update an entire column (not used)
# 4. time: float (number of seconds)
# 5. experiment: 'varystart', 'varysize', 'varystartandsize'
data = [
record
for ds in [
read_vizier("laptop", "varystart"),
read_dataspread("laptop", "varystart"),
read_vizier("laptop", "varysize"),
read_dataspread("laptop", "varysize"),
read_vizier("laptop", "varystartandsize"),
read_dataspread("laptop", "varystartandsize"),
]
for record in ds
]
stages = set(i[3] for i in data)
sizes = set(i[2] for i in data)
experiment_xlabels = {
"varystart" : "First visible row",
"varysize" : "Number of rows",
"varystartandsize" : "Number of rows",
}
system_labels = {
"vizier" : ("Vizier", "v-"),
"vizier-batch" : ("Vizier (Simulated Batching)", "^-"),
"dataspread" : ("DataSpread", 'o-')
}
init_costs = {}
init_fields = [
"init spreadsheet",
"init formulas"
]
for record in data:
if record[3] in init_fields:
key = (
record[0],
record[1],
record[2],
record[5]
)
print(key)
init_costs[key] = init_costs.get(key, 0) + record[4]
data += [
(
key[0],
key[1],
key[2],
"init",
init_costs[key],
key[3]
)
for key in init_costs
]
print(data)
print(stages)
def plot_one(testbed, stage, experiment):
global data
fig, ax = plt.subplots(
figsize=(4, 2),
constrained_layout=True,
)
# ax.set_title(f"{stage} ({testbed})")
ax.set_ylabel(f"Time (s)")
ax.set_xlabel(experiment_xlabels[experiment])
ax.set_xscale("log")
ax.set_yscale("log")
for system in system_labels:
points = sorted([
(record[2], record[4])
for record in data
if record[0] == testbed
and record[1] == system
and record[3] == stage
and record[5] == experiment
], key=lambda x: x[0])
label, marker = system_labels[system]
ax.plot(
[pt[0] for pt in points],
[pt[1] for pt in points],
marker,
label=label,
)
with open(f"{testbed}-{stage}-{experiment}-{system}.csv", "w+") as f:
for p in points:
f.write(",".join(str(c) for c in p)+"\n")
ax.legend()
stage = stage.replace(" ", "_")
fig.savefig(f"{testbed}-{stage}-{experiment}.pdf")
fig.savefig(f"{testbed}-{stage}-{experiment}.png")
# plot_one("laptop", "init spreadsheet", "varystart")
# plot_one("laptop", "init formulas", "varystart")
plot_one("laptop", "init", "varystart")
plot_one("laptop", "update one", "varystart")
# plot_one("laptop", "init spreadsheet", "varysize")
# plot_one("laptop", "init formulas", "varysize")
plot_one("laptop", "init", "varysize")
plot_one("laptop", "update one", "varysize")
# plot_one("laptop", "init spreadsheet", "varystartandsize")
# plot_one("laptop", "init formulas", "varystartandsize")
plot_one("laptop", "init", "varystartandsize")
plot_one("laptop", "update one", "varystartandsize")