[libc++] Produce summary reports in compare-benchmarks
This patch adds the ability to produce a summary report with a few KPIs
in the compare-benchmarks script. This is useful to regularly monitor
the progress of the library on these KPIs.
Example usage:
compare-benchmarks libstdcxx.lnt llvm-20.lnt llvm-21.lnt main.lnt \
--series-names "GNU,LLVM 20,LLVM 21,LLVM main" \
--format kpi \
--noise-threshold 0.1 \
--meta-candidate 'LLVM'
This would produce a short report showing the evolution of benchmarks
in the given LLVM releases as compared to a GNU baseline.
NOKEYCHECK=True
GitOrigin-RevId: 20c67c75ec3a08938808646f849a98fd6141ce74
diff --git a/utils/compare-benchmarks b/utils/compare-benchmarks
index b5bd880..63e1e6b 100755
--- a/utils/compare-benchmarks
+++ b/utils/compare-benchmarks
@@ -86,6 +86,46 @@
figure.update_layout(xaxis_title='', yaxis_title='', legend_title='')
return figure
+def produce_kpis(data, noise, extrema, series, series_names, meta_candidate, title):
+ addendum = f"{noise}% noise threshold, based on {len(data)} benchmarks"
+ top_addendum = f"by >= {extrema}%, {noise}% noise threshold, based on {len(data)} benchmarks"
+ headers = [title if title else '']
+ columns = [[
+ f'Benchmarks where {meta_candidate} is faster than {series_names[0]} ({addendum})',
+ f'Neutral benchmarks ({addendum})',
+ f'Benchmarks where {meta_candidate} is slower than {series_names[0]} ({addendum})',
+ f'Worst performers ({top_addendum})',
+ f'Best performers ({top_addendum})',
+ ]]
+ fmt = [None]
+
+ def compute_kpis(base, cand):
+ diff = data[cand] - data[base]
+ pct = diff / data[base]
+ faster = data[(data[base] > data[cand]) & (pct.abs() > noise)]
+ neutral = data[pct.abs() <= noise]
+ slower = data[(data[base] < data[cand]) & (pct.abs() > noise)]
+ worst = data[(data[base] < data[cand]) & (pct.abs() >= extrema)]
+ best = data[(data[base] > data[cand]) & (pct.abs() >= extrema)]
+ return list(map(lambda k: len(k) / len(data), [faster, neutral, slower, worst, best]))
+
+ baseline = series[0]
+ for (i, candidate) in enumerate(series[1:], start=1):
+ kpis = compute_kpis(baseline, candidate)
+ if i > 1: # after the first series, also generate a relative difference
+ previous_kpis = columns[-1]
+ rel_deltas = [(kpis[k] - previous_kpis[k]) / previous_kpis[k] for k in range(len(kpis))]
+ headers.append('rel Δ')
+ columns.append(rel_deltas)
+ fmt.append('+.2%')
+
+ headers.append(series_names[i])
+ columns.append(kpis)
+ fmt.append('.2%')
+
+ rows = list(zip(*columns))
+ print(tabulate.tabulate(rows, headers=headers, floatfmt=fmt))
+
def main(argv):
parser = argparse.ArgumentParser(
prog='compare-benchmarks',
@@ -113,9 +153,10 @@
'`candidate` (sort using the absolute number of the candidate run), '
'and `percent_diff` (sort using the percent difference between the baseline and the candidate). '
'Note that when more than two input files are compared, the only valid sorting order is `benchmark`.')
- parser.add_argument('--format', type=str, choices=['text', 'chart'], default='text',
- help='Select the output format. `text` generates a plain-text comparison in tabular form, and `chart` '
- 'generates a self-contained HTML graph that can be opened in a browser. The default is `text`.')
+ parser.add_argument('--format', type=str, choices=['text', 'chart', 'kpi'], default='text',
+ help='Select the output format. `text` generates a plain-text comparison in tabular form, `chart` '
+ 'generates a self-contained HTML graph that can be opened in a browser, and `kpi` generates a '
+ 'summary report based on a few KPIs. The default is `text`.')
parser.add_argument('--open', action='store_true',
help='Whether to automatically open the generated HTML file when finished. This option only makes sense '
'when the output format is `chart`.')
@@ -125,6 +166,24 @@
parser.add_argument('--subtitle', type=str, required=False,
help='Optional subtitle to use for the chart. This can be used to help identify the contents of the chart. '
'This option cannot be used with the plain text output.')
+ parser.add_argument('--noise-threshold', type=float, required=False,
+ help='Noise threshold used by KPIs to determine noise. This is a floating point number between '
+ '0 and 1 that represents the percentage of difference required between two results in order '
+ 'for them not to be considered "within the noise" of each other.')
+ parser.add_argument('--top-performer-threshold', type=float, required=False, default=0.5,
+ help='Threshold percent used by KPIs to determine top (and worst) performers. This is a floating '
+ 'point number between 0 and 1 that represents the percentage of difference required to consider '
+ 'a benchmark to be a top/worst performer. For example, if this number is 0.5, we consider top/worst '
+ 'performers in the data to be benchmarks that have at least 50%% of difference between the baseline '
+ 'and the candidate.')
+ parser.add_argument('--meta-candidate', type=str, required=False,
+ help='The name to use for the candidate when producing a KPI report. Required for --format=kpi.')
+ parser.add_argument('--discard-benchmarks-introduced-after', type=str, required=False,
+ help='Discard benchmarks introduced after the given candidate. This is useful to stabilize reports '
+ 'when new benchmarks are introduced as time goes on, which would change the total number of '
+ 'benchmarks and hence appear to retroactively change the report for previous candidates. '
+ 'If used, the name used here must correspond to the name of a series (as passed to or defaulted '
+ 'via `--series-names`.')
args = parser.parse_args(argv)
# Validate arguments (the values admissible for various arguments depend on other
@@ -137,6 +196,14 @@
if args.open:
parser.error('Passing --open makes no sense with --format=text')
+ if args.format == 'kpi':
+ if args.open:
+ parser.error('Passing --open makes no sense with --format=kpi')
+ if args.noise_threshold is None:
+ raise parser.error('--format=kpi requires passing a --noise-threshold')
+ if args.meta_candidate is None:
+ raise parser.error('--format=kpi requires passing a --meta-candidate')
+
if len(args.files) != 2 and args.sort != 'benchmark':
parser.error('Using any sort order other than `benchmark` requires exactly two input files.')
@@ -182,6 +249,19 @@
do_open = args.output is None or args.open
output = args.output or tempfile.NamedTemporaryFile(suffix='.html').name
plotly.io.write_html(figure, file=output, auto_open=do_open)
+ elif args.format == 'kpi':
+ if args.discard_benchmarks_introduced_after is not None:
+ index = args.series_names.index(args.discard_benchmarks_introduced_after)
+ series_to_filter = [f'{args.metric}_{i}' for i in range(index+1, len(lnt_inputs))]
+ for candidate in series_to_filter:
+ first_candidate = f'{args.metric}_1'
+ data = data[~(data[first_candidate].isna() & data[candidate].notna())]
+ produce_kpis(data, noise=args.noise_threshold,
+ extrema=args.top_performer_threshold,
+ series=[f'{args.metric}_{i}' for i in range(len(lnt_inputs))],
+ series_names=args.series_names,
+ meta_candidate=args.meta_candidate,
+ title=args.subtitle)
else:
diff = plain_text_comparison(data, args.metric, baseline_name=args.series_names[0],
candidate_name=args.series_names[1])