[libc++] Produce summary reports in compare-benchmarks

This patch adds the ability to produce a summary report with a few KPIs
in the compare-benchmarks script. This is useful to regularly monitor
the progress of the library on these KPIs.

Example usage:

     compare-benchmarks libstdcxx.lnt llvm-20.lnt llvm-21.lnt main.lnt     \
         --series-names "GNU,LLVM 20,LLVM 21,LLVM main"                    \
         --format kpi                                                      \
         --noise-threshold 0.1                                             \
         --meta-candidate 'LLVM'

This would produce a short report showing the evolution of benchmarks
in the given LLVM releases as compared to a GNU baseline.

NOKEYCHECK=True
GitOrigin-RevId: 20c67c75ec3a08938808646f849a98fd6141ce74
diff --git a/utils/compare-benchmarks b/utils/compare-benchmarks
index b5bd880..63e1e6b 100755
--- a/utils/compare-benchmarks
+++ b/utils/compare-benchmarks
@@ -86,6 +86,46 @@
     figure.update_layout(xaxis_title='', yaxis_title='', legend_title='')
     return figure
 
+def produce_kpis(data, noise, extrema, series, series_names, meta_candidate, title):
+    addendum = f"{noise}% noise threshold, based on {len(data)} benchmarks"
+    top_addendum = f"by >= {extrema}%, {noise}% noise threshold, based on {len(data)} benchmarks"
+    headers = [title if title else '']
+    columns = [[
+        f'Benchmarks where {meta_candidate} is faster than {series_names[0]} ({addendum})',
+        f'Neutral benchmarks ({addendum})',
+        f'Benchmarks where {meta_candidate} is slower than {series_names[0]} ({addendum})',
+        f'Worst performers ({top_addendum})',
+        f'Best performers ({top_addendum})',
+    ]]
+    fmt = [None]
+
+    def compute_kpis(base, cand):
+        diff = data[cand] - data[base]
+        pct = diff / data[base]
+        faster = data[(data[base] > data[cand]) & (pct.abs() > noise)]
+        neutral = data[pct.abs() <= noise]
+        slower = data[(data[base] < data[cand]) & (pct.abs() > noise)]
+        worst = data[(data[base] < data[cand]) & (pct.abs() >= extrema)]
+        best = data[(data[base] > data[cand]) & (pct.abs() >= extrema)]
+        return list(map(lambda k: len(k) / len(data), [faster, neutral, slower, worst, best]))
+
+    baseline = series[0]
+    for (i, candidate) in enumerate(series[1:], start=1):
+        kpis = compute_kpis(baseline, candidate)
+        if i > 1: # after the first series, also generate a relative difference
+            previous_kpis = columns[-1]
+            rel_deltas = [(kpis[k] - previous_kpis[k]) / previous_kpis[k] for k in range(len(kpis))]
+            headers.append('rel Δ')
+            columns.append(rel_deltas)
+            fmt.append('+.2%')
+
+        headers.append(series_names[i])
+        columns.append(kpis)
+        fmt.append('.2%')
+
+    rows = list(zip(*columns))
+    print(tabulate.tabulate(rows, headers=headers, floatfmt=fmt))
+
 def main(argv):
     parser = argparse.ArgumentParser(
         prog='compare-benchmarks',
@@ -113,9 +153,10 @@
              '`candidate` (sort using the absolute number of the candidate run), '
              'and `percent_diff` (sort using the percent difference between the baseline and the candidate). '
              'Note that when more than two input files are compared, the only valid sorting order is `benchmark`.')
-    parser.add_argument('--format', type=str, choices=['text', 'chart'], default='text',
-        help='Select the output format. `text` generates a plain-text comparison in tabular form, and `chart` '
-             'generates a self-contained HTML graph that can be opened in a browser. The default is `text`.')
+    parser.add_argument('--format', type=str, choices=['text', 'chart', 'kpi'], default='text',
+        help='Select the output format. `text` generates a plain-text comparison in tabular form, `chart` '
+             'generates a self-contained HTML graph that can be opened in a browser, and `kpi` generates a '
+             'summary report based on a few KPIs. The default is `text`.')
     parser.add_argument('--open', action='store_true',
         help='Whether to automatically open the generated HTML file when finished. This option only makes sense '
              'when the output format is `chart`.')
@@ -125,6 +166,24 @@
     parser.add_argument('--subtitle', type=str, required=False,
         help='Optional subtitle to use for the chart. This can be used to help identify the contents of the chart. '
              'This option cannot be used with the plain text output.')
+    parser.add_argument('--noise-threshold', type=float, required=False,
+        help='Noise threshold used by KPIs to determine noise. This is a floating point number between '
+             '0 and 1 that represents the percentage of difference required between two results in order '
+             'for them not to be considered "within the noise" of each other.')
+    parser.add_argument('--top-performer-threshold', type=float, required=False, default=0.5,
+        help='Threshold percent used by KPIs to determine top (and worst) performers. This is a floating '
+             'point number between 0 and 1 that represents the percentage of difference required to consider '
+             'a benchmark to be a top/worst performer. For example, if this number is 0.5, we consider top/worst '
+             'performers in the data to be benchmarks that have at least 50%% of difference between the baseline '
+             'and the candidate.')
+    parser.add_argument('--meta-candidate', type=str, required=False,
+        help='The name to use for the candidate when producing a KPI report. Required for --format=kpi.')
+    parser.add_argument('--discard-benchmarks-introduced-after', type=str, required=False,
+        help='Discard benchmarks introduced after the given candidate. This is useful to stabilize reports '
+             'when new benchmarks are introduced as time goes on, which would change the total number of '
+             'benchmarks and hence appear to retroactively change the report for previous candidates. '
+             'If used, the name used here must correspond to the name of a series (as passed to or defaulted '
+             'via `--series-names`.')
     args = parser.parse_args(argv)
 
     # Validate arguments (the values admissible for various arguments depend on other
@@ -137,6 +196,14 @@
         if args.open:
             parser.error('Passing --open makes no sense with --format=text')
 
+    if args.format == 'kpi':
+        if args.open:
+            parser.error('Passing --open makes no sense with --format=kpi')
+        if args.noise_threshold is None:
+            raise parser.error('--format=kpi requires passing a --noise-threshold')
+        if args.meta_candidate is None:
+            raise parser.error('--format=kpi requires passing a --meta-candidate')
+
     if len(args.files) != 2 and args.sort != 'benchmark':
         parser.error('Using any sort order other than `benchmark` requires exactly two input files.')
 
@@ -182,6 +249,19 @@
         do_open = args.output is None or args.open
         output = args.output or tempfile.NamedTemporaryFile(suffix='.html').name
         plotly.io.write_html(figure, file=output, auto_open=do_open)
+    elif args.format == 'kpi':
+        if args.discard_benchmarks_introduced_after is not None:
+            index = args.series_names.index(args.discard_benchmarks_introduced_after)
+            series_to_filter = [f'{args.metric}_{i}' for i in range(index+1, len(lnt_inputs))]
+            for candidate in series_to_filter:
+                first_candidate = f'{args.metric}_1'
+                data = data[~(data[first_candidate].isna() & data[candidate].notna())]
+        produce_kpis(data, noise=args.noise_threshold,
+                           extrema=args.top_performer_threshold,
+                           series=[f'{args.metric}_{i}' for i in range(len(lnt_inputs))],
+                           series_names=args.series_names,
+                           meta_candidate=args.meta_candidate,
+                           title=args.subtitle)
     else:
         diff = plain_text_comparison(data, args.metric, baseline_name=args.series_names[0],
                                                         candidate_name=args.series_names[1])