blob: dc9afb407669095d1d6ba5245f681e206cc27ff9 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Analyze advanced bisection statistics.
- Generate view.json at each 'failed' event and count the reason/number of
retries.
$ export SKYLAB_CLOUD_SERVICE_ACCOUNT_JSON=<PATH_TO_PRODUCT_BISECT_RUNNER.JSON>
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen --bisect_id <BISECT_ID>
or
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen --after <DATE_TIME> [--before <BEFORE_TIME>]
For each bisection, it would generate view_<timestamp>.json at each failed
timestamp and an index file "progress.json" in <WORK_DIR>/<BISECT_ID>.
Also it would generate a "retries.json" for the reason/number of retries.
- Get the counter of total retries.
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> count_retries
It would aggregate "retries.json" in all sub directories under <WORK_DIR>.
It should be run after the sub command "gen".
- Geterate the DUT waiting time.
$ export SKYLAB_CLOUD_SERVICE_ACCOUNT_JSON=<PATH_TO_PRODUCT_BISECT_RUNNER.JSON>
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen_waiting_time --bisect_id <BISECT_ID>
or
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen_waiting_time --after <DATE_TIME> [--before <BEFORE_TIME>]
For each bisection , it would generate "waiting_time.json" under <WORK_DIR>/<BISECT_ID>.
- If the bisect is unable to complete due to DUT leasing issue (reach retry
limit or timeout), output "failed".
- If the bisect is failed due to reasons other than DUT leasing issues while
waiting for a DUT, output "unknown" because the waiting time data is incomplete.
- If the bisect is completed (either success or failed due to reason other
- Get the total waiting time.
$ ./analyze_bisect_progress.py --workdir=<WORK_DIR> count_waiting_time
It would aggregate "waiting_time.json" in all sub directories under <WORK_DIR>.
It should be run after the sub command "gen_waiting_time".
"""
from __future__ import annotations
import argparse
import collections
import datetime
import json
import logging
import os
import re
import subprocess
import sys
from bisect_kit import bisect_db
from bisect_kit import common
from bisect_kit import gs_util
from google.cloud import datastore
logger = logging.getLogger(__name__)
_GS_DIR_PATTTERN = r'gs://crosperf/b/{bisect_id}'
_GS_FILE_PATTERN = r'gs://crosperf/b/[^/]+/([^/]+)'
_DOMAIN_SESSION_FILE_SUFFIX = 'Domain'
_MAIN_SESSION_FILE_SUFFIX = 'CommandLine'
_SUFFIXES_OF_INTERESTS = [
_MAIN_SESSION_FILE_SUFFIX,
_DOMAIN_SESSION_FILE_SUFFIX,
]
_MAIN_SCRIPT_MAP = {
'DiagnoseAutotestCommandLine': 'diagnose_cros_autotest.py',
'DiagnoseTastCommandLine': 'diagnose_cros_tast.py',
'DiagnoseCustomEvalCommandLine': 'diagnose_cros_custom_eval.py',
}
def output_view_json(
records: dict[str, str],
bisect_id: str,
main_script: str,
bisect_dir: str,
timestamp: float | None = None,
):
"""Generate a view.json for a bisect at some timestamp.
Args:
records: A list of dict. The last dict should be filled with {'view_json': <output file>}
bisect_id: the bisect ID
main_script: the main diagnose script to execute to generate view.json
bisect_dir: the output bisect dir
timestamp: the timestamp or None
"""
logger.info(
'generating view.json for %s at timestamp %s', bisect_id, timestamp
)
try:
cmd = [
'./%s' % main_script,
'view',
'--log-base-dir',
os.path.dirname(bisect_dir),
'--session',
bisect_id,
'--json',
]
if timestamp is not None:
cmd.extend(
[
'--timestamp',
str(timestamp),
]
)
# pylint: disable=subprocess-run-check
# The return code is checked manually below.
# check=True would raise an exception immediately so stderr has no
# chance to output.
result = subprocess.run(
cmd,
capture_output=True,
text=True,
)
except Exception as e:
logger.error(
'unable to execute %s for %s: %s', main_script, bisect_id, e
)
return
if result.returncode != 0:
logger.error(
'failed to generate view.json with returncode %s', result.returncode
)
if result.stderr:
logger.error('stderr: %s', result.stderr)
return
# To save disk size, do not write a new file if the view.json is the same as
# in the previous timestamp.
should_write = True
if len(records) >= 2:
previous_output_file = records[-2]['view_json']
with open(os.path.join(bisect_dir, previous_output_file)) as f:
previous_content = f.read()
if previous_content == result.stdout:
should_write = False
logger.info(
'skip writing file at timestamp %s because it is the same as %s',
timestamp,
previous_output_file,
)
records[-1]['view_json'] = previous_output_file
if should_write:
output_file = 'view_%s.json' % (timestamp or 'final')
with open(os.path.join(bisect_dir, output_file), 'w') as f:
f.write(result.stdout)
records[-1]['view_json'] = output_file
logger.info('%s generated successfully', output_file)
def download_session_from_gs(
bisect_id: str, bisect_dir: str, suffix_of_interests: list[str]
):
"""Download session files from gs bucket to local directory.
Args:
bisect_id: bisect ID.
bisect_dir: local directory path.
suffix_of_interests: a list of suffix to download.
"""
try:
gs_paths = [
line.strip()
for line in gs_util.ls(_GS_DIR_PATTTERN.format(bisect_id=bisect_id))
]
except Exception as e:
logger.error('failed to list gsbucket files of %s: %s', bisect_id, e)
return
for gs_path in gs_paths:
matched = re.fullmatch(_GS_FILE_PATTERN, gs_path)
if not matched:
continue
file_name = matched.group(1)
logger.debug('matched file %s', file_name)
for suffix in suffix_of_interests:
if file_name.endswith(suffix):
file_path = os.path.join(bisect_dir, file_name)
logger.info('downloading %s to %s', gs_path, file_path)
try:
gs_util.cp(gs_path, file_path)
except Exception as e:
logger.error('failed to download %s: %s', gs_path, e)
break
def gen_view(
bisect_id: str,
out_dir: str,
regenerate: bool = False,
download: bool = True,
) -> dict[str, int]:
"""Generates view.json for a single bisect.
Args:
bisect_id: bisect ID.
out_dir: local dir name.
regenerate: whether to analyze it again if the bisect directory already exists.
downoload: whether to download session files from gsbucket. If False, assume the files are downloaded before.
Returns:
"""
logger.info('processing %s', bisect_id)
bisect_dir = os.path.join(out_dir, bisect_id)
try:
os.makedirs(bisect_dir, exist_ok=False)
except FileExistsError as e:
logger.warning('directory %s already exists: %s', bisect_dir, e)
if not regenerate:
logger.info('skip %s since the directory already exists', bisect_id)
return
logger.info('re-generate %s', bisect_id)
except Exception as e:
logger.error('failed to create directory %s: %s', bisect_dir, e)
return
if download:
download_session_from_gs(
bisect_id,
bisect_dir,
[
_MAIN_SESSION_FILE_SUFFIX,
_DOMAIN_SESSION_FILE_SUFFIX,
],
)
main_session_file = None
for file_name in os.listdir(bisect_dir):
if file_name.endswith(_MAIN_SESSION_FILE_SUFFIX):
main_session_file = file_name
logger.info('main session file for %s: %s', bisect_id, main_session_file)
if not main_session_file:
logger.error('Can not find main session file for %s, abort', bisect_id)
return
main_script = _MAIN_SCRIPT_MAP.get(main_session_file)
logger.info('main script for %s: %s', bisect_id, main_script)
if not main_script:
logger.error(
'Can not find corresponding main script name for file %s',
main_session_file,
)
return
with open(os.path.join(bisect_dir, main_session_file)) as f:
data = json.load(f)
retries = collections.Counter()
records = []
previous_entry = None
for entry in data.get('history'):
event = entry.get('event')
if event == 'failed':
timestamp = entry.get('timestamp')
records.append(entry.copy())
output_view_json(
records, bisect_id, main_script, bisect_dir, timestamp
)
elif event == 'start':
if (
previous_entry
and previous_entry.get('error_type') == 'Retriable'
):
retries[previous_entry.get('exception')] += 1
previous_entry = entry
records.append({})
output_view_json(records, bisect_id, main_script, bisect_dir)
with open(os.path.join(bisect_dir, 'progress.json'), 'w') as f:
json.dump(records, f, indent=4)
with open(os.path.join(bisect_dir, 'retries.json'), 'w') as f:
json.dump(retries, f, indent=4)
def cmd_count_retries(opts):
total_retries = collections.Counter()
count = 0
for dir_name in os.listdir(opts.work_dir):
if not os.path.isdir(os.path.join(opts.work_dir, dir_name)):
continue
retries_file = os.path.join(opts.work_dir, dir_name, 'retries.json')
try:
with open(retries_file) as f:
data = json.load(f)
except Exception as e:
logger.error('failed to load %s: %s', retries_file, e)
continue
count += 1
total_retries += collections.Counter(data)
logger.info('total %s bisections', count)
with open(os.path.join(opts.work_dir, 'total_retires.json'), 'w') as f:
json.dump(total_retries, f, indent=4)
logger.info('%s total retries: %s', total_retries.total(), total_retries)
def cmd_gen_view(opts):
if opts.after:
if not opts.before:
opts.before = datetime.datetime.now()
logger.info('query bisections in [%s, %s)', opts.after, opts.before)
ds_client = bisect_db.Client()
bisect_ids = [
entity.key.id_or_name
for entity in ds_client.query_finished_bisects(
opts.after, opts.before
)
]
num_bisects = len(bisect_ids)
logger.info('total %s bisections found', num_bisects)
for i, bisect_id in enumerate(bisect_ids):
logger.info('processing %s of %s', i + 1, num_bisects)
gen_view(
bisect_id,
opts.work_dir,
opts.regenerate,
opts.download,
)
elif opts.bisect_id:
gen_view(
opts.bisect_id,
opts.work_dir,
opts.regenerate,
opts.download,
)
def cmd_gen_waiting_time(opts):
ds_client = bisect_db.Client()
if opts.after:
if not opts.before:
opts.before = datetime.datetime.now()
logger.info('query bisections in [%s, %s)', opts.after, opts.before)
ds_client = bisect_db.Client()
bisects = [
entity
for entity in ds_client.query_finished_bisects(
opts.after,
opts.before,
)
if 'stateless'
in entity.get(
'Experiments', []
) # and 'shared_dut_pool' in eneity.get('Experiments', [])
]
num_bisects = len(bisects)
logger.info('total %s bisections found', num_bisects)
for i, bisect in enumerate(bisects):
logger.info('procesing %s of %s', i + 1, num_bisects)
gen_waiting_time(
bisect.key.id_or_name,
bisect,
opts.work_dir,
opts.regenerate,
opts.download,
)
elif opts.bisect_id:
entity = ds_client.get_bisect(opts.bisect_id)
gen_waiting_time(
opts.bisect_id,
entity,
opts.work_dir,
opts.regenerate,
opts.download,
)
_DUT_FAILURES_MESSAGES = [
'cannot allocate a lab DUT more than 30 times',
'reached retry limit of 30 times',
'Bisection exceeds timeout',
]
def is_dut_failure(message: str) -> bool:
for dut_failure_message in _DUT_FAILURES_MESSAGES:
if dut_failure_message in message:
return True
return False
def gen_waiting_time(
bisect_id: str,
entity: datastore.Entity,
out_dir: str,
regenerate: bool = False,
download: bool = True,
):
"""Generates waiting_time.json for a single bisection.
Args:
bisect_id: the bisect ID.
entity: a bisect entity from the Datastore.
out_dir: the output directory.
regenerate: whether to regenerate if the directory already exists.
downoload: whether to download session files from gsbucket. If False, assume the files are downloaded before.
"""
logger.info('processing %s', bisect_id)
bisect_dir = os.path.join(out_dir, bisect_id)
try:
os.makedirs(bisect_dir, exist_ok=False)
except FileExistsError as e:
logger.warning('directory %s already exists: %s', bisect_dir, e)
if not regenerate:
logger.info('skip %s since the directory already exists', bisect_id)
return
logger.info('re-generate %s', bisect_id)
except Exception as e:
logger.error('failed to create directory %s: %s', bisect_dir, e)
return
if download:
download_session_from_gs(
bisect_id,
bisect_dir,
[
_MAIN_SESSION_FILE_SUFFIX,
],
)
main_session_file = None
for file_name in os.listdir(bisect_dir):
if file_name.endswith(_MAIN_SESSION_FILE_SUFFIX):
main_session_file = file_name
logger.info('main session file for %s: %s', bisect_id, main_session_file)
if not main_session_file:
logger.error('Can not find main session file for %s, abort', bisect_id)
return
try:
with open(os.path.join(bisect_dir, main_session_file)) as f:
data = json.load(f)
except Exception as e:
logger.error('failed to load %s: %s', main_session_file, e)
dut_leases = []
dut_lease_file_path = data.get('statistics', {}).get('dut_leases_log_path')
if not dut_lease_file_path:
logger.info('No DUT lease log file')
else:
dut_lease_file_name = os.path.basename(dut_lease_file_path)
dut_lease_local_path = os.path.join(bisect_dir, dut_lease_file_name)
if download:
gs_path = os.path.join(
_GS_DIR_PATTTERN.format(bisect_id=bisect_id),
'log',
dut_lease_file_name,
)
try:
gs_util.cp(gs_path, dut_lease_local_path)
except Exception as e:
logger.error('failed to download %s: %s', gs_path, e)
return
try:
with open(dut_lease_local_path) as f:
dut_leases = json.load(f)
except Exception as e:
logger.error('failed to load %s: %s', dut_lease_local_path, e)
return
fail_timestamps = []
for entry in data.get('history'):
event = entry.get('event')
exception = entry.get('exception')
if event == 'failed':
if exception in ['DutLeaseTimeout', 'NoDutAvailable']:
fail_timestamps.append(entry.get('timestamp'))
records = []
waiting_time = 0
previous_lease = None
summary = None
for fail_timestamp in fail_timestamps:
if previous_lease and fail_timestamp < previous_lease:
continue
found = False
for lease in dut_leases:
start_timestamp = lease.get('start_timestamp')
if start_timestamp > fail_timestamp:
duration = start_timestamp - fail_timestamp
waiting_time += duration
records.append(
{
'fail': fail_timestamp,
'next_lease': start_timestamp,
'duration': duration,
}
)
previous_lease = start_timestamp
found = True
break
if not found:
reason = entity.get('RunInfo.ErrDesc', '')
if is_dut_failure(reason):
logger.info('is dut failure: %s %s', bisect_id, reason)
summary = {
'result': 'failed',
'reason': reason,
}
else:
# Otherwise, the error is not caused by DUT lease error.
# We exclude the data point from calculation.
summary = {
'result': 'unknown',
'reason': reason,
}
break
if not summary:
summary = {
'result': 'success',
'leases': records,
'waiting_time': waiting_time,
}
with open(os.path.join(bisect_dir, 'waiting_time.json'), 'w') as f:
json.dump(
summary,
f,
indent=4,
)
logger.info('waiting_time.json generated for %s', bisect_id)
def cmd_count_waiting_time(opts):
count = 0
failed_count = 0
success_count = 0
total_waiting_time = 0
for dir_name in os.listdir(opts.work_dir):
if not os.path.isdir(os.path.join(opts.work_dir, dir_name)):
continue
waiting_time_file = os.path.join(
opts.work_dir, dir_name, 'waiting_time.json'
)
try:
with open(waiting_time_file) as f:
data = json.load(f)
except Exception as e:
logger.error('failed to load %s: %s', waiting_time_file, e)
continue
count += 1
result = data.get('result')
if result == 'failed':
failed_count += 1
elif result == 'success':
success_count += 1
total_waiting_time += float(data.get('waiting_time'))
logger.info('total %s bisections', count)
with open(os.path.join(opts.work_dir, 'total_waiting_time.json'), 'w') as f:
json.dump(
{
'success': success_count,
'failed': failed_count,
'total_waiting_time': total_waiting_time,
},
f,
indent=4,
)
def main(args):
parser = argparse.ArgumentParser(description='analyze bisect progress')
parser.add_argument(
'--debug', action='store_true', help='Output DEBUG log to console'
)
parser.add_argument(
'--work_dir',
required=True,
help='output directory',
)
subparsers = parser.add_subparsers(required=True)
parser_gen_view = subparsers.add_parser('gen_view')
parser_gen_view.add_argument('--bisect_id', help='bisection id')
parser_gen_view.add_argument(
'--regenerate',
action=argparse.BooleanOptionalAction,
default=False,
help='whether to regenerate if the bisect has been processed',
)
parser_gen_view.add_argument(
'--download',
action=argparse.BooleanOptionalAction,
default=True,
help='whether to download files from gs bucket',
)
parser_gen_view.add_argument(
'--before',
type=datetime.datetime.fromisoformat,
help='query bisections < the date',
)
parser_gen_view.add_argument(
'--after',
type=datetime.datetime.fromisoformat,
help='query bisections >= the date',
)
parser_gen_view.set_defaults(func=cmd_gen_view)
parser_count_retries = subparsers.add_parser('count_retries')
parser_count_retries.set_defaults(func=cmd_count_retries)
parser_gen_waiting_time = subparsers.add_parser('gen_waiting_time')
parser_gen_waiting_time.add_argument('--bisect_id', help='bisection id')
parser_gen_waiting_time.add_argument(
'--regenerate',
action=argparse.BooleanOptionalAction,
default=False,
help='whether to regenerate if the bisect has been processed',
)
parser_gen_waiting_time.add_argument(
'--download',
action=argparse.BooleanOptionalAction,
default=True,
help='whether to download files from gs bucket',
)
parser_gen_waiting_time.add_argument(
'--before',
type=datetime.datetime.fromisoformat,
help='query bisections < the date',
)
parser_gen_waiting_time.add_argument(
'--after',
type=datetime.datetime.fromisoformat,
help='query bisections >= the date',
)
parser_gen_waiting_time.set_defaults(func=cmd_gen_waiting_time)
parser_count_waiting_time = subparsers.add_parser('count_waiting_time')
parser_count_waiting_time.set_defaults(func=cmd_count_waiting_time)
opts = parser.parse_args(args)
opts.log_file = os.path.join(opts.work_dir, 'log')
common.config_logging(opts)
opts.func(opts)
if __name__ == '__main__':
main(sys.argv[1:])