| #!/usr/bin/env python3 |
| # Copyright 2024 The ChromiumOS Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Analyze advanced bisection statistics. |
| |
| - Generate view.json at each 'failed' event and count the reason/number of |
| retries. |
| |
| $ export SKYLAB_CLOUD_SERVICE_ACCOUNT_JSON=<PATH_TO_PRODUCT_BISECT_RUNNER.JSON> |
| |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen --bisect_id <BISECT_ID> |
| or |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen --after <DATE_TIME> [--before <BEFORE_TIME>] |
| |
| For each bisection, it would generate view_<timestamp>.json at each failed |
| timestamp and an index file "progress.json" in <WORK_DIR>/<BISECT_ID>. |
| Also it would generate a "retries.json" for the reason/number of retries. |
| |
| - Get the counter of total retries. |
| |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> count_retries |
| |
| It would aggregate "retries.json" in all sub directories under <WORK_DIR>. |
| It should be run after the sub command "gen". |
| |
| - Geterate the DUT waiting time. |
| |
| $ export SKYLAB_CLOUD_SERVICE_ACCOUNT_JSON=<PATH_TO_PRODUCT_BISECT_RUNNER.JSON> |
| |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen_waiting_time --bisect_id <BISECT_ID> |
| or |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> gen_waiting_time --after <DATE_TIME> [--before <BEFORE_TIME>] |
| |
| For each bisection , it would generate "waiting_time.json" under <WORK_DIR>/<BISECT_ID>. |
| - If the bisect is unable to complete due to DUT leasing issue (reach retry |
| limit or timeout), output "failed". |
| - If the bisect is failed due to reasons other than DUT leasing issues while |
| waiting for a DUT, output "unknown" because the waiting time data is incomplete. |
| - If the bisect is completed (either success or failed due to reason other |
| |
| - Get the total waiting time. |
| |
| $ ./analyze_bisect_progress.py --workdir=<WORK_DIR> count_waiting_time |
| |
| It would aggregate "waiting_time.json" in all sub directories under <WORK_DIR>. |
| It should be run after the sub command "gen_waiting_time". |
| """ |
| |
| from __future__ import annotations |
| |
| import argparse |
| import collections |
| import datetime |
| import json |
| import logging |
| import os |
| import re |
| import subprocess |
| import sys |
| |
| from bisect_kit import bisect_db |
| from bisect_kit import common |
| from bisect_kit import gs_util |
| from google.cloud import datastore |
| |
| |
| logger = logging.getLogger(__name__) |
| |
| |
| _GS_DIR_PATTTERN = r'gs://crosperf/b/{bisect_id}' |
| |
| _GS_FILE_PATTERN = r'gs://crosperf/b/[^/]+/([^/]+)' |
| |
| _DOMAIN_SESSION_FILE_SUFFIX = 'Domain' |
| |
| _MAIN_SESSION_FILE_SUFFIX = 'CommandLine' |
| |
| _SUFFIXES_OF_INTERESTS = [ |
| _MAIN_SESSION_FILE_SUFFIX, |
| _DOMAIN_SESSION_FILE_SUFFIX, |
| ] |
| |
| |
| _MAIN_SCRIPT_MAP = { |
| 'DiagnoseAutotestCommandLine': 'diagnose_cros_autotest.py', |
| 'DiagnoseTastCommandLine': 'diagnose_cros_tast.py', |
| 'DiagnoseCustomEvalCommandLine': 'diagnose_cros_custom_eval.py', |
| } |
| |
| |
| def output_view_json( |
| records: dict[str, str], |
| bisect_id: str, |
| main_script: str, |
| bisect_dir: str, |
| timestamp: float | None = None, |
| ): |
| """Generate a view.json for a bisect at some timestamp. |
| |
| Args: |
| records: A list of dict. The last dict should be filled with {'view_json': <output file>} |
| bisect_id: the bisect ID |
| main_script: the main diagnose script to execute to generate view.json |
| bisect_dir: the output bisect dir |
| timestamp: the timestamp or None |
| """ |
| logger.info( |
| 'generating view.json for %s at timestamp %s', bisect_id, timestamp |
| ) |
| try: |
| cmd = [ |
| './%s' % main_script, |
| 'view', |
| '--log-base-dir', |
| os.path.dirname(bisect_dir), |
| '--session', |
| bisect_id, |
| '--json', |
| ] |
| if timestamp is not None: |
| cmd.extend( |
| [ |
| '--timestamp', |
| str(timestamp), |
| ] |
| ) |
| # pylint: disable=subprocess-run-check |
| # The return code is checked manually below. |
| # check=True would raise an exception immediately so stderr has no |
| # chance to output. |
| result = subprocess.run( |
| cmd, |
| capture_output=True, |
| text=True, |
| ) |
| except Exception as e: |
| logger.error( |
| 'unable to execute %s for %s: %s', main_script, bisect_id, e |
| ) |
| return |
| |
| if result.returncode != 0: |
| logger.error( |
| 'failed to generate view.json with returncode %s', result.returncode |
| ) |
| if result.stderr: |
| logger.error('stderr: %s', result.stderr) |
| return |
| |
| # To save disk size, do not write a new file if the view.json is the same as |
| # in the previous timestamp. |
| should_write = True |
| if len(records) >= 2: |
| previous_output_file = records[-2]['view_json'] |
| with open(os.path.join(bisect_dir, previous_output_file)) as f: |
| previous_content = f.read() |
| if previous_content == result.stdout: |
| should_write = False |
| logger.info( |
| 'skip writing file at timestamp %s because it is the same as %s', |
| timestamp, |
| previous_output_file, |
| ) |
| records[-1]['view_json'] = previous_output_file |
| |
| if should_write: |
| output_file = 'view_%s.json' % (timestamp or 'final') |
| with open(os.path.join(bisect_dir, output_file), 'w') as f: |
| f.write(result.stdout) |
| records[-1]['view_json'] = output_file |
| logger.info('%s generated successfully', output_file) |
| |
| |
| def download_session_from_gs( |
| bisect_id: str, bisect_dir: str, suffix_of_interests: list[str] |
| ): |
| """Download session files from gs bucket to local directory. |
| |
| Args: |
| bisect_id: bisect ID. |
| bisect_dir: local directory path. |
| suffix_of_interests: a list of suffix to download. |
| """ |
| try: |
| gs_paths = [ |
| line.strip() |
| for line in gs_util.ls(_GS_DIR_PATTTERN.format(bisect_id=bisect_id)) |
| ] |
| except Exception as e: |
| logger.error('failed to list gsbucket files of %s: %s', bisect_id, e) |
| return |
| |
| for gs_path in gs_paths: |
| matched = re.fullmatch(_GS_FILE_PATTERN, gs_path) |
| if not matched: |
| continue |
| file_name = matched.group(1) |
| logger.debug('matched file %s', file_name) |
| for suffix in suffix_of_interests: |
| if file_name.endswith(suffix): |
| file_path = os.path.join(bisect_dir, file_name) |
| logger.info('downloading %s to %s', gs_path, file_path) |
| try: |
| gs_util.cp(gs_path, file_path) |
| except Exception as e: |
| logger.error('failed to download %s: %s', gs_path, e) |
| break |
| |
| |
| def gen_view( |
| bisect_id: str, |
| out_dir: str, |
| regenerate: bool = False, |
| download: bool = True, |
| ) -> dict[str, int]: |
| """Generates view.json for a single bisect. |
| |
| Args: |
| bisect_id: bisect ID. |
| out_dir: local dir name. |
| regenerate: whether to analyze it again if the bisect directory already exists. |
| downoload: whether to download session files from gsbucket. If False, assume the files are downloaded before. |
| |
| Returns: |
| """ |
| logger.info('processing %s', bisect_id) |
| |
| bisect_dir = os.path.join(out_dir, bisect_id) |
| try: |
| os.makedirs(bisect_dir, exist_ok=False) |
| except FileExistsError as e: |
| logger.warning('directory %s already exists: %s', bisect_dir, e) |
| if not regenerate: |
| logger.info('skip %s since the directory already exists', bisect_id) |
| return |
| logger.info('re-generate %s', bisect_id) |
| except Exception as e: |
| logger.error('failed to create directory %s: %s', bisect_dir, e) |
| return |
| |
| if download: |
| download_session_from_gs( |
| bisect_id, |
| bisect_dir, |
| [ |
| _MAIN_SESSION_FILE_SUFFIX, |
| _DOMAIN_SESSION_FILE_SUFFIX, |
| ], |
| ) |
| |
| main_session_file = None |
| for file_name in os.listdir(bisect_dir): |
| if file_name.endswith(_MAIN_SESSION_FILE_SUFFIX): |
| main_session_file = file_name |
| |
| logger.info('main session file for %s: %s', bisect_id, main_session_file) |
| if not main_session_file: |
| logger.error('Can not find main session file for %s, abort', bisect_id) |
| return |
| |
| main_script = _MAIN_SCRIPT_MAP.get(main_session_file) |
| logger.info('main script for %s: %s', bisect_id, main_script) |
| if not main_script: |
| logger.error( |
| 'Can not find corresponding main script name for file %s', |
| main_session_file, |
| ) |
| return |
| |
| with open(os.path.join(bisect_dir, main_session_file)) as f: |
| data = json.load(f) |
| |
| retries = collections.Counter() |
| records = [] |
| previous_entry = None |
| for entry in data.get('history'): |
| event = entry.get('event') |
| if event == 'failed': |
| timestamp = entry.get('timestamp') |
| records.append(entry.copy()) |
| output_view_json( |
| records, bisect_id, main_script, bisect_dir, timestamp |
| ) |
| elif event == 'start': |
| if ( |
| previous_entry |
| and previous_entry.get('error_type') == 'Retriable' |
| ): |
| retries[previous_entry.get('exception')] += 1 |
| |
| previous_entry = entry |
| |
| records.append({}) |
| output_view_json(records, bisect_id, main_script, bisect_dir) |
| |
| with open(os.path.join(bisect_dir, 'progress.json'), 'w') as f: |
| json.dump(records, f, indent=4) |
| |
| with open(os.path.join(bisect_dir, 'retries.json'), 'w') as f: |
| json.dump(retries, f, indent=4) |
| |
| |
| def cmd_count_retries(opts): |
| total_retries = collections.Counter() |
| count = 0 |
| for dir_name in os.listdir(opts.work_dir): |
| if not os.path.isdir(os.path.join(opts.work_dir, dir_name)): |
| continue |
| retries_file = os.path.join(opts.work_dir, dir_name, 'retries.json') |
| try: |
| with open(retries_file) as f: |
| data = json.load(f) |
| except Exception as e: |
| logger.error('failed to load %s: %s', retries_file, e) |
| continue |
| count += 1 |
| total_retries += collections.Counter(data) |
| logger.info('total %s bisections', count) |
| with open(os.path.join(opts.work_dir, 'total_retires.json'), 'w') as f: |
| json.dump(total_retries, f, indent=4) |
| logger.info('%s total retries: %s', total_retries.total(), total_retries) |
| |
| |
| def cmd_gen_view(opts): |
| if opts.after: |
| if not opts.before: |
| opts.before = datetime.datetime.now() |
| logger.info('query bisections in [%s, %s)', opts.after, opts.before) |
| ds_client = bisect_db.Client() |
| bisect_ids = [ |
| entity.key.id_or_name |
| for entity in ds_client.query_finished_bisects( |
| opts.after, opts.before |
| ) |
| ] |
| num_bisects = len(bisect_ids) |
| logger.info('total %s bisections found', num_bisects) |
| for i, bisect_id in enumerate(bisect_ids): |
| logger.info('processing %s of %s', i + 1, num_bisects) |
| gen_view( |
| bisect_id, |
| opts.work_dir, |
| opts.regenerate, |
| opts.download, |
| ) |
| elif opts.bisect_id: |
| gen_view( |
| opts.bisect_id, |
| opts.work_dir, |
| opts.regenerate, |
| opts.download, |
| ) |
| |
| |
| def cmd_gen_waiting_time(opts): |
| ds_client = bisect_db.Client() |
| if opts.after: |
| if not opts.before: |
| opts.before = datetime.datetime.now() |
| logger.info('query bisections in [%s, %s)', opts.after, opts.before) |
| ds_client = bisect_db.Client() |
| bisects = [ |
| entity |
| for entity in ds_client.query_finished_bisects( |
| opts.after, |
| opts.before, |
| ) |
| if 'stateless' |
| in entity.get( |
| 'Experiments', [] |
| ) # and 'shared_dut_pool' in eneity.get('Experiments', []) |
| ] |
| num_bisects = len(bisects) |
| logger.info('total %s bisections found', num_bisects) |
| for i, bisect in enumerate(bisects): |
| logger.info('procesing %s of %s', i + 1, num_bisects) |
| gen_waiting_time( |
| bisect.key.id_or_name, |
| bisect, |
| opts.work_dir, |
| opts.regenerate, |
| opts.download, |
| ) |
| elif opts.bisect_id: |
| entity = ds_client.get_bisect(opts.bisect_id) |
| gen_waiting_time( |
| opts.bisect_id, |
| entity, |
| opts.work_dir, |
| opts.regenerate, |
| opts.download, |
| ) |
| |
| |
| _DUT_FAILURES_MESSAGES = [ |
| 'cannot allocate a lab DUT more than 30 times', |
| 'reached retry limit of 30 times', |
| 'Bisection exceeds timeout', |
| ] |
| |
| |
| def is_dut_failure(message: str) -> bool: |
| for dut_failure_message in _DUT_FAILURES_MESSAGES: |
| if dut_failure_message in message: |
| return True |
| return False |
| |
| |
| def gen_waiting_time( |
| bisect_id: str, |
| entity: datastore.Entity, |
| out_dir: str, |
| regenerate: bool = False, |
| download: bool = True, |
| ): |
| """Generates waiting_time.json for a single bisection. |
| |
| Args: |
| bisect_id: the bisect ID. |
| entity: a bisect entity from the Datastore. |
| out_dir: the output directory. |
| regenerate: whether to regenerate if the directory already exists. |
| downoload: whether to download session files from gsbucket. If False, assume the files are downloaded before. |
| """ |
| logger.info('processing %s', bisect_id) |
| |
| bisect_dir = os.path.join(out_dir, bisect_id) |
| try: |
| os.makedirs(bisect_dir, exist_ok=False) |
| except FileExistsError as e: |
| logger.warning('directory %s already exists: %s', bisect_dir, e) |
| if not regenerate: |
| logger.info('skip %s since the directory already exists', bisect_id) |
| return |
| logger.info('re-generate %s', bisect_id) |
| except Exception as e: |
| logger.error('failed to create directory %s: %s', bisect_dir, e) |
| return |
| |
| if download: |
| download_session_from_gs( |
| bisect_id, |
| bisect_dir, |
| [ |
| _MAIN_SESSION_FILE_SUFFIX, |
| ], |
| ) |
| |
| main_session_file = None |
| for file_name in os.listdir(bisect_dir): |
| if file_name.endswith(_MAIN_SESSION_FILE_SUFFIX): |
| main_session_file = file_name |
| |
| logger.info('main session file for %s: %s', bisect_id, main_session_file) |
| if not main_session_file: |
| logger.error('Can not find main session file for %s, abort', bisect_id) |
| return |
| |
| try: |
| with open(os.path.join(bisect_dir, main_session_file)) as f: |
| data = json.load(f) |
| except Exception as e: |
| logger.error('failed to load %s: %s', main_session_file, e) |
| |
| dut_leases = [] |
| dut_lease_file_path = data.get('statistics', {}).get('dut_leases_log_path') |
| if not dut_lease_file_path: |
| logger.info('No DUT lease log file') |
| else: |
| dut_lease_file_name = os.path.basename(dut_lease_file_path) |
| dut_lease_local_path = os.path.join(bisect_dir, dut_lease_file_name) |
| |
| if download: |
| gs_path = os.path.join( |
| _GS_DIR_PATTTERN.format(bisect_id=bisect_id), |
| 'log', |
| dut_lease_file_name, |
| ) |
| try: |
| gs_util.cp(gs_path, dut_lease_local_path) |
| except Exception as e: |
| logger.error('failed to download %s: %s', gs_path, e) |
| return |
| |
| try: |
| with open(dut_lease_local_path) as f: |
| dut_leases = json.load(f) |
| except Exception as e: |
| logger.error('failed to load %s: %s', dut_lease_local_path, e) |
| return |
| |
| fail_timestamps = [] |
| for entry in data.get('history'): |
| event = entry.get('event') |
| exception = entry.get('exception') |
| if event == 'failed': |
| if exception in ['DutLeaseTimeout', 'NoDutAvailable']: |
| fail_timestamps.append(entry.get('timestamp')) |
| |
| records = [] |
| waiting_time = 0 |
| previous_lease = None |
| summary = None |
| for fail_timestamp in fail_timestamps: |
| if previous_lease and fail_timestamp < previous_lease: |
| continue |
| found = False |
| for lease in dut_leases: |
| start_timestamp = lease.get('start_timestamp') |
| if start_timestamp > fail_timestamp: |
| duration = start_timestamp - fail_timestamp |
| waiting_time += duration |
| records.append( |
| { |
| 'fail': fail_timestamp, |
| 'next_lease': start_timestamp, |
| 'duration': duration, |
| } |
| ) |
| previous_lease = start_timestamp |
| found = True |
| break |
| if not found: |
| reason = entity.get('RunInfo.ErrDesc', '') |
| if is_dut_failure(reason): |
| logger.info('is dut failure: %s %s', bisect_id, reason) |
| summary = { |
| 'result': 'failed', |
| 'reason': reason, |
| } |
| else: |
| # Otherwise, the error is not caused by DUT lease error. |
| # We exclude the data point from calculation. |
| summary = { |
| 'result': 'unknown', |
| 'reason': reason, |
| } |
| break |
| |
| if not summary: |
| summary = { |
| 'result': 'success', |
| 'leases': records, |
| 'waiting_time': waiting_time, |
| } |
| |
| with open(os.path.join(bisect_dir, 'waiting_time.json'), 'w') as f: |
| json.dump( |
| summary, |
| f, |
| indent=4, |
| ) |
| logger.info('waiting_time.json generated for %s', bisect_id) |
| |
| |
| def cmd_count_waiting_time(opts): |
| count = 0 |
| failed_count = 0 |
| success_count = 0 |
| total_waiting_time = 0 |
| |
| for dir_name in os.listdir(opts.work_dir): |
| if not os.path.isdir(os.path.join(opts.work_dir, dir_name)): |
| continue |
| waiting_time_file = os.path.join( |
| opts.work_dir, dir_name, 'waiting_time.json' |
| ) |
| try: |
| with open(waiting_time_file) as f: |
| data = json.load(f) |
| except Exception as e: |
| logger.error('failed to load %s: %s', waiting_time_file, e) |
| continue |
| count += 1 |
| result = data.get('result') |
| if result == 'failed': |
| failed_count += 1 |
| elif result == 'success': |
| success_count += 1 |
| total_waiting_time += float(data.get('waiting_time')) |
| |
| logger.info('total %s bisections', count) |
| with open(os.path.join(opts.work_dir, 'total_waiting_time.json'), 'w') as f: |
| json.dump( |
| { |
| 'success': success_count, |
| 'failed': failed_count, |
| 'total_waiting_time': total_waiting_time, |
| }, |
| f, |
| indent=4, |
| ) |
| |
| |
| def main(args): |
| parser = argparse.ArgumentParser(description='analyze bisect progress') |
| parser.add_argument( |
| '--debug', action='store_true', help='Output DEBUG log to console' |
| ) |
| parser.add_argument( |
| '--work_dir', |
| required=True, |
| help='output directory', |
| ) |
| subparsers = parser.add_subparsers(required=True) |
| |
| parser_gen_view = subparsers.add_parser('gen_view') |
| parser_gen_view.add_argument('--bisect_id', help='bisection id') |
| parser_gen_view.add_argument( |
| '--regenerate', |
| action=argparse.BooleanOptionalAction, |
| default=False, |
| help='whether to regenerate if the bisect has been processed', |
| ) |
| parser_gen_view.add_argument( |
| '--download', |
| action=argparse.BooleanOptionalAction, |
| default=True, |
| help='whether to download files from gs bucket', |
| ) |
| parser_gen_view.add_argument( |
| '--before', |
| type=datetime.datetime.fromisoformat, |
| help='query bisections < the date', |
| ) |
| parser_gen_view.add_argument( |
| '--after', |
| type=datetime.datetime.fromisoformat, |
| help='query bisections >= the date', |
| ) |
| parser_gen_view.set_defaults(func=cmd_gen_view) |
| |
| parser_count_retries = subparsers.add_parser('count_retries') |
| parser_count_retries.set_defaults(func=cmd_count_retries) |
| |
| parser_gen_waiting_time = subparsers.add_parser('gen_waiting_time') |
| parser_gen_waiting_time.add_argument('--bisect_id', help='bisection id') |
| parser_gen_waiting_time.add_argument( |
| '--regenerate', |
| action=argparse.BooleanOptionalAction, |
| default=False, |
| help='whether to regenerate if the bisect has been processed', |
| ) |
| parser_gen_waiting_time.add_argument( |
| '--download', |
| action=argparse.BooleanOptionalAction, |
| default=True, |
| help='whether to download files from gs bucket', |
| ) |
| parser_gen_waiting_time.add_argument( |
| '--before', |
| type=datetime.datetime.fromisoformat, |
| help='query bisections < the date', |
| ) |
| parser_gen_waiting_time.add_argument( |
| '--after', |
| type=datetime.datetime.fromisoformat, |
| help='query bisections >= the date', |
| ) |
| parser_gen_waiting_time.set_defaults(func=cmd_gen_waiting_time) |
| |
| parser_count_waiting_time = subparsers.add_parser('count_waiting_time') |
| parser_count_waiting_time.set_defaults(func=cmd_count_waiting_time) |
| |
| opts = parser.parse_args(args) |
| opts.log_file = os.path.join(opts.work_dir, 'log') |
| common.config_logging(opts) |
| |
| opts.func(opts) |
| |
| |
| if __name__ == '__main__': |
| main(sys.argv[1:]) |