blob: 59c78f7a7e2d4a02bbdebaa6baa6d35618dcb1a7 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2021 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This script cleans up the vendor directory."""
import argparse
import collections
import copy
import functools
import hashlib
import itertools
import json
import logging
import os
from pathlib import Path
import re
import shutil
import subprocess
import sys
import textwrap
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
sys.path.append(str(Path(__file__).resolve().parent / "scripts"))
import rust_crates
rust_crates.die_if_running_as_root()
rust_crates.run_inside_chroot()
rust_crates.install_tomli_and_reexec_if_unavailable()
import tomli
import tomli_w
# Eg. crate(-1.2.3+blah)?
_PATCH_VERSION_REGEX = re.compile(r"^(.*?)(?:-(\d+\.\d+\.\d+(?:\+.*)?)?)?$")
_METALLURGY_CARGO_TOML_HEADER = """# Copyright 2023 The ChromiumOS Authors.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# !! Autogenerated by `vendor.py`; please don't edit. !!
[package]
name = "metallurgy-crates"
version = "0.0.1"
[workspace]
[dependencies]
"""
_FAKE_NO_STD_CRATES = (
# num-* uses std feature to toggle std in dependencies.
# This is the subset of num-* that requires std to function.
"num",
"num-bigint",
"num-rational",
# svd2rust depends on env_logger, which relies on std, and has the
# following line in the Cargo.toml:
# log = { version = "~0.4", features = ["std"] }
"svd2rust",
)
_NO_STD_FEATURES = ("no-std", "no_std")
_STD_FEATURES = ("std", "use_std")
# We only care about crates we're actually going to use and that's usually
# limited to ones with cfg(linux). For running `cargo metadata`, limit results
# to only these platforms.
ALL_SUPPORTED_PLATFORMS = (
# Main targets.
"x86_64-cros-linux-gnu",
"armv7a-cros-linux-gnueabihf",
"aarch64-cros-linux-gnu",
# As far as we care, this is the same as x86_64-cros-linux-gnu.
# "x86_64-pc-linux-gnu",
# Baremetal targets.
"thumbv6m-none-eabi",
"thumbv7m-none-eabi",
"thumbv7em-none-eabihf",
"i686-unknown-uefi",
"x86_64-unknown-uefi",
)
# A series of crates which are to be made empty by having no (non-comment)
# contents in their `lib.rs`, rather than by inserting a compilation error.
NOP_EMPTY_CRATES = frozenset({"windows"})
EMPTY_CRATE_BODY = """\
compile_error!("This crate cannot be built for this configuration.");
"""
NOP_EMPTY_CRATE_BODY = "// " + EMPTY_CRATE_BODY
_ALL_MODES = {"alchemy", "std", "no_std"}
# This ends up being a JSON-y map that provides data for
# https://bazelbuild.github.io/rules_rust/crate_universe.html#crateannotation
BazelAnnotation = Dict[str, Any]
def _rerun_checksums(package_path):
"""Re-run checksums for given package.
Writes resulting checksums to $package_path/.cargo-checksum.json.
"""
hashes = dict()
checksum_path = os.path.join(package_path, ".cargo-checksum.json")
if not Path(checksum_path).is_file():
return False
with open(checksum_path, "r") as fread:
contents = json.load(fread)
for root, _, files in os.walk(package_path, topdown=True):
for f in files:
# Don't checksum an existing checksum file
if f == ".cargo-checksum.json":
continue
file_path = os.path.join(root, f)
with open(file_path, "rb") as frb:
m = hashlib.sha256()
m.update(frb.read())
d = m.hexdigest()
# Key is relative to the package path so strip from beginning
key = os.path.relpath(file_path, package_path)
hashes[key] = d
if hashes:
logging.info(
"%s regenerated %d hashes", package_path, len(hashes.keys())
)
contents["files"] = hashes
with open(checksum_path, "w") as fwrite:
json.dump(contents, fwrite, sort_keys=True)
return True
def _remove_OWNERS_checksum(root):
"""Delete all OWNERS files from the checksum file.
Args:
root: Root directory for the vendored crate.
Returns:
True if OWNERS was found and cleaned up. Otherwise False.
"""
checksum_path = os.path.join(root, ".cargo-checksum.json")
if not Path(checksum_path).is_file():
return False
with open(checksum_path, "r") as fread:
contents = json.load(fread)
del_keys = []
for cfile in contents["files"]:
if "OWNERS" in cfile:
del_keys.append(cfile)
for key in del_keys:
del contents["files"][key]
if del_keys:
logging.info("%s deleted: %s", root, del_keys)
with open(checksum_path, "w") as fwrite:
json.dump(contents, fwrite, sort_keys=True)
return bool(del_keys)
def cleanup_owners(vendor_path):
"""Remove owners checksums from the vendor directory.
We currently do not check in the OWNERS files from vendored crates because
they interfere with the find-owners functionality in gerrit. This cleanup
simply finds all instances of "OWNERS" in the checksum files within and
removes them.
Args:
vendor_path: Absolute path to vendor directory.
"""
deps_cleaned = []
for root, dirs, _ in os.walk(vendor_path):
for d in dirs:
removed = _remove_OWNERS_checksum(os.path.join(root, d))
if removed:
deps_cleaned.append(d)
if deps_cleaned:
logging.info("Cleanup owners:\n%s", "\n".join(deps_cleaned))
def apply_single_patch(patch, workdir):
"""Apply a single patch and return whether it was successful.
Returns:
True if successful. False otherwise.
"""
proc = subprocess.run(
[
"patch",
"-p1",
"--no-backup-if-mismatch",
"-i",
patch,
],
cwd=workdir,
)
return proc.returncode == 0
def apply_patch_script(script, workdir):
"""Run the given patch script, returning whether it exited cleanly.
Returns:
True if successful. False otherwise.
"""
return subprocess.run([script], cwd=workdir).returncode == 0
def determine_vendor_crates(vendor_path):
"""Returns a map of {crate_name: [directory]} at the given vendor_path."""
result = collections.defaultdict(list)
crate_version_re = re.compile(r"-\d+\.\d+\.\d+(:?[+-]|$)")
for crate_name_plus_ver in os.listdir(vendor_path):
version = crate_version_re.search(crate_name_plus_ver)
assert version, crate_name_plus_ver
name = crate_name_plus_ver[: version.start()]
result[name].append(crate_name_plus_ver)
for crate_list in result.values():
crate_list.sort()
return result
def apply_patches(patches_path, vendor_path):
"""Finds patches and applies them to sub-folders in the vendored crates.
Args:
patches_path: Path to folder with patches. Expect all patches to be one
level down (matching the crate name).
vendor_path: Root path to vendored crates directory.
"""
checksums_for = {}
# Don't bother running if patches directory is empty
if not Path(patches_path).is_dir():
return
patches_failed = False
vendor_crate_map = determine_vendor_crates(vendor_path)
# Look for all patches and apply them
for d in os.listdir(patches_path):
dir_path = os.path.join(patches_path, d)
# We don't process patches in root dir
if not os.path.isdir(dir_path):
continue
# We accept one of two forms here:
# - direct targets (these name # `${crate_name}-${version}`)
# - simply the crate name (which applies to all versions of the
# crate)
direct_target = os.path.join(vendor_path, d)
if os.path.isdir(direct_target):
patch_targets = [d]
elif d in vendor_crate_map:
patch_targets = vendor_crate_map[d]
else:
raise RuntimeError(f"Unknown crate in {vendor_path}: {d}")
for patch in os.listdir(dir_path):
file_path = os.path.join(dir_path, patch)
# Skip if not a patch file
if not os.path.isfile(file_path):
continue
if patch.endswith(".patch"):
apply = apply_single_patch
elif os.access(file_path, os.X_OK):
apply = apply_patch_script
else:
# Unrecognized. Skip it.
continue
for target_name in patch_targets:
checksums_for[target_name] = True
target = os.path.join(vendor_path, target_name)
logging.info("Applying %s to %s", file_path, target)
if not apply(file_path, target):
logging.error("Failed to apply %s to %s", file_path, target)
patches_failed = True
# Do this late, so we can report all of the failing patches in one
# invocation.
if patches_failed:
raise ValueError("Patches failed; please see above logs")
# Re-run checksums for all modified packages since we applied patches.
for key in checksums_for.keys():
_rerun_checksums(os.path.join(vendor_path, key))
def generate_patches_manifest(
patch_dirs: Dict[str, Path],
) -> Dict[str, List[BazelAnnotation]]:
"""Returns a dictionary containing json configuration of the patch file."""
patches = collections.defaultdict(list)
for label, patch_dir in patch_dirs.items():
for d in sorted(patch_dir.iterdir()):
if d.is_dir():
crate, version = _PATCH_VERSION_REGEX.match(d.name).groups()
patch_files = [
p.relative_to(patch_dir) for p in d.glob("*.patch")
]
# Some directories instead have shell scripts to remove the
# executable bit from files. We don't care about these ones,
# since we're not vendoring with bazel, which won't let you
# execute them anyway.
if patch_files:
# `glob` has no ordering guarantees, so sort to ensure output
# independent of dirent ordering.
patch_files.sort()
patch = dict(
version=version or "*",
# The default bazel patch tool doesn't support fuzzing,
# which vendor relies on.
patch_tool="patch",
patch_args=["-p1"],
patches=[
label.format(patch=patch) for patch in patch_files
],
)
patches[crate].append(patch)
return patches
def get_workspace_cargo_toml(working_dir):
"""Returns all Cargo.toml files under working_dir."""
return [working_dir / "projects" / "Cargo.toml"]
def run_cargo_vendor(working_dir):
"""Runs cargo vendor.
Args:
working_dir: Directory to run inside. This should be the directory where
Cargo.toml is kept.
"""
# `cargo vendor` may update dependencies (which may update metadata).
load_all_package_metadata.cache_clear()
# Cargo will refuse to revendor into versioned directories, which leads to
# repeated `./vendor.py` invocations trying to apply patches to
# already-patched sources. Remove the existing vendor directory to avoid
# this.
vendor_dir = working_dir / "vendor"
if vendor_dir.exists():
shutil.rmtree(vendor_dir)
cargo_cmdline = [
"cargo",
"vendor",
"--versioned-dirs",
"-v",
"--manifest-path=projects/Cargo.toml",
"--",
"vendor",
]
subprocess.check_call(cargo_cmdline, cwd=working_dir)
def load_single_metadata(working_dir, filter_platform):
"""Load metadata for all projects under a given directory.
Args:
working_dir: Base directory to run from.
filter_platform: Filter packages to ones configured for this platform.
"""
cmd = [
"cargo",
"metadata",
"--format-version=1",
# Use `--quiet` here, since cargo may warn about dependencies which
# don't strictly make sense (e.g., some third-party packages depend on
# things like bindgen-cli, which doesn't have a library target).
#
# We don't care. :)
"--quiet",
"--manifest-path=projects/Cargo.toml",
]
# Conditionally add platform filter
if filter_platform:
cmd += ("--filter-platform", filter_platform)
output = subprocess.check_output(cmd, cwd=working_dir)
return json.loads(output)
# Calls to this are somewhat expensive, and repeated a fair few times
# throughout `./vendor.py`. Measuring locally, having a cache here speeds this
# script up by 1.4x.
@functools.lru_cache()
def load_all_package_metadata(working_dir, platforms=ALL_SUPPORTED_PLATFORMS):
"""Loads and merges metadata for all platforms in `platforms`.
This drops a lot of data from `cargo metadata`. Some of this metadata is
hard to merge, other bits of it just aren't worth keeping at the moment.
"""
assert platforms, f"`platforms` should have things; has {platforms}"
found_package_ids = set()
results = []
for platform in platforms:
metadata = load_single_metadata(working_dir, platform)["packages"]
for package in metadata:
package_id = package["id"]
if package_id in found_package_ids:
continue
found_package_ids.add(package_id)
results.append(
{
"id": package["id"],
"license": package["license"],
"license_file": package["license_file"],
"name": package["name"],
"version": package["version"],
}
)
return results
class LicenseManager:
"""Manage consolidating licenses for all packages."""
# These are all the licenses we support. Keys are what is seen in metadata
# and values are what is expected by ebuilds.
SUPPORTED_LICENSES = {
"0BSD": "0BSD",
"Apache-2.0": "Apache-2.0",
"BSD-2-Clause": "BSD-2",
"BSD-3-Clause": "BSD-3",
"ISC": "ISC",
"MIT": "MIT",
"MPL-2.0": "MPL-2.0",
"unicode": "unicode",
"Unicode-3.0": "unicode",
"Zlib": "ZLIB",
}
# Prefer to take attribution licenses in this order. All these require that
# we actually use the license file found in the package so they MUST have
# a license file set.
PREFERRED_ATTRIB_LICENSE_ORDER = ["MIT", "BSD-3", "ISC"]
# If Apache license is found, always prefer it (simplifies attribution)
APACHE_LICENSE = "Apache-2.0"
# Regex for license files found in the vendored directories. Search for
# these files with re.IGNORECASE.
#
# These will be searched in order with the earlier entries being preferred.
LICENSE_NAMES_REGEX = [
r"^license-mit$",
r"^copyright$",
r"^licen[cs]e.*$",
]
# Some crates have their license file in other crates. This usually occurs
# because multiple crates are published from the same git repository and the
# license isn't updated in each sub-crate. In these cases, we can just
# ignore these packages.
MAP_LICENSE_TO_OTHER = {
"blazesym-c": "blazesym",
"failure_derive": "failure",
"grpcio-compiler": "grpcio",
"grpcio-sys": "grpcio",
"mocktopus_macros": "mocktopus",
"protobuf-codegen": "protobuf",
"protobuf-parse": "protobuf",
"protobuf-support": "protobuf",
"rustyline-derive": "rustyline",
}
# Map a package to a specific license and license file. Only use this if
# a package doesn't have an easily discoverable license or exports its
# license in a weird way. Prefer to patch the project with a license and
# upstream the patch instead.
STATIC_LICENSE_MAP = {
# "package name": ("license name", "license file relative location")
# Patch for adding these are upstream, but the patch application
# doesn't apply to `cargo metadata`. This is presumably because it
# can't detect our vendor directory.
# https://gitlab.freedesktop.org/slirp/libslirp-sys/-/merge_requests/6
"libslirp-sys": ("MIT", "LICENSE"),
# https://gitlab.freedesktop.org/anholt/deqp-runner/-/merge_requests/48
"deqp-runner": ("MIT", "LICENSE"),
# https://github.com/DimaKudosh/difflib/blob/master/LICENSE
"difflib": ("MIT", "LICENSE"),
# Upstream prefers to embed license text inside README.md:
"riscv": ("ISC", "README.md"),
"riscv-rt": ("ISC", "README.md"),
"zerocopy": ("BSD-2", "LICENSE"),
"zerocopy-derive": ("BSD-2", "LICENSE"),
}
def __init__(self, working_dir, vendor_dir):
self.working_dir = working_dir
self.vendor_dir = vendor_dir
def _find_license_in_dir(self, search_dir):
for p in os.listdir(search_dir):
# Ignore anything that's not a file
if not os.path.isfile(os.path.join(search_dir, p)):
continue
# Now check if the name matches any of the regexes
# We'll return the first matching file.
for regex in self.LICENSE_NAMES_REGEX:
if re.search(regex, p, re.IGNORECASE):
yield os.path.join(search_dir, p)
break
def _guess_license_type(self, license_file):
if "-MIT" in license_file:
return "MIT"
elif "-APACHE" in license_file:
return "APACHE"
elif "-BSD" in license_file:
return "BSD-3"
with open(license_file, "r") as f:
lines = f.read()
if "MIT" in lines:
return "MIT"
elif "Apache" in lines:
return "APACHE"
elif "BSD 3-Clause" in lines:
return "BSD-3"
return ""
def generate_license(
self,
skip_license_check,
print_map_to_file,
license_shorthand_file,
destroyed_crates,
):
"""Generate single massive license file from metadata."""
metadata = load_all_package_metadata(self.working_dir)
special_unicode_license = "(MIT OR Apache-2.0) AND Unicode-DFS-2016"
special_whatwg_license = "(Apache-2.0 OR MIT) AND BSD-3-Clause"
bad_licenses = {}
# Keep license map ordered so it generates a consistent license map
license_map = {}
skip_license_check = skip_license_check or []
has_unicode_license = False
for package in metadata:
# Skip the synthesized Cargo.toml packages that exist solely to
# list dependencies.
if "path+file:///" in package["id"]:
continue
pkg_name = package["name"]
pkg_version = package["version"]
if pkg_name in skip_license_check:
logging.info(
"Skipped license check on %s. Reason: Skipped from command line",
pkg_name,
)
continue
# Skip the license check for packages we have destroyed.
if (pkg_name, pkg_version) in destroyed_crates:
continue
if pkg_name in self.MAP_LICENSE_TO_OTHER:
logging.info(
"Skipped license check on %s. Reason: License already in %s",
pkg_name,
self.MAP_LICENSE_TO_OTHER[pkg_name],
)
continue
# Check if we have a static license map for this package. Use the
# static values if we have it already set.
if pkg_name in self.STATIC_LICENSE_MAP:
license, license_file = self.STATIC_LICENSE_MAP[pkg_name]
license_map[pkg_name] = {
"license": license,
"license_file": license_file,
}
continue
license_files = []
# use `or ''` instead of get's default, since `package` may have a
# None value for 'license'.
license = package.get("license") or ""
# We ignore the metadata for license file because most crates don't
# have it set. Just scan the source for licenses.
license_files = list(
self._find_license_in_dir(
os.path.join(self.vendor_dir, f"{pkg_name}-{pkg_version}")
)
)
# FIXME(b/240953811): The code later in this loop is only
# structured to handle ORs, not ANDs. Fortunately, this license in
# particular is `AND`ed between a super common license (Apache) and
# a more obscure one (unicode). This hack is specifically intended
# for the `unicode-ident` crate, though no crate name check is
# made, since it's OK other crates happen to have this license.
if license == special_unicode_license:
has_unicode_license = True
# We'll check later to be sure MIT or Apache-2.0 is represented
# properly.
for x in license_files:
if os.path.basename(x) == "LICENSE-UNICODE":
license_file = x
break
else:
raise ValueError(
"No LICENSE-UNICODE found in " f"{license_files}"
)
license_map[pkg_name] = {
"license": license,
"license_file": license_file,
}
continue
# FIXME(b/240953811): This it the same hack as above for
# `unicode-ident`, except this time it handles the license of the
# `encoding_rs` crate.
if license == special_whatwg_license:
has_whatwg_license = True
# We'll check later to be sure MIT or Apache-2.0 is represented
# properly.
for x in license_files:
if os.path.basename(x) == "LICENSE-WHATWG":
license_file = x
break
else:
raise ValueError(
"No LICENSE-WHATWG found in " f"{license_files}"
)
license_map[pkg_name] = {
"license": license,
"license_file": license_file,
}
continue
# If there are multiple licenses, they are delimited with "OR" or "/"
delim = " OR " if " OR " in license else "/"
found = [x.strip() for x in license.split(delim)]
# Filter licenses to ones we support
licenses_or = [
self.SUPPORTED_LICENSES[f]
for f in found
if f in self.SUPPORTED_LICENSES
]
# If apache license is found, always prefer it because it simplifies
# license attribution (we can use existing Apache notice)
if self.APACHE_LICENSE in licenses_or:
license_map[pkg_name] = {"license": self.APACHE_LICENSE}
# Handle single license that has at least one license file
# We pick the first license file and the license
elif len(licenses_or) == 1:
if license_files:
l = licenses_or[0]
lf = license_files[0]
license_map[pkg_name] = {
"license": l,
"license_file": os.path.relpath(lf, self.working_dir),
}
else:
bad_licenses[pkg_name] = "{} missing license file".format(
licenses_or[0]
)
# Handle multiple licenses
elif len(licenses_or) > 1:
# Check preferred licenses in order
license_found = False
for l in self.PREFERRED_ATTRIB_LICENSE_ORDER:
if not l in licenses_or:
continue
for f in license_files:
if self._guess_license_type(f) == l:
license_found = True
license_map[pkg_name] = {
"license": l,
"license_file": os.path.relpath(
f, self.working_dir
),
}
break
# Break out of loop if license is found
if license_found:
break
else:
bad_licenses[pkg_name] = license
# If we had any bad licenses, we need to abort
if bad_licenses:
for k in bad_licenses.keys():
logging.error(
"%s had no acceptable licenses: %s", k, bad_licenses[k]
)
raise Exception("Bad licenses in vendored packages.")
# Write license map to file
if print_map_to_file:
with open(
os.path.join(self.working_dir, print_map_to_file), "w"
) as lfile:
json.dump(license_map, lfile, sort_keys=True)
# Raise missing licenses unless we have a valid reason to ignore them
raise_missing_license = False
for name, v in license_map.items():
if (
"license_file" not in v
and v.get("license", "") != self.APACHE_LICENSE
):
raise_missing_license = True
logging.error(
" %s: Missing license file. Fix or add to ignorelist.",
name,
)
if raise_missing_license:
raise Exception(
"Unhandled missing license file. "
"Make sure all are accounted for before continuing."
)
has_license_types = {x["license"] for x in license_map.values()}
if has_unicode_license:
# Replace this license with the actual SPDX license we plan to use.
has_license_types.remove(special_unicode_license)
has_license_types.add("unicode")
if self.APACHE_LICENSE not in has_license_types:
raise ValueError(
"Need the apache license; currently have: "
f"{sorted(has_license_types)}"
)
if has_whatwg_license:
# Replace this license with the actual SPDX license we plan to use.
has_license_types.remove(special_whatwg_license)
has_license_types.add("BSD-3")
if self.APACHE_LICENSE not in has_license_types:
raise ValueError(
"Need the apache license; currently have: "
f"{sorted(has_license_types)}"
)
sorted_licenses = sorted(has_license_types)
logging.info("The following licenses are in use: %s", sorted_licenses)
header = textwrap.dedent(
"""\
# File to describe the licenses used by this registry.
# Used so it's easy to automatically verify ebuilds are updated.
# Each line is a license. Lines starting with # are comments.
"""
)
with open(license_shorthand_file, "w", encoding="utf-8") as f:
f.write(header)
f.write("\n".join(sorted_licenses))
# Ensure there's a newline at the end to appease `cros format`.
f.write("\n")
def clean_source_related_lines_in_place(cargo_toml):
"""Removes all [[bin]] (and similar) sections in `cargo_toml`."""
cargo_toml.pop("bench", None)
cargo_toml.pop("bin", None)
cargo_toml.pop("examples", None)
cargo_toml.pop("test", None)
lib = cargo_toml.get("lib")
if lib:
lib.pop("path", None)
package = cargo_toml.get("package")
if package:
package.pop("build", None)
package.pop("default-run", None)
package.pop("include", None)
def clean_features_in_place(cargo_toml):
"""Removes all side-effects of features in `cargo_toml`."""
features = cargo_toml.get("features")
if not features:
return
for name in features:
features[name] = []
def remove_all_dependencies_in_place(cargo_toml):
"""Removes all `target.*.dependencies` from `cargo_toml`."""
cargo_toml.pop("build-dependencies", None)
cargo_toml.pop("dependencies", None)
cargo_toml.pop("dev-dependencies", None)
target = cargo_toml.get("target")
if not target:
return
empty_keys = []
for key, values in target.items():
values.pop("build-dependencies", None)
values.pop("dependencies", None)
values.pop("dev-dependencies", None)
if not values:
empty_keys.append(key)
if len(empty_keys) == len(target):
del cargo_toml["target"]
else:
for key in empty_keys:
del target[key]
class CrateDestroyer:
def __init__(self, working_dir, vendor_dir):
self.working_dir = working_dir
self.vendor_dir = vendor_dir
def _modify_cargo_toml(self, pkg_path):
with open(os.path.join(pkg_path, "Cargo.toml"), "rb") as cargo:
contents = tomli.load(cargo)
package = contents["package"]
# Change description, license and delete license key
package["description"] = "Empty crate that should not build."
package["license"] = "Apache-2.0"
package.pop("license_file", None)
# If there's no build.rs but we specify `links = "foo"`, Cargo gets
# upset.
package.pop("links", None)
# Some packages have cfg-specific dependencies. Remove them here; we
# don't care about the dependencies of an empty package.
#
# This is a load-bearing optimization: `dev-python/toml` doesn't
# always round-trip dumps(loads(x)) correctly when `x` has keys with
# strings (b/242589711#comment3). The place this has bitten us so far
# is target dependencies, which can be harmlessly removed for now.
#
# Cleaning features in-place is also necessary, since we're removing
# dependencies, and a feature can enable features in dependencies.
# Cargo errors out on `[features] foo = "bar/baz"` if `bar` isn't a
# dependency.
clean_features_in_place(contents)
remove_all_dependencies_in_place(contents)
# Since we're removing all source files, also be sure to remove
# source-related keys.
clean_source_related_lines_in_place(contents)
with open(os.path.join(pkg_path, "Cargo.toml"), "wb") as cargo:
tomli_w.dump(contents, cargo)
def _replace_source_contents(self, package_path, compile_error):
# First load the checksum file before starting
checksum_file = os.path.join(package_path, ".cargo-checksum.json")
with open(checksum_file, "r") as csum:
checksum_contents = json.load(csum)
# Also load the cargo.toml file which we need to write back
cargo_file = os.path.join(package_path, "Cargo.toml")
with open(cargo_file, "rb") as cfile:
cargo_contents = cfile.read()
shutil.rmtree(package_path)
# Make package and src dirs and replace lib.rs
os.makedirs(os.path.join(package_path, "src"), exist_ok=True)
with open(os.path.join(package_path, "src", "lib.rs"), "w") as librs:
librs.write(
EMPTY_CRATE_BODY if compile_error else NOP_EMPTY_CRATE_BODY
)
# Restore cargo.toml
with open(cargo_file, "wb") as cfile:
cfile.write(cargo_contents)
# Restore checksum
with open(checksum_file, "w") as csum:
json.dump(checksum_contents, csum)
def destroy_unused_crates(
self, destroyed_crates_file: Path
) -> List[Tuple[str, str]]:
metadata = [
(x["name"], x["version"])
for x in load_single_metadata(
self.working_dir, filter_platform=None
)["packages"]
]
used_packages = {
(x["name"], x["version"])
for x in load_all_package_metadata(self.working_dir)
}
cleaned_packages = []
# Since we're asking for _all_ metadata packages, we may see
# duplication.
for package_desc in metadata:
package_name, package_version = package_desc
if package_desc in used_packages:
# b/239449434: Due to RUSTSEC-2020-0071, we manually empty
# time-0.1. It's present in the depgraph because chrono brings it
# in by default under the `oldtime` feature. Nothing in our
# depgraph actually makes use of this.
#
# b/271837931: bindgen versons before 0.63 do not work correctly
# with newer versions of LLVM. We patch grpcio-sys to depend on
# bindgen-0.63, but vendoring happens before that, so bindgen-0.57
# is still pulled in. This causes build errors, so remove it.
#
# b/288421251: The failure crate is no longer maintained and
# has an open CVE, so we patched the only user while we wait
# for a pull requrest to be accepted upstream.
#
# b/321669037: shlex 0.1 is affected by RUSTSEC-2024-0006. The
# only user is bindgen 0.57, which is patched out.
#
# b/347443966: blazesym by default uses simd-adler32. We'd
# rather patch it to avoid using "custom" crypto. Unfortunately
# this happens after vendoring, so remove it.
force_destroy_crate = (
(
package_name == "time"
and package_version.startswith("0.1")
)
or (
package_name == "bindgen"
and package_version.startswith("0.57")
)
or (package_name == "failure")
or (
package_name == "shlex"
and package_version.startswith("0.1")
)
or (package_name == "simd-adler32")
)
if not force_destroy_crate:
continue
logging.info(
f"Forcibly emptying %s@%s", package_name, package_version
)
# Detect the correct package path to destroy
pkg_path = os.path.join(
self.vendor_dir,
"{}-{}".format(package_name, package_version),
)
if not os.path.isdir(pkg_path):
logging.info(
f"Crate %s not found at %s", package_name, pkg_path
)
continue
self._replace_source_contents(
pkg_path, compile_error=package_name not in NOP_EMPTY_CRATES
)
self._modify_cargo_toml(pkg_path)
_rerun_checksums(pkg_path)
cleaned_packages.append((package_name, package_version))
for pkg, ver in cleaned_packages:
logging.info("Removed unused crate %s@%s", pkg, ver)
# Write a list of crates that've been destroyed. This is used by
# `scripts/cargo-vet.py`.
file_header = "# List of destroyed crates autogenerated by vendor.py."
file_lines = [f"{pkg} {ver}" for pkg, ver in cleaned_packages]
# Ensure there's a newline at the end of the file, so `cros format`
# remains happy.
file_lines.append("")
destroyed_crates_file.write_text(
"\n".join([file_header] + file_lines), encoding="utf-8"
)
return cleaned_packages
class InProgressStamp:
"""Class that represents an 'in-progress' file.
This file helps make it more obvious when vendor.py has not completed
successfully: b/278073343. It's intended to stick around until vendor.py
terminates successfully, so this isn't phrased as a contextmanager or
similar.
"""
def __init__(self, vendor_artifacts: Path):
in_progress_stamp = vendor_artifacts / "vendor_script_in_progress"
message = "\n".join(
(
"# Stamp file that's created when vendor.py started running,",
"# and removed when vendor.py completes successfully.",
"# If this file is hanging around, vendor.py did not terminate",
"# successfully. Please try not to land a change that leaves",
"# vendor.py broken.",
)
)
in_progress_stamp.write_text(message, encoding="utf-8")
self._in_progress_stamp = in_progress_stamp
def note_successful_termination(self):
"""Called when vendor.py is about to exit successfully."""
self._in_progress_stamp.unlink()
@functools.total_ordering
class Package:
@classmethod
def from_metadata(cls, metadata: Dict[str, Any]) -> "Package":
crates_by_id = {}
versions_by_name = collections.defaultdict(list)
packages = {pkg["id"]: pkg for pkg in metadata["packages"]}
resolved = {pkg["id"]: pkg for pkg in metadata["resolve"]["nodes"]}
assert packages.keys() == resolved.keys()
for pkg_id, pkg in packages.items():
crate = Package(pkg_id, pkg, resolved[pkg_id])
versions_by_name[crate.name].append(crate)
crates_by_id[crate.id] = crate
latest_crates = {
name: max(crates) for name, crates in versions_by_name.items()
}
for crate in crates_by_id.values():
crate.is_latest = crate == latest_crates[crate.name]
for pkg_id in crate.dep_ids:
dep = crates_by_id[pkg_id]
crate.deps.append(dep)
dep.reverse_deps.append(crate)
return sorted(crates_by_id.values())
def __init__(
self, pkg_id: str, package: Dict[str, Any], resolved: Dict[str, Any]
):
self.name = package["name"]
self.version = package["version"]
source = package["source"]
self.id = pkg_id
self.is_external = source is not None and source.startswith("registry+")
self.dep_ids = [dep["pkg"] for dep in resolved["deps"]]
self.deps: List[Package] = []
self.reverse_deps: List[Package] = []
self.is_latest = True
# Sometimes features can enable a feature in a transitive crate.
# Ignore them and set them manually in annotations.json.
default_features = package["features"].get("default", [])
self.default_features = {f for f in default_features if "/" not in f}
self.features = set(resolved["features"])
# Inline default features here. Undo this when outputting features.
if "default" in self.features:
self.features.discard("default")
self.features.update(self.default_features)
def find_feature(
options: List[str], features: List[str]
) -> Optional[str]:
for option in options:
if option in features:
return option
return None
enable_std_flag = find_feature(_STD_FEATURES, package["features"])
disable_std_flag = find_feature(_NO_STD_FEATURES, package["features"])
supports_no_std = (
enable_std_flag
or disable_std_flag
or find_feature(_NO_STD_FEATURES, package["categories"])
or find_feature(_NO_STD_FEATURES, package["keywords"])
)
if supports_no_std and self.name not in _FAKE_NO_STD_CRATES:
self.no_std_features = set(self.features)
if disable_std_flag:
self.no_std_features.add(disable_std_flag)
elif enable_std_flag:
self.no_std_features.discard(enable_std_flag)
else:
self.no_std_features = None
def format(self, features: Set[str]) -> str:
# Only bother outputting 3pp crates that we directly depend on.
if not self.is_external or all(
dep.is_external for dep in self.reverse_deps
):
return ""
default_features = self.default_features.issubset(features)
if default_features:
features = features.difference(self.default_features)
lines = []
if self.is_latest:
name = self.name
else:
name = (
self.name
+ "-"
+ self.version.replace(".", "-").replace("+", "-")
)
# This is used for us to see "What do I need to remove to be able to put everything on the same version?". There's not much value to showing this for the latest versions, especially since they will be a much longer list.
for rdep in self.reverse_deps:
lines.append(f"# Used by {rdep.name}-{rdep.version}")
if name == self.name and default_features and not features:
lines.append(f'{name} = "{self.version}"')
else:
default_features = (
"" if default_features else ', "default-features" = false'
)
features = (
f", features = {sorted(features)}" if self.features else ""
)
crate = "" if name == self.name else f'package = "{self.name}", '
lines.append(
f'{name} = {{ {crate}"version" = "{self.version}"{default_features}{features} }}'
)
return "\n".join(lines) + "\n"
def _key(self):
return (
self.name,
# TODO: strings can't compare against ints; implement proper semver
# ordering if this ever becomes a problem: https://semver.org/
[int(x) if x.isdigit() else x for x in self.version.split(".")],
)
def __repr__(self):
return f"{self.name} {self.version}"
def __eq__(self, other: "Package"):
return self._key() == other._key()
def __lt__(self, other: "Package"):
return self._key() < other._key()
def merge_annotations(a: Any, b: Any):
"""Merges two BazelAnnotations together, if possible."""
if a is None:
return b
if b is None:
return a
if isinstance(a, list) and isinstance(b, list):
return a + b
if isinstance(a, dict) and isinstance(b, dict):
result = {}
for key in set(itertools.chain(a.keys(), b.keys())):
result[key] = merge_annotations(a.get(key), b.get(key))
return result
# Merging strings may be doable in some cases (e.g., rustflags), but not
# all (e.g., patch program names). Also catch merging of different types.
raise TypeError(
f"Can't merge values {a} and {b} of types {type(a)} and {type(b)}"
)
def merge_annotation_maps(
map_a: Dict[str, List[BazelAnnotation]],
map_b: Dict[str, List[BazelAnnotation]],
mode: str,
) -> Dict[str, List[BazelAnnotation]]:
"""Merges two bazel annotation maps into one."""
# {crate_name: {crate_version: BazelAnnotation}}
annotations_by_version = collections.defaultdict(
lambda: collections.defaultdict(dict)
)
for crate_name, annotations in itertools.chain(
map_a.items(), map_b.items()
):
crate_annotations = annotations_by_version[crate_name]
for a in annotations:
a = copy.deepcopy(a)
modes = set(a.pop("modes", _ALL_MODES))
if not modes.issubset(_ALL_MODES):
raise ValueError(
f"Invalid modes {modes}. Modes must be a subset of {_ALL_MODES}"
)
if mode in modes:
version = a.get("version", "*")
existing = crate_annotations[version]
# `merge_annotations` can't merge the version field; handle that
# manually.
existing.pop("version", None)
crate_annotations[version] = merge_annotations(a, existing)
existing["version"] = version
return {
crate_name: [v for k, v in sorted(annotations.items())]
for crate_name, annotations in annotations_by_version.items()
}
def generate_annotations_file(
cargo_dir: Path,
mode,
all_annotations: Sequence[Dict[str, List[BazelAnnotation]]],
):
merged_annotations = {}
for annotation in all_annotations:
merged_annotations = merge_annotation_maps(
merged_annotations, annotation, mode=mode
)
package_versions = collections.defaultdict(set)
with (cargo_dir / "Cargo.lock").open("rb") as f:
for package in tomli.load(f)["package"]:
package_versions[package["name"]].add(package["version"])
for package, versions in package_versions.items():
# All of these versions are treated as "*" and are always valid
versions.update(["*", "", None])
filtered_annotations = collections.defaultdict(list)
for package, annotations in merged_annotations.items():
for annotation in annotations:
if annotation.get("version") in package_versions[package]:
filtered_annotations[package].append(annotation)
with (cargo_dir / "annotations.json").open("w", encoding="utf-8") as f:
json.dump(filtered_annotations, f, indent=2, sort_keys=True)
f.write("\n")
def generate_metallurgy_crates(
projects_dir: Path,
vendor_artifacts_dir: Path,
bazel_artifacts_dir: Path,
available_patches: Dict[str, List[BazelAnnotation]],
destroyed_crates: Set[Tuple[str, str]],
):
logging.info(
"Ensuring metallurgy crates are in sync with non-metallurgy crates..."
)
with (bazel_artifacts_dir / "annotations.toml").open("rb") as f:
annotations = tomli.load(f)
generate_annotations_file(
cargo_dir=vendor_artifacts_dir / "alchemy",
mode="alchemy",
all_annotations=[available_patches, annotations],
)
metadata = load_single_metadata(projects_dir.parent, filter_platform=None)
crates = Package.from_metadata(metadata)
for subdir, std in [("std", True), ("no_std", False)]:
logging.info("Syncing %s crates", subdir)
cargo_dir = vendor_artifacts_dir / subdir
metallurgy_crates = cargo_dir / "Cargo.toml"
with metallurgy_crates.open("w", encoding="utf-8") as f:
f.write(_METALLURGY_CARGO_TOML_HEADER)
for crate in crates:
if (crate.name, crate.version) in destroyed_crates:
continue
features = crate.features if std else crate.no_std_features
if features is not None:
f.write(crate.format(features))
# Copy from Cargo.lock to ensure we start with the same deps, and then do a
# minimal update (--workspace does a minimal update).
orig_lockfile = projects_dir / "Cargo.lock"
new_lockfile = cargo_dir / "Cargo.lock"
shutil.copyfile(orig_lockfile, new_lockfile)
subprocess.run(
["cargo", "update", "--workspace"], cwd=cargo_dir, check=True
)
generate_annotations_file(
cargo_dir=cargo_dir,
mode=subdir,
all_annotations=[available_patches, annotations],
)
logging.info("Crates in sync.")
def main():
logging.basicConfig(
format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
"%(message)s",
level=logging.INFO,
)
parser = argparse.ArgumentParser(description="Vendor packages properly")
parser.add_argument(
"--skip-license-check",
"-s",
help="Skip the license check on a specific package",
action="append",
)
parser.add_argument("--license-map", help="Write license map to this file")
parser.add_argument(
"--skip-version-check",
action="store_true",
help="Don't exit if our repo isn't up-to-date.",
)
parser.add_argument(
"--skip-cargo-vendor",
action="store_true",
help="Don't run cargo-vendor. Please don't upload changes that skip "
"this check. This flag is for local development use only.",
)
parser.add_argument(
"--skip-metallurgy",
action="store_true",
help="Skip metallurgy file generation. Please don't upload changes "
"that skip this step. This flag is for local development use only.",
)
args = parser.parse_args()
current_path = Path(__file__).parent.absolute()
if not args.skip_version_check:
rust_crates.exit_if_head_is_not_up_to_date(
current_path, disable_check_flag="--skip-version-check"
)
patches = os.path.join(current_path, "patches")
vendor = os.path.join(current_path, "vendor")
vendor_artifacts = current_path / "vendor_artifacts"
license_shorthand_file = os.path.join(vendor_artifacts, "licenses_used.txt")
destroyed_crates_file = vendor_artifacts / "destroyed_crates.txt"
in_progress_stamp = InProgressStamp(vendor_artifacts)
# First, actually run cargo vendor
if args.skip_cargo_vendor:
# Since this is for dev only, this is fine.
destroyed_crates = set()
else:
run_cargo_vendor(current_path)
# Order matters here:
# - Apply patches (also re-calculates checksums)
# - Cleanup any owners files (otherwise, git check-in or checksums are
# unhappy)
# - Destroy unused crates
apply_patches(patches, vendor)
cleanup_owners(vendor)
destroyer = CrateDestroyer(current_path, vendor)
destroyed_crates = destroyer.destroy_unused_crates(
destroyed_crates_file
)
destroyed_crates = set(destroyed_crates)
if not args.skip_metallurgy:
patches_manifest = generate_patches_manifest(
{
"@@//third_party/rust_crates/patches:{patch}": Path(patches),
"@@//bazel/rust/alchemy_crates/patches:{patch}": (
vendor_artifacts / "alchemy/patches"
),
}
)
generate_metallurgy_crates(
current_path / "projects",
vendor_artifacts,
current_path / "bazel_files",
patches_manifest,
destroyed_crates,
)
if not args.skip_cargo_vendor:
# Combine license file and check for any bad licenses
lm = LicenseManager(current_path, vendor)
lm.generate_license(
args.skip_license_check,
args.license_map,
license_shorthand_file,
destroyed_crates,
)
if args.skip_cargo_vendor or args.skip_metallurgy:
what_skipped = (
"cargo-vendor" if args.skip_cargo_vendor else "metallurgy checks"
)
logging.warning(
f"Skipped %s. This is for local dev only.", what_skipped
)
# Don't remove `in_progress_stamp`; it should hopefully serve as an
# extra reminder to rerun this without skipping cargo-vet.
return
in_progress_stamp.note_successful_termination()
if __name__ == "__main__":
main()