| #!/usr/bin/env python3 |
| # Copyright 2025 The Emscripten Authors. All rights reserved. |
| # Emscripten is available under two separate licenses, the MIT license and the |
| # University of Illinois/NCSA Open Source License. Both these licenses can be |
| # found in the LICENSE file. |
| |
| """ |
| Wrapper for 'wasm-split --multi-split' functionality. This script generates a |
| .manifest file based on the list of user source paths, using source map |
| information. |
| |
| This assumes the name section exists in the input wasm file, and also assumes |
| the sourceMappingURL section exists in the input or a source map file is |
| separately supplied with --sourcemap. If we have two files a.c and b.c, to |
| generate a source map and the name section, if you compile and link within a |
| single command, you can do something like |
| $ emcc -g2 -gsrouce-map a.c b.c -o result.js |
| If you want to compile and link in separate commands, you can do |
| $ emcc -gsource-map a.c -o a.o |
| $ emcc -gsource-map b.c -o b.o |
| $ emcc -g2 -gsource-map a.o b.o -o result.js |
| See https://emscripten.org/docs/porting/Debugging.html for more details. |
| |
| This takes a wasm file and a paths file, which is a text file containing a list |
| of paths as inputs. The paths file should contain a single path per line. A |
| single split module will be generated per specified path. If a specified path |
| contains another specified path, functions contained in the inner path will be |
| split as the inner path's module, and the rest of the functions will be split as |
| the outer path's module. Functions that do not belong to any of the specified |
| paths will remain in the primary module. |
| |
| The paths in the paths file can be either absolute or relative, but they should |
| match those of 'sources' field in the source map file. Sometimes a source map's |
| 'sources' field contains paths relative to a build directory, so source files |
| may be recorded as '../src/subdir/test.c', for example. In this case, if you |
| want to split the directory src/subdir, you should list it as ../src/subdir. You |
| can manually open the source map file and check 'sources' field, but we also an |
| option to help that. You can do like |
| $ empath-split --print-sources test.wasm |
| or |
| $ empath-split --print-sources --source-map test.wasm.map |
| to print the list of sources in 'sources' field in the source map. Note that |
| emscripten's libraries' source files have /emsdk/emscripten prefix, which is a |
| fake deterministic prefix to produce reproducible builds across platforms. |
| """ |
| |
| import argparse |
| import json |
| import os |
| import sys |
| import tempfile |
| from pathlib import PurePath |
| |
| __scriptdir__ = os.path.dirname(os.path.abspath(__file__)) |
| __rootdir__ = os.path.dirname(__scriptdir__) |
| sys.path.insert(0, __rootdir__) |
| |
| from tools import building |
| from tools import diagnostics |
| from tools import emsymbolizer |
| from tools import shared |
| from tools import utils |
| from tools import webassembly |
| from tools.utils import exit_with_error |
| |
| |
| def parse_args(): |
| parser = argparse.ArgumentParser( |
| description='Split a wasm file based on user paths', |
| epilog=""" |
| This is a wrapper for 'wasm-split --multi-split' functionality, so you should |
| add wasm-split's command line options as well. You should or may want to add |
| wasm-split options like -o (--output), --out-prefix, -g, and feature |
| enabling/disabling options. Run 'wasm-split -h' for the list of options. But you |
| should NOT add --manifest, because this will be generated from this script. |
| """) |
| parser.add_argument('wasm', nargs='?', help='Path to the input wasm file') |
| parser.add_argument('paths_file', nargs='?', help='Path to the input file containing paths') |
| parser.add_argument('-s', '--sourcemap', help='Force source map file') |
| parser.add_argument('-v', '--verbose', action='store_true', |
| help='Print verbose info for debugging this script') |
| parser.add_argument('--wasm-split', help='Path to wasm-split executable') |
| parser.add_argument('--preserve-manifest', action='store_true', |
| help='Preserve generated manifest file. This sets --verbose too.') |
| parser.add_argument('--print-sources', action='store_true', |
| help='Print the list of sources in the source map to help figure out splitting boundaries. Does NOT perform the splitting.') |
| |
| args, forwarded_args = parser.parse_known_args() |
| if args.preserve_manifest: |
| args.verbose = True |
| if not args.wasm_split: |
| args.wasm_split = os.path.join(building.get_binaryen_bin(), shared.exe_suffix('wasm-split')) |
| |
| if '--manifest' in forwarded_args: |
| parser.error('manifest file will be generated by this script and should not be given') |
| |
| if args.print_sources: |
| if not args.wasm and not args.sourcemap: |
| parser.error('--print-sources requires either wasm or --sourcemap') |
| return args, forwarded_args |
| |
| if not args.wasm and not args.paths_file: |
| parser.error("the following arguments are required: wasm, paths_file") |
| if not args.paths_file: |
| parser.error("the following arguments are required: paths_file") |
| if '-o' not in forwarded_args and '--output' not in forwarded_args: |
| parser.error('-o (--output) is required') |
| return args, forwarded_args |
| |
| |
| def check_errors(args): |
| if args.wasm and not os.path.isfile(args.wasm): |
| exit_with_error(f"'{args.wasm}' was not found or not a file") |
| if args.paths_file and not os.path.isfile(args.paths_file): |
| exit_with_error(f"'{args.paths_file}' was not found or not a file") |
| |
| if args.sourcemap: |
| sourcemap = args.sourcemap |
| |
| if args.wasm: |
| with webassembly.Module(args.wasm) as module: |
| if not args.sourcemap: |
| if not emsymbolizer.get_sourceMappingURL_section(module): |
| exit_with_error('sourceMappingURL section does not exist') |
| sourcemap = module.get_sourceMappingURL() |
| if not module.has_name_section(): |
| exit_with_error('Name section does not exist') |
| |
| if not os.path.isfile(sourcemap): |
| exit_with_error(f"'{sourcemap}' was not found or not a file") |
| if not os.path.isfile(args.wasm_split): |
| exit_with_error(f"'{args.wasm_split}' was not found or not a file") |
| |
| # Check source map validity. Just perform simple checks to make sure mandatory |
| # fields exist. |
| try: |
| with open(sourcemap) as f: |
| source_map_data = json.load(f) |
| except json.JSONDecodeError: |
| exit_with_error(f'Invalid JSON format in file {args.sourcemap}') |
| for field in ['version', 'sources', 'mappings']: |
| if field not in source_map_data: |
| exit_with_error(f"Field '{field}' is missing in the source map") |
| |
| |
| def get_sourceMappingURL(wasm, arg_sourcemap): |
| if arg_sourcemap: |
| return arg_sourcemap |
| with webassembly.Module(wasm) as module: |
| return module.get_sourceMappingURL() |
| |
| |
| def print_sources(sourcemap): |
| with open(sourcemap) as f: |
| sources = json.load(f).get('sources') |
| assert(isinstance(sources, list)) |
| for src in sources: |
| print(src) |
| |
| |
| def get_path_to_functions_map(wasm, sourcemap, paths): |
| def is_synthesized_func(func): |
| # TODO There can be more |
| synthesized_names = [ |
| 'main', |
| '__wasm_call_ctors', |
| '__clang_call_terminate', |
| ] |
| synthesized_prefixes = [ |
| 'legalstub$', |
| 'legalfunc$', |
| '__cxx_global_', |
| '_GLOBAL__', |
| 'virtual thunk to ', |
| ] |
| if func in synthesized_names: |
| return True |
| return func.startswith(tuple(synthesized_prefixes)) |
| |
| # Compute {func_name: src file} map, and invert it to get |
| # {src file: list of functions} map, and construct {path: list of functions} |
| # map from it |
| with webassembly.Module(wasm) as module: |
| funcs = module.get_functions() |
| func_names = module.get_function_names() |
| assert len(funcs) == len(func_names) |
| |
| func_to_src = {} |
| src_to_funcs = {} |
| |
| sm = emsymbolizer.WasmSourceMap() |
| sm.parse(sourcemap) |
| |
| for func_name, func in zip(func_names, funcs): |
| # From the last address, decrement the address by 1 until we find location |
| # info with source file information. The reason we do this is to reduce |
| # the probability of picking an address where another function is inlined |
| # into, picking the inlined function's source. |
| # We start from the end because it is simpler; it is harder to compute the |
| # first instruction's address, because there is a gap for local types |
| # between function offset and the first instruction. |
| addr = func.offset + func.size - 1 |
| while addr > func.offset: |
| loc = sm.lookup(addr, func.offset) |
| # This means there is no source map mappings for the entire function |
| # (because we give func.offset as a lower bound). Exit the loop. |
| if not loc: |
| break |
| # Exit the loop only if a location info with source file information is |
| # found. If not, continue the search. |
| if loc.source: |
| break |
| addr -= 1 |
| |
| if loc and loc.source: |
| func_to_src[func_name] = utils.normalize_path(loc.source) |
| else: |
| if not is_synthesized_func(func_name): |
| diagnostics.warn(f"No source file information found in the source map for function '{func_name}'") |
| |
| for func_name, src in func_to_src.items(): |
| if src not in src_to_funcs: |
| src_to_funcs[src] = [] |
| src_to_funcs[src].append(func_name) |
| |
| # Visit paths in the reverse sorting order, so that we can process inner paths |
| # first. |
| # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign |
| # functions contained in /a/b/c to it first and assign the remaining functions |
| # to /a/b. |
| visited_funcs = set() |
| path_to_funcs = {} |
| for path in sorted(paths, reverse=True): |
| ppath = PurePath(path) |
| path_to_funcs[path] = [] |
| for src, funcs in src_to_funcs.items(): |
| psrc = PurePath(src) |
| if ppath == psrc or ppath in psrc.parents: |
| for func in funcs: |
| if func not in visited_funcs: |
| visited_funcs.add(func) |
| path_to_funcs[path].append(func) |
| return path_to_funcs |
| |
| |
| def main(): |
| args, forwarded_args = parse_args() |
| check_errors(args) |
| |
| sourcemap = get_sourceMappingURL(args.wasm, args.sourcemap) |
| if args.print_sources: |
| print_sources(sourcemap) |
| return |
| |
| paths = utils.read_file(args.paths_file).splitlines() |
| paths = [utils.normalize_path(path.strip()) for path in paths if path.strip()] |
| # To make /a/b/c and /a/b/c/ equivalent |
| paths = [path.rstrip(os.sep) for path in paths] |
| # Remove duplicates |
| paths = list(dict.fromkeys(paths)) |
| |
| # Compute {path: list of functions} map |
| path_to_funcs = get_path_to_functions_map(args.wasm, sourcemap, paths) |
| |
| # Write .manifest file |
| with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f: |
| manifest = f.name |
| for i, path in enumerate(paths): |
| f.write(f'{i}\n') |
| if not path_to_funcs[path]: |
| diagnostics.warn(f'{path} does not match any functions') |
| if args.verbose: |
| print(f'{path}: {len(path_to_funcs[path])} functions') |
| for func in path_to_funcs[path]: |
| print(' ' + func) |
| print() |
| for func in path_to_funcs[path]: |
| f.write(func + '\n') |
| if i < len(paths) - 1: |
| f.write('\n') |
| f.flush() |
| |
| cmd = [args.wasm_split, '--multi-split', args.wasm, '--manifest', manifest] |
| if args.verbose: |
| # This option is used both in this script and wasm-split |
| cmd.append('-v') |
| cmd += forwarded_args |
| if args.verbose: |
| print('\n' + ' '.join(cmd)) |
| shared.run_process(cmd) |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |