Reland "Support names field in source maps" (#25928)
This relands #25870. #25870 was reverted in #25923 due to CI errors
(https://app.circleci.com/pipelines/github/emscripten-core/emscripten/47832/workflows/ea0292aa-124d-4a3f-b988-0a96823e9bcd/jobs/1089017/tests),
which were fixed by
https://github.com/WebAssembly/binaryen/pull/8113.
diff --git a/ChangeLog.md b/ChangeLog.md
index 3fd97ff..1a6ad5c 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -20,6 +20,10 @@
4.0.22 (in development)
-----------------------
+- Source maps now support 'names' field with function name information.
+ emsymbolizer will show function names when used with a source map. The size
+ of source maps may increase 2-3x and the link time can increase slightly due
+ to more processing on source map creation. (#25298)
- Emscripten will now cache the JS code that it generates and re-use when
linking with the same settings at a later date. This should improve link
times generally but should especially noticeable when linking lots of small
diff --git a/test/core/test_dwarf.cpp b/test/core/test_dwarf.cpp
new file mode 100644
index 0000000..ad91ccd
--- /dev/null
+++ b/test/core/test_dwarf.cpp
@@ -0,0 +1,26 @@
+#include <emscripten.h>
+
+EM_JS(int, out_to_js, (int x), {})
+
+class MyClass {
+public:
+ void foo();
+ void bar();
+};
+
+void __attribute__((noinline)) MyClass::foo() {
+ out_to_js(0); // line 12
+ out_to_js(1);
+ out_to_js(2);
+}
+
+void __attribute__((always_inline)) MyClass::bar() {
+ out_to_js(3);
+ __builtin_trap(); // line 19
+}
+
+int main() {
+ MyClass mc;
+ mc.foo();
+ mc.bar();
+}
diff --git a/test/test_other.py b/test/test_other.py
index 76b051c..00ada9a 100644
--- a/test/test_other.py
+++ b/test/test_other.py
@@ -9629,12 +9629,49 @@
for loc in locs:
self.assertIn(loc, out)
- def check_source_map_loc_info(address, loc):
+ def check_source_map_loc_info(address, func, loc):
out = self.run_process(
[emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address],
stdout=PIPE).stdout
+ self.assertIn(func, out)
self.assertIn(loc, out)
+ def do_tests(src):
+ # 1. Test DWARF + source map together
+ # For DWARF, we check for the full inlined info for both function names and
+ # source locations. Source maps does not provide inlined info. So we only
+ # check for the info of the outermost function.
+ self.run_process([EMCC, test_file(src), '-g', '-gsource-map', '-O1', '-o',
+ 'test_dwarf.js'])
+ check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func,
+ out_to_js_call_loc)
+ check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0],
+ out_to_js_call_loc[0])
+ check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)
+ # Source map shows the original (inlined) source location with the original
+ # function name
+ check_source_map_loc_info(unreachable_addr, unreachable_func[0],
+ unreachable_loc[0])
+
+ # 2. Test source map only
+ # The addresses, function names, and source locations are the same across
+ # the builds because they are relative offsets from the code section, so we
+ # don't need to recompute them
+ self.run_process([EMCC, test_file(src), '-gsource-map', '-O1', '-o',
+ 'test_dwarf.js'])
+ check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0],
+ out_to_js_call_loc[0])
+ check_source_map_loc_info(unreachable_addr, unreachable_func[0],
+ unreachable_loc[0])
+
+ # 3. Test DWARF only
+ self.run_process([EMCC, test_file(src), '-g', '-O1', '-o',
+ 'test_dwarf.js'])
+ check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func,
+ out_to_js_call_loc)
+ check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)
+
+ # -- C program test --
# We test two locations within test_dwarf.c:
# out_to_js(0); // line 6
# __builtin_trap(); // line 13
@@ -9657,31 +9694,32 @@
# The first one corresponds to the innermost inlined location.
unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3']
- # 1. Test DWARF + source map together
- # For DWARF, we check for the full inlined info for both function names and
- # source locations. Source maps provide neither function names nor inlined
- # info. So we only check for the source location of the outermost function.
- check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func,
- out_to_js_call_loc)
- check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0])
- check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)
- check_source_map_loc_info(unreachable_addr, unreachable_loc[0])
+ do_tests('core/test_dwarf.c')
- # 2. Test source map only
- # The addresses, function names, and source locations are the same across
- # the builds because they are relative offsets from the code section, so we
- # don't need to recompute them
- self.run_process([EMCC, test_file('core/test_dwarf.c'),
- '-gsource-map', '-O1', '-o', 'test_dwarf.js'])
- check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0])
- check_source_map_loc_info(unreachable_addr, unreachable_loc[0])
+ # -- C++ program test --
+ # We test two locations within test_dwarf.cpp:
+ # out_to_js(0); // line 12
+ # __builtin_trap(); // line 19
+ self.run_process([EMCC, test_file('core/test_dwarf.cpp'),
+ '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js'])
+ # Address of out_to_js(0) within MyClass::foo(), uninlined
+ out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm')
+ # Address of __builtin_trap() within MyClass::bar(), inlined into main()
+ unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm')
- # 3. Test DWARF only
- self.run_process([EMCC, test_file('core/test_dwarf.c'),
- '-g', '-O1', '-o', 'test_dwarf.js'])
- check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func,
- out_to_js_call_loc)
- check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)
+ # Function name of out_to_js(0) within MyClass::foo(), uninlined
+ out_to_js_call_func = ['MyClass::foo()']
+ # Function names of __builtin_trap() within MyClass::bar(), inlined into
+ # main(). The first one corresponds to the innermost inlined function.
+ unreachable_func = ['MyClass::bar()', 'main']
+
+ # Source location of out_to_js(0) within MyClass::foo(), uninlined
+ out_to_js_call_loc = ['test_dwarf.cpp:12:3']
+ # Source locations of __builtin_trap() within MyClass::bar(), inlined into
+ # main(). The first one corresponds to the innermost inlined location.
+ unreachable_loc = ['test_dwarf.cpp:19:3', 'test_dwarf.cpp:25:6']
+
+ do_tests('core/test_dwarf.cpp')
def test_emsymbolizer_functions(self):
'Test emsymbolizer use cases that only provide function-granularity info'
diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py
index 37d50cb..a4046ce 100755
--- a/tools/emsymbolizer.py
+++ b/tools/emsymbolizer.py
@@ -117,6 +117,7 @@
def __init__(self):
self.version = None
self.sources = []
+ self.funcs = []
self.mappings = {}
self.offsets = []
@@ -128,6 +129,7 @@
self.version = source_map_json['version']
self.sources = source_map_json['sources']
+ self.funcs = source_map_json['names']
chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/='
vlq_map = {c: i for i, c in enumerate(chars)}
@@ -155,6 +157,7 @@
src = 0
line = 1
col = 1
+ func = 0
for segment in source_map_json['mappings'].split(','):
data = decodeVLQ(segment)
info = []
@@ -169,7 +172,9 @@
if len(data) >= 4:
col += data[3]
info.append(col)
- # TODO: see if we need the name, which is the next field (data[4])
+ if len(data) == 5:
+ func += data[4]
+ info.append(func)
self.mappings[offset] = WasmSourceMap.Location(*info)
self.offsets.append(offset)
@@ -207,6 +212,7 @@
self.sources[info.source] if info.source is not None else None,
info.line,
info.column,
+ self.funcs[info.func] if info.func is not None else None,
)
diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py
index 31f112f..63ae7ea 100755
--- a/tools/wasm-sourcemap.py
+++ b/tools/wasm-sourcemap.py
@@ -25,6 +25,8 @@
from tools import shared, utils
from tools.system_libs import DETERMINISTIC_PREFIX
+LLVM_CXXFILT = shared.llvm_tool_path('llvm-cxxfilt')
+
EMSCRIPTEN_PREFIX = utils.normalize_path(utils.path_from_root())
logger = logging.getLogger('wasm-sourcemap')
@@ -217,32 +219,189 @@
def extract_comp_dir_map(text):
+ compile_unit_pattern = re.compile(r"0x[0-9a-f]*: DW_TAG_compile_unit")
+ stmt_list_pattern = re.compile(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)")
+ comp_dir_pattern = re.compile(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)")
+
map_stmt_list_to_comp_dir = {}
- chunks = re.split(r"0x[0-9a-f]*: DW_TAG_compile_unit", text)
+ chunks = compile_unit_pattern.split(text) # DW_TAG_compile_unit
for chunk in chunks[1:]:
- stmt_list_match = re.search(r"DW_AT_stmt_list\s+\((0x[0-9a-f]*)\)", chunk)
+ stmt_list_match = stmt_list_pattern.search(chunk) # DW_AT_stmt_list
if stmt_list_match is not None:
stmt_list = stmt_list_match.group(1)
- comp_dir_match = re.search(r"DW_AT_comp_dir\s+\(\"([^\"]+)\"\)", chunk)
+ comp_dir_match = comp_dir_pattern.search(chunk) # DW_AT_comp_dir
comp_dir = decode_octal_encoded_utf8(comp_dir_match.group(1)) if comp_dir_match is not None else ''
map_stmt_list_to_comp_dir[stmt_list] = comp_dir
return map_stmt_list_to_comp_dir
-def read_dwarf_entries(wasm, options):
+def demangle_names(names):
+ # Only demangle names that look mangled
+ mangled_names = sorted({n for n in names if n.startswith('_Z')})
+ if not mangled_names:
+ return {}
+ if not os.path.exists(LLVM_CXXFILT):
+ logger.warning('llvm-cxxfilt does not exist')
+ return {}
+
+ # Gather all mangled names and call llvm-cxxfilt only once for all of them
+ input_str = '\n'.join(mangled_names)
+ proc = shared.check_call([LLVM_CXXFILT], input=input_str, stdout=shared.PIPE, stderr=shared.PIPE, text=True)
+ if proc.returncode != 0:
+ logger.warning('llvm-cxxfilt failed: %s' % proc.stderr)
+ return {}
+
+ demangled_list = proc.stdout.splitlines()
+ if len(demangled_list) != len(mangled_names):
+ logger.warning('llvm-cxxfilt output length mismatch')
+ return {}
+
+ return dict(zip(mangled_names, demangled_list, strict=True))
+
+
+class FuncRange:
+ def __init__(self, name, low_pc, high_pc):
+ self.name = name
+ self.low_pc = low_pc
+ self.high_pc = high_pc
+
+
+# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for
+# each function in a list of FuncRanges. The result list will be sorted in the
+# increasing order of low_pcs.
+def extract_func_ranges(text):
+ # This function handles four cases:
+ # 1. DW_TAG_subprogram with DW_AT_name, DW_AT_low_pc, and DW_AT_high_pc.
+ # 0x000000ba: DW_TAG_subprogram
+ # DW_AT_low_pc (0x0000005f)
+ # DW_AT_high_pc (0x00000071)
+ # DW_AT_name ("foo")
+ # ...
+ #
+ # 2. DW_TAG_subprogram with DW_AT_linkage_name, DW_AT_low_pc, and
+ # DW_AT_high_pc. Applies to mangled C++ functions.
+ # (We parse DW_AT_linkage_name instead of DW_AT_name here.)
+ # 0x000000ba: DW_TAG_subprogram
+ # DW_AT_low_pc (0x0000005f)
+ # DW_AT_high_pc (0x00000071)
+ # DW_AT_linkage_name ("_ZN7MyClass3fooEv")
+ # DW_AT_name ("foo")
+ # ...
+ #
+ # 3. DW_TAG_subprogram with DW_AT_specification, DW_AT_low_pc, and
+ # DW_AT_high_pc. C++ function info can be split into two DIEs (one with
+ # DW_AT_linkage_name and DW_AT_declaration (true) and the other with
+ # DW_AT_specification). In this case we parse DW_AT_specification for the
+ # function name.
+ # 0x0000006d: DW_TAG_subprogram
+ # DW_AT_linkage_name ("_ZN7MyClass3fooEv")
+ # DW_AT_name ("foo")
+ # DW_AT_declaration (true)
+ # ...
+ # 0x00000097: DW_TAG_subprogram
+ # DW_AT_low_pc (0x00000007)
+ # DW_AT_high_pc (0x0000004c)
+ # DW_AT_specification (0x0000006d "_ZN7MyClass3fooEv")
+ # ...
+ #
+ # 4. DW_TAG_inlined_subroutine with DW_AT_abstract_origin, DW_AT_low_pc, and
+ # DW_AT_high_pc. This represents an inlined function. We parse
+ # DW_AT_abstract_origin for the original function name.
+ # 0x0000011a: DW_TAG_inlined_subroutine
+ # DW_AT_abstract_origin (0x000000da "_ZN7MyClass3barEv")
+ # DW_AT_low_pc (0x00000078)
+ # DW_AT_high_pc (0x00000083)
+ # ...
+
+ tag_pattern = re.compile(r'\r?\n(?=0x[0-9a-f]+:)')
+ subprogram_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_subprogram")
+ inlined_pattern = re.compile(r"0x[0-9a-f]+:\s+DW_TAG_inlined_subroutine")
+ low_pc_pattern = re.compile(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)')
+ high_pc_pattern = re.compile(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)')
+ abstract_origin_pattern = re.compile(r'DW_AT_abstract_origin\s+\(0x[0-9a-f]+\s+"([^"]+)"\)')
+ linkage_name_pattern = re.compile(r'DW_AT_linkage_name\s+\("([^"]+)"\)')
+ name_pattern = re.compile(r'DW_AT_name\s+\("([^"]+)"\)')
+ specification_pattern = re.compile(r'DW_AT_specification\s+\(0x[0-9a-f]+\s+"([^"]+)"\)')
+
+ func_ranges = []
+ dw_tags = tag_pattern.split(text)
+
+ def get_name_from_tag(tag):
+ m = linkage_name_pattern.search(tag) # DW_AT_linkage_name
+ if m:
+ return m.group(1)
+ m = name_pattern.search(tag) # DW_AT_name
+ if m:
+ return m.group(1)
+ # If name is missing, check for DW_AT_specification annotation
+ m = specification_pattern.search(tag)
+ if m:
+ return m.group(1)
+ return None
+
+ for tag in dw_tags:
+ is_subprogram = subprogram_pattern.search(tag) # DW_TAG_subprogram
+ is_inlined = inlined_pattern.search(tag) # DW_TAG_inlined_subroutine
+
+ if is_subprogram or is_inlined:
+ name = None
+ low_pc = None
+ high_pc = None
+ m = low_pc_pattern.search(tag) # DW_AT_low_pc
+ if m:
+ low_pc = int(m.group(1), 16)
+ m = high_pc_pattern.search(tag) # DW_AT_high_pc
+ if m:
+ high_pc = int(m.group(1), 16)
+ if is_subprogram:
+ name = get_name_from_tag(tag)
+ else: # is_inlined
+ m = abstract_origin_pattern.search(tag) # DW_AT_abstract_origin
+ if m:
+ name = m.group(1)
+ if name and low_pc is not None and high_pc is not None:
+ func_ranges.append(FuncRange(name, low_pc, high_pc))
+
+ # Demangle names
+ all_names = [item.name for item in func_ranges]
+ demangled_map = demangle_names(all_names)
+ for func_range in func_ranges:
+ if func_range.name in demangled_map:
+ func_range.name = demangled_map[func_range.name]
+
+ # To correctly identify the innermost function for a given address,
+ # func_ranges is sorted primarily by low_pc in ascending order and secondarily
+ # by high_pc in descending order. This ensures that for overlapping ranges,
+ # the more specific (inner) range appears later in the list.
+ func_ranges.sort(key=lambda item: (item.low_pc, -item.high_pc))
+ return func_ranges
+
+
+def read_dwarf_info(wasm, options):
if options.dwarfdump_output:
output = utils.read_file(options.dwarfdump_output)
elif options.dwarfdump:
logger.debug('Reading DWARF information from %s' % wasm)
if not os.path.exists(options.dwarfdump):
utils.exit_with_error('llvm-dwarfdump not found: ' + options.dwarfdump)
- proc = shared.check_call([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=shared.PIPE)
+ # We need only three tags in the debug info: DW_TAG_compile_unit for
+ # source location, and DW_TAG_subprogram and DW_TAG_inlined_subroutine
+ # for the function ranges.
+ dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm,
+ '-t', 'DW_TAG_compile_unit', '-t', 'DW_TAG_subprogram',
+ '-t', 'DW_TAG_inlined_subroutine']
+ proc = shared.check_call(dwarfdump_cmd, stdout=shared.PIPE)
output = proc.stdout
else:
utils.exit_with_error('Please specify either --dwarfdump or --dwarfdump-output')
+ debug_line_pattern = re.compile(r"debug_line\[(0x[0-9a-f]*)\]")
+ include_dir_pattern = re.compile(r"include_directories\[\s*(\d+)\] = \"([^\"]*)")
+ file_pattern = re.compile(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)")
+ line_pattern = re.compile(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?")
+
entries = []
- debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", output)
+ debug_line_chunks = debug_line_pattern.split(output)
map_stmt_list_to_comp_dir = extract_comp_dir_map(debug_line_chunks[0])
for stmt_list, line_chunk in zip(debug_line_chunks[1::2], debug_line_chunks[2::2], strict=True):
comp_dir = map_stmt_list_to_comp_dir.get(stmt_list, '')
@@ -263,16 +422,16 @@
# 0x0000000000000011 28 0 1 0 0 is_stmt
include_directories = {'0': comp_dir}
- for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk):
+ for dir in include_dir_pattern.finditer(line_chunk):
include_directories[dir.group(1)] = os.path.join(comp_dir, decode_octal_encoded_utf8(dir.group(2)))
files = {}
- for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk):
+ for file in file_pattern.finditer(line_chunk):
dir = include_directories[file.group(3)]
file_path = os.path.join(dir, decode_octal_encoded_utf8(file.group(2)))
files[file.group(1)] = file_path
- for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
+ for line in line_pattern.finditer(line_chunk):
entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
if not entry['eos']:
entries.append(entry)
@@ -288,22 +447,61 @@
remove_dead_entries(entries)
# return entries sorted by the address field
- return sorted(entries, key=lambda entry: entry['address'])
+ entries = sorted(entries, key=lambda entry: entry['address'])
+
+ func_ranges = extract_func_ranges(debug_line_chunks[0])
+ return entries, func_ranges
-def build_sourcemap(entries, code_section_offset, options):
+def build_sourcemap(entries, func_ranges, code_section_offset, options):
base_path = options.basepath
collect_sources = options.sources
prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path)
+ # Add code section offset to the low/high pc in the function PC ranges
+ for func_range in func_ranges:
+ func_range.low_pc += code_section_offset
+ func_range.high_pc += code_section_offset
+
sources = []
sources_content = []
+ # There can be duplicate names in case an original source function has
+ # multiple disjoint PC ranges or is inlined to multiple callsites. Make the
+ # 'names' list a unique list of names, and map the function ranges to the
+ # indices in that list.
+ names = sorted({item.name for item in func_ranges})
+ name_to_id = {name: i for i, name in enumerate(names)}
mappings = []
sources_map = {}
last_address = 0
last_source_id = 0
last_line = 1
last_column = 1
+ last_func_id = 0
+
+ active_funcs = []
+ next_func_range_id = 0
+
+ # Get the function name ID that the given address falls into
+ def get_function_id(address):
+ nonlocal active_funcs
+ nonlocal next_func_range_id
+
+ # Maintain a list of "active functions" whose ranges currently cover the
+ # address. As the address advances, it adds new functions that start and
+ # removes functions that end. The last function remaining in the active list
+ # at any point is the innermost function.
+ while next_func_range_id < len(func_ranges) and func_ranges[next_func_range_id].low_pc <= address:
+ # active_funcs contains (high_pc, id) pair
+ active_funcs.append((func_ranges[next_func_range_id].high_pc, next_func_range_id))
+ next_func_range_id += 1
+ active_funcs = [f for f in active_funcs if f[0] > address]
+
+ if active_funcs:
+ func_range_id = active_funcs[-1][1]
+ name = func_ranges[func_range_id].name
+ return name_to_id[name]
+ return None
for entry in entries:
line = entry['line']
@@ -334,21 +532,27 @@
sources_content.append(None)
else:
source_id = sources_map[source_name]
+ func_id = get_function_id(address)
address_delta = address - last_address
source_id_delta = source_id - last_source_id
line_delta = line - last_line
column_delta = column - last_column
- mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta))
last_address = address
last_source_id = source_id
last_line = line
last_column = column
+ mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)
+ if func_id is not None:
+ func_id_delta = func_id - last_func_id
+ last_func_id = func_id
+ mapping += encode_vlq(func_id_delta)
+ mappings.append(mapping)
return {'version': 3,
'sources': sources,
'sourcesContent': sources_content,
- 'names': [],
+ 'names': names,
'mappings': ','.join(mappings)}
@@ -359,12 +563,12 @@
with open(wasm_input, 'rb') as infile:
wasm = infile.read()
- entries = read_dwarf_entries(wasm_input, options)
+ entries, func_ranges = read_dwarf_info(wasm_input, options)
code_section_offset = get_code_section_offset(wasm)
logger.debug('Saving to %s' % options.output)
- map = build_sourcemap(entries, code_section_offset, options)
+ map = build_sourcemap(entries, func_ranges, code_section_offset, options)
with open(options.output, 'w', encoding='utf-8') as outfile:
json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False)