Migrate to llvm-objdump and llvm-symbolizer

Android NDK will no longer support tools based on GNU toolchain.

Hence migrate ASan symbolization for Android to LLVM base tools
like llvm-symbolizer and llvm-objdump.

Bug: 1273402
Cq-Include-Trybots: luci.chromium.try:linux_chromium_asan_rel_ng,win-asan,mac_chromium_asan_rel_ng,android-asan
Change-Id: Icc0c5f8e64e1bb05d49f347c3316850a0ff83e55
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3303875
Reviewed-by: Andrew Grieve <[email protected]>
Reviewed-by: Hans Wennborg <[email protected]>
Reviewed-by: Nico Weber <[email protected]>
Commit-Queue: Jonathan Wright <[email protected]>
Cr-Commit-Position: refs/heads/main@{#961488}
NOKEYCHECK=True
GitOrigin-RevId: ffa736ce5f5d7b018b768c29c0cc1ff35152393a
diff --git a/llvm_objdump.py b/llvm_objdump.py
new file mode 100644
index 0000000..3b2acd6
--- /dev/null
+++ b/llvm_objdump.py
@@ -0,0 +1,137 @@
+# Copyright 2022 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import logging
+import os
+import re
+import subprocess
+
+_CHROME_SRC = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
+_LLVM_OBJDUMP_PATH = os.path.join(_CHROME_SRC, 'third_party', 'llvm-build',
+                                  'Release+Asserts', 'bin', 'llvm-objdump')
+
+# Function lines look like:
+#   000177b0 <android::IBinder::~IBinder()+0x2c>:
+# We pull out the address and function first. Then we check for an optional
+# offset. This is tricky due to functions that look like "operator+(..)+0x2c"
+_FUNC = re.compile(r"(^[a-f0-9]*) <(.*)>:$")
+_OFFSET = re.compile(r"(.*)\+0x([a-f0-9]*)")
+
+# A disassembly line looks like:
+#   177b2:  b510        push  {r4, lr}
+_ASM = re.compile(r"(^[ a-f0-9]*):[ a-f0-0]*.*$")
+
+
+def _StripPC(addr, cpu_arch):
+  """Strips the Thumb bit from a program counter address when appropriate.
+
+  Args:
+    addr: the program counter address
+    cpu_arch: Target CPU architecture.
+
+  Returns:
+    The stripped program counter address.
+  """
+  if cpu_arch == "arm":
+    return addr & ~1
+  return addr
+
+
+class ObjdumpInformation(object):
+  def __init__(self, address, library, symbol, offset):
+    self.address = address
+    self.library = library
+    self.symbol = symbol
+    self.offset = offset
+
+
+class LLVMObjdumper(object):
+  def __init__(self):
+    """Creates an instance of LLVMObjdumper that interacts with llvm-objdump.
+    """
+    self._llvm_objdump_parameters = [
+        '--disassemble',
+        '--demangle',
+        '--section=.text',
+    ]
+
+  def __enter__(self):
+    return self
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    pass
+
+  @staticmethod
+  def GetSymbolDataFromObjdumpOutput(objdump_out, address, cpu_arch):
+    stripped_target_address = _StripPC(address, cpu_arch)
+    for line in objdump_out.split(os.linesep):
+      components = _FUNC.match(line)
+      if components:
+        # This is a new function, so record the current function and its
+        # address.
+        current_symbol_addr = int(components.group(1), 16)
+        current_symbol = components.group(2)
+
+        # Does it have an optional offset like: "foo(..)+0x2c"?
+        components = _OFFSET.match(current_symbol)
+        if components:
+          current_symbol = components.group(1)
+          offset = components.group(2)
+          if offset:
+            current_symbol_addr -= int(offset, 16)
+
+      # Is it a disassembly line like: "177b2:  b510        push  {r4, lr}"?
+      components = _ASM.match(line)
+      if components:
+        addr = components.group(1)
+        i_addr = int(addr, 16)
+        if i_addr == stripped_target_address:
+          return (current_symbol, stripped_target_address - current_symbol_addr)
+
+    return (None, None)
+
+  def GetSymbolInformation(self, lib, address, cpu_arch):
+    """Returns the corresponding function names and line numbers.
+
+    Args:
+      lib: library to search for info.
+      address: address to look for info.
+      cpu_arch: architecture where the dump was taken
+
+    Returns:
+      An ObjdumpInformation object
+    """
+    if not os.path.isfile(_LLVM_OBJDUMP_PATH):
+      logging.error('Cannot find llvm-objdump. path=%s', _LLVM_OBJDUMP_PATH)
+      return None
+
+    stripped_address = _StripPC(address, cpu_arch)
+
+    full_arguments = [_LLVM_OBJDUMP_PATH] + self._llvm_objdump_parameters
+    full_arguments.append('--start-address=' + str(stripped_address))
+    full_arguments.append('--stop-address=' + str(stripped_address + 8))
+    full_arguments.append(lib)
+
+    objdump_process = subprocess.Popen(full_arguments,
+                                       stdout=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       universal_newlines=True)
+
+    stdout, stderr = objdump_process.communicate()
+    objdump_process_return_code = objdump_process.poll()
+
+    if objdump_process_return_code != 0:
+      logging.error(
+          'Invocation of llvm-objdump failed!' +
+          ' tool-command-line=\'{}\', return-code={}, std-error=\'{}\''.format(
+              ' '.join(full_arguments), objdump_process_return_code, stderr))
+      return None
+
+    symbol, offset = LLVMObjdumper.GetSymbolDataFromObjdumpOutput(
+        stdout, address, cpu_arch)
+
+    return ObjdumpInformation(address=address,
+                              library=lib,
+                              symbol=symbol,
+                              offset=offset)
diff --git a/llvm_symbolizer.py b/llvm_symbolizer.py
index ac21078..86bc0a8 100644
--- a/llvm_symbolizer.py
+++ b/llvm_symbolizer.py
@@ -4,7 +4,6 @@
 
 import logging
 import os
-import re
 import subprocess
 import threading
 
@@ -13,25 +12,26 @@
     _CHROME_SRC, 'third_party', 'llvm-build', 'Release+Asserts', 'bin',
     'llvm-symbolizer')
 
-_BINARY = re.compile(r'0b[0,1]+')
-_HEX = re.compile(r'0x[0-9,a-e]+')
-_OCTAL = re.compile(r'0[0-7]+')
-
 _UNKNOWN = '<UNKNOWN>'
 
+_ELF_MAGIC_HEADER_BYTES = b'\x7f\x45\x4c\x46'
 
-def _CheckValidAddr(addr):
-  """
-  Check whether the addr is valid input to llvm symbolizer.
-  Valid addr has to be octal, binary, or hex number.
+
+def IsValidLLVMSymbolizerTarget(file_path):
+  """ Verify the passed file is a valid target for llvm-symbolization
 
   Args:
-    addr: addr to be entered to llvm symbolizer.
+    file_path: Path to a file to be checked
 
-  Returns:
-    whether the addr is valid input to llvm symbolizer.
+  Return:
+    True if the file exists and has the correct ELF header, False otherwise
   """
-  return _HEX.match(addr) or _OCTAL.match(addr) or _BINARY.match(addr)
+  try:
+    with open(file_path, 'rb') as f:
+      header_bytes = f.read(4)
+      return header_bytes == _ELF_MAGIC_HEADER_BYTES
+  except IOError:
+    return False
 
 
 class LLVMSymbolizer(object):
@@ -42,6 +42,12 @@
     numbers of an address from the symbols library.
     """
     self._llvm_symbolizer_subprocess = None
+    self._llvm_symbolizer_parameters = [
+        '--functions',
+        '--demangle',
+        '--inlines',
+    ]
+
     # Allow only one thread to call GetSymbolInformation at a time.
     self._lock = threading.Lock()
 
@@ -53,7 +59,7 @@
     """
     if os.path.isfile(_LLVM_SYMBOLIZER_PATH):
       self._llvm_symbolizer_subprocess = subprocess.Popen(
-          [_LLVM_SYMBOLIZER_PATH],
+          [_LLVM_SYMBOLIZER_PATH] + self._llvm_symbolizer_parameters,
           stdout=subprocess.PIPE,
           stdin=subprocess.PIPE,
           universal_newlines=True)
@@ -89,15 +95,26 @@
       addr: address to look for info.
 
     Returns:
-      A list of (function name, line numbers) tuple.
+      A triplet of address, module-name and list of symbols
     """
-    if (self._llvm_symbolizer_subprocess is None or not lib
-        or not _CheckValidAddr(addr) or not os.path.isfile(lib)):
+    if (self._llvm_symbolizer_subprocess is None):
+      logging.error('Can\'t run llvm-symbolizer! ' +
+                    'Subprocess for llvm-symbolizer has not been started!')
+      return [(_UNKNOWN, lib)]
+
+    if not lib:
+      logging.error('Can\'t run llvm-symbolizer! No target is given!')
+      return [(_UNKNOWN, lib)]
+
+    if not IsValidLLVMSymbolizerTarget(lib):
+      logging.error(
+          'Can\'t run llvm-symbolizer! ' +
+          'Given binary is not a valid target. path=%s', lib)
       return [(_UNKNOWN, lib)]
 
     proc = self._llvm_symbolizer_subprocess
     with self._lock:
-      proc.stdin.write('%s %s\n' % (lib, addr))
+      proc.stdin.write('%s %s\n' % (lib, hex(addr)))
       proc.stdin.flush()
       result = []
       # Read until an empty line is observed, which indicates the end of the