adding actual benchmarks to the project
diff --git a/.github/workflows/ubuntu24.yml b/.github/workflows/ubuntu24.yml
index 0a327b3..8da4c5f 100644
--- a/.github/workflows/ubuntu24.yml
+++ b/.github/workflows/ubuntu24.yml
@@ -11,7 +11,7 @@
         run: |
           mkdir build &&
           cd build &&
-          CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON  ..  &&
+          CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON -D FASTFLOAT_BENCHMARKS=ON ..  &&
           cmake --build .   &&
           ctest --output-on-failure
       - name: Use cmake CXX23
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e331b15..94fc0b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@
 set(FASTFLOAT_CXX_STANDARD 11 CACHE STRING "the C++ standard to use for fastfloat")
 set(CMAKE_CXX_STANDARD ${FASTFLOAT_CXX_STANDARD})
 option(FASTFLOAT_TEST "Enable tests" OFF)
+
 if(FASTFLOAT_TEST)
   enable_testing()
   add_subdirectory(tests)
@@ -29,6 +30,16 @@
 endif()
 
 add_library(fast_float INTERFACE)
+
+
+option(FASTFLOAT_BENCHMARKS "Enable benchmarks" OFF)
+if(FASTFLOAT_BENCHMARKS)
+  add_subdirectory(benchmarks)
+else(FASTFLOAT_BENCHMARKS)
+  message(STATUS "Benchmarks are disabled. Set FASTFLOAT_BENCHMARKS to ON to build benchmarks (assumes C++17).")
+endif(FASTFLOAT_BENCHMARKS)
+
+
 add_library(FastFloat::fast_float ALIAS fast_float)
 target_include_directories(
   fast_float
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..b4e0395
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_executable(realbenchmark benchmark.cpp)
+set_property(
+    TARGET realbenchmark
+    PROPERTY CXX_STANDARD 17)
+
+target_link_libraries(realbenchmark PUBLIC fast_float)
+include(ExternalProject)
+
+# Define the external project
+ExternalProject_Add(simple_fastfloat_benchmark
+    GIT_REPOSITORY https://github.com/lemire/simple_fastfloat_benchmark.git
+    GIT_TAG        master  # or specify a particular commit/tag/branch
+    SOURCE_DIR     ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark
+    BINARY_DIR     ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark-build
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+)
+set(DATA_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark/data)
+
+add_custom_target(CopyData ALL
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${DATA_DIR} ${CMAKE_CURRENT_BINARY_DIR}/data
+    DEPENDS simple_fastfloat_benchmark
+)
+add_dependencies(realbenchmark CopyData)
+target_compile_definitions(realbenchmark PUBLIC BENCHMARK_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data")
diff --git a/benchmarks/apple_arm_events.h b/benchmarks/apple_arm_events.h
new file mode 100644
index 0000000..3a94081
--- /dev/null
+++ b/benchmarks/apple_arm_events.h
@@ -0,0 +1,1117 @@
+// Original design from:
+// =============================================================================
+// XNU kperf/kpc
+// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges
+//
+// References:
+//
+// XNU source (since xnu 2422.1.72):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h
+// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c
+//
+// Lightweight PET (Profile Every Thread, since xnu 3789.1.32):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c
+//
+// System Private frameworks (since macOS 10.11, iOS 8.0):
+// /System/Library/PrivateFrameworks/kperf.framework
+// /System/Library/PrivateFrameworks/kperfdata.framework
+//
+// Xcode framework (since Xcode 7.0):
+// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework
+//
+// CPU database (plist files)
+// macOS (since macOS 10.11):
+//     /usr/share/kpep/<name>.plist
+// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0):
+//     /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+//     /DeviceSupport/<version>/DeveloperDiskImage.dmg/usr/share/kpep/<name>.plist
+//
+//
+// Created by YaoYuan <[email protected]> on 2021.
+// Released into the public domain (unlicense.org).
+// =============================================================================
+
+#ifndef M1CYCLES_H
+#define M1CYCLES_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dlfcn.h>          // for dlopen() and dlsym()
+#include <mach/mach_time.h> // for mach_absolute_time()
+#include <sys/kdebug.h>     // for kdebug trace decode
+#include <sys/sysctl.h>     // for sysctl()
+#include <unistd.h>         // for usleep()
+
+struct performance_counters {
+  double cycles;
+  double branches;
+  double missed_branches;
+  double instructions;
+  performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i)
+      : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+  performance_counters(double c, double b, double m, double i)
+      : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+  performance_counters(double init)
+      : cycles(init), branches(init), missed_branches(init),
+        instructions(init) {}
+
+  inline performance_counters &operator-=(const performance_counters &other) {
+    cycles -= other.cycles;
+    branches -= other.branches;
+    missed_branches -= other.missed_branches;
+    instructions -= other.instructions;
+    return *this;
+  }
+  inline performance_counters &min(const performance_counters &other) {
+    cycles = other.cycles < cycles ? other.cycles : cycles;
+    branches = other.branches < branches ? other.branches : branches;
+    missed_branches = other.missed_branches < missed_branches
+                          ? other.missed_branches
+                          : missed_branches;
+    instructions =
+        other.instructions < instructions ? other.instructions : instructions;
+    return *this;
+  }
+  inline performance_counters &operator+=(const performance_counters &other) {
+    cycles += other.cycles;
+    branches += other.branches;
+    missed_branches += other.missed_branches;
+    instructions += other.instructions;
+    return *this;
+  }
+
+  inline performance_counters &operator/=(double numerator) {
+    cycles /= numerator;
+    branches /= numerator;
+    missed_branches /= numerator;
+    instructions /= numerator;
+    return *this;
+  }
+};
+
+inline performance_counters operator-(const performance_counters &a,
+                                      const performance_counters &b) {
+  return performance_counters(a.cycles - b.cycles, a.branches - b.branches,
+                              a.missed_branches - b.missed_branches,
+                              a.instructions - b.instructions);
+}
+
+
+
+typedef float f32;
+typedef double f64;
+typedef int8_t i8;
+typedef uint8_t u8;
+typedef int16_t i16;
+typedef uint16_t u16;
+typedef int32_t i32;
+typedef uint32_t u32;
+typedef int64_t i64;
+typedef uint64_t u64;
+typedef size_t usize;
+
+// -----------------------------------------------------------------------------
+// <kperf.framework> header (reverse engineered)
+// This framework wraps some sysctl calls to communicate with the kpc in kernel.
+// Most functions requires root privileges, or process is "blessed".
+// -----------------------------------------------------------------------------
+
+// Cross-platform class constants.
+#define KPC_CLASS_FIXED (0)
+#define KPC_CLASS_CONFIGURABLE (1)
+#define KPC_CLASS_POWER (2)
+#define KPC_CLASS_RAWPMU (3)
+
+// Cross-platform class mask constants.
+#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED)               // 1
+#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2
+#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER)               // 4
+#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU)             // 8
+
+// PMU version constants.
+#define KPC_PMU_ERROR (0)     // Error
+#define KPC_PMU_INTEL_V3 (1)  // Intel
+#define KPC_PMU_ARM_APPLE (2) // ARM64
+#define KPC_PMU_INTEL_V2 (3)  // Old Intel
+#define KPC_PMU_ARM_V2 (4)    // Old ARM
+
+// The maximum number of counters we could read from every class in one go.
+// ARMV7: FIXED: 1, CONFIGURABLE: 4
+// ARM32: FIXED: 2, CONFIGURABLE: 6
+// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8)
+// x86: 32
+#define KPC_MAX_COUNTERS 32
+
+// Bits for defining what to do on an action.
+// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h
+#define KPERF_SAMPLER_TH_INFO (1U << 0)
+#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1)
+#define KPERF_SAMPLER_KSTACK (1U << 2)
+#define KPERF_SAMPLER_USTACK (1U << 3)
+#define KPERF_SAMPLER_PMC_THREAD (1U << 4)
+#define KPERF_SAMPLER_PMC_CPU (1U << 5)
+#define KPERF_SAMPLER_PMC_CONFIG (1U << 6)
+#define KPERF_SAMPLER_MEMINFO (1U << 7)
+#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8)
+#define KPERF_SAMPLER_TH_DISPATCH (1U << 9)
+#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10)
+#define KPERF_SAMPLER_SYS_MEM (1U << 11)
+#define KPERF_SAMPLER_TH_INSCYC (1U << 12)
+#define KPERF_SAMPLER_TK_INFO (1U << 13)
+
+// Maximum number of kperf action ids.
+#define KPERF_ACTION_MAX (32)
+
+// Maximum number of kperf timer ids.
+#define KPERF_TIMER_MAX (8)
+
+// x86/arm config registers are 64-bit
+typedef u64 kpc_config_t;
+
+/// Print current CPU identification string to the buffer (same as snprintf),
+/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC
+/// database in /usr/share/kpep.
+/// @return string's length, or negative value if error occurs.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(hw.cputype), get(hw.cpusubtype),
+///                 get(hw.cpufamily), get(machdep.cpu.model)
+static int (*kpc_cpu_string)(char *buf, usize buf_size);
+
+/// Get the version of KPC that's being run.
+/// @return See `PMU version constants` above.
+/// @details sysctl get(kpc.pmu_version)
+static u32 (*kpc_pmu_version)(void);
+
+/// Get running PMC classes.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.counting)
+static u32 (*kpc_get_counting)(void);
+
+/// Set PMC classes to enable counting.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.counting)
+static int (*kpc_set_counting)(u32 classes);
+
+/// Get running PMC classes for current thread.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.thread_counting)
+static u32 (*kpc_get_thread_counting)(void);
+
+/// Set PMC classes to enable counting for current thread.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.thread_counting)
+static int (*kpc_set_thread_counting)(u32 classes);
+
+/// Get how many config registers there are for a given mask.
+/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @return 0 if error occurs or no class is set.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.config_count)
+static u32 (*kpc_get_config_count)(u32 classes);
+
+/// Get config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer to receive values, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), get(kpc.config)
+static int (*kpc_get_config)(u32 classes, kpc_config_t *config);
+
+/// Set config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), set(kpc.config)
+static int (*kpc_set_config)(u32 classes, kpc_config_t *config);
+
+/// Get how many counters there are for a given mask.
+/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.counter_count)
+static u32 (*kpc_get_counter_count)(u32 classes);
+
+/// Get counter accumulations.
+/// If `all_cpus` is true, the buffer count should not smaller than
+/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller
+/// than (counter_count).
+/// @see kpc_get_counter_count(), kpc_cpu_count().
+/// @param all_cpus true for all CPUs, false for current cpu.
+/// @param classes See `class mask constants` above.
+/// @param curcpu A pointer to receive current cpu id, can be NULL.
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters)
+static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu,
+                                   u64 *buf);
+
+/// Get counter accumulations for current thread.
+/// @param tid Thread id, should be 0.
+/// @param buf_count The number of buf's elements (not bytes),
+///                  should not smaller than kpc_get_counter_count().
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(kpc.thread_counters)
+static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf);
+
+/// Acquire/release the counters used by the Power Manager.
+/// @param val 1:acquire, 0:release
+/// @return 0 for success.
+/// @details sysctl set(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_set)(int val);
+
+/// Get the state of all_ctrs.
+/// @return 0 for success.
+/// @details sysctl get(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_get)(int *val_out);
+
+/// Set number of actions, should be `KPERF_ACTION_MAX`.
+/// @details sysctl set(kperf.action.count)
+static int (*kperf_action_count_set)(u32 count);
+
+/// Get number of actions.
+/// @details sysctl get(kperf.action.count)
+static int (*kperf_action_count_get)(u32 *count);
+
+/// Set what to sample when a trigger fires an action, e.g.
+/// `KPERF_SAMPLER_PMC_CPU`.
+/// @details sysctl set(kperf.action.samplers)
+static int (*kperf_action_samplers_set)(u32 actionid, u32 sample);
+
+/// Get what to sample when a trigger fires an action.
+/// @details sysctl get(kperf.action.samplers)
+static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample);
+
+/// Apply a task filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_task)
+static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port);
+
+/// Apply a pid filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_pid)
+static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid);
+
+/// Set number of time triggers, should be `KPERF_TIMER_MAX`.
+/// @details sysctl set(kperf.timer.count)
+static int (*kperf_timer_count_set)(u32 count);
+
+/// Get number of time triggers.
+/// @details sysctl get(kperf.timer.count)
+static int (*kperf_timer_count_get)(u32 *count);
+
+/// Set timer number and period.
+/// @details sysctl set(kperf.timer.period)
+static int (*kperf_timer_period_set)(u32 actionid, u64 tick);
+
+/// Get timer number and period.
+/// @details sysctl get(kperf.timer.period)
+static int (*kperf_timer_period_get)(u32 actionid, u64 *tick);
+
+/// Set timer number and actionid.
+/// @details sysctl set(kperf.timer.action)
+static int (*kperf_timer_action_set)(u32 actionid, u32 timerid);
+
+/// Get timer number and actionid.
+/// @details sysctl get(kperf.timer.action)
+static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid);
+
+/// Set which timer ID does PET (Profile Every Thread).
+/// @details sysctl set(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_set)(u32 timerid);
+
+/// Get which timer ID does PET (Profile Every Thread).
+/// @details sysctl get(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_get)(u32 *timerid);
+
+/// Enable or disable sampling.
+/// @details sysctl set(kperf.sampling)
+static int (*kperf_sample_set)(u32 enabled);
+
+/// Get is currently sampling.
+/// @details sysctl get(kperf.sampling)
+static int (*kperf_sample_get)(u32 *enabled);
+
+/// Reset kperf: stop sampling, kdebug, timers and actions.
+/// @return 0 for success.
+static int (*kperf_reset)(void);
+
+/// Nanoseconds to CPU ticks.
+static u64 (*kperf_ns_to_ticks)(u64 ns);
+
+/// CPU ticks to nanoseconds.
+static u64 (*kperf_ticks_to_ns)(u64 ticks);
+
+/// CPU ticks frequency (mach_absolute_time).
+static u64 (*kperf_tick_frequency)(void);
+
+/// Get lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_get(u32 *enabled) {
+  if (!enabled)
+    return -1;
+  usize size = 4;
+  return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0);
+}
+
+/// Set lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_set(u32 enabled) {
+  return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4);
+}
+
+// -----------------------------------------------------------------------------
+// <kperfdata.framework> header (reverse engineered)
+// This framework provides some functions to access the local CPU database.
+// These functions do not require root privileges.
+// -----------------------------------------------------------------------------
+
+// KPEP CPU archtecture constants.
+#define KPEP_ARCH_I386 0
+#define KPEP_ARCH_X86_64 1
+#define KPEP_ARCH_ARM 2
+#define KPEP_ARCH_ARM64 3
+
+/// KPEP event (size: 48/28 bytes on 64/32 bit OS)
+typedef struct kpep_event {
+  const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY".
+  const char *description; ///< Description for this event.
+  const char *errata;      ///< Errata, currently NULL.
+  const char *alias;       ///< Alias name, such as "Instructions", "Cycles".
+  const char *fallback;    ///< Fallback event name for fixed counter.
+  u32 mask;
+  u8 number;
+  u8 umask;
+  u8 reserved;
+  u8 is_fixed;
+} kpep_event;
+
+/// KPEP database (size: 144/80 bytes on 64/32 bit OS)
+typedef struct kpep_db {
+  const char *name;           ///< Database name, such as "haswell".
+  const char *cpu_id;         ///< Plist name, such as "cpu_7_8_10b282dc".
+  const char *marketing_name; ///< Marketing name, such as "Intel Haswell".
+  void *plist_data;           ///< Plist data (CFDataRef), currently NULL.
+  void *event_map; ///< All events (CFDict<CFSTR(event_name), kpep_event *>).
+  kpep_event
+      *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count).
+  kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *)
+                                ///< * fixed_counter_count)
+  void *alias_map; ///< All aliases (CFDict<CFSTR(event_name), kpep_event *>).
+  usize reserved_1;
+  usize reserved_2;
+  usize reserved_3;
+  usize event_count; ///< All events count.
+  usize alias_count;
+  usize fixed_counter_count;
+  usize config_counter_count;
+  usize power_counter_count;
+  u32 archtecture; ///< see `KPEP CPU archtecture constants` above.
+  u32 fixed_counter_bits;
+  u32 config_counter_bits;
+  u32 power_counter_bits;
+} kpep_db;
+
+/// KPEP config (size: 80/44 bytes on 64/32 bit OS)
+typedef struct kpep_config {
+  kpep_db *db;
+  kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL
+  usize *ev_map;       ///< (sizeof(usize *) * counter_count), init 0
+  usize *ev_idx;       ///< (sizeof(usize *) * counter_count), init -1
+  u32 *flags;          ///< (sizeof(u32 *) * counter_count), init 0
+  u64 *kpc_periods;    ///< (sizeof(u64 *) * counter_count), init 0
+  usize event_count;   /// kpep_config_events_count()
+  usize counter_count;
+  u32 classes; ///< See `class mask constants` above.
+  u32 config_counter;
+  u32 power_counter;
+  u32 reserved;
+} kpep_config;
+
+/// Error code for kpep_config_xxx() and kpep_db_xxx() functions.
+typedef enum {
+  KPEP_CONFIG_ERROR_NONE = 0,
+  KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1,
+  KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2,
+  KPEP_CONFIG_ERROR_IO = 3,
+  KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4,
+  KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5,
+  KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6,
+  KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7,
+  KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8,
+  KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9,
+  KPEP_CONFIG_ERROR_DB_CORRUPT = 10,
+  KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11,
+  KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12,
+  KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13,
+  KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14,
+  KPEP_CONFIG_ERROR_ERRNO = 15,
+  KPEP_CONFIG_ERROR_MAX
+} kpep_config_error_code;
+
+/// Error description for kpep_config_error_code.
+static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = {
+    "none",
+    "invalid argument",
+    "out of memory",
+    "I/O",
+    "buffer too small",
+    "current system unknown",
+    "database path invalid",
+    "database not found",
+    "database architecture unsupported",
+    "database version unsupported",
+    "database corrupt",
+    "event not found",
+    "conflicting events",
+    "all counters must be forced",
+    "event unavailable",
+    "check errno"};
+
+/// Error description.
+static const char *kpep_config_error_desc(int code) {
+  if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) {
+    return kpep_config_error_names[code];
+  }
+  return "unknown error";
+}
+
+/// Create a config.
+/// @param db A kpep db, see kpep_db_create()
+/// @param cfg_ptr A pointer to receive the new config.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr);
+
+/// Free the config.
+static void (*kpep_config_free)(kpep_config *cfg);
+
+/// Add an event to config.
+/// @param cfg The config.
+/// @param ev_ptr A event pointer.
+/// @param flag 0: all, 1: user space only
+/// @param err Error bitmap pointer, can be NULL.
+///            If return value is `CONFLICTING_EVENTS`, this bitmap contains
+///            the conflicted event indices, e.g. "1 << 2" means index 2.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr,
+                                    u32 flag, u32 *err);
+
+/// Remove event at index.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx);
+
+/// Force all counters.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_force_counters)(kpep_config *cfg);
+
+/// Get events count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get all event pointers.
+/// @param buf A buffer to receive event pointers.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf,
+                                 usize buf_size);
+
+/// Get kpc register configs.
+/// @param buf A buffer to receive kpc register configs.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_kpc_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf,
+                              usize buf_size);
+
+/// Get kpc register config count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get kpc classes.
+/// @param classes See `class mask constants` above.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr);
+
+/// Get the index mapping from event to counter.
+/// @param buf A buffer to receive indexes.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size);
+
+/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/".
+/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8".
+///             Pass NULL for current CPU.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_create)(const char *name, kpep_db **db_ptr);
+
+/// Free the kpep database.
+static void (*kpep_db_free)(kpep_db *db);
+
+/// Get the database's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_name)(kpep_db *db, const char **name);
+
+/// Get the event alias count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases_count)(kpep_db *db, usize *count);
+
+/// Get all alias.
+/// @param buf A buffer to receive all alias strings.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_aliases_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size);
+
+/// Get counters count for given classes.
+/// @param classes 1: Fixed, 2: Configurable.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count);
+
+/// Get all event count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events_count)(kpep_db *db, usize *count);
+
+/// Get all events.
+/// @param buf A buffer to receive all event pointers.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size);
+
+/// Get one event by name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr);
+
+/// Get event's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr);
+
+/// Get event's alias.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr);
+
+/// Get event's description.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr);
+
+// -----------------------------------------------------------------------------
+// load kperf/kperfdata dynamic library
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  const char *name;
+  void **impl;
+} lib_symbol;
+
+#define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
+#define lib_symbol_def(name)                                                   \
+  {                                                                            \
+#name, (void **)&name                                                      \
+  }
+
+static const lib_symbol lib_symbols_kperf[] = {
+    lib_symbol_def(kpc_pmu_version),
+    lib_symbol_def(kpc_cpu_string),
+    lib_symbol_def(kpc_set_counting),
+    lib_symbol_def(kpc_get_counting),
+    lib_symbol_def(kpc_set_thread_counting),
+    lib_symbol_def(kpc_get_thread_counting),
+    lib_symbol_def(kpc_get_config_count),
+    lib_symbol_def(kpc_get_counter_count),
+    lib_symbol_def(kpc_set_config),
+    lib_symbol_def(kpc_get_config),
+    lib_symbol_def(kpc_get_cpu_counters),
+    lib_symbol_def(kpc_get_thread_counters),
+    lib_symbol_def(kpc_force_all_ctrs_set),
+    lib_symbol_def(kpc_force_all_ctrs_get),
+    lib_symbol_def(kperf_action_count_set),
+    lib_symbol_def(kperf_action_count_get),
+    lib_symbol_def(kperf_action_samplers_set),
+    lib_symbol_def(kperf_action_samplers_get),
+    lib_symbol_def(kperf_action_filter_set_by_task),
+    lib_symbol_def(kperf_action_filter_set_by_pid),
+    lib_symbol_def(kperf_timer_count_set),
+    lib_symbol_def(kperf_timer_count_get),
+    lib_symbol_def(kperf_timer_period_set),
+    lib_symbol_def(kperf_timer_period_get),
+    lib_symbol_def(kperf_timer_action_set),
+    lib_symbol_def(kperf_timer_action_get),
+    lib_symbol_def(kperf_sample_set),
+    lib_symbol_def(kperf_sample_get),
+    lib_symbol_def(kperf_reset),
+    lib_symbol_def(kperf_timer_pet_set),
+    lib_symbol_def(kperf_timer_pet_get),
+    lib_symbol_def(kperf_ns_to_ticks),
+    lib_symbol_def(kperf_ticks_to_ns),
+    lib_symbol_def(kperf_tick_frequency),
+};
+
+static const lib_symbol lib_symbols_kperfdata[] = {
+    lib_symbol_def(kpep_config_create),
+    lib_symbol_def(kpep_config_free),
+    lib_symbol_def(kpep_config_add_event),
+    lib_symbol_def(kpep_config_remove_event),
+    lib_symbol_def(kpep_config_force_counters),
+    lib_symbol_def(kpep_config_events_count),
+    lib_symbol_def(kpep_config_events),
+    lib_symbol_def(kpep_config_kpc),
+    lib_symbol_def(kpep_config_kpc_count),
+    lib_symbol_def(kpep_config_kpc_classes),
+    lib_symbol_def(kpep_config_kpc_map),
+    lib_symbol_def(kpep_db_create),
+    lib_symbol_def(kpep_db_free),
+    lib_symbol_def(kpep_db_name),
+    lib_symbol_def(kpep_db_aliases_count),
+    lib_symbol_def(kpep_db_aliases),
+    lib_symbol_def(kpep_db_counters_count),
+    lib_symbol_def(kpep_db_events_count),
+    lib_symbol_def(kpep_db_events),
+    lib_symbol_def(kpep_db_event),
+    lib_symbol_def(kpep_event_name),
+    lib_symbol_def(kpep_event_alias),
+    lib_symbol_def(kpep_event_description),
+};
+
+#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf"
+#define lib_path_kperfdata                                                     \
+  "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata"
+
+static bool lib_inited = false;
+static bool lib_has_err = false;
+static char lib_err_msg[256];
+
+static void *lib_handle_kperf = NULL;
+static void *lib_handle_kperfdata = NULL;
+
+static void lib_deinit(void) {
+  lib_inited = false;
+  lib_has_err = false;
+  if (lib_handle_kperf)
+    dlclose(lib_handle_kperf);
+  if (lib_handle_kperfdata)
+    dlclose(lib_handle_kperfdata);
+  lib_handle_kperf = NULL;
+  lib_handle_kperfdata = NULL;
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = NULL;
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = NULL;
+  }
+}
+
+static bool lib_init(void) {
+#define return_err()                                                           \
+  do {                                                                         \
+    lib_deinit();                                                              \
+    lib_inited = true;                                                         \
+    lib_has_err = true;                                                        \
+    return false;                                                              \
+  } while (false)
+
+  if (lib_inited)
+    return !lib_has_err;
+
+  // load dynamic library
+  lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY);
+  if (!lib_handle_kperf) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperf.framework, message: %s.", dlerror());
+    return_err();
+  }
+  lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY);
+  if (!lib_handle_kperfdata) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperfdata.framework, message: %s.", dlerror());
+    return_err();
+  }
+
+  // load symbol address from dynamic library
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = dlsym(lib_handle_kperf, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperf function: %s.", symbol->name);
+      return_err();
+    }
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperfdata function: %s.", symbol->name);
+      return_err();
+    }
+  }
+
+  lib_inited = true;
+  lib_has_err = false;
+  return true;
+
+#undef return_err
+}
+
+// -----------------------------------------------------------------------------
+// kdebug private structs
+// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h
+// -----------------------------------------------------------------------------
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
+typedef struct {
+  uint64_t timestamp;
+  kd_buf_argtype arg1;
+  kd_buf_argtype arg2;
+  kd_buf_argtype arg3;
+  kd_buf_argtype arg4;
+  kd_buf_argtype arg5; /* the thread ID */
+  uint32_t debugid;    /* see <sys/kdebug.h> */
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
+  uint32_t cpuid; /* cpu index, from 0 */
+  kd_buf_argtype unused;
+#endif
+} kd_buf;
+
+/* bits for the type field of kd_regtype */
+#define KDBG_CLASSTYPE 0x10000
+#define KDBG_SUBCLSTYPE 0x20000
+#define KDBG_RANGETYPE 0x40000
+#define KDBG_TYPENONE 0x80000
+#define KDBG_CKTYPES 0xF0000
+
+/* only trace at most 4 types of events, at the code granularity */
+#define KDBG_VALCHECK 0x00200000U
+
+typedef struct {
+  unsigned int type;
+  unsigned int value1;
+  unsigned int value2;
+  unsigned int value3;
+  unsigned int value4;
+} kd_regtype;
+
+typedef struct {
+  /* number of events that can fit in the buffers */
+  int nkdbufs;
+  /* set if trace is disabled */
+  int nolog;
+  /* kd_ctrl_page.flags */
+  unsigned int flags;
+  /* number of threads in thread map */
+  int nkdthreads;
+  /* the owning pid */
+  int bufid;
+} kbufinfo_t;
+
+// -----------------------------------------------------------------------------
+// kdebug utils
+// -----------------------------------------------------------------------------
+
+/// Clean up trace buffers and reset ktrace/kdebug/kperf.
+/// @return 0 on success.
+static int kdebug_reset(void) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE};
+  return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Disable and reinitialize the trace buffers.
+/// @return 0 on success.
+static int kdebug_reinit(void) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP};
+  return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Set debug filter.
+static int kdebug_setreg(kd_regtype *kdr) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG};
+  usize size = sizeof(kd_regtype);
+  return sysctl(mib, 3, kdr, &size, NULL, 0);
+}
+
+/// Set maximum number of trace entries (kd_buf).
+/// Only allow allocation up to half the available memory (sane_size).
+/// @return 0 on success.
+static int kdebug_trace_setbuf(int nbufs) {
+  int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs};
+  return sysctl(mib, 4, NULL, NULL, NULL, 0);
+}
+
+/// Enable or disable kdebug trace.
+/// Trace buffer must already be initialized.
+/// @return 0 on success.
+static int kdebug_trace_enable(bool enable) {
+  int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable};
+  return sysctl(mib, 4, NULL, 0, NULL, 0);
+}
+
+/// Retrieve trace buffer information from kernel.
+/// @return 0 on success.
+static int kdebug_get_bufinfo(kbufinfo_t *info) {
+  if (!info)
+    return -1;
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF};
+  size_t needed = sizeof(kbufinfo_t);
+  return sysctl(mib, 3, info, &needed, NULL, 0);
+}
+
+/// Retrieve trace buffers from kernel.
+/// @param buf Memory to receive buffer data, array of `kd_buf`.
+/// @param len Length of `buf` in bytes.
+/// @param count Number of trace entries (kd_buf) obtained.
+/// @return 0 on success.
+static int kdebug_trace_read(void *buf, usize len, usize *count) {
+  if (count)
+    *count = 0;
+  if (!buf || !len)
+    return -1;
+
+  // Note: the input and output units are not the same.
+  // input: bytes
+  // output: number of kd_buf
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR};
+  int ret = sysctl(mib, 3, buf, &len, NULL, 0);
+  if (ret != 0)
+    return ret;
+  *count = len;
+  return 0;
+}
+
+/// Block until there are new buffers filled or `timeout_ms` have passed.
+/// @param timeout_ms timeout milliseconds, 0 means wait forever.
+/// @param suc set true if new buffers filled.
+/// @return 0 on success.
+static int kdebug_wait(usize timeout_ms, bool *suc) {
+  if (timeout_ms == 0)
+    return -1;
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT};
+  usize val = timeout_ms;
+  int ret = sysctl(mib, 3, NULL, &val, NULL, 0);
+  if (suc)
+    *suc = !!val;
+  return ret;
+}
+
+// -----------------------------------------------------------------------------
+// Demo
+// -----------------------------------------------------------------------------
+
+#define EVENT_NAME_MAX 8
+typedef struct {
+  const char *alias;                 /// name for print
+  const char *names[EVENT_NAME_MAX]; /// name from pmc db
+} event_alias;
+
+/// Event names from /usr/share/kpep/<name>.plist
+static const event_alias profile_events[] = {
+    {"cycles",
+     {
+         "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE
+         "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th
+         "CPU_CLK_UNHALTED.CORE",   // Intel Yonah, Merom
+     }},
+    {"instructions",
+     {
+         "FIXED_INSTRUCTIONS", // Apple A7-A15
+         "INST_RETIRED.ANY"    // Intel Yonah, Merom, Core 1th-10th
+     }},
+    {"branches",
+     {
+         "INST_BRANCH",                  // Apple A7-A15
+         "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th
+         "INST_RETIRED.ANY",             // Intel Yonah, Merom
+     }},
+    {"branch-misses",
+     {
+         "BRANCH_MISPRED_NONSPEC",       // Apple A7-A15, since iOS 15, macOS 12
+         "BRANCH_MISPREDICT",            // Apple A7-A14
+         "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th
+         "BR_INST_RETIRED.MISPRED",      // Intel Yonah, Merom
+     }},
+};
+
+static kpep_event *get_event(kpep_db *db, const event_alias *alias) {
+  for (usize j = 0; j < EVENT_NAME_MAX; j++) {
+    const char *name = alias->names[j];
+    if (!name)
+      break;
+    kpep_event *ev = NULL;
+    if (kpep_db_event(db, name, &ev) == 0) {
+      return ev;
+    }
+  }
+  return NULL;
+}
+
+kpc_config_t regs[KPC_MAX_COUNTERS] = {0};
+usize counter_map[KPC_MAX_COUNTERS] = {0};
+u64 counters_0[KPC_MAX_COUNTERS] = {0};
+u64 counters_1[KPC_MAX_COUNTERS] = {0};
+const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]);
+
+
+bool setup_performance_counters() {
+  static bool init = false;
+  static bool worked = false;
+
+  if (init) {
+    return worked;
+  }
+  init = true;
+
+  // load dylib
+  if (!lib_init()) {
+    printf("Error: %s\n", lib_err_msg);
+    return (worked = false);
+  }
+
+  // check permission
+  int force_ctrs = 0;
+  if (kpc_force_all_ctrs_get(&force_ctrs)) {
+    //printf("Permission denied, xnu/kpc requires root privileges.\n");
+    return (worked = false);
+  }
+  int ret;
+  // load pmc db
+  kpep_db *db = NULL;
+  if ((ret = kpep_db_create(NULL, &db))) {
+    printf("Error: cannot load pmc database: %d.\n", ret);
+    return (worked = false);
+  }
+  printf("loaded db: %s (%s)\n", db->name, db->marketing_name);
+
+  // create a config
+  kpep_config *cfg = NULL;
+  if ((ret = kpep_config_create(db, &cfg))) {
+    printf("Failed to create kpep config: %d (%s).\n", ret,
+           kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+  if ((ret = kpep_config_force_counters(cfg))) {
+    printf("Failed to force counters: %d (%s).\n", ret,
+           kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+
+  // get events
+  kpep_event *ev_arr[ev_count] = {0};
+  for (usize i = 0; i < ev_count; i++) {
+    const event_alias *alias = profile_events + i;
+    ev_arr[i] = get_event(db, alias);
+    if (!ev_arr[i]) {
+      printf("Cannot find event: %s.\n", alias->alias);
+      return (worked = false);
+    }
+  }
+
+  // add event to config
+  for (usize i = 0; i < ev_count; i++) {
+    kpep_event *ev = ev_arr[i];
+    if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) {
+      printf("Failed to add event: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+  }
+
+  // prepare buffer and config
+  u32 classes = 0;
+  usize reg_count = 0;
+  if ((ret = kpep_config_kpc_classes(cfg, &classes))) {
+    printf("Failed get kpc classes: %d (%s).\n", ret,
+           kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+  if ((ret = kpep_config_kpc_count(cfg, &reg_count))) {
+    printf("Failed get kpc count: %d (%s).\n", ret,
+           kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+  if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) {
+    printf("Failed get kpc map: %d (%s).\n", ret, kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+  if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) {
+    printf("Failed get kpc registers: %d (%s).\n", ret,
+           kpep_config_error_desc(ret));
+    return (worked = false);
+  }
+
+  // set config to kernel
+  if ((ret = kpc_force_all_ctrs_set(1))) {
+    printf("Failed force all ctrs: %d.\n", ret);
+    return (worked = false);
+  }
+  if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) {
+    if ((ret = kpc_set_config(classes, regs))) {
+      printf("Failed set kpc config: %d.\n", ret);
+      return (worked = false);
+    }
+  }
+
+  // start counting
+  if ((ret = kpc_set_counting(classes))) {
+    printf("Failed set counting: %d.\n", ret);
+    return (worked = false);
+  }
+  if ((ret = kpc_set_thread_counting(classes))) {
+    printf("Failed set thread counting: %d.\n", ret);
+    return (worked = false);
+  }
+
+  return (worked = true);
+}
+
+inline performance_counters get_counters() {
+  static bool warned = false;
+  int ret;
+  // get counters before
+  if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) {
+    if (!warned) {
+
+      printf("Failed get thread counters before: %d.\n", ret);
+      warned = true;
+    }
+    return 1;
+  }
+ /*printf("counters value:\n");
+    for (usize i = 0; i < ev_count; i++) {
+        const event_alias *alias = profile_events + i;
+        usize idx = counter_map[i];
+        u64 val = counters_1[idx] - counters_0[idx];
+        printf("%14s: %llu\n", alias->alias, val);
+    }*/
+  return performance_counters{
+      counters_0[counter_map[0]], counters_0[counter_map[2]],
+      counters_0[counter_map[3]],
+      counters_0[counter_map[1]]};
+}
+
+#endif
diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp
new file mode 100644
index 0000000..c6b091f
--- /dev/null
+++ b/benchmarks/benchmark.cpp
@@ -0,0 +1,247 @@
+#if defined(__linux__) || (__APPLE__ &&  __aarch64__)
+#define USING_COUNTERS
+#include "event_counter.h"
+#endif
+#include <algorithm>
+#include "fast_float/fast_float.h"
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include <locale.h>
+
+
+template <typename CharT>
+double findmax_fastfloat64(std::vector<std::basic_string<CharT>> &s) {
+  double answer = 0;
+  double x = 0;
+  for (auto &st : s) {
+    auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x);
+    if (p == st.data()) {
+      throw std::runtime_error("bug in findmax_fastfloat");
+    }
+    answer = answer > x ? answer : x;
+  }
+  return answer;
+}
+
+template <typename CharT>
+double findmax_fastfloat32(std::vector<std::basic_string<CharT>> &s) {
+  float answer = 0;
+  float x = 0;
+  for (auto &st : s) {
+    auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x);
+    if (p == st.data()) {
+      throw std::runtime_error("bug in findmax_fastfloat");
+    }
+    answer = answer > x ? answer : x;
+  }
+  return answer;
+}
+
+event_collector collector{};
+
+#ifdef USING_COUNTERS
+template <class T, class CharT>
+std::vector<event_count> time_it_ns(std::vector<std::basic_string<CharT>> &lines,
+                                     T const &function, size_t repeat) {
+  std::vector<event_count> aggregate;
+  bool printed_bug = false;
+  for (size_t i = 0; i < repeat; i++) {
+    collector.start();
+    double ts = function(lines);
+    if (ts == 0 && !printed_bug) {
+      printf("bug\n");
+      printed_bug = true;
+    }
+    aggregate.push_back(collector.end());
+  }
+  return aggregate;
+}
+
+void pretty_print(double volume, size_t number_of_floats, std::string name, std::vector<event_count> events) {
+  double volumeMB = volume / (1024. * 1024.);
+  double average_ns{0};
+  double min_ns{DBL_MAX};
+  double cycles_min{DBL_MAX};
+  double instructions_min{DBL_MAX};
+  double cycles_avg{0};
+  double instructions_avg{0};
+  double branches_min{0};
+  double branches_avg{0};
+  double branch_misses_min{0};
+  double branch_misses_avg{0};
+  for(event_count e : events) {
+    double ns = e.elapsed_ns();
+    average_ns += ns;
+    min_ns = min_ns < ns ? min_ns : ns;
+
+    double cycles = e.cycles();
+    cycles_avg += cycles;
+    cycles_min = cycles_min < cycles ? cycles_min : cycles;
+
+    double instructions = e.instructions();
+    instructions_avg += instructions;
+    instructions_min = instructions_min < instructions ? instructions_min : instructions;
+
+    double branches = e.branches();
+    branches_avg += branches;
+    branches_min = branches_min < branches ? branches_min : branches;
+
+    double branch_misses = e.missed_branches();
+    branch_misses_avg += branch_misses;
+    branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses;
+  }
+  cycles_avg /= events.size();
+  instructions_avg /= events.size();
+  average_ns /= events.size();
+  branches_avg /= events.size();
+  printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
+           volumeMB * 1000000000 / min_ns,
+           (average_ns - min_ns) * 100.0 / average_ns);
+  printf("%8.2f Mfloat/s  ", 
+           number_of_floats * 1000 / min_ns);
+  if(instructions_min > 0) {
+    printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", 
+           instructions_min / volume,
+           instructions_min / number_of_floats, 
+           (instructions_avg - instructions_min) * 100.0 / instructions_avg);
+
+    printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", 
+           cycles_min / volume,
+           cycles_min / number_of_floats, 
+           (cycles_avg - cycles_min) * 100.0 / cycles_avg);
+    printf(" %8.2f i/c ", 
+           instructions_min /cycles_min);
+    printf(" %8.2f b/f ",
+           branches_avg /number_of_floats);
+    printf(" %8.2f bm/f ",
+           branch_misses_avg /number_of_floats);
+    printf(" %8.2f GHz ", 
+           cycles_min / min_ns);
+  }
+  printf("\n");
+
+}
+#else
+template <class T, class CharT>
+std::pair<double, double> time_it_ns(std::vector<std::basic_string<CharT>> &lines,
+                                     T const &function, size_t repeat) {
+  std::chrono::high_resolution_clock::time_point t1, t2;
+  double average = 0;
+  double min_value = DBL_MAX;
+  bool printed_bug = false;
+  for (size_t i = 0; i < repeat; i++) {
+    t1 = std::chrono::high_resolution_clock::now();
+    double ts = function(lines);
+    if (ts == 0 && !printed_bug) {
+      printf("bug\n");
+      printed_bug = true;
+    }
+    t2 = std::chrono::high_resolution_clock::now();
+    double dif =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
+    average += dif;
+    min_value = min_value < dif ? min_value : dif;
+  }
+  average /= repeat;
+  return std::make_pair(min_value, average);
+}
+
+
+
+
+void pretty_print(double volume, size_t number_of_floats, std::string name, std::pair<double,double> result) {
+  double volumeMB = volume / (1024. * 1024.);
+  printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
+           volumeMB * 1000000000 / result.first,
+           (result.second - result.first) * 100.0 / result.second);
+  printf("%8.2f Mfloat/s  ", 
+           number_of_floats * 1000 / result.first);
+  printf(" %8.2f ns/f \n", 
+           double(result.first) /number_of_floats );
+}
+#endif 
+
+
+// this is okay, all chars are ASCII
+inline std::u16string widen(std::string line) {
+  std::u16string u16line;
+  u16line.resize(line.size());
+  for (size_t i = 0; i < line.size(); ++i) {
+    u16line[i] = char16_t(line[i]);
+  }
+  return u16line;
+}
+
+std::vector<std::u16string> widen(const std::vector<std::string> &lines) {
+  std::vector<std::u16string> u16lines;
+  u16lines.reserve(lines.size());
+  for (auto const &line : lines) {
+    u16lines.push_back(widen(line));
+  }
+  return u16lines;
+}
+
+
+void process(std::vector<std::string> &lines, size_t volume) {
+  size_t repeat = 100;
+  double volumeMB = volume / (1024. * 1024.);
+  std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl;
+  pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines, findmax_fastfloat64<char>, repeat));
+  pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines, findmax_fastfloat32<char>, repeat));
+
+  std::vector<std::u16string> lines16 = widen(lines);
+  volume = 2 * volume;
+  volumeMB = volume / (1024. * 1024.);
+  std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl;
+  pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines16, findmax_fastfloat64<char16_t>, repeat));
+  pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines16, findmax_fastfloat32<char16_t>, repeat));
+
+}
+
+void fileload(std::string filename) {
+  std::ifstream inputfile(filename);
+  if (!inputfile) {
+    std::cerr << "can't open " << filename << std::endl;
+    return;
+  }
+  std::cout << "#### " << std::endl;
+  std::cout << "# reading " << filename << std::endl;
+  std::cout << "#### " << std::endl;
+  std::string line;
+  std::vector<std::string> lines;
+  lines.reserve(10000); // let us reserve plenty of memory.
+  size_t volume = 0;
+  while (getline(inputfile, line)) {
+    volume += line.size();
+    lines.push_back(line);
+  }
+  std::cout << "# read " << lines.size() << " lines " << std::endl;
+  process(lines, volume);
+}
+
+
+int main(int argc, char **argv) {
+  if(collector.has_events()) {
+    std::cout << "# Using hardware counters" << std::endl;
+  } else {
+#if defined(__linux__) || (__APPLE__ &&  __aarch64__)
+    std::cout << "# Hardware counters not available, try to run in privileged mode (e.g., sudo)." << std::endl;
+#endif
+  }
+  fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt");
+  fileload(std::string(BENCHMARK_DATA_DIR) + "/mesh.txt");
+}
diff --git a/benchmarks/event_counter.h b/benchmarks/event_counter.h
new file mode 100644
index 0000000..fb6db3a
--- /dev/null
+++ b/benchmarks/event_counter.h
@@ -0,0 +1,152 @@
+#ifndef __EVENT_COUNTER_H
+#define __EVENT_COUNTER_H
+
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#endif
+#include <cinttypes>
+
+#include <cstring>
+
+#include <chrono>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+
+#if __APPLE__ &&  __aarch64__
+#include "apple_arm_events.h"
+#endif
+
+struct event_count {
+  std::chrono::duration<double> elapsed;
+  std::vector<unsigned long long> event_counts;
+  event_count() : elapsed(0), event_counts{0,0,0,0,0} {}
+  event_count(const std::chrono::duration<double> _elapsed, const std::vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {}
+  event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { }
+
+  // The types of counters (so we can read the getter more easily)
+  enum event_counter_types {
+    CPU_CYCLES = 0,
+    INSTRUCTIONS = 1,
+    BRANCHES = 2,
+    MISSED_BRANCHES = 3
+  };
+
+  double elapsed_sec() const { return std::chrono::duration<double>(elapsed).count(); }
+  double elapsed_ns() const { return std::chrono::duration<double, std::nano>(elapsed).count(); }
+  double cycles() const { return static_cast<double>(event_counts[CPU_CYCLES]); }
+  double instructions() const { return static_cast<double>(event_counts[INSTRUCTIONS]); }
+  double branches() const { return static_cast<double>(event_counts[BRANCHES]); }
+  double missed_branches() const { return static_cast<double>(event_counts[MISSED_BRANCHES]); }
+
+  event_count& operator=(const event_count& other) {
+    this->elapsed = other.elapsed;
+    this->event_counts = other.event_counts;
+    return *this;
+  }
+  event_count operator+(const event_count& other) const {
+    return event_count(elapsed+other.elapsed, {
+      event_counts[0]+other.event_counts[0],
+      event_counts[1]+other.event_counts[1],
+      event_counts[2]+other.event_counts[2],
+      event_counts[3]+other.event_counts[3],
+      event_counts[4]+other.event_counts[4],
+    });
+  }
+
+  void operator+=(const event_count& other) {
+    *this = *this + other;
+  }
+};
+
+struct event_aggregate {
+  bool has_events = false;
+  int iterations = 0;
+  event_count total{};
+  event_count best{};
+  event_count worst{};
+
+  event_aggregate() = default;
+
+  void operator<<(const event_count& other) {
+    if (iterations == 0 || other.elapsed < best.elapsed) {
+      best = other;
+    }
+    if (iterations == 0 || other.elapsed > worst.elapsed) {
+      worst = other;
+    }
+    iterations++;
+    total += other;
+  }
+
+  double elapsed_sec() const { return total.elapsed_sec() / iterations; }
+  double elapsed_ns() const { return total.elapsed_ns() / iterations; }
+  double cycles() const { return total.cycles() / iterations; }
+  double instructions() const { return total.instructions() / iterations; }
+  double branches() const { return total.branches() / iterations; }
+  double missed_branches() const { return total.missed_branches() / iterations; }
+};
+
+struct event_collector {
+  event_count count{};
+  std::chrono::time_point<std::chrono::steady_clock> start_clock{};
+
+#if defined(__linux__) 
+  LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
+  event_collector() : linux_events(std::vector<int>{
+    PERF_COUNT_HW_CPU_CYCLES,
+    PERF_COUNT_HW_INSTRUCTIONS,
+    PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions
+    PERF_COUNT_HW_BRANCH_MISSES
+  }) {}
+  bool has_events() {
+    return linux_events.is_working();
+  }
+#elif __APPLE__ &&  __aarch64__
+  performance_counters diff;
+  event_collector() : diff(0) {
+    setup_performance_counters();
+  }
+  bool has_events() {
+    return setup_performance_counters();
+  }
+#else
+  event_collector() {}
+  bool has_events() {
+    return false;
+  }
+#endif
+
+  inline void start() {
+#if defined(__linux)
+    linux_events.start();
+#elif __APPLE__ &&  __aarch64__
+    if(has_events()) { diff = get_counters(); }
+#endif
+    start_clock = std::chrono::steady_clock::now();
+  }
+  inline event_count& end() {
+    const auto end_clock = std::chrono::steady_clock::now();
+#if defined(__linux)
+    linux_events.end(count.event_counts);
+#elif __APPLE__ &&  __aarch64__
+    if(has_events()) {
+      performance_counters end = get_counters();
+      diff = end - diff;
+    }
+    count.event_counts[0] = diff.cycles;
+    count.event_counts[1] = diff.instructions;
+    count.event_counts[2] = diff.branches;
+    count.event_counts[3] = diff.missed_branches;
+    count.event_counts[4] = 0;
+#endif
+    count.elapsed = end_clock - start_clock;
+    return count;
+  }
+};
+
+#endif
diff --git a/benchmarks/linux-perf-events.h b/benchmarks/linux-perf-events.h
new file mode 100644
index 0000000..73cfbaf
--- /dev/null
+++ b/benchmarks/linux-perf-events.h
@@ -0,0 +1,103 @@
+#pragma once
+#ifdef __linux__
+
+#include <asm/unistd.h>       // for __NR_perf_event_open
+#include <linux/perf_event.h> // for perf event constants
+#include <sys/ioctl.h>        // for ioctl
+#include <unistd.h>           // for syscall
+
+#include <cerrno>  // for errno
+#include <cstring> // for memset
+#include <stdexcept>
+
+#include <iostream>
+#include <vector>
+
+template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
+  int fd;
+  bool working;
+  perf_event_attr attribs{};
+  size_t num_events{};
+  std::vector<uint64_t> temp_result_vec{};
+  std::vector<uint64_t> ids{};
+
+public:
+  explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
+    memset(&attribs, 0, sizeof(attribs));
+    attribs.type = TYPE;
+    attribs.size = sizeof(attribs);
+    attribs.disabled = 1;
+    attribs.exclude_kernel = 1;
+    attribs.exclude_hv = 1;
+
+    attribs.sample_period = 0;
+    attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+    const int pid = 0;  // the current process
+    const int cpu = -1; // all CPUs
+    const unsigned long flags = 0;
+
+    int group = -1; // no group
+    num_events = config_vec.size();
+    ids.resize(config_vec.size());
+    uint32_t i = 0;
+    for (auto config : config_vec) {
+      attribs.config = config;
+      int _fd = static_cast<int>(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
+      if (_fd == -1) {
+        report_error("perf_event_open");
+      }
+      ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]);
+      if (group == -1) {
+        group = _fd;
+        fd = _fd;
+      }
+    }
+
+    temp_result_vec.resize(num_events * 2 + 1);
+  }
+
+  ~LinuxEvents() { if (fd != -1) { close(fd); } }
+
+  inline void start() {
+    if (fd != -1) {
+      if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_RESET)");
+      }
+
+      if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
+      }
+    }
+  }
+
+  inline void end(std::vector<unsigned long long> &results) {
+    if (fd != -1) {
+      if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
+      }
+
+      if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
+        report_error("read");
+      }
+    }
+    // our actual results are in slots 1,3,5, ... of this structure
+    for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
+      results[i / 2] = temp_result_vec[i];
+    }
+    for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
+      if(ids[i/2-1] != temp_result_vec[i]) {
+        report_error("event mismatch");
+      }
+    }
+  }
+
+  bool is_working() {
+    return working;
+  }
+
+private:
+  void report_error(const std::string &) {
+    working = false;
+  }
+};
+#endif
\ No newline at end of file