adding actual benchmarks to the project
diff --git a/.github/workflows/ubuntu24.yml b/.github/workflows/ubuntu24.yml
index 0a327b3..8da4c5f 100644
--- a/.github/workflows/ubuntu24.yml
+++ b/.github/workflows/ubuntu24.yml
@@ -11,7 +11,7 @@
run: |
mkdir build &&
cd build &&
- CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON .. &&
+ CXXFLAGS=-Werror cmake -DFASTFLOAT_TEST=ON -D FASTFLOAT_BENCHMARKS=ON .. &&
cmake --build . &&
ctest --output-on-failure
- name: Use cmake CXX23
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e331b15..94fc0b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@
set(FASTFLOAT_CXX_STANDARD 11 CACHE STRING "the C++ standard to use for fastfloat")
set(CMAKE_CXX_STANDARD ${FASTFLOAT_CXX_STANDARD})
option(FASTFLOAT_TEST "Enable tests" OFF)
+
if(FASTFLOAT_TEST)
enable_testing()
add_subdirectory(tests)
@@ -29,6 +30,16 @@
endif()
add_library(fast_float INTERFACE)
+
+
+option(FASTFLOAT_BENCHMARKS "Enable benchmarks" OFF)
+if(FASTFLOAT_BENCHMARKS)
+ add_subdirectory(benchmarks)
+else(FASTFLOAT_BENCHMARKS)
+ message(STATUS "Benchmarks are disabled. Set FASTFLOAT_BENCHMARKS to ON to build benchmarks (assumes C++17).")
+endif(FASTFLOAT_BENCHMARKS)
+
+
add_library(FastFloat::fast_float ALIAS fast_float)
target_include_directories(
fast_float
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..b4e0395
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_executable(realbenchmark benchmark.cpp)
+set_property(
+ TARGET realbenchmark
+ PROPERTY CXX_STANDARD 17)
+
+target_link_libraries(realbenchmark PUBLIC fast_float)
+include(ExternalProject)
+
+# Define the external project
+ExternalProject_Add(simple_fastfloat_benchmark
+ GIT_REPOSITORY https://github.com/lemire/simple_fastfloat_benchmark.git
+ GIT_TAG master # or specify a particular commit/tag/branch
+ SOURCE_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark
+ BINARY_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark-build
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+)
+set(DATA_DIR ${CMAKE_BINARY_DIR}/simple_fastfloat_benchmark/data)
+
+add_custom_target(CopyData ALL
+ COMMAND ${CMAKE_COMMAND} -E copy_directory ${DATA_DIR} ${CMAKE_CURRENT_BINARY_DIR}/data
+ DEPENDS simple_fastfloat_benchmark
+)
+add_dependencies(realbenchmark CopyData)
+target_compile_definitions(realbenchmark PUBLIC BENCHMARK_DATA_DIR="${CMAKE_CURRENT_BINARY_DIR}/data")
diff --git a/benchmarks/apple_arm_events.h b/benchmarks/apple_arm_events.h
new file mode 100644
index 0000000..3a94081
--- /dev/null
+++ b/benchmarks/apple_arm_events.h
@@ -0,0 +1,1117 @@
+// Original design from:
+// =============================================================================
+// XNU kperf/kpc
+// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges
+//
+// References:
+//
+// XNU source (since xnu 2422.1.72):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h
+// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c
+//
+// Lightweight PET (Profile Every Thread, since xnu 3789.1.32):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c
+//
+// System Private frameworks (since macOS 10.11, iOS 8.0):
+// /System/Library/PrivateFrameworks/kperf.framework
+// /System/Library/PrivateFrameworks/kperfdata.framework
+//
+// Xcode framework (since Xcode 7.0):
+// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework
+//
+// CPU database (plist files)
+// macOS (since macOS 10.11):
+// /usr/share/kpep/<name>.plist
+// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0):
+// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+// /DeviceSupport/<version>/DeveloperDiskImage.dmg/usr/share/kpep/<name>.plist
+//
+//
+// Created by YaoYuan <[email protected]> on 2021.
+// Released into the public domain (unlicense.org).
+// =============================================================================
+
+#ifndef M1CYCLES_H
+#define M1CYCLES_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dlfcn.h> // for dlopen() and dlsym()
+#include <mach/mach_time.h> // for mach_absolute_time()
+#include <sys/kdebug.h> // for kdebug trace decode
+#include <sys/sysctl.h> // for sysctl()
+#include <unistd.h> // for usleep()
+
+struct performance_counters {
+ double cycles;
+ double branches;
+ double missed_branches;
+ double instructions;
+ performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i)
+ : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+ performance_counters(double c, double b, double m, double i)
+ : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+ performance_counters(double init)
+ : cycles(init), branches(init), missed_branches(init),
+ instructions(init) {}
+
+ inline performance_counters &operator-=(const performance_counters &other) {
+ cycles -= other.cycles;
+ branches -= other.branches;
+ missed_branches -= other.missed_branches;
+ instructions -= other.instructions;
+ return *this;
+ }
+ inline performance_counters &min(const performance_counters &other) {
+ cycles = other.cycles < cycles ? other.cycles : cycles;
+ branches = other.branches < branches ? other.branches : branches;
+ missed_branches = other.missed_branches < missed_branches
+ ? other.missed_branches
+ : missed_branches;
+ instructions =
+ other.instructions < instructions ? other.instructions : instructions;
+ return *this;
+ }
+ inline performance_counters &operator+=(const performance_counters &other) {
+ cycles += other.cycles;
+ branches += other.branches;
+ missed_branches += other.missed_branches;
+ instructions += other.instructions;
+ return *this;
+ }
+
+ inline performance_counters &operator/=(double numerator) {
+ cycles /= numerator;
+ branches /= numerator;
+ missed_branches /= numerator;
+ instructions /= numerator;
+ return *this;
+ }
+};
+
+inline performance_counters operator-(const performance_counters &a,
+ const performance_counters &b) {
+ return performance_counters(a.cycles - b.cycles, a.branches - b.branches,
+ a.missed_branches - b.missed_branches,
+ a.instructions - b.instructions);
+}
+
+
+
+typedef float f32;
+typedef double f64;
+typedef int8_t i8;
+typedef uint8_t u8;
+typedef int16_t i16;
+typedef uint16_t u16;
+typedef int32_t i32;
+typedef uint32_t u32;
+typedef int64_t i64;
+typedef uint64_t u64;
+typedef size_t usize;
+
+// -----------------------------------------------------------------------------
+// <kperf.framework> header (reverse engineered)
+// This framework wraps some sysctl calls to communicate with the kpc in kernel.
+// Most functions requires root privileges, or process is "blessed".
+// -----------------------------------------------------------------------------
+
+// Cross-platform class constants.
+#define KPC_CLASS_FIXED (0)
+#define KPC_CLASS_CONFIGURABLE (1)
+#define KPC_CLASS_POWER (2)
+#define KPC_CLASS_RAWPMU (3)
+
+// Cross-platform class mask constants.
+#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1
+#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2
+#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4
+#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8
+
+// PMU version constants.
+#define KPC_PMU_ERROR (0) // Error
+#define KPC_PMU_INTEL_V3 (1) // Intel
+#define KPC_PMU_ARM_APPLE (2) // ARM64
+#define KPC_PMU_INTEL_V2 (3) // Old Intel
+#define KPC_PMU_ARM_V2 (4) // Old ARM
+
+// The maximum number of counters we could read from every class in one go.
+// ARMV7: FIXED: 1, CONFIGURABLE: 4
+// ARM32: FIXED: 2, CONFIGURABLE: 6
+// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8)
+// x86: 32
+#define KPC_MAX_COUNTERS 32
+
+// Bits for defining what to do on an action.
+// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h
+#define KPERF_SAMPLER_TH_INFO (1U << 0)
+#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1)
+#define KPERF_SAMPLER_KSTACK (1U << 2)
+#define KPERF_SAMPLER_USTACK (1U << 3)
+#define KPERF_SAMPLER_PMC_THREAD (1U << 4)
+#define KPERF_SAMPLER_PMC_CPU (1U << 5)
+#define KPERF_SAMPLER_PMC_CONFIG (1U << 6)
+#define KPERF_SAMPLER_MEMINFO (1U << 7)
+#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8)
+#define KPERF_SAMPLER_TH_DISPATCH (1U << 9)
+#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10)
+#define KPERF_SAMPLER_SYS_MEM (1U << 11)
+#define KPERF_SAMPLER_TH_INSCYC (1U << 12)
+#define KPERF_SAMPLER_TK_INFO (1U << 13)
+
+// Maximum number of kperf action ids.
+#define KPERF_ACTION_MAX (32)
+
+// Maximum number of kperf timer ids.
+#define KPERF_TIMER_MAX (8)
+
+// x86/arm config registers are 64-bit
+typedef u64 kpc_config_t;
+
+/// Print current CPU identification string to the buffer (same as snprintf),
+/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC
+/// database in /usr/share/kpep.
+/// @return string's length, or negative value if error occurs.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(hw.cputype), get(hw.cpusubtype),
+/// get(hw.cpufamily), get(machdep.cpu.model)
+static int (*kpc_cpu_string)(char *buf, usize buf_size);
+
+/// Get the version of KPC that's being run.
+/// @return See `PMU version constants` above.
+/// @details sysctl get(kpc.pmu_version)
+static u32 (*kpc_pmu_version)(void);
+
+/// Get running PMC classes.
+/// @return See `class mask constants` above,
+/// 0 if error occurs or no class is set.
+/// @details sysctl get(kpc.counting)
+static u32 (*kpc_get_counting)(void);
+
+/// Set PMC classes to enable counting.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.counting)
+static int (*kpc_set_counting)(u32 classes);
+
+/// Get running PMC classes for current thread.
+/// @return See `class mask constants` above,
+/// 0 if error occurs or no class is set.
+/// @details sysctl get(kpc.thread_counting)
+static u32 (*kpc_get_thread_counting)(void);
+
+/// Set PMC classes to enable counting for current thread.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.thread_counting)
+static int (*kpc_set_thread_counting)(u32 classes);
+
+/// Get how many config registers there are for a given mask.
+/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`,
+/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @return 0 if error occurs or no class is set.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.config_count)
+static u32 (*kpc_get_config_count)(u32 classes);
+
+/// Get config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer to receive values, should not smaller than
+/// kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), get(kpc.config)
+static int (*kpc_get_config)(u32 classes, kpc_config_t *config);
+
+/// Set config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer, should not smaller than
+/// kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), set(kpc.config)
+static int (*kpc_set_config)(u32 classes, kpc_config_t *config);
+
+/// Get how many counters there are for a given mask.
+/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`,
+/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.counter_count)
+static u32 (*kpc_get_counter_count)(u32 classes);
+
+/// Get counter accumulations.
+/// If `all_cpus` is true, the buffer count should not smaller than
+/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller
+/// than (counter_count).
+/// @see kpc_get_counter_count(), kpc_cpu_count().
+/// @param all_cpus true for all CPUs, false for current cpu.
+/// @param classes See `class mask constants` above.
+/// @param curcpu A pointer to receive current cpu id, can be NULL.
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters)
+static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu,
+ u64 *buf);
+
+/// Get counter accumulations for current thread.
+/// @param tid Thread id, should be 0.
+/// @param buf_count The number of buf's elements (not bytes),
+/// should not smaller than kpc_get_counter_count().
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(kpc.thread_counters)
+static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf);
+
+/// Acquire/release the counters used by the Power Manager.
+/// @param val 1:acquire, 0:release
+/// @return 0 for success.
+/// @details sysctl set(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_set)(int val);
+
+/// Get the state of all_ctrs.
+/// @return 0 for success.
+/// @details sysctl get(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_get)(int *val_out);
+
+/// Set number of actions, should be `KPERF_ACTION_MAX`.
+/// @details sysctl set(kperf.action.count)
+static int (*kperf_action_count_set)(u32 count);
+
+/// Get number of actions.
+/// @details sysctl get(kperf.action.count)
+static int (*kperf_action_count_get)(u32 *count);
+
+/// Set what to sample when a trigger fires an action, e.g.
+/// `KPERF_SAMPLER_PMC_CPU`.
+/// @details sysctl set(kperf.action.samplers)
+static int (*kperf_action_samplers_set)(u32 actionid, u32 sample);
+
+/// Get what to sample when a trigger fires an action.
+/// @details sysctl get(kperf.action.samplers)
+static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample);
+
+/// Apply a task filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_task)
+static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port);
+
+/// Apply a pid filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_pid)
+static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid);
+
+/// Set number of time triggers, should be `KPERF_TIMER_MAX`.
+/// @details sysctl set(kperf.timer.count)
+static int (*kperf_timer_count_set)(u32 count);
+
+/// Get number of time triggers.
+/// @details sysctl get(kperf.timer.count)
+static int (*kperf_timer_count_get)(u32 *count);
+
+/// Set timer number and period.
+/// @details sysctl set(kperf.timer.period)
+static int (*kperf_timer_period_set)(u32 actionid, u64 tick);
+
+/// Get timer number and period.
+/// @details sysctl get(kperf.timer.period)
+static int (*kperf_timer_period_get)(u32 actionid, u64 *tick);
+
+/// Set timer number and actionid.
+/// @details sysctl set(kperf.timer.action)
+static int (*kperf_timer_action_set)(u32 actionid, u32 timerid);
+
+/// Get timer number and actionid.
+/// @details sysctl get(kperf.timer.action)
+static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid);
+
+/// Set which timer ID does PET (Profile Every Thread).
+/// @details sysctl set(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_set)(u32 timerid);
+
+/// Get which timer ID does PET (Profile Every Thread).
+/// @details sysctl get(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_get)(u32 *timerid);
+
+/// Enable or disable sampling.
+/// @details sysctl set(kperf.sampling)
+static int (*kperf_sample_set)(u32 enabled);
+
+/// Get is currently sampling.
+/// @details sysctl get(kperf.sampling)
+static int (*kperf_sample_get)(u32 *enabled);
+
+/// Reset kperf: stop sampling, kdebug, timers and actions.
+/// @return 0 for success.
+static int (*kperf_reset)(void);
+
+/// Nanoseconds to CPU ticks.
+static u64 (*kperf_ns_to_ticks)(u64 ns);
+
+/// CPU ticks to nanoseconds.
+static u64 (*kperf_ticks_to_ns)(u64 ticks);
+
+/// CPU ticks frequency (mach_absolute_time).
+static u64 (*kperf_tick_frequency)(void);
+
+/// Get lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_get(u32 *enabled) {
+ if (!enabled)
+ return -1;
+ usize size = 4;
+ return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0);
+}
+
+/// Set lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_set(u32 enabled) {
+ return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4);
+}
+
+// -----------------------------------------------------------------------------
+// <kperfdata.framework> header (reverse engineered)
+// This framework provides some functions to access the local CPU database.
+// These functions do not require root privileges.
+// -----------------------------------------------------------------------------
+
+// KPEP CPU archtecture constants.
+#define KPEP_ARCH_I386 0
+#define KPEP_ARCH_X86_64 1
+#define KPEP_ARCH_ARM 2
+#define KPEP_ARCH_ARM64 3
+
+/// KPEP event (size: 48/28 bytes on 64/32 bit OS)
+typedef struct kpep_event {
+ const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY".
+ const char *description; ///< Description for this event.
+ const char *errata; ///< Errata, currently NULL.
+ const char *alias; ///< Alias name, such as "Instructions", "Cycles".
+ const char *fallback; ///< Fallback event name for fixed counter.
+ u32 mask;
+ u8 number;
+ u8 umask;
+ u8 reserved;
+ u8 is_fixed;
+} kpep_event;
+
+/// KPEP database (size: 144/80 bytes on 64/32 bit OS)
+typedef struct kpep_db {
+ const char *name; ///< Database name, such as "haswell".
+ const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc".
+ const char *marketing_name; ///< Marketing name, such as "Intel Haswell".
+ void *plist_data; ///< Plist data (CFDataRef), currently NULL.
+ void *event_map; ///< All events (CFDict<CFSTR(event_name), kpep_event *>).
+ kpep_event
+ *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count).
+ kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *)
+ ///< * fixed_counter_count)
+ void *alias_map; ///< All aliases (CFDict<CFSTR(event_name), kpep_event *>).
+ usize reserved_1;
+ usize reserved_2;
+ usize reserved_3;
+ usize event_count; ///< All events count.
+ usize alias_count;
+ usize fixed_counter_count;
+ usize config_counter_count;
+ usize power_counter_count;
+ u32 archtecture; ///< see `KPEP CPU archtecture constants` above.
+ u32 fixed_counter_bits;
+ u32 config_counter_bits;
+ u32 power_counter_bits;
+} kpep_db;
+
+/// KPEP config (size: 80/44 bytes on 64/32 bit OS)
+typedef struct kpep_config {
+ kpep_db *db;
+ kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL
+ usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0
+ usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1
+ u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0
+ u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0
+ usize event_count; /// kpep_config_events_count()
+ usize counter_count;
+ u32 classes; ///< See `class mask constants` above.
+ u32 config_counter;
+ u32 power_counter;
+ u32 reserved;
+} kpep_config;
+
+/// Error code for kpep_config_xxx() and kpep_db_xxx() functions.
+typedef enum {
+ KPEP_CONFIG_ERROR_NONE = 0,
+ KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1,
+ KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2,
+ KPEP_CONFIG_ERROR_IO = 3,
+ KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4,
+ KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5,
+ KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6,
+ KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7,
+ KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8,
+ KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9,
+ KPEP_CONFIG_ERROR_DB_CORRUPT = 10,
+ KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11,
+ KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12,
+ KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13,
+ KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14,
+ KPEP_CONFIG_ERROR_ERRNO = 15,
+ KPEP_CONFIG_ERROR_MAX
+} kpep_config_error_code;
+
+/// Error description for kpep_config_error_code.
+static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = {
+ "none",
+ "invalid argument",
+ "out of memory",
+ "I/O",
+ "buffer too small",
+ "current system unknown",
+ "database path invalid",
+ "database not found",
+ "database architecture unsupported",
+ "database version unsupported",
+ "database corrupt",
+ "event not found",
+ "conflicting events",
+ "all counters must be forced",
+ "event unavailable",
+ "check errno"};
+
+/// Error description.
+static const char *kpep_config_error_desc(int code) {
+ if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) {
+ return kpep_config_error_names[code];
+ }
+ return "unknown error";
+}
+
+/// Create a config.
+/// @param db A kpep db, see kpep_db_create()
+/// @param cfg_ptr A pointer to receive the new config.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr);
+
+/// Free the config.
+static void (*kpep_config_free)(kpep_config *cfg);
+
+/// Add an event to config.
+/// @param cfg The config.
+/// @param ev_ptr A event pointer.
+/// @param flag 0: all, 1: user space only
+/// @param err Error bitmap pointer, can be NULL.
+/// If return value is `CONFLICTING_EVENTS`, this bitmap contains
+/// the conflicted event indices, e.g. "1 << 2" means index 2.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr,
+ u32 flag, u32 *err);
+
+/// Remove event at index.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx);
+
+/// Force all counters.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_force_counters)(kpep_config *cfg);
+
+/// Get events count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get all event pointers.
+/// @param buf A buffer to receive event pointers.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+/// kpep_config_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf,
+ usize buf_size);
+
+/// Get kpc register configs.
+/// @param buf A buffer to receive kpc register configs.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+/// kpep_config_kpc_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf,
+ usize buf_size);
+
+/// Get kpc register config count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get kpc classes.
+/// @param classes See `class mask constants` above.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr);
+
+/// Get the index mapping from event to counter.
+/// @param buf A buffer to receive indexes.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+/// kpep_config_events_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size);
+
+/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/".
+/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8".
+/// Pass NULL for current CPU.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_create)(const char *name, kpep_db **db_ptr);
+
+/// Free the kpep database.
+static void (*kpep_db_free)(kpep_db *db);
+
+/// Get the database's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_name)(kpep_db *db, const char **name);
+
+/// Get the event alias count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases_count)(kpep_db *db, usize *count);
+
+/// Get all alias.
+/// @param buf A buffer to receive all alias strings.
+/// @param buf_size The buffer's size in bytes,
+/// should not smaller than kpep_db_aliases_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size);
+
+/// Get counters count for given classes.
+/// @param classes 1: Fixed, 2: Configurable.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count);
+
+/// Get all event count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events_count)(kpep_db *db, usize *count);
+
+/// Get all events.
+/// @param buf A buffer to receive all event pointers.
+/// @param buf_size The buffer's size in bytes,
+/// should not smaller than kpep_db_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size);
+
+/// Get one event by name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr);
+
+/// Get event's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr);
+
+/// Get event's alias.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr);
+
+/// Get event's description.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr);
+
+// -----------------------------------------------------------------------------
+// load kperf/kperfdata dynamic library
+// -----------------------------------------------------------------------------
+
+typedef struct {
+ const char *name;
+ void **impl;
+} lib_symbol;
+
+#define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
+#define lib_symbol_def(name) \
+ { \
+#name, (void **)&name \
+ }
+
+static const lib_symbol lib_symbols_kperf[] = {
+ lib_symbol_def(kpc_pmu_version),
+ lib_symbol_def(kpc_cpu_string),
+ lib_symbol_def(kpc_set_counting),
+ lib_symbol_def(kpc_get_counting),
+ lib_symbol_def(kpc_set_thread_counting),
+ lib_symbol_def(kpc_get_thread_counting),
+ lib_symbol_def(kpc_get_config_count),
+ lib_symbol_def(kpc_get_counter_count),
+ lib_symbol_def(kpc_set_config),
+ lib_symbol_def(kpc_get_config),
+ lib_symbol_def(kpc_get_cpu_counters),
+ lib_symbol_def(kpc_get_thread_counters),
+ lib_symbol_def(kpc_force_all_ctrs_set),
+ lib_symbol_def(kpc_force_all_ctrs_get),
+ lib_symbol_def(kperf_action_count_set),
+ lib_symbol_def(kperf_action_count_get),
+ lib_symbol_def(kperf_action_samplers_set),
+ lib_symbol_def(kperf_action_samplers_get),
+ lib_symbol_def(kperf_action_filter_set_by_task),
+ lib_symbol_def(kperf_action_filter_set_by_pid),
+ lib_symbol_def(kperf_timer_count_set),
+ lib_symbol_def(kperf_timer_count_get),
+ lib_symbol_def(kperf_timer_period_set),
+ lib_symbol_def(kperf_timer_period_get),
+ lib_symbol_def(kperf_timer_action_set),
+ lib_symbol_def(kperf_timer_action_get),
+ lib_symbol_def(kperf_sample_set),
+ lib_symbol_def(kperf_sample_get),
+ lib_symbol_def(kperf_reset),
+ lib_symbol_def(kperf_timer_pet_set),
+ lib_symbol_def(kperf_timer_pet_get),
+ lib_symbol_def(kperf_ns_to_ticks),
+ lib_symbol_def(kperf_ticks_to_ns),
+ lib_symbol_def(kperf_tick_frequency),
+};
+
+static const lib_symbol lib_symbols_kperfdata[] = {
+ lib_symbol_def(kpep_config_create),
+ lib_symbol_def(kpep_config_free),
+ lib_symbol_def(kpep_config_add_event),
+ lib_symbol_def(kpep_config_remove_event),
+ lib_symbol_def(kpep_config_force_counters),
+ lib_symbol_def(kpep_config_events_count),
+ lib_symbol_def(kpep_config_events),
+ lib_symbol_def(kpep_config_kpc),
+ lib_symbol_def(kpep_config_kpc_count),
+ lib_symbol_def(kpep_config_kpc_classes),
+ lib_symbol_def(kpep_config_kpc_map),
+ lib_symbol_def(kpep_db_create),
+ lib_symbol_def(kpep_db_free),
+ lib_symbol_def(kpep_db_name),
+ lib_symbol_def(kpep_db_aliases_count),
+ lib_symbol_def(kpep_db_aliases),
+ lib_symbol_def(kpep_db_counters_count),
+ lib_symbol_def(kpep_db_events_count),
+ lib_symbol_def(kpep_db_events),
+ lib_symbol_def(kpep_db_event),
+ lib_symbol_def(kpep_event_name),
+ lib_symbol_def(kpep_event_alias),
+ lib_symbol_def(kpep_event_description),
+};
+
+#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf"
+#define lib_path_kperfdata \
+ "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata"
+
+static bool lib_inited = false;
+static bool lib_has_err = false;
+static char lib_err_msg[256];
+
+static void *lib_handle_kperf = NULL;
+static void *lib_handle_kperfdata = NULL;
+
+static void lib_deinit(void) {
+ lib_inited = false;
+ lib_has_err = false;
+ if (lib_handle_kperf)
+ dlclose(lib_handle_kperf);
+ if (lib_handle_kperfdata)
+ dlclose(lib_handle_kperfdata);
+ lib_handle_kperf = NULL;
+ lib_handle_kperfdata = NULL;
+ for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+ const lib_symbol *symbol = &lib_symbols_kperf[i];
+ *symbol->impl = NULL;
+ }
+ for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+ const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+ *symbol->impl = NULL;
+ }
+}
+
+static bool lib_init(void) {
+#define return_err() \
+ do { \
+ lib_deinit(); \
+ lib_inited = true; \
+ lib_has_err = true; \
+ return false; \
+ } while (false)
+
+ if (lib_inited)
+ return !lib_has_err;
+
+ // load dynamic library
+ lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY);
+ if (!lib_handle_kperf) {
+ snprintf(lib_err_msg, sizeof(lib_err_msg),
+ "Failed to load kperf.framework, message: %s.", dlerror());
+ return_err();
+ }
+ lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY);
+ if (!lib_handle_kperfdata) {
+ snprintf(lib_err_msg, sizeof(lib_err_msg),
+ "Failed to load kperfdata.framework, message: %s.", dlerror());
+ return_err();
+ }
+
+ // load symbol address from dynamic library
+ for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+ const lib_symbol *symbol = &lib_symbols_kperf[i];
+ *symbol->impl = dlsym(lib_handle_kperf, symbol->name);
+ if (!*symbol->impl) {
+ snprintf(lib_err_msg, sizeof(lib_err_msg),
+ "Failed to load kperf function: %s.", symbol->name);
+ return_err();
+ }
+ }
+ for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+ const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+ *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name);
+ if (!*symbol->impl) {
+ snprintf(lib_err_msg, sizeof(lib_err_msg),
+ "Failed to load kperfdata function: %s.", symbol->name);
+ return_err();
+ }
+ }
+
+ lib_inited = true;
+ lib_has_err = false;
+ return true;
+
+#undef return_err
+}
+
+// -----------------------------------------------------------------------------
+// kdebug private structs
+// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h
+// -----------------------------------------------------------------------------
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
+typedef struct {
+ uint64_t timestamp;
+ kd_buf_argtype arg1;
+ kd_buf_argtype arg2;
+ kd_buf_argtype arg3;
+ kd_buf_argtype arg4;
+ kd_buf_argtype arg5; /* the thread ID */
+ uint32_t debugid; /* see <sys/kdebug.h> */
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
+ uint32_t cpuid; /* cpu index, from 0 */
+ kd_buf_argtype unused;
+#endif
+} kd_buf;
+
+/* bits for the type field of kd_regtype */
+#define KDBG_CLASSTYPE 0x10000
+#define KDBG_SUBCLSTYPE 0x20000
+#define KDBG_RANGETYPE 0x40000
+#define KDBG_TYPENONE 0x80000
+#define KDBG_CKTYPES 0xF0000
+
+/* only trace at most 4 types of events, at the code granularity */
+#define KDBG_VALCHECK 0x00200000U
+
+typedef struct {
+ unsigned int type;
+ unsigned int value1;
+ unsigned int value2;
+ unsigned int value3;
+ unsigned int value4;
+} kd_regtype;
+
+typedef struct {
+ /* number of events that can fit in the buffers */
+ int nkdbufs;
+ /* set if trace is disabled */
+ int nolog;
+ /* kd_ctrl_page.flags */
+ unsigned int flags;
+ /* number of threads in thread map */
+ int nkdthreads;
+ /* the owning pid */
+ int bufid;
+} kbufinfo_t;
+
+// -----------------------------------------------------------------------------
+// kdebug utils
+// -----------------------------------------------------------------------------
+
+/// Clean up trace buffers and reset ktrace/kdebug/kperf.
+/// @return 0 on success.
+static int kdebug_reset(void) {
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE};
+ return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Disable and reinitialize the trace buffers.
+/// @return 0 on success.
+static int kdebug_reinit(void) {
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP};
+ return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Set debug filter.
+static int kdebug_setreg(kd_regtype *kdr) {
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG};
+ usize size = sizeof(kd_regtype);
+ return sysctl(mib, 3, kdr, &size, NULL, 0);
+}
+
+/// Set maximum number of trace entries (kd_buf).
+/// Only allow allocation up to half the available memory (sane_size).
+/// @return 0 on success.
+static int kdebug_trace_setbuf(int nbufs) {
+ int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs};
+ return sysctl(mib, 4, NULL, NULL, NULL, 0);
+}
+
+/// Enable or disable kdebug trace.
+/// Trace buffer must already be initialized.
+/// @return 0 on success.
+static int kdebug_trace_enable(bool enable) {
+ int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable};
+ return sysctl(mib, 4, NULL, 0, NULL, 0);
+}
+
+/// Retrieve trace buffer information from kernel.
+/// @return 0 on success.
+static int kdebug_get_bufinfo(kbufinfo_t *info) {
+ if (!info)
+ return -1;
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF};
+ size_t needed = sizeof(kbufinfo_t);
+ return sysctl(mib, 3, info, &needed, NULL, 0);
+}
+
+/// Retrieve trace buffers from kernel.
+/// @param buf Memory to receive buffer data, array of `kd_buf`.
+/// @param len Length of `buf` in bytes.
+/// @param count Number of trace entries (kd_buf) obtained.
+/// @return 0 on success.
+static int kdebug_trace_read(void *buf, usize len, usize *count) {
+ if (count)
+ *count = 0;
+ if (!buf || !len)
+ return -1;
+
+ // Note: the input and output units are not the same.
+ // input: bytes
+ // output: number of kd_buf
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR};
+ int ret = sysctl(mib, 3, buf, &len, NULL, 0);
+ if (ret != 0)
+ return ret;
+ *count = len;
+ return 0;
+}
+
+/// Block until there are new buffers filled or `timeout_ms` have passed.
+/// @param timeout_ms timeout milliseconds, 0 means wait forever.
+/// @param suc set true if new buffers filled.
+/// @return 0 on success.
+static int kdebug_wait(usize timeout_ms, bool *suc) {
+ if (timeout_ms == 0)
+ return -1;
+ int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT};
+ usize val = timeout_ms;
+ int ret = sysctl(mib, 3, NULL, &val, NULL, 0);
+ if (suc)
+ *suc = !!val;
+ return ret;
+}
+
+// -----------------------------------------------------------------------------
+// Demo
+// -----------------------------------------------------------------------------
+
+#define EVENT_NAME_MAX 8
+typedef struct {
+ const char *alias; /// name for print
+ const char *names[EVENT_NAME_MAX]; /// name from pmc db
+} event_alias;
+
+/// Event names from /usr/share/kpep/<name>.plist
+static const event_alias profile_events[] = {
+ {"cycles",
+ {
+ "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE
+ "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th
+ "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom
+ }},
+ {"instructions",
+ {
+ "FIXED_INSTRUCTIONS", // Apple A7-A15
+ "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th
+ }},
+ {"branches",
+ {
+ "INST_BRANCH", // Apple A7-A15
+ "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th
+ "INST_RETIRED.ANY", // Intel Yonah, Merom
+ }},
+ {"branch-misses",
+ {
+ "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12
+ "BRANCH_MISPREDICT", // Apple A7-A14
+ "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th
+ "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom
+ }},
+};
+
+static kpep_event *get_event(kpep_db *db, const event_alias *alias) {
+ for (usize j = 0; j < EVENT_NAME_MAX; j++) {
+ const char *name = alias->names[j];
+ if (!name)
+ break;
+ kpep_event *ev = NULL;
+ if (kpep_db_event(db, name, &ev) == 0) {
+ return ev;
+ }
+ }
+ return NULL;
+}
+
+kpc_config_t regs[KPC_MAX_COUNTERS] = {0};
+usize counter_map[KPC_MAX_COUNTERS] = {0};
+u64 counters_0[KPC_MAX_COUNTERS] = {0};
+u64 counters_1[KPC_MAX_COUNTERS] = {0};
+const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]);
+
+
+bool setup_performance_counters() {
+ static bool init = false;
+ static bool worked = false;
+
+ if (init) {
+ return worked;
+ }
+ init = true;
+
+ // load dylib
+ if (!lib_init()) {
+ printf("Error: %s\n", lib_err_msg);
+ return (worked = false);
+ }
+
+ // check permission
+ int force_ctrs = 0;
+ if (kpc_force_all_ctrs_get(&force_ctrs)) {
+ //printf("Permission denied, xnu/kpc requires root privileges.\n");
+ return (worked = false);
+ }
+ int ret;
+ // load pmc db
+ kpep_db *db = NULL;
+ if ((ret = kpep_db_create(NULL, &db))) {
+ printf("Error: cannot load pmc database: %d.\n", ret);
+ return (worked = false);
+ }
+ printf("loaded db: %s (%s)\n", db->name, db->marketing_name);
+
+ // create a config
+ kpep_config *cfg = NULL;
+ if ((ret = kpep_config_create(db, &cfg))) {
+ printf("Failed to create kpep config: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+ if ((ret = kpep_config_force_counters(cfg))) {
+ printf("Failed to force counters: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+
+ // get events
+ kpep_event *ev_arr[ev_count] = {0};
+ for (usize i = 0; i < ev_count; i++) {
+ const event_alias *alias = profile_events + i;
+ ev_arr[i] = get_event(db, alias);
+ if (!ev_arr[i]) {
+ printf("Cannot find event: %s.\n", alias->alias);
+ return (worked = false);
+ }
+ }
+
+ // add event to config
+ for (usize i = 0; i < ev_count; i++) {
+ kpep_event *ev = ev_arr[i];
+ if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) {
+ printf("Failed to add event: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+ }
+
+ // prepare buffer and config
+ u32 classes = 0;
+ usize reg_count = 0;
+ if ((ret = kpep_config_kpc_classes(cfg, &classes))) {
+ printf("Failed get kpc classes: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+ if ((ret = kpep_config_kpc_count(cfg, ®_count))) {
+ printf("Failed get kpc count: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+ if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) {
+ printf("Failed get kpc map: %d (%s).\n", ret, kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+ if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) {
+ printf("Failed get kpc registers: %d (%s).\n", ret,
+ kpep_config_error_desc(ret));
+ return (worked = false);
+ }
+
+ // set config to kernel
+ if ((ret = kpc_force_all_ctrs_set(1))) {
+ printf("Failed force all ctrs: %d.\n", ret);
+ return (worked = false);
+ }
+ if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) {
+ if ((ret = kpc_set_config(classes, regs))) {
+ printf("Failed set kpc config: %d.\n", ret);
+ return (worked = false);
+ }
+ }
+
+ // start counting
+ if ((ret = kpc_set_counting(classes))) {
+ printf("Failed set counting: %d.\n", ret);
+ return (worked = false);
+ }
+ if ((ret = kpc_set_thread_counting(classes))) {
+ printf("Failed set thread counting: %d.\n", ret);
+ return (worked = false);
+ }
+
+ return (worked = true);
+}
+
+inline performance_counters get_counters() {
+ static bool warned = false;
+ int ret;
+ // get counters before
+ if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) {
+ if (!warned) {
+
+ printf("Failed get thread counters before: %d.\n", ret);
+ warned = true;
+ }
+ return 1;
+ }
+ /*printf("counters value:\n");
+ for (usize i = 0; i < ev_count; i++) {
+ const event_alias *alias = profile_events + i;
+ usize idx = counter_map[i];
+ u64 val = counters_1[idx] - counters_0[idx];
+ printf("%14s: %llu\n", alias->alias, val);
+ }*/
+ return performance_counters{
+ counters_0[counter_map[0]], counters_0[counter_map[2]],
+ counters_0[counter_map[3]],
+ counters_0[counter_map[1]]};
+}
+
+#endif
diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp
new file mode 100644
index 0000000..c6b091f
--- /dev/null
+++ b/benchmarks/benchmark.cpp
@@ -0,0 +1,247 @@
+#if defined(__linux__) || (__APPLE__ && __aarch64__)
+#define USING_COUNTERS
+#include "event_counter.h"
+#endif
+#include <algorithm>
+#include "fast_float/fast_float.h"
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include <locale.h>
+
+
+template <typename CharT>
+double findmax_fastfloat64(std::vector<std::basic_string<CharT>> &s) {
+ double answer = 0;
+ double x = 0;
+ for (auto &st : s) {
+ auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x);
+ if (p == st.data()) {
+ throw std::runtime_error("bug in findmax_fastfloat");
+ }
+ answer = answer > x ? answer : x;
+ }
+ return answer;
+}
+
+template <typename CharT>
+double findmax_fastfloat32(std::vector<std::basic_string<CharT>> &s) {
+ float answer = 0;
+ float x = 0;
+ for (auto &st : s) {
+ auto [p, ec] = fast_float::from_chars(st.data(), st.data() + st.size(), x);
+ if (p == st.data()) {
+ throw std::runtime_error("bug in findmax_fastfloat");
+ }
+ answer = answer > x ? answer : x;
+ }
+ return answer;
+}
+
+event_collector collector{};
+
+#ifdef USING_COUNTERS
+template <class T, class CharT>
+std::vector<event_count> time_it_ns(std::vector<std::basic_string<CharT>> &lines,
+ T const &function, size_t repeat) {
+ std::vector<event_count> aggregate;
+ bool printed_bug = false;
+ for (size_t i = 0; i < repeat; i++) {
+ collector.start();
+ double ts = function(lines);
+ if (ts == 0 && !printed_bug) {
+ printf("bug\n");
+ printed_bug = true;
+ }
+ aggregate.push_back(collector.end());
+ }
+ return aggregate;
+}
+
+void pretty_print(double volume, size_t number_of_floats, std::string name, std::vector<event_count> events) {
+ double volumeMB = volume / (1024. * 1024.);
+ double average_ns{0};
+ double min_ns{DBL_MAX};
+ double cycles_min{DBL_MAX};
+ double instructions_min{DBL_MAX};
+ double cycles_avg{0};
+ double instructions_avg{0};
+ double branches_min{0};
+ double branches_avg{0};
+ double branch_misses_min{0};
+ double branch_misses_avg{0};
+ for(event_count e : events) {
+ double ns = e.elapsed_ns();
+ average_ns += ns;
+ min_ns = min_ns < ns ? min_ns : ns;
+
+ double cycles = e.cycles();
+ cycles_avg += cycles;
+ cycles_min = cycles_min < cycles ? cycles_min : cycles;
+
+ double instructions = e.instructions();
+ instructions_avg += instructions;
+ instructions_min = instructions_min < instructions ? instructions_min : instructions;
+
+ double branches = e.branches();
+ branches_avg += branches;
+ branches_min = branches_min < branches ? branches_min : branches;
+
+ double branch_misses = e.missed_branches();
+ branch_misses_avg += branch_misses;
+ branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses;
+ }
+ cycles_avg /= events.size();
+ instructions_avg /= events.size();
+ average_ns /= events.size();
+ branches_avg /= events.size();
+ printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
+ volumeMB * 1000000000 / min_ns,
+ (average_ns - min_ns) * 100.0 / average_ns);
+ printf("%8.2f Mfloat/s ",
+ number_of_floats * 1000 / min_ns);
+ if(instructions_min > 0) {
+ printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ",
+ instructions_min / volume,
+ instructions_min / number_of_floats,
+ (instructions_avg - instructions_min) * 100.0 / instructions_avg);
+
+ printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ",
+ cycles_min / volume,
+ cycles_min / number_of_floats,
+ (cycles_avg - cycles_min) * 100.0 / cycles_avg);
+ printf(" %8.2f i/c ",
+ instructions_min /cycles_min);
+ printf(" %8.2f b/f ",
+ branches_avg /number_of_floats);
+ printf(" %8.2f bm/f ",
+ branch_misses_avg /number_of_floats);
+ printf(" %8.2f GHz ",
+ cycles_min / min_ns);
+ }
+ printf("\n");
+
+}
+#else
+template <class T, class CharT>
+std::pair<double, double> time_it_ns(std::vector<std::basic_string<CharT>> &lines,
+ T const &function, size_t repeat) {
+ std::chrono::high_resolution_clock::time_point t1, t2;
+ double average = 0;
+ double min_value = DBL_MAX;
+ bool printed_bug = false;
+ for (size_t i = 0; i < repeat; i++) {
+ t1 = std::chrono::high_resolution_clock::now();
+ double ts = function(lines);
+ if (ts == 0 && !printed_bug) {
+ printf("bug\n");
+ printed_bug = true;
+ }
+ t2 = std::chrono::high_resolution_clock::now();
+ double dif =
+ std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
+ average += dif;
+ min_value = min_value < dif ? min_value : dif;
+ }
+ average /= repeat;
+ return std::make_pair(min_value, average);
+}
+
+
+
+
+void pretty_print(double volume, size_t number_of_floats, std::string name, std::pair<double,double> result) {
+ double volumeMB = volume / (1024. * 1024.);
+ printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
+ volumeMB * 1000000000 / result.first,
+ (result.second - result.first) * 100.0 / result.second);
+ printf("%8.2f Mfloat/s ",
+ number_of_floats * 1000 / result.first);
+ printf(" %8.2f ns/f \n",
+ double(result.first) /number_of_floats );
+}
+#endif
+
+
+// this is okay, all chars are ASCII
+inline std::u16string widen(std::string line) {
+ std::u16string u16line;
+ u16line.resize(line.size());
+ for (size_t i = 0; i < line.size(); ++i) {
+ u16line[i] = char16_t(line[i]);
+ }
+ return u16line;
+}
+
+std::vector<std::u16string> widen(const std::vector<std::string> &lines) {
+ std::vector<std::u16string> u16lines;
+ u16lines.reserve(lines.size());
+ for (auto const &line : lines) {
+ u16lines.push_back(widen(line));
+ }
+ return u16lines;
+}
+
+
+void process(std::vector<std::string> &lines, size_t volume) {
+ size_t repeat = 100;
+ double volumeMB = volume / (1024. * 1024.);
+ std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl;
+ pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines, findmax_fastfloat64<char>, repeat));
+ pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines, findmax_fastfloat32<char>, repeat));
+
+ std::vector<std::u16string> lines16 = widen(lines);
+ volume = 2 * volume;
+ volumeMB = volume / (1024. * 1024.);
+ std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl;
+ pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines16, findmax_fastfloat64<char16_t>, repeat));
+ pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines16, findmax_fastfloat32<char16_t>, repeat));
+
+}
+
+void fileload(std::string filename) {
+ std::ifstream inputfile(filename);
+ if (!inputfile) {
+ std::cerr << "can't open " << filename << std::endl;
+ return;
+ }
+ std::cout << "#### " << std::endl;
+ std::cout << "# reading " << filename << std::endl;
+ std::cout << "#### " << std::endl;
+ std::string line;
+ std::vector<std::string> lines;
+ lines.reserve(10000); // let us reserve plenty of memory.
+ size_t volume = 0;
+ while (getline(inputfile, line)) {
+ volume += line.size();
+ lines.push_back(line);
+ }
+ std::cout << "# read " << lines.size() << " lines " << std::endl;
+ process(lines, volume);
+}
+
+
+int main(int argc, char **argv) {
+ if(collector.has_events()) {
+ std::cout << "# Using hardware counters" << std::endl;
+ } else {
+#if defined(__linux__) || (__APPLE__ && __aarch64__)
+ std::cout << "# Hardware counters not available, try to run in privileged mode (e.g., sudo)." << std::endl;
+#endif
+ }
+ fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt");
+ fileload(std::string(BENCHMARK_DATA_DIR) + "/mesh.txt");
+}
diff --git a/benchmarks/event_counter.h b/benchmarks/event_counter.h
new file mode 100644
index 0000000..fb6db3a
--- /dev/null
+++ b/benchmarks/event_counter.h
@@ -0,0 +1,152 @@
+#ifndef __EVENT_COUNTER_H
+#define __EVENT_COUNTER_H
+
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#endif
+#include <cinttypes>
+
+#include <cstring>
+
+#include <chrono>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+
+#if __APPLE__ && __aarch64__
+#include "apple_arm_events.h"
+#endif
+
+struct event_count {
+ std::chrono::duration<double> elapsed;
+ std::vector<unsigned long long> event_counts;
+ event_count() : elapsed(0), event_counts{0,0,0,0,0} {}
+ event_count(const std::chrono::duration<double> _elapsed, const std::vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {}
+ event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { }
+
+ // The types of counters (so we can read the getter more easily)
+ enum event_counter_types {
+ CPU_CYCLES = 0,
+ INSTRUCTIONS = 1,
+ BRANCHES = 2,
+ MISSED_BRANCHES = 3
+ };
+
+ double elapsed_sec() const { return std::chrono::duration<double>(elapsed).count(); }
+ double elapsed_ns() const { return std::chrono::duration<double, std::nano>(elapsed).count(); }
+ double cycles() const { return static_cast<double>(event_counts[CPU_CYCLES]); }
+ double instructions() const { return static_cast<double>(event_counts[INSTRUCTIONS]); }
+ double branches() const { return static_cast<double>(event_counts[BRANCHES]); }
+ double missed_branches() const { return static_cast<double>(event_counts[MISSED_BRANCHES]); }
+
+ event_count& operator=(const event_count& other) {
+ this->elapsed = other.elapsed;
+ this->event_counts = other.event_counts;
+ return *this;
+ }
+ event_count operator+(const event_count& other) const {
+ return event_count(elapsed+other.elapsed, {
+ event_counts[0]+other.event_counts[0],
+ event_counts[1]+other.event_counts[1],
+ event_counts[2]+other.event_counts[2],
+ event_counts[3]+other.event_counts[3],
+ event_counts[4]+other.event_counts[4],
+ });
+ }
+
+ void operator+=(const event_count& other) {
+ *this = *this + other;
+ }
+};
+
+struct event_aggregate {
+ bool has_events = false;
+ int iterations = 0;
+ event_count total{};
+ event_count best{};
+ event_count worst{};
+
+ event_aggregate() = default;
+
+ void operator<<(const event_count& other) {
+ if (iterations == 0 || other.elapsed < best.elapsed) {
+ best = other;
+ }
+ if (iterations == 0 || other.elapsed > worst.elapsed) {
+ worst = other;
+ }
+ iterations++;
+ total += other;
+ }
+
+ double elapsed_sec() const { return total.elapsed_sec() / iterations; }
+ double elapsed_ns() const { return total.elapsed_ns() / iterations; }
+ double cycles() const { return total.cycles() / iterations; }
+ double instructions() const { return total.instructions() / iterations; }
+ double branches() const { return total.branches() / iterations; }
+ double missed_branches() const { return total.missed_branches() / iterations; }
+};
+
+struct event_collector {
+ event_count count{};
+ std::chrono::time_point<std::chrono::steady_clock> start_clock{};
+
+#if defined(__linux__)
+ LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
+ event_collector() : linux_events(std::vector<int>{
+ PERF_COUNT_HW_CPU_CYCLES,
+ PERF_COUNT_HW_INSTRUCTIONS,
+ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions
+ PERF_COUNT_HW_BRANCH_MISSES
+ }) {}
+ bool has_events() {
+ return linux_events.is_working();
+ }
+#elif __APPLE__ && __aarch64__
+ performance_counters diff;
+ event_collector() : diff(0) {
+ setup_performance_counters();
+ }
+ bool has_events() {
+ return setup_performance_counters();
+ }
+#else
+ event_collector() {}
+ bool has_events() {
+ return false;
+ }
+#endif
+
+ inline void start() {
+#if defined(__linux)
+ linux_events.start();
+#elif __APPLE__ && __aarch64__
+ if(has_events()) { diff = get_counters(); }
+#endif
+ start_clock = std::chrono::steady_clock::now();
+ }
+ inline event_count& end() {
+ const auto end_clock = std::chrono::steady_clock::now();
+#if defined(__linux)
+ linux_events.end(count.event_counts);
+#elif __APPLE__ && __aarch64__
+ if(has_events()) {
+ performance_counters end = get_counters();
+ diff = end - diff;
+ }
+ count.event_counts[0] = diff.cycles;
+ count.event_counts[1] = diff.instructions;
+ count.event_counts[2] = diff.branches;
+ count.event_counts[3] = diff.missed_branches;
+ count.event_counts[4] = 0;
+#endif
+ count.elapsed = end_clock - start_clock;
+ return count;
+ }
+};
+
+#endif
diff --git a/benchmarks/linux-perf-events.h b/benchmarks/linux-perf-events.h
new file mode 100644
index 0000000..73cfbaf
--- /dev/null
+++ b/benchmarks/linux-perf-events.h
@@ -0,0 +1,103 @@
+#pragma once
+#ifdef __linux__
+
+#include <asm/unistd.h> // for __NR_perf_event_open
+#include <linux/perf_event.h> // for perf event constants
+#include <sys/ioctl.h> // for ioctl
+#include <unistd.h> // for syscall
+
+#include <cerrno> // for errno
+#include <cstring> // for memset
+#include <stdexcept>
+
+#include <iostream>
+#include <vector>
+
+template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
+ int fd;
+ bool working;
+ perf_event_attr attribs{};
+ size_t num_events{};
+ std::vector<uint64_t> temp_result_vec{};
+ std::vector<uint64_t> ids{};
+
+public:
+ explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
+ memset(&attribs, 0, sizeof(attribs));
+ attribs.type = TYPE;
+ attribs.size = sizeof(attribs);
+ attribs.disabled = 1;
+ attribs.exclude_kernel = 1;
+ attribs.exclude_hv = 1;
+
+ attribs.sample_period = 0;
+ attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+ const int pid = 0; // the current process
+ const int cpu = -1; // all CPUs
+ const unsigned long flags = 0;
+
+ int group = -1; // no group
+ num_events = config_vec.size();
+ ids.resize(config_vec.size());
+ uint32_t i = 0;
+ for (auto config : config_vec) {
+ attribs.config = config;
+ int _fd = static_cast<int>(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
+ if (_fd == -1) {
+ report_error("perf_event_open");
+ }
+ ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]);
+ if (group == -1) {
+ group = _fd;
+ fd = _fd;
+ }
+ }
+
+ temp_result_vec.resize(num_events * 2 + 1);
+ }
+
+ ~LinuxEvents() { if (fd != -1) { close(fd); } }
+
+ inline void start() {
+ if (fd != -1) {
+ if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+ report_error("ioctl(PERF_EVENT_IOC_RESET)");
+ }
+
+ if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
+ }
+ }
+ }
+
+ inline void end(std::vector<unsigned long long> &results) {
+ if (fd != -1) {
+ if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
+ }
+
+ if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
+ report_error("read");
+ }
+ }
+ // our actual results are in slots 1,3,5, ... of this structure
+ for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
+ results[i / 2] = temp_result_vec[i];
+ }
+ for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
+ if(ids[i/2-1] != temp_result_vec[i]) {
+ report_error("event mismatch");
+ }
+ }
+ }
+
+ bool is_working() {
+ return working;
+ }
+
+private:
+ void report_error(const std::string &) {
+ working = false;
+ }
+};
+#endif
\ No newline at end of file