)]}'
{
  "log": [
    {
      "commit": "46394a438c056d90f8f18c3f8775febd8a3b6407",
      "tree": "c4ef4791760cecac4abba26d0a5bd52d0921a660",
      "parents": [
        "078291a7dc5858fb80ad9f5f7fad721f52b95bd1"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Sat May 30 01:31:51 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sat May 30 01:32:31 2026"
      },
      "message": "Simplified fix for warnings in update-microkernels.py by recursively ignoring subdirectories of ignored roots.\n\nPiperOrigin-RevId: 923698983\n"
    },
    {
      "commit": "078291a7dc5858fb80ad9f5f7fad721f52b95bd1",
      "tree": "5dd328c7c3e7931a1430f019a596bf487a08e958",
      "parents": [
        "d34f52c6c7962687092c1ad4ccc7a93693510a92"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 29 23:58:03 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 23:58:47 2026"
      },
      "message": "Fix overzealous assert\n\nPiperOrigin-RevId: 923668444\n"
    },
    {
      "commit": "d34f52c6c7962687092c1ad4ccc7a93693510a92",
      "tree": "3cfd679823063e8f77cf06892bd3ebee0cabad8e",
      "parents": [
        "5d756cd83d5462805f5a78436271a18ecf2f7af5"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 29 19:32:42 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 19:33:28 2026"
      },
      "message": "Update KleidiAI in XNNPACK\n\nPiperOrigin-RevId: 923547940\n"
    },
    {
      "commit": "5d756cd83d5462805f5a78436271a18ecf2f7af5",
      "tree": "48ec49ebbc944ebae553b8fce64d1fa5910cbe59",
      "parents": [
        "76228ba65ee27cb9bb39805e3acfa8ad413ed4e4"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 19:08:44 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 19:10:36 2026"
      },
      "message": "Add approx_tanh operator support behind YNN_FLAG_FAST_MATH.\n\nPiperOrigin-RevId: 923536167\n"
    },
    {
      "commit": "76228ba65ee27cb9bb39805e3acfa8ad413ed4e4",
      "tree": "07b0e7bc8a97f2a490dab608e4dc0a95757ef5b9",
      "parents": [
        "58c0a526396203d347fbebec7983f4121d788104"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 17:17:49 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 17:18:29 2026"
      },
      "message": "Fix NaN handling\n\nPiperOrigin-RevId: 923479160\n"
    },
    {
      "commit": "58c0a526396203d347fbebec7983f4121d788104",
      "tree": "d83137853f88bf7b4b2b824f4c8fc8a7196a9de0",
      "parents": [
        "d89bf2f383ea5005b4dee58d45079ae5c9a22ef8"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Fri May 29 15:45:24 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 15:46:56 2026"
      },
      "message": "Remove RTTI from the Tensor API. Refactor operation handling in the graph.\n\nRemove virtual inheritance from the operation implementations and stop using\nside-casting using `dynamic_cast` to find out if an operation is implemented by\na backend.\n\nAdd a list of \"graph::BackendExtension\" elements that can be grafted onto an\noperation. Backends implementation of every operation are now implemented by\nderiving from this new class. When lowering to a specific backend, we go\nthrough the list of extensions attached to an operation to find a matching\nbackend.\n\nBackend identification is done by calling `BackendExtension::GetTypeId()`.\n\nAs a result of the above changes, operation node implementations don\u0027t need to\nbe defined as templates anymore.\n\nPiperOrigin-RevId: 923435766\n"
    },
    {
      "commit": "d89bf2f383ea5005b4dee58d45079ae5c9a22ef8",
      "tree": "acad2c17b5203f41ab9b26f672fa4a7a00451958",
      "parents": [
        "a74b04841bd6b751833e0eab805b079b757d1a07"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 29 00:33:25 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 29 00:34:07 2026"
      },
      "message": "Don\u0027t allow broadcasts as the first dimension of dot inputs\n\nThis change requires that the stride of the first dimension of dot inputs is 1 element, which forces broadcasts to be realized into memory before running the dot.\n\nBroadcasting is a weird thing to do to a dot input, but some tests do it, and it would be easiest to just handle it than find a workaround.\n\nPiperOrigin-RevId: 923066551\n"
    },
    {
      "commit": "a74b04841bd6b751833e0eab805b079b757d1a07",
      "tree": "f021cd041631b5f6875b2b7baceeca4b26ec808e",
      "parents": [
        "23ba0fbd097cafee7aa569b8daaa64f5918bdbf8"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 22:05:46 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 22:06:31 2026"
      },
      "message": "Relax tolerance of sum reduce test\n\nShould address this rare flaky failure: https://github.com/google/XNNPACK/actions/runs/26599479177/job/78379097239\n\nPiperOrigin-RevId: 922996166\n"
    },
    {
      "commit": "23ba0fbd097cafee7aa569b8daaa64f5918bdbf8",
      "tree": "faaf91cbb240e6c491d44748d0e27d4dd9467d5a",
      "parents": [
        "0702200b644be75cd569ec5a750c93934f6f39db"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 21:03:38 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 21:04:31 2026"
      },
      "message": "Remove `force_root` from `static_transpose` scheduling\n\nPiperOrigin-RevId: 922963746\n"
    },
    {
      "commit": "0702200b644be75cd569ec5a750c93934f6f39db",
      "tree": "9c253947b1516efd4550b5bbc0ec4c1eca7260be",
      "parents": [
        "bf8d96be1971a3a8a7b3c103d32098682ab20919"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 20:35:42 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 20:36:36 2026"
      },
      "message": "Enable `YNN_FLAG_FAST_MATH` in XNNPACK compatibility layer\n\nPiperOrigin-RevId: 922948315\n"
    },
    {
      "commit": "bf8d96be1971a3a8a7b3c103d32098682ab20919",
      "tree": "87a51896b7b0c4ccfce8aa166b7d63633ae19c3e",
      "parents": [
        "1eb730046b14f68b8a51cf0f6e3c9d11964ca869"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 20:02:42 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 20:03:34 2026"
      },
      "message": "Add YNN_FLAG_FAST_MATH and approx_erf operator support behind this flag.\n\nThis change introduces the `YNN_FLAG_FAST_MATH` flag and the corresponding `approx_erf` operator to restore performance for classic use cases that do not require high accuracy.\n\nPiperOrigin-RevId: 922930124\n"
    },
    {
      "commit": "1eb730046b14f68b8a51cf0f6e3c9d11964ca869",
      "tree": "81111a1245a982abbae70761b3fceeb2b2632256",
      "parents": [
        "9b4a49f0f0943f821bfe30ab9dfb91b10b1454f1"
      ],
      "author": {
        "name": "Marie White",
        "email": "mariewhite@google.com",
        "time": "Thu May 28 14:56:19 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 14:57:30 2026"
      },
      "message": "Do not create serial loops for k2, k3, ...\n\nIR before:\n```\nk2#0 \u003d loop(serial, [0, 268], 1) {\n   reduction \u003d crop_dim(reduction, 2, [k2#0, k2#0]) {\n    k1#0 \u003d loop(serial, [0, 3], 1) {\n     reduction \u003d crop_dim(reduction, 1, [k1#0, k1#0]) {\n      d0#0 \u003d loop(parallel, [buffer_min(out3, 0), buffer_max(out3, 0)], 128) {\n       closure {in0, in2, out3, v4, reduction, d0#0} in {\n        out3.d0#0 \u003d crop_dim(out3, 0, [d0#0, (d0#0 + 127)]) {\n         d1#0 \u003d loop(parallel, [buffer_min(out3.d0#0, 1), buffer_max(out3.d0#0, 1)], 32) {\n          closure {in0, in2, v4, reduction, d1#0, out3.d0#0} in {\n           out3.d1#0 \u003d crop_dim(out3.d0#0, 1, [d1#0, (d1#0 + 31)]) {\n            call(dot num_k_dims\u003d3, {in0, v4, in2}, {out3.d1#0, reduction}, {})\n```\n\nIR After:\n```\n  call(pack_b, {in1}, {v4}, {})\n  reduction \u003d allocate(automatic, 0, {\n    {[0, 0], 0, \u003c\u003e},\n    {[0, 1], 0, \u003c\u003e},\n    {[0, 266], 0, \u003c\u003e}\n  }) {\n   call(dot num_k_dims\u003d3, {in0, v4, const2}, {out3, reduction}, {})\n  }\n```\n\nPiperOrigin-RevId: 922773683\n"
    },
    {
      "commit": "9b4a49f0f0943f821bfe30ab9dfb91b10b1454f1",
      "tree": "ac8da299b337af2be513e60d8abd13b93af13b41",
      "parents": [
        "05c9b9157a078f13cd6980a067cc6c7a2d72756f"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Thu May 28 10:55:20 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 10:56:01 2026"
      },
      "message": "Fix warnings in update-microkernels.py by recursively ignoring subdirectories of ignored roots.\n\nPiperOrigin-RevId: 922678462\n"
    },
    {
      "commit": "05c9b9157a078f13cd6980a067cc6c7a2d72756f",
      "tree": "0acec9c7231b5c54ca2184abeaf96be99cbee685",
      "parents": [
        "e2ab35abf6a5107fa7603184d5f93eb806281742"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Thu May 28 09:13:48 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 09:18:49 2026"
      },
      "message": "Remove fxdiv usages from XNNPACK, keeping it only for pthreadpool\n\nPiperOrigin-RevId: 922638743\n"
    },
    {
      "commit": "e2ab35abf6a5107fa7603184d5f93eb806281742",
      "tree": "48ed517b9bfb177dc4b97b39c2ea7801670da40b",
      "parents": [
        "5004f85c86eb6097b2650afabc6b59aa8d8e19be",
        "393da7ddec358bcaa2a1ec1b8e50634a010db4a9"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 09:14:31 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 09:14:31 2026"
      },
      "message": "Merge pull request #10242 from ken-unger:f16-vlog-rvv\n\nPiperOrigin-RevId: 922638609\n"
    },
    {
      "commit": "5004f85c86eb6097b2650afabc6b59aa8d8e19be",
      "tree": "caec45224d7abe30adc69c02a8451c86a8515c88",
      "parents": [
        "01d254db8ac85612afe8c6721ab7a32b1488b6ab"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 02:07:36 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 02:09:42 2026"
      },
      "message": "Rewrite `transpose(static_broadcast(x))` \u003d\u003e `static_broadcast(transpose(x))`\n\nPiperOrigin-RevId: 922472494\n"
    },
    {
      "commit": "01d254db8ac85612afe8c6721ab7a32b1488b6ab",
      "tree": "a92b40b638f046161812531462abb79047fda529",
      "parents": [
        "5fc47cc7a4fd0b699c8f1da58a4eb8ff7358e58f"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 01:46:42 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 01:47:19 2026"
      },
      "message": "Remove `broadcast` op implementation\n\n`broadcast` can be represented by a transpose that removes a dimension, and adds a new dimension in its place.\n\nPiperOrigin-RevId: 922464606\n"
    },
    {
      "commit": "5fc47cc7a4fd0b699c8f1da58a4eb8ff7358e58f",
      "tree": "dbe26d20282264fc4825e1a2817ad11e169cf361",
      "parents": [
        "f9f2c229991dfaf187606ca5657e7ea3d0838947"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 28 01:31:22 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 28 01:32:55 2026"
      },
      "message": "Fuse sequences of transpose(transpose(x)) into one transpose(x)\n\nPiperOrigin-RevId: 922459271\n"
    },
    {
      "commit": "f9f2c229991dfaf187606ca5657e7ea3d0838947",
      "tree": "684b72fd149e13ff82f6e1fcaf7fcea7494a59af",
      "parents": [
        "f1ab4551669c5552f095dadf1a46881594ec9589"
      ],
      "author": {
        "name": "Marie White",
        "email": "mariewhite@google.com",
        "time": "Wed May 27 22:35:32 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 22:36:07 2026"
      },
      "message": "Do not rely on tile_k when aligning split_k\n\ntile_k is dependent on the microkernel which depends on the CPU, which breaks numerical consistency across CPUs.\n\nPiperOrigin-RevId: 922383499\n"
    },
    {
      "commit": "f1ab4551669c5552f095dadf1a46881594ec9589",
      "tree": "89b9fdf84f6116dad26afcda32a83b003f69391e",
      "parents": [
        "d1da9a531ba33bfda32b5dc558460f42c98ed734"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 27 21:16:10 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 21:17:28 2026"
      },
      "message": "Implement `static_expand_dims` using `static_transpose`\n\nFewer ops to support is nice, and this op is easier to simplify (see reshape -\u003e transpose rewrite for example).\n\nPiperOrigin-RevId: 922341041\n"
    },
    {
      "commit": "d1da9a531ba33bfda32b5dc558460f42c98ed734",
      "tree": "a0929b4398b68668ba6eeed67b8bf951fa22aac5",
      "parents": [
        "38eb8abda8a7a56c7828b51df92329ef06c49050"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 27 15:30:30 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 15:31:38 2026"
      },
      "message": "Add transcendental ops for every x86 architecture\n\nThe current selection of which ISAs have kernels is now pretty stale. The first step in cleaning this up is to just add all of the kernels to every ISA.\n\nPiperOrigin-RevId: 922148772\n"
    },
    {
      "commit": "38eb8abda8a7a56c7828b51df92329ef06c49050",
      "tree": "b9f1a282d79635c8c27b4351a0d945189601439c",
      "parents": [
        "ac73c5b87399a1845a43368e109d9e1d4defe5c0"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Wed May 27 15:27:17 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 15:28:17 2026"
      },
      "message": "Remove RTTI from the Tensor API. Rework the `Quantization` hierarchy.\n\nInstances of `dynamic_cast` should be replaced with calls to\n`Quantization::As\u003cT\u003e()` to safely downcast to a concrete quantization type.\n\n`GetTypeId()` and `IsA()` have been implemented for the existing quantization\ntypes and let us find out the concrete type of a quantization object when it\u0027s\nkept as part of the graph and are used by `Quantization::As\u003cT\u003e()` to safely\ndowncast from a buffer.\n\nPiperOrigin-RevId: 922147064\n"
    },
    {
      "commit": "ac73c5b87399a1845a43368e109d9e1d4defe5c0",
      "tree": "505b1b6bb6cdacd95f7eb3603ffb5b2cf9267e35",
      "parents": [
        "43118f4fc34456a392a06a635ac5d85ccca2f874"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Wed May 27 14:59:19 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 15:00:37 2026"
      },
      "message": "Remove RTTI from the Tensor API. Rework the `Buffer` hierarchy.\n\nThis change removes virtual inheritance from the `Buffer` hierarchy.\n`MutableBuffer` is removed and `LockMutable()` is merged into `Buffer`.\n\nInstances of `dynamic_cast` on buffers should now use `Buffer::As\u003cT\u003e()`. This\nis also safer as it forces users to check that the cast was performed\nsuccessfully.\n\n`GetTypeId()` and `IsA()` have been implemented for the existing buffer types\nand let us retrieve the concrete type of a buffer when it\u0027s kept as part of the\ngraph and are used by `Buffer::As\u003cT\u003e()` to safely downcast from a buffer.\n\nPiperOrigin-RevId: 922132824\n"
    },
    {
      "commit": "43118f4fc34456a392a06a635ac5d85ccca2f874",
      "tree": "d9b6a0419604ad8f2cdf182ab9cd4c3b62bc6228",
      "parents": [
        "30e1a98b4c6a32afa28641758a9f53777b2129c5"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Wed May 27 14:36:14 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 14:36:51 2026"
      },
      "message": "Remove RTTI from the Tensor API. Introduce `TypeId` class.\n\n`TypeId` will be used to identify the concrete type of type erased elements in\nthe Tensor API graph.\n\nThis change is part of a series of changes that aim to make the Tensor API\ncompile when RTTI is disabled.\n\nPiperOrigin-RevId: 922122531\n"
    },
    {
      "commit": "30e1a98b4c6a32afa28641758a9f53777b2129c5",
      "tree": "9801c06bfc60efb3e6a6a98a0b0db0ac2a90c562",
      "parents": [
        "f9ee799919bd14b18677dfce901285e3c3b84297"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Wed May 27 02:58:38 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 02:59:31 2026"
      },
      "message": "f16-vtanh using high-accuracy rational polynomial implementation.\n\nPiperOrigin-RevId: 921833633\n"
    },
    {
      "commit": "f9ee799919bd14b18677dfce901285e3c3b84297",
      "tree": "bcd3ec5b804a4ec1266c9e4e7876dbd5ffd9bdfc",
      "parents": [
        "134e8ef6cd903f75f6ce7dcfcfddc8e5c271d82e"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Wed May 27 02:31:06 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 02:31:48 2026"
      },
      "message": "Add indirection_bench to test performance of indirection init\n\nPiperOrigin-RevId: 921822572\n"
    },
    {
      "commit": "134e8ef6cd903f75f6ce7dcfcfddc8e5c271d82e",
      "tree": "78f0b9813677c69d4f2fa5d196cb7f0c0b26e740",
      "parents": [
        "dddad07f84631ef6c8fe8ecb5e879c866d3538bb"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 27 01:36:15 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 01:37:11 2026"
      },
      "message": "Fix test timeouts on emulators\n\nThe pi_test does pretty big reductions, not suitable for use on emulators.\n\nPiperOrigin-RevId: 921803185\n"
    },
    {
      "commit": "dddad07f84631ef6c8fe8ecb5e879c866d3538bb",
      "tree": "499dd1c886c46ab6ec188c70587e5ccd92c83b1a",
      "parents": [
        "4d837f20dfff9177c579e160cad339172c0843e5"
      ],
      "author": {
        "name": "Marie White",
        "email": "mariewhite@google.com",
        "time": "Wed May 27 00:02:54 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 27 00:03:36 2026"
      },
      "message": "Split dot operation on K.\n\nCurrently split_k is the length of k so we don\u0027t expect major performance differences.\n\nSample IR for `f32[110,10240,2560]`.\n\nBefore:\n```\nnull \u003d constant_buffer([], 0, {}) {\n v3 \u003d allocate(automatic, 4, {\n   {},\n   {[0, 15], 4, \u003c\u003e},\n   {[0, 10239], 64, \u003c\u003e},\n   {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n }) {\n  out2.d0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n   closure {in0, in1, out2, v3, null, out2.d0} in {\n    v3.out2.d0 \u003d crop_dim(v3, 3, [(out2.d0 / 16), ((out2.d0 / 16) + 1)]) {\n     call(pack_b, {in1}, {v3.out2.d0}, {})\n    }\n    out2.out2.d0 \u003d crop_dim(out2, 0, [out2.d0, (out2.d0 + 31)]) {\n     call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.out2.d0}, {})\n    }\n   }\n  }\n }\n}\n```\n\nAfter where k_split\u003d1024.\n```\nnull \u003d constant_buffer([], 0, {}) {\n reduction \u003d allocate(automatic, 0, {\n   {[0, 10239], 0, \u003c\u003e}\n }) {\n  k0#0 \u003d loop(serial, [0, 10239], 1024) {\n   reduction \u003d crop_dim(reduction, 0, [k0#0, (k0#0 + 1023)]) {\n    v3 \u003d allocate(automatic, 4, {\n      {},\n      {[0, 15], 4, \u003c\u003e},\n      {[k0#0, buffer_max(reduction, 0)], 64, \u003c\u003e},\n      {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n    }) {\n     d0#0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n      closure {in0, in1, out2, v3, null, reduction, d0#0} in {\n       v3.d0#0 \u003d crop_dim(v3, 3, [(d0#0 / 16), ((d0#0 / 16) + 1)]) {\n        call(pack_b, {in1}, {v3.d0#0}, {})\n       }\n       out2.d0#0 \u003d crop_dim(out2, 0, [d0#0, (d0#0 + 31)]) {\n        call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.d0#0, reduction}, {})\n       }\n      }\n     }\n    }\n   }\n  }\n }\n}\n```\nPiperOrigin-RevId: 921767627\n"
    },
    {
      "commit": "4d837f20dfff9177c579e160cad339172c0843e5",
      "tree": "16ecfc903e02871f4fd37c79196fd913d05f63dc",
      "parents": [
        "acb00f5ed496bc2b39f3e7918fe0baae32e3d006"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 23:14:14 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 23:14:52 2026"
      },
      "message": "Fix attempts to use AVX2 instructions on non-AVX2 targets\n\nPiperOrigin-RevId: 921744427\n"
    },
    {
      "commit": "acb00f5ed496bc2b39f3e7918fe0baae32e3d006",
      "tree": "f94af980da403919776833fb5cbd6d2f559354e9",
      "parents": [
        "e60eb9dcc68a46684454cbb4116408c63fe8e8d1"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 21:25:41 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 21:28:30 2026"
      },
      "message": "Remove tile size from kernel function name\n\nThis will make it easier to compare performance when the tile size changes\n\nPiperOrigin-RevId: 921682098\n"
    },
    {
      "commit": "e60eb9dcc68a46684454cbb4116408c63fe8e8d1",
      "tree": "e095ea2c5aae49c2e4af4a6e21ea5b3acf18c50e",
      "parents": [
        "6cb1a2d35e0f148ff0d9f48ae641af81ec2e6982"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 21:24:08 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 21:25:05 2026"
      },
      "message": "Add missing build of arm_neonfma benchmarks\n\nPiperOrigin-RevId: 921681198\n"
    },
    {
      "commit": "6cb1a2d35e0f148ff0d9f48ae641af81ec2e6982",
      "tree": "6e89c92e4004dd66a59579e13606ab51de68dec0",
      "parents": [
        "8ea1945e5d99cff0b214db889548bffb567606f5"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Tue May 26 20:45:49 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 20:46:40 2026"
      },
      "message": "Add XNN_ENABLE_RNDNU16 build flag and conditionally use rndnu16 kernels\n\nPiperOrigin-RevId: 921659833\n"
    },
    {
      "commit": "8ea1945e5d99cff0b214db889548bffb567606f5",
      "tree": "37a383dffdab1febbb4aea445ebd5cc826cbb2e3",
      "parents": [
        "cc4daec108add3fbc2d820160627f687489fbfd4"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 19:02:54 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 19:03:33 2026"
      },
      "message": "`tanh` accuracy improvements\n\n- Use `expm1(2x) / (expm1(2x) + 2)` for better accuracy (2 ULPs vs. 5)\n- Keep the old direct polynomial approximation as `approx_tanh(x)`\n- Add a small x region to `approx_tanh(x)` to avoid higher error for denormals.\n\nThis is a significant performance regression due to the first change:\n```\nname                                                             time/op        time/op      vs base\nbench_reference/tanh_float/m:1/n:4096/real_time                  42.66µ ± 10%    43.11µ ±  2%         ~ (p\u003d0.818 n\u003d6)\nbench_reference/tanh_double/m:1/n:4096/real_time                 110.1µ ± 44%    107.6µ ±  6%         ~ (p\u003d0.589 n\u003d6)\nbench/tanh_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   1.381µ ± 30%    2.925µ ±  4%  +111.75% (p\u003d0.002 n\u003d6)\nbench/tanh_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   7.696µ ± 13%    7.448µ ±  7%         ~ (p\u003d0.485 n\u003d6)\nbench/tanh_fp32_1x32_x86_avx/m:1/n:4096/real_time                2.879µ ±  6%    7.561µ ±  3%  +162.59% (p\u003d0.002 n\u003d6)\nbench/tanh_fp64_1x16_x86_avx/m:1/n:4096/real_time                19.26µ ± 14%    19.39µ ±  3%         ~ (p\u003d1.000 n\u003d6)\nbench/tanh_fp32_1x16_x86_sse2/m:1/n:4096/real_time               5.813µ ±  8%   10.910µ ±  3%   +87.68% (p\u003d0.002 n\u003d6)\nbench/tanh_fp64_1x8_x86_sse2/m:1/n:4096/real_time                28.26µ ±  4%    28.14µ ±  8%         ~ (p\u003d0.937 n\u003d6)\nbench/tanh_fp32_1x16_x86_sse2_fma/m:1/n:4096/real_time           86.54µ ±  2%   116.29µ ± 10%   +34.38% (p\u003d0.002 n\u003d6)\ngeomean                                                          15.04µ          20.07µ         +33.47%\n```\n\nPiperOrigin-RevId: 921603838\n"
    },
    {
      "commit": "cc4daec108add3fbc2d820160627f687489fbfd4",
      "tree": "d6f6f88366d3c5bbab06633509c55e160a1add51",
      "parents": [
        "35ba08cb5823236db61f2f326e49e88104349233"
      ],
      "author": {
        "name": "Gerardo Carranza",
        "email": "gcarranza@google.com",
        "time": "Tue May 26 17:39:47 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 17:40:45 2026"
      },
      "message": "Migrate LiteRT ATS unary op graph generation to use litert::tensor API.\n\nPiperOrigin-RevId: 921553787\n"
    },
    {
      "commit": "35ba08cb5823236db61f2f326e49e88104349233",
      "tree": "cc967b10eda8ee70f8cdbf36fd7b1a4669ad59e1",
      "parents": [
        "f8deca0bb9e9f2312419fab30f601c8f070763d1"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 16:08:58 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 16:11:13 2026"
      },
      "message": "Add `tanh` SIMD wrappers\n\nTo avoid a significant performance regression, this change includes another change to carefully order the `min` and `max` operands (assuming that the scalar is not NaN), so we don\u0027t need special cases for `NaN`.\n\nBizarrely, the elimination of the `NaN` special case caused a significant regression for the `erf_fp32` kernel only for x86_fma3. This is just a compiler quirk (since this change strictly eliminates instructions vs. baseline in this kernel), I worked around it as much as possible by tweaking the unrolling factor for this kernel.\n\nThis is also the last usage of the polynomial arithmetic in the elementwise op compiler, so the helpers for those is removed.\n\n```\nname                                                              time/op        time/op      vs base\nbench/erf_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time     5.111µ ± 13%    4.669µ ±  3%   -8.65% (p\u003d0.002 n\u003d6)\nbench/tanh_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time    1.374µ ±  6%    1.358µ ±  2%   -1.17% (p\u003d0.026 n\u003d6)\nbench/erf_fp32_1x16_x86_fma3/m:1/n:4096/real_time                 8.789µ ±  2%\nbench/erf_fp64_1x8_x86_fma3/m:1/n:4096/real_time                  26.33µ ±  6%    24.40µ ±  3%   -7.33% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x64_x86_avx/m:1/n:4096/real_time                  13.91µ ±  4%    13.40µ ±  1%   -3.62% (p\u003d0.002 n\u003d6)\nbench/erf_fp64_1x8_x86_avx/m:1/n:4096/real_time                   40.67µ ±  5%    39.23µ ±  4%   -3.54% (p\u003d0.009 n\u003d6)\nbench/tanh_fp32_1x32_x86_avx/m:1/n:4096/real_time                 2.736µ ±  2%    2.910µ ±  3%   +6.37% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time                 9.433µ ±  1%    9.862µ ±  3%   +4.55% (p\u003d0.002 n\u003d6)\nbench/tanh_fp32_1x16_x86_sse2/m:1/n:4096/real_time                5.433µ ±  4%    5.887µ ±  2%   +8.36% (p\u003d0.002 n\u003d6)\nbench/tanh_fp32_1x16_x86_sse2_fma/m:1/n:4096/real_time            66.59µ ± 19%    86.32µ ±  3%  +29.63% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x32_x86_fma3/m:1/n:4096/real_time                 8.646µ ±  4%\ngeomean                                                           14.67µ          14.68µ         +0.14%               ¹\n```\n\nPiperOrigin-RevId: 921503300\n"
    },
    {
      "commit": "f8deca0bb9e9f2312419fab30f601c8f070763d1",
      "tree": "a04848963110dc3f5f683a0ff16620c58f15b2e4",
      "parents": [
        "f4ed055f9166daad591d82346d5ff9dc53d8c669"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 26 00:31:55 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 26 00:32:50 2026"
      },
      "message": "Replace rational polynomials for exp with non-rational polynomials\n\nThe rational polynomials provided some ILP, which is now gone. To avoid a performance regression due to this, this change also adds an overload of `eval_polynomial` that uses x^2 to bring back some ILP.\n\nThis change is a bit of a tradeoff:\n- It\u0027s faster on AVX512, by a lot\n- It\u0027s a mixed bag on other ISAs, including a regression on ARM64 for fp32\n\nI think we should avoid using different algorithms on different targets to maintain numerical consistency, so for now, I think this change is the right tradeoff.\n\n```\nname                                                                cpu/op         cpu/op       vs base\nbench_reference/exp_float/m:1/n:4096/real_time                      35.25µ ± 5%    35.35µ ±  3%        ~ (p\u003d0.818 n\u003d6)\nbench_reference/exp_double/m:1/n:4096/real_time                     72.89µ ± 2%    72.47µ ±  1%        ~ (p\u003d0.240 n\u003d6)\nbench_reference/expm1_float/m:1/n:4096/real_time                    38.08µ ± 1%    38.09µ ±  5%        ~ (p\u003d1.000 n\u003d6)\nbench_reference/expm1_double/m:1/n:4096/real_time                   86.36µ ± 1%    86.05µ ±  1%   -0.35% (p\u003d0.026 n\u003d6)\nbench_reference/erf_float/m:1/n:4096/real_time                      36.07µ ± 7%    35.90µ ±  3%        ~ (p\u003d0.937 n\u003d6)\nbench_reference/erf_double/m:1/n:4096/real_time                     41.36µ ± 3%    41.47µ ±  3%        ~ (p\u003d0.937 n\u003d6)\nbench_reference/sigmoid_float/m:1/n:4096/real_time                  37.59µ ± 1%    37.63µ ±  3%        ~ (p\u003d0.485 n\u003d6)\nbench_reference/sigmoid_double/m:1/n:4096/real_time                 73.26µ ± 2%    73.16µ ±  2%        ~ (p\u003d0.937 n\u003d6)\nbench/erf_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       5.131µ ± 4%    4.796µ ±  2%   -6.51% (p\u003d0.002 n\u003d6)\nbench/erf_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       25.41µ ± 2%    18.74µ ±  3%  -26.22% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       2.280µ ± 1%    1.943µ ±  1%  -14.79% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       5.779µ ± 2%    4.592µ ±  1%  -20.54% (p\u003d0.002 n\u003d6)\nbench/expm1_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time     2.389µ ± 1%    2.018µ ±  1%  -15.53% (p\u003d0.002 n\u003d6)\nbench/expm1_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time     5.925µ ± 1%    4.828µ ±  1%  -18.51% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   2.902µ ± 4%    2.397µ ±  2%  -17.39% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   7.928µ ± 2%    5.794µ ±  3%  -26.91% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time              3.107µ ± 2%    3.070µ ±  2%        ~ (p\u003d0.132 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time              9.315µ ± 3%    8.371µ ±  2%  -10.13% (p\u003d0.002 n\u003d6)\nbench/expm1_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time            3.241µ ± 4%    3.182µ ±  5%        ~ (p\u003d0.485 n\u003d6)\nbench/expm1_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time            9.491µ ± 4%    9.278µ ±  4%   -2.25% (p\u003d0.015 n\u003d6)\nbench/erf_fp32_1x16_x86_fma3/m:1/n:4096/real_time                   8.597µ ± 3%    8.737µ ±  5%        ~ (p\u003d0.180 n\u003d6)\nbench/erf_fp64_1x8_x86_fma3/m:1/n:4096/real_time                    27.59µ ± 3%    25.83µ ± 24%        ~ (p\u003d0.065 n\u003d6)\nbench/exp_fp32_1x16_x86_avx2/m:1/n:4096/real_time                   5.488µ ± 5%    5.637µ ± 10%        ~ (p\u003d0.065 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time                   16.70µ ± 9%    15.40µ ±  4%   -7.80% (p\u003d0.002 n\u003d6)\nbench/expm1_fp32_1x16_x86_avx2/m:1/n:4096/real_time                 5.596µ ± 3%    5.655µ ±  2%        ~ (p\u003d0.240 n\u003d6)\nbench/expm1_fp64_1x16_x86_avx2/m:1/n:4096/real_time                 17.13µ ± 5%    15.98µ ±  5%   -6.72% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x16_x86_avx2/m:1/n:4096/real_time               5.623µ ± 5%    5.715µ ± 19%   +1.63% (p\u003d0.041 n\u003d6)\nbench/sigmoid_fp64_1x8_x86_avx2/m:1/n:4096/real_time                17.16µ ± 2%    15.28µ ±  3%  -10.98% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x64_x86_avx/m:1/n:4096/real_time                    13.59µ ± 4%    14.00µ ±  4%        ~ (p\u003d0.093 n\u003d6)\nbench/erf_fp64_1x8_x86_avx/m:1/n:4096/real_time                     38.19µ ± 6%    40.27µ ±  2%   +5.46% (p\u003d0.041 n\u003d6)\nbench/erf_fp32_1x16_x86_sse2/m:1/n:4096/real_time                   21.88µ ± 2%    23.86µ ±  2%   +9.02% (p\u003d0.002 n\u003d6)\nbench/erf_fp64_1x16_x86_sse2/m:1/n:4096/real_time                   54.22µ ± 3%    54.27µ ±  1%        ~ (p\u003d0.818 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time                   7.435µ ± 2%    9.585µ ±  4%  +28.92% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time                    25.48µ ± 1%    25.22µ ±  3%        ~ (p\u003d0.093 n\u003d6)\nbench/expm1_fp32_1x16_x86_sse2/m:1/n:4096/real_time                 7.561µ ± 1%    9.819µ ±  3%  +29.86% (p\u003d0.002 n\u003d6)\nbench/expm1_fp64_1x8_x86_sse2/m:1/n:4096/real_time                  25.73µ ± 7%    26.00µ ±  2%        ~ (p\u003d0.485 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2/m:1/n:4096/real_time               9.060µ ± 4%   10.162µ ±  2%  +12.16% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp64_1x8_x86_sse2/m:1/n:4096/real_time                26.36µ ± 2%    26.12µ ±  1%        ~ (p\u003d0.310 n\u003d6)\nbench/erf_fp32_1x16_x86_sse2_fma/m:1/n:4096/real_time               271.4µ ± 1%    283.7µ ±  1%   +4.54% (p\u003d0.002 n\u003d6)\nbench/expm1_fp32_1x16_x86_sse2_fma/m:1/n:4096/real_time             92.40µ ± 2%   105.19µ ±  2%  +13.85% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2_fma/m:1/n:4096/real_time           94.12µ ± 3%   116.66µ ±  2%  +23.94% (p\u003d0.002 n\u003d6)\ngeomean                                                             16.25µ         15.90µ         -2.15%\n```\n```\nname                                                  time/op        time/op     vs base\nbench_reference/exp_float/m:1/n:4096/real_time        5.818µ ± 8%   5.888µ ± 12%        ~ (p\u003d0.485 n\u003d6)\nbench_reference/exp_double/m:1/n:4096/real_time       6.939µ ± 6%   6.881µ ± 10%        ~ (p\u003d0.818 n\u003d6)\nbench_reference/expm1_float/m:1/n:4096/real_time      18.98µ ± 2%   18.15µ ± 11%        ~ (p\u003d0.065 n\u003d6)\nbench_reference/expm1_double/m:1/n:4096/real_time     19.17µ ± 4%   19.76µ ±  5%   +3.09% (p\u003d0.026 n\u003d6)\nbench_reference/erf_float/m:1/n:4096/real_time        15.06µ ± 5%   14.97µ ±  7%        ~ (p\u003d0.818 n\u003d6)\nbench_reference/erf_double/m:1/n:4096/real_time       16.75µ ± 6%   16.77µ ±  8%        ~ (p\u003d0.818 n\u003d6)\nbench_reference/sigmoid_float/m:1/n:4096/real_time    6.159µ ± 6%   6.239µ ± 10%        ~ (p\u003d0.485 n\u003d6)\nbench_reference/sigmoid_double/m:1/n:4096/real_time   7.192µ ± 5%   7.159µ ±  8%        ~ (p\u003d0.818 n\u003d6)\nbench/erf_fp64_1x8_neon/m:1/n:4096/real_time          14.53µ ± 5%   13.31µ ±  5%   -8.44% (p\u003d0.004 n\u003d6)\nbench/exp_fp64_1x8_neon/m:1/n:4096/real_time          4.920µ ± 3%   4.210µ ±  5%  -14.43% (p\u003d0.002 n\u003d6)\nbench/expm1_fp64_1x8_neon/m:1/n:4096/real_time        5.122µ ± 3%   4.290µ ±  5%  -16.25% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp64_1x4_neon/m:1/n:4096/real_time      5.474µ ± 3%   4.973µ ±  4%   -9.16% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x16_neon/m:1/n:4096/real_time         6.269µ ± 3%   6.470µ ±  8%        ~ (p\u003d0.065 n\u003d6)\nbench/exp_fp32_1x16_neon/m:1/n:4096/real_time         2.388µ ± 2%   2.704µ ± 13%  +13.23% (p\u003d0.041 n\u003d6)\nbench/expm1_fp32_1x16_neon/m:1/n:4096/real_time       2.454µ ± 3%   2.611µ ±  9%        ~ (p\u003d0.132 n\u003d6)\nbench/sigmoid_fp32_1x8_neon/m:1/n:4096/real_time      2.643µ ± 2%   3.473µ ± 58%  +31.39% (p\u003d0.002 n\u003d6)\ngeomean                                               6.983µ        6.968µ         -0.21%\n```\n\nPiperOrigin-RevId: 921140178\n"
    },
    {
      "commit": "f4ed055f9166daad591d82346d5ff9dc53d8c669",
      "tree": "d2dcd2e92f4c6e542123cac6bb67ddf716470048",
      "parents": [
        "41fde00824707ba4d4507fdbfc3bb70f9862b884"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 25 21:47:23 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 25 21:48:11 2026"
      },
      "message": "Add `erf` SIMD math functions\n\nThese are the toughest functions I\u0027ve tried to approximate so far, and thus are the most complicated.\n\nFor float32, there are two approximations:\n- `fast_erf`, which is a single rational polynomial, accurate to 5 ULPs\n- `erf`, which is 3 piecewise (non-rational) polynomials, the first of which is a direct approximation of erf, and the next two are approximating the \"scaled\" complementary error function, accurate to 2 ULPs, and is ~3x slower.\n\nFor float64, `erf` is two rational polynomials, the second of which uses the scaled complementary error function like the float32 approximation.\n\nThis is a significant regression in performance, but a necessary improvement in correctness:\n```\nname                                                            time/op        time/op     vs base\nbench_reference/erf_float/m:1/n:4096/real_time                  35.71µ ± 16%    36.26µ ± 2%         ~ (p\u003d0.394 n\u003d6)\nbench_reference/erf_double/m:1/n:4096/real_time                 42.67µ ± 40%    42.31µ ± 4%         ~ (p\u003d0.310 n\u003d6)\nbench/erf_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   1.626µ ± 23%    5.230µ ± 8%  +221.76% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x16_x86_fma3/m:1/n:4096/real_time               2.429µ ±  3%    8.561µ ± 3%  +252.48% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x64_x86_avx/m:1/n:4096/real_time                3.491µ ±  3%   13.446µ ± 3%  +285.21% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x16_x86_sse2/m:1/n:4096/real_time               6.997µ ±  4%   22.067µ ± 1%  +215.39% (p\u003d0.002 n\u003d6)\nbench/erf_fp32_1x16_x86_sse2_fma/m:1/n:4096/real_time           83.35µ ±  3%   270.31µ ± 1%  +224.29% (p\u003d0.002 n\u003d6)\nbench/erf_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   25.49µ ± 1%\nbench/erf_fp64_1x8_x86_fma3/m:1/n:4096/real_time                27.89µ ± 2%\nbench/erf_fp64_1x8_x86_avx/m:1/n:4096/real_time                 38.04µ ± 3%\nbench/erf_fp64_1x16_x86_sse2/m:1/n:4096/real_time               55.07µ ± 2%\ngeomean                                                         10.29µ          27.97µ       +139.34%\n```\n\nPiperOrigin-RevId: 921097089\n"
    },
    {
      "commit": "41fde00824707ba4d4507fdbfc3bb70f9862b884",
      "tree": "db3d2353f9ea2afc14b9a7b62c9adf7ba481bcb9",
      "parents": [
        "2b99af3f109a3802ae63bb3d8883eaccec341c90"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 25 21:26:15 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 25 21:27:07 2026"
      },
      "message": "Improve exp approximation\n\nThe basic idea is to pull a few terms from the Taylor series of `exp(x)` out of the polynomial approximation. This enables tightening the error bounds further, `expm1` tolerances are reduced from 3 to 2 ULPs.\n\nWhile I think `exp` could be reduced from 2 to 1 ULP, it fails because of the one single (output) value that is 2 ULPs away from infinity. We produce infinity, while `std::exp` produces a finite value there. I don\u0027t think this is worth a performance cost to fix.\n\nPiperOrigin-RevId: 921092139\n"
    },
    {
      "commit": "2b99af3f109a3802ae63bb3d8883eaccec341c90",
      "tree": "6124513e0746d63a22a448f21c1e18acfa65044a",
      "parents": [
        "894ae65353c98e0256112ca6f0ae69f0d472a697"
      ],
      "author": {
        "name": "Marie White",
        "email": "mariewhite@google.com",
        "time": "Mon May 25 07:13:22 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 25 07:14:11 2026"
      },
      "message": "Add clarifying comments in call to define_transpose_a().\n\nPiperOrigin-RevId: 920809733\n"
    },
    {
      "commit": "894ae65353c98e0256112ca6f0ae69f0d472a697",
      "tree": "efc028a206c8f1501239620aa7786d76ffc7876a",
      "parents": [
        "eafd6fe46bbb93ed46c71fc56796f1fbb82bf0db"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Sun May 24 23:36:56 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sun May 24 23:38:06 2026"
      },
      "message": "Fix `floor_log2(NaN)` to be `NaN`\n\nThis is a performance regression, but seems necessary for correctness:\n```\nname                       time/op        time/op     vs base\nBM_floor_log2_f32x4_sse2   2.290n ± 2%    2.292n ± 1%        ~ (p\u003d0.937 n\u003d6)\nBM_floor_log2_f64x2_sse2   1.915n ± 2%    2.303n ± 2%  +20.32% (p\u003d0.002 n\u003d6)\nBM_log_f32x4_sse2          6.999n ± 2%    7.303n ± 2%   +4.34% (p\u003d0.002 n\u003d6)\nBM_log_f64x2_sse2          11.62n ± 3%    12.47n ± 3%   +7.35% (p\u003d0.002 n\u003d6)\nBM_log1p_f32x4_sse2        9.873n ± 3%   10.439n ± 4%   +5.73% (p\u003d0.006 n\u003d6)\nBM_log1p_f64x2_sse2        14.63n ± 3%    15.96n ± 4%   +9.10% (p\u003d0.002 n\u003d6)\ngeomean                    6.099n         6.566n        +7.65%\n```\n\nAVX512 is unaffected:\n```\nname                           time/op         time/op     vs base\nBM_floor_log2_f32x16_avx512     1.039n ±  3%    1.034n ± 4%       ~ (p\u003d1.000 n\u003d6)\nBM_floor_log2_f32x8_avx512     0.6460n ±  2%   0.6450n ± 2%       ~ (p\u003d0.907 n\u003d6)\nBM_floor_log2_f32x4_avx512     0.5720n ±  2%   0.5755n ± 2%       ~ (p\u003d0.162 n\u003d6)\nBM_floor_log2_f64x8_avx512      1.030n ±  3%    1.043n ± 2%       ~ (p\u003d0.329 n\u003d6)\nBM_floor_log2_f64x4_avx512     0.8695n ± 21%   0.8715n ± 3%       ~ (p\u003d0.937 n\u003d6)\nBM_floor_log2_f64x2_avx512     0.7570n ± 20%   0.7660n ± 2%       ~ (p\u003d0.331 n\u003d6)\nBM_log_f32x16_avx512            6.230n ±  4%    6.301n ± 2%       ~ (p\u003d0.589 n\u003d6)\nBM_log_f64x8_avx512            10.172n ±  5%    9.771n ± 5%       ~ (p\u003d0.132 n\u003d6)\nBM_log1p_f32x16_avx512          8.073n ±  3%    8.127n ± 2%       ~ (p\u003d0.180 n\u003d6)\nBM_log1p_f64x8_avx512           11.53n ±  4%    11.46n ± 2%       ~ (p\u003d0.699 n\u003d6)\ngeomean                        2.083n          2.082n       -0.01%\n```\n\nPiperOrigin-RevId: 920668251\n"
    },
    {
      "commit": "eafd6fe46bbb93ed46c71fc56796f1fbb82bf0db",
      "tree": "b2a478f03a359607ebaa471bd0f393b737e9f111",
      "parents": [
        "38631e8f9ec99555c302fe744739b2ce14808da8"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Sun May 24 23:13:26 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sun May 24 23:15:27 2026"
      },
      "message": "Add benchmarks of exp and log for avx and avx2\n\nPiperOrigin-RevId: 920663137\n"
    },
    {
      "commit": "38631e8f9ec99555c302fe744739b2ce14808da8",
      "tree": "00da42bd4a83476c3db2a26b6be8461127ec328b",
      "parents": [
        "fda3ca7b87546df6288c9036775c0fc0235e873f"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Sun May 24 23:08:58 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sun May 24 23:10:19 2026"
      },
      "message": "Refactor `exp` and `expm1` to use the same implementation\n\nThe expm1 polynomial should be better, because it passes through 0, and so it should be easier to fit numerically.\n\nIndeed, this change enables reducing the ULP tolerance of `exp` from 3 to 2.\n\nPiperOrigin-RevId: 920662177\n"
    },
    {
      "commit": "fda3ca7b87546df6288c9036775c0fc0235e873f",
      "tree": "bff9cf00228001ce0479b1ed8c2027fd14d0bd44",
      "parents": [
        "52bd8d0a6ef2b6c941973df67864daf33a4e783c"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Sat May 23 01:36:07 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sat May 23 01:37:01 2026"
      },
      "message": "Fix precision issue in rndnu16 requantization for scales near powers of 2.\n\nPiperOrigin-RevId: 919955609\n"
    },
    {
      "commit": "52bd8d0a6ef2b6c941973df67864daf33a4e783c",
      "tree": "28a31af436dbd3be92eac38bc3e3e9a2054786dc",
      "parents": [
        "0ccb84e7c33a5527d7ba417057b305debdb2a48f"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 22:33:13 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 22:34:08 2026"
      },
      "message": "Tighten tolerances of `log` from 3 ULPs to 2\n\nAnd add tests/benchmarks for ARM\n\nPiperOrigin-RevId: 919889466\n"
    },
    {
      "commit": "0ccb84e7c33a5527d7ba417057b305debdb2a48f",
      "tree": "d825e7724c25efcc262b62874c256865e0fc6985",
      "parents": [
        "c45d6b4166d3e8ac581e6fdabb371fcd33ef065d"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 21:27:14 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 21:27:54 2026"
      },
      "message": "Implement `YNN_FLAG_CONSISTENT_ARITHMETIC` for unary elementwise kernels\n\nPiperOrigin-RevId: 919861252\n"
    },
    {
      "commit": "c45d6b4166d3e8ac581e6fdabb371fcd33ef065d",
      "tree": "9307fdc68ebb5ee21a3d7745753e8ab3ca7aa79f",
      "parents": [
        "5ff101c80c734d97aef504940482350953351be2"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Fri May 22 18:33:11 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 18:34:02 2026"
      },
      "message": "Don\u0027t split the innermost dimension if the type of the input is sub-byte.\n\nPiperOrigin-RevId: 919777496\n"
    },
    {
      "commit": "5ff101c80c734d97aef504940482350953351be2",
      "tree": "3523a6cb58e807a0d9444ad0c5af3ae0f347be9c",
      "parents": [
        "3b5dbb202628f7687c374f6c9a2caf9c4c7adce5",
        "0f6ee412250039c51b002352cd88537b64d856aa"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 18:19:10 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 18:19:10 2026"
      },
      "message": "Merge pull request #10298 from wangw-1991:fix_LUT_fusion\n\nPiperOrigin-RevId: 919769875\n"
    },
    {
      "commit": "3b5dbb202628f7687c374f6c9a2caf9c4c7adce5",
      "tree": "bdd7e9ad70e8695a02dbfa2e95ee05d4ebdf822e",
      "parents": [
        "008036729dd12d3b3d6985008ae258c4078a32e0"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 18:02:30 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 18:03:18 2026"
      },
      "message": "Remove `fma` when not available, and add `multiply_add` which optionally uses `fma` when available.\n\nAnd add the `sse2fma` target which emulates fma (slowly).\n\nPiperOrigin-RevId: 919762437\n"
    },
    {
      "commit": "008036729dd12d3b3d6985008ae258c4078a32e0",
      "tree": "e46ee8a6ad087f1e4dd8d8a1634d3ca4e866249f",
      "parents": [
        "c4e49c65c72674932fc60977e2985d6326ad7188"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 17:25:04 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 17:26:07 2026"
      },
      "message": "Combine x86 SIMD wrapper headers\n\nThis is a large change that is mostly code motion, to combine the various microarchitecture specific header files for x86 intrinsics wrappers. The problem this solves is that things don\u0027t really work properly if you include multiple of these headers at once. But, sometimes that is necessary, e.g. you want to use `x86_f16c.h` and `x86_fma3.h` at the same time. I\u0027m currently trying to make an optional fma or multiply-add intrinsic, and that runs into this problem broadly.\n\nPiperOrigin-RevId: 919744261\n"
    },
    {
      "commit": "c4e49c65c72674932fc60977e2985d6326ad7188",
      "tree": "d5c132b0ad9ab6ed794c6fc6b991702116ff97b7",
      "parents": [
        "00825fc8ecdf3c3d58a2cae3471c959e28c9b147"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 16:44:03 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 16:44:50 2026"
      },
      "message": "Combine ARM SIMD wrapper headers\n\nThis is a prototype for what I\u0027d like to do to the x86 headers too, but this is an easier place to start.\n\nThe main advantage of this approach is it\u0027s easier to add new things with fallback implementations on older microarchitectures. The problem I\u0027m specifically trying to solve is how to add `multiply_add` to use fma when available, and fall back to multiply + add when not available. Doing this with the header per microarchitecture approach is really hard. Consider AVX2, AVX, and FMA: there are a lot of combinations of headers to support. And then if you are using AVX512, it should use FMA for AVX and SSE vectors too.\n\nThe main disadvantage is that there\u0027s less safety, I think it would be easy to accidentally not use a lower capability microarchitecture (i.e. get worse performance).\n\nPiperOrigin-RevId: 919723374\n"
    },
    {
      "commit": "00825fc8ecdf3c3d58a2cae3471c959e28c9b147",
      "tree": "4399116b98e7e571a3aa72325afe1b5a0969b910",
      "parents": [
        "cccad557c84c1e03c1adb595946e79f9a84fafbd"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 22 16:07:20 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 22 16:08:14 2026"
      },
      "message": "Define all architecture flags transitively implied by enabled architectures.\n\nThis means we can rely on these flags instead of compiler specific features to enable or disable intrinsics, which is safer when we have a fallback path (we won\u0027t silently fall back to a lesser implementation than the user was expecting).\n\nPiperOrigin-RevId: 919707309\n"
    },
    {
      "commit": "cccad557c84c1e03c1adb595946e79f9a84fafbd",
      "tree": "d35ed483430cd87c44d9bed6dea5674b4c425071",
      "parents": [
        "549deb8dd62218ba926d0b6cd4730c408b4a6718"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 21 22:56:08 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 22:56:50 2026"
      },
      "message": "Add math helpers to SIMD wrappers\n\nThe main benefit of this change is the ability to test the math functions in isolation from the rest of the kernel infrastructure. This led me to discover an issue with our tolerances that was otherwise difficult to find.\n\nThis change is a significant upgrade in the accuracy of `log` and `log1p` especially, and especially for fp64 types. We now have ~3 ULPs for all exp and log functions (and no more absolute tolerances).\n\nCurrently, I\u0027ve implemented `tanh` and `sigmoid` in terms of these functions. These implementations could likely be optimized and improved further, but I think this change is already big enough by itself.\n\nI intend to further remove the sophisticated approximations from the python code and move them to the C++ SIMD wrappers.\n\nCurrently it is added to `generic.inc`, but this is really not a good place for them. The SIMD wrappers need some restructuring to enable a better implementation for this. However, this is a good improvement that unblocks other work, I\u0027d like to work on this cleanup in parallel with other tasks.\n\nThere are some performance improvements and regressions in this change:\n```\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                     2.550µ ±  3%    2.309µ ±   3%   -9.46% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time                     6.255µ ±  2%    5.905µ ±   5%   -5.59% (p\u003d0.009 n\u003d6)\nbench/expm1_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                   2.876µ ±  2%    2.713µ ±   4%   -5.67% (p\u003d0.002 n\u003d6)\nbench/expm1_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time                   7.094µ ±  2%    6.776µ ±   2%   -4.48% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                     1.506µ ±  2%    1.883µ ±   3%  +25.07% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time                     4.851µ ±  7%    5.473µ ±   3%  +12.83% (p\u003d0.002 n\u003d6)\nbench/floor_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                   297.6n ±  3%    325.0n ±   4%   +9.20% (p\u003d0.002 n\u003d6)\nbench/negate_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                  270.8n ±  9%    319.7n ±  11%  +18.06% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time                 1.837µ ±  1%    3.086µ ±   4%  +68.03% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time                 4.616µ ±  5%    8.505µ ±   1%  +84.23% (p\u003d0.002 n\u003d6)\nbench/tanh_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time                    7.683µ ±  4%    9.246µ ±   2%  +20.35% (p\u003d0.002 n\u003d6)\n```\n\nThe sigmoid regressions are significant, but I think we should prioritize correctness for now. A dedicated sigmoid op using this new mechanism would be faster and not too hard to implement.\n\nPiperOrigin-RevId: 919298994\n"
    },
    {
      "commit": "549deb8dd62218ba926d0b6cd4730c408b4a6718",
      "tree": "322ca473054e6e5f83a5b9767232decb74436a5b",
      "parents": [
        "58a65db7c2b470e967fca11c3be29689946683c9"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 21 21:14:59 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 21:15:52 2026"
      },
      "message": "Remove unused `transpose` SIMD wrapper\n\nPiperOrigin-RevId: 919250056\n"
    },
    {
      "commit": "58a65db7c2b470e967fca11c3be29689946683c9",
      "tree": "0fd151193cf08db59a591d8b000cc30551ad15e7",
      "parents": [
        "6c9a1ab6259e7a6a8bce27dac6922ecc28d2bbe0"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Thu May 21 17:55:59 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 17:56:49 2026"
      },
      "message": "Mark values as external outputs in constant folding only if they are actually used in the non-constant pipeline.\n\nPiperOrigin-RevId: 919143197\n"
    },
    {
      "commit": "6c9a1ab6259e7a6a8bce27dac6922ecc28d2bbe0",
      "tree": "aa89c88e01b27f7dc4239b8e2318a5acad5dedce",
      "parents": [
        "ea77aab35cfd965afa93891af6a2fa249af98164"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 21 17:20:30 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 17:21:12 2026"
      },
      "message": "Consolidate some SIMD wrapper headers\n\nThis is a low hanging fruit part of eliminating micro-architecture specific headers.\n\nPiperOrigin-RevId: 919122512\n"
    },
    {
      "commit": "ea77aab35cfd965afa93891af6a2fa249af98164",
      "tree": "1afee7f4392d7058c0dd8dc4d63031e38f29bc5e",
      "parents": [
        "1b849f44fb6ce9eae460586158669461044acc2d"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 21 16:36:52 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 16:37:53 2026"
      },
      "message": "Generalize FMA emulation helper\n\nThis doesn\u0027t need to depend specifically on SSE2, it should work for any SIMD type.\n\nThis eliminates one of the microarchitecture entangled headers in the SIMD wrappers.\n\nPiperOrigin-RevId: 919098203\n"
    },
    {
      "commit": "1b849f44fb6ce9eae460586158669461044acc2d",
      "tree": "116aca0116f85a3314d83a2e03cfc1f6a44b1e12",
      "parents": [
        "d4adfcdc4c5c84eff8b8f2d68bf29c0b9d185eea"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 21 16:18:06 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 16:18:37 2026"
      },
      "message": "Tune params for unary kernels to avoid tolerance issues\n\nIn some cases, the absolute tolerances are needed due to the choice of \"params\" that scale/offset the result. This changes the choices of params such that we can remove the absolute tolerances from the tests, by avoiding zero in the output and avoiding \"shrinking\" the output too much.\n\nPiperOrigin-RevId: 919089273\n"
    },
    {
      "commit": "0f6ee412250039c51b002352cd88537b64d856aa",
      "tree": "0f0c50b93cb94ea5f6e643c77b63af2e5fbc12b3",
      "parents": [
        "d877e1a189580fb562a66026021f3403cc48bf18"
      ],
      "author": {
        "name": "Wei Wang",
        "email": "wei4.wang@intel.com",
        "time": "Thu May 21 11:58:47 2026"
      },
      "committer": {
        "name": "Wei Wang",
        "email": "wei4.wang@intel.com",
        "time": "Thu May 21 11:58:47 2026"
      },
      "message": "Initial upload.\n"
    },
    {
      "commit": "d4adfcdc4c5c84eff8b8f2d68bf29c0b9d185eea",
      "tree": "44fff0143cf2e931a161917acdd27211e92caf76",
      "parents": [
        "f56a6c716d6c544b74e8388a68f89a49132375ab"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Thu May 21 01:01:08 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 21 01:01:50 2026"
      },
      "message": "gemm benchmark documentation fix\n- update names of models to match files\n\nbenchmarks now require 3 int values on command line\neg f16_gemm_minmax_bench 784 512 144\n\nPiperOrigin-RevId: 918743819\n"
    },
    {
      "commit": "f56a6c716d6c544b74e8388a68f89a49132375ab",
      "tree": "abb8deec3c40aadc5f08e7bffed4f29e6fc25a00",
      "parents": [
        "29a1c7333c02fedf408ba913f959781058d26843"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 20 23:36:45 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 23:37:37 2026"
      },
      "message": "Add numerically correct `expm1` kernels\n\n- Add a feature to `rational_approximation.py` to include 0 as one of the points that Remez optimizes.\n- Use it to generate polynomials for `exp2m1`\n- Implement `expm1` kernels. These are very similar to `exp`, but rearranging the arithmetic to avoid catastrophic cancellation when 2^x is near 1.\n\n```\n--------------------------------------------------------------------------------------------------------------------------\nBenchmark                                                                Time             CPU   Iterations UserCounters...\n--------------------------------------------------------------------------------------------------------------------------\nbench_reference/exp_float/m:1/n:4096/real_time                       36990 ns        36980 ns        18487 Bytes\u003d885.865M/s Op\u003d110.733M/s\nbench_reference/exp_double/m:1/n:4096/real_time                      75183 ns        75157 ns         9337 Bytes\u003d871.682M/s Op\u003d54.4801M/s\nbench_reference/expm1_float/m:1/n:4096/real_time                     36931 ns        36925 ns        18730 Bytes\u003d887.269M/s Op\u003d110.909M/s\nbench_reference/expm1_double/m:1/n:4096/real_time                    80463 ns        80438 ns         8501 Bytes\u003d814.486M/s Op\u003d50.9054M/s\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time         2576 ns         2576 ns       268977 Bytes\u003d12.7188G/s Op\u003d1.58985G/s\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time         6399 ns         6397 ns       109094 Bytes\u003d10.2419G/s Op\u003d640.117M/s\nbench/expm1_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       2938 ns         2938 ns       238502 Bytes\u003d11.1519G/s Op\u003d1.39398G/s\nbench/expm1_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       7284 ns         7282 ns        96595 Bytes\u003d8.99685G/s Op\u003d562.303M/s\nbench/exp_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time                3397 ns         3397 ns       207260 Bytes\u003d9.64533G/s Op\u003d1.20567G/s\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time                8476 ns         8474 ns        81181 Bytes\u003d7.73191G/s Op\u003d483.245M/s\nbench/expm1_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time              3761 ns         3761 ns       182875 Bytes\u003d8.71165G/s Op\u003d1.08896G/s\nbench/expm1_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time             12944 ns        12941 ns        54861 Bytes\u003d5.06303G/s Op\u003d316.439M/s\nbench/exp_fp32_1x16_x86_avx2/m:1/n:4096/real_time                     4995 ns         4994 ns       138480 Bytes\u003d6.56011G/s Op\u003d820.014M/s\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time                    11544 ns        11541 ns        58414 Bytes\u003d5.67696G/s Op\u003d354.81M/s\nbench/expm1_fp32_1x16_x86_avx2/m:1/n:4096/real_time                   6345 ns         6343 ns       110725 Bytes\u003d5.16475G/s Op\u003d645.594M/s\nbench/expm1_fp64_1x16_x86_avx2/m:1/n:4096/real_time                  18278 ns        18273 ns        36881 Bytes\u003d3.58542G/s Op\u003d224.089M/s\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time                     7399 ns         7398 ns        95901 Bytes\u003d4.42847G/s Op\u003d553.558M/s\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time                     20106 ns        20101 ns        35700 Bytes\u003d3.25956G/s Op\u003d203.722M/s\nbench/expm1_fp32_1x16_x86_sse2/m:1/n:4096/real_time                   7980 ns         7979 ns        86063 Bytes\u003d4.1063G/s Op\u003d513.287M/s\nbench/expm1_fp64_1x8_x86_sse2/m:1/n:4096/real_time                   24816 ns        24813 ns        27562 Bytes\u003d2.64087G/s Op\u003d165.054M/s\n```\n\nPiperOrigin-RevId: 918713274\n"
    },
    {
      "commit": "29a1c7333c02fedf408ba913f959781058d26843",
      "tree": "a36ad44de144b45576f88353048b6bd454d89d91",
      "parents": [
        "b1a0a5d3725f9b75b9ca449e88e0f197fbc3a7db"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 21:33:47 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 21:34:41 2026"
      },
      "message": "Add std::string overloads for tensor::Create.\n\nPiperOrigin-RevId: 918656793\n"
    },
    {
      "commit": "b1a0a5d3725f9b75b9ca449e88e0f197fbc3a7db",
      "tree": "0860ed7cb470be7f01d86e23b61fc69dc83fa194",
      "parents": [
        "a0dbef32efcd73eff5d13f6900d93bb58ddd91f4",
        "4ae8b8f607b2a35a91bba3ef491bda68a405fdc3"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 19:27:08 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 19:27:08 2026"
      },
      "message": "Merge pull request #10261 from velonica0:f16\n\nPiperOrigin-RevId: 918589980\n"
    },
    {
      "commit": "a0dbef32efcd73eff5d13f6900d93bb58ddd91f4",
      "tree": "4e352a23c07a167fa68e0046ff96c3f8e7327d87",
      "parents": [
        "3f33e550b6270dbad0b9549170de4b9f107a4052"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 20 19:21:53 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 19:22:42 2026"
      },
      "message": "Improve `exp` accuracy\n\nCurrently we effectively have an `exp2` kernel, and turn it into an `exp` kernel with an `input_multiplier`.\n\nThis strategy is a bit problematic because a little precision is lost in the `exp` -\u003e `exp2` conversion. This was hidden by the fact that we computed the reference result in the same way.\n\nThis change rewrites the `exp` kernels to compute `exp` directly.\n\nThis is a performance regression, but I think we need to prioritize accuracy for now. Most of the performance regression is due to the more accurate range reduction.\n\n```\nname                                                            time/op       time/op     vs base\nbench_reference/exp_float/m:1/n:4096/real_time                  53.98µ ± 2%   35.47µ ± 5%  -34.30% (p\u003d0.002 n\u003d6)\nbench_reference/exp_double/m:1/n:4096/real_time                 66.48µ ± 2%   72.48µ ± 2%   +9.04% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   1.873µ ± 3%   2.486µ ± 2%  +32.70% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   4.732µ ± 5%   6.318µ ± 2%  +33.51% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time          2.530µ ± 4%   3.216µ ± 2%  +27.10% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time          6.717µ ± 3%   8.113µ ± 4%  +20.79% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_avx2/m:1/n:4096/real_time               3.799µ ± 1%   4.839µ ± 2%  +27.39% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time               10.02µ ± 2%   11.17µ ± 1%  +11.49% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time               5.585µ ± 3%   6.913µ ± 2%  +23.78% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time                16.33µ ± 7%   19.19µ ± 2%  +17.53% (p\u003d0.002 n\u003d6)\ngeomean                                                         8.459µ        9.723µ       +14.94%\n```\n\nPiperOrigin-RevId: 918587354\n"
    },
    {
      "commit": "3f33e550b6270dbad0b9549170de4b9f107a4052",
      "tree": "ddb29e2b19836f6988cdc13694380c6185d88415",
      "parents": [
        "5a59a54af5e777acbb0c81a8b0d8bff8cc1d3464"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 20 18:50:16 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 18:51:06 2026"
      },
      "message": "Add `select` and conditional operations to SIMD wrappers\n\nTo make this work across the variety of SIMD ISA conventions for this, mask types should just be `auto`. They can\u0027t be stored to memory.\n\nIf we need `select` to work on conditions with a different size type than the values, we should be able to handle that with more overloads (and some conversions), but we don\u0027t need it yet.\n\nThis removes the workarounds that avoided the need for this until now: `select_greater_than`, `copynan`, `kahan_sum`.\n\nThis was partly AI generated, here is what it had to say:\n\n✦ I have completed the implementation of boolean operators and conditional selection across the entire SIMD abstraction layer. Here is a summary of the work:\n\n  1. SIMD Core \u0026 Generic Layer\n   * Boolean Operators: Implemented \u003d\u003d, !\u003d, \u003c, \u003c\u003d, \u003e, \u003e\u003d for all vector types. These now use auto return types to accommodate varying architecture-specific mask representations (e.g., integer vectors on SSE/AVX vs. __mmaskN on AVX-512).\n   * Conditional Selection: Added a unified select(mask, true_val, false_val) intrinsic.\n   * Predicates: Implemented isnan, isinf, and isfinite for floating-point vectors.\n   * Recursive Generics: Updated generic.inc to support recursive implementation of these operators for wide vectors (e.g., f32x16 on AVX2) by defining lo()/hi()/concat() helpers for all vector and mask types.\n\n  2. Architecture-Specific Implementations\n   * x86 AVX-512: Utilized native mask registers (__mmask16, etc.) and _mm512_mask_blend_* instructions. Implemented mask-specific glue logic for recursive templates.\n   * x86 SSE/AVX: Implemented comparisons returning signed integer vectors. Added s64x2 and s64x4 specializations to ensure 64-bit comparisons (for doubles) maintain the correct element count in the resulting mask.\n   * ARM NEON: Leveraged vceqq, vcgtq, and vbslq (bitselect) for mask operations. Added 64-bit integer vector support for ARM64.\n   * WASM SIMD128: Integrated wasm_v128_bitselect and standard WASM comparison intrinsics.\n\n  3. Kernel Compiler \u0026 Intrinsics\n   * Intrinsic Support: Updated the elementwise kernel compiler (compiler.py) to support select, isnan, isinf, and isfinite.\n   * Type Deduction: Modified the compiler to emit auto for comparison results, allowing the C++ compiler to handle architecture-specific mask types.\n   * Kernel Refactoring: Migrated existing kernels (e.g., exp, sigmoid) away from the legacy copynan helper to the more idiomatic select(isnan(x), ...) pattern.\n\n  4. Verification \u0026 Cleanup\n   * Comprehensive Testing: Expanded the SIMD test suite in base/simd/test/ to verify comparisons, predicates, and selection logic across all architectures.\n   * Code Quality: Resolved header inclusion ordering issues, fixed redefinition errors between SSE2 and SSE4.1, and performed a significant cleanup of bitwise operator consistency in the AVX headers.\n\n  Status: All tasks in PLAN.md are complete, and all SIMD test targets (x86_avx512, x86_avx2, x86_avx, x86_sse2, arm_neon, wasm_simd128) are passing.\n\nI expected this to be a no-op in terms of performance/generated code, but it improves some benchmarks:\n```\nbench/sum_squared_k1_bf16_fp32_avx512/real_time     [256x256] 5.520µ ±  4%    5.483µ ±  3%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_avx512/real_time     [256x256] 4.766µ ±  3%    5.053µ ± 11%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_squared_k1_fp16_fp32_avx512/real_time     [256x256] 4.495µ ±  3%    4.571µ ±  5%        ~ (p\u003d0.310 n\u003d6)\nbench/sum_squared_kn_fp16_fp32_avx512/real_time     [256x256] 4.157µ ±  1%    4.181µ ± 17%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_squared_k1_fp32_avx512/real_time          [256x256] 5.506µ ± 15%    5.286µ ±  7%        ~ (p\u003d0.310 n\u003d6)\nbench/sum_squared_kn_fp32_avx512/real_time          [256x256] 4.770µ ±  9%    4.691µ ± 17%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_squared_k1_fp64_avx512/real_time          [256x256] 9.632µ ± 10%    8.636µ ±  2%  -10.34% (p\u003d0.015 n\u003d6)\nbench/sum_squared_kn_fp64_avx512/real_time          [256x256] 10.36µ ±  6%    10.21µ ±  9%        ~ (p\u003d0.699 n\u003d6)\nbench/sum_squared_k1_bf16_fp32_avx2/real_time       [256x256] 8.651µ ±  8%    7.314µ ±  4%  -15.45% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_avx2/real_time       [256x256] 7.510µ ±  3%    6.949µ ±  3%   -7.47% (p\u003d0.002 n\u003d6)\nbench/sum_squared_k1_fp32_avx/real_time             [256x256] 7.076µ ±  3%    6.491µ ±  3%   -8.27% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp32_avx/real_time             [256x256] 6.368µ ±  4%    5.859µ ±  3%   -7.99% (p\u003d0.002 n\u003d6)\nbench/sum_squared_k1_fp64_avx/real_time             [256x256] 12.28µ ±  2%    11.31µ ±  4%   -7.89% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp64_avx/real_time             [256x256] 12.72µ ±  4%    11.61µ ±  9%   -8.76% (p\u003d0.009 n\u003d6)\nbench/sum_squared_k1_fp16_fp32_f16c/real_time       [256x256] 7.480µ ±  4%    6.799µ ±  2%   -9.11% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp16_fp32_f16c/real_time       [256x256] 6.445µ ±  2%    6.163µ ± 12%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_squared_k1_fp32_sse2/real_time            [256x256] 10.97µ ±  4%    10.31µ ±  5%   -6.05% (p\u003d0.009 n\u003d6)\nbench/sum_squared_kn_fp32_sse2/real_time            [256x256] 10.42µ ±  3%    10.21µ ±  3%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_squared_k1_bf16_fp32_sse2/real_time       [256x256] 12.85µ ±  5%    12.31µ ±  3%   -4.20% (p\u003d0.004 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_sse2/real_time       [256x256] 11.56µ ±  2%    11.25µ ±  4%        ~ (p\u003d0.240 n\u003d6)\nbench/sum_squared_k1_fp64/real_time                 [256x256] 24.13µ ±  3%    22.98µ ± 19%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_squared_kn_fp64/real_time                 [256x256] 21.80µ ±  4%    20.33µ ± 16%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_squared_k1_fp16_fp32/real_time            [256x256] 183.7µ ±  1%    181.1µ ±  2%        ~ (p\u003d0.093 n\u003d6)\nbench/sum_squared_kn_fp16_fp32/real_time            [256x256] 50.73µ ±  7%    47.96µ ±  2%   -5.45% (p\u003d0.002 n\u003d6)\ngeomean                                                       9.443µ          9.233µ         -2.22%\n```\nBut some kernels slower:\n```\nname                                                                time/op       time/op     vs base\nbench_reference/exp_float/m:1/n:4096/real_time                      53.03µ ±  1%   53.24µ ± 2%       ~ (p\u003d0.485 n\u003d6)\nbench_reference/exp_double/m:1/n:4096/real_time                     64.92µ ±  1%   65.43µ ± 2%       ~ (p\u003d0.180 n\u003d6)\nbench_reference/sigmoid_float/m:1/n:4096/real_time                  37.11µ ±  1%   37.41µ ± 2%       ~ (p\u003d0.394 n\u003d6)\nbench_reference/sigmoid_double/m:1/n:4096/real_time                 72.69µ ± 20%   72.44µ ± 1%       ~ (p\u003d0.699 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       1.844µ ±  1%   1.872µ ± 3%  +1.51% (p\u003d0.015 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       4.816µ ±  6%   4.743µ ± 2%       ~ (p\u003d0.394 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   1.841µ ±  3%   1.827µ ± 1%       ~ (p\u003d0.394 n\u003d6)\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   4.549µ ±  7%   4.619µ ± 3%       ~ (p\u003d0.093 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time              2.481µ ±  3%   2.575µ ± 3%  +3.77% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time              6.648µ ±  3%   6.834µ ± 3%  +2.80% (p\u003d0.026 n\u003d6)\nbench/exp_fp32_1x16_x86_avx2/m:1/n:4096/real_time                   3.777µ ±  2%   3.876µ ± 3%  +2.60% (p\u003d0.015 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time                   9.753µ ±  3%   9.919µ ± 3%       ~ (p\u003d0.180 n\u003d6)\nbench/sigmoid_fp32_1x16_x86_avx2/m:1/n:4096/real_time               4.010µ ±  3%   3.910µ ± 2%       ~ (p\u003d0.093 n\u003d6)\nbench/sigmoid_fp64_1x8_x86_avx2/m:1/n:4096/real_time                10.65µ ±  3%   10.66µ ± 2%       ~ (p\u003d0.589 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time                   5.544µ ±  1%   5.770µ ± 1%  +4.07% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time                    16.14µ ±  7%   16.53µ ± 3%       ~ (p\u003d0.310 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2/m:1/n:4096/real_time               5.792µ ±  4%   5.917µ ± 4%       ~ (p\u003d0.394 n\u003d6)\nbench/sigmoid_fp64_1x8_x86_sse2/m:1/n:4096/real_time                17.36µ ±  3%   17.72µ ± 2%       ~ (p\u003d0.132 n\u003d6)\ngeomean                                                             9.036µ         9.144µ       +1.19%\n```\n\nPiperOrigin-RevId: 918570419\n"
    },
    {
      "commit": "5a59a54af5e777acbb0c81a8b0d8bff8cc1d3464",
      "tree": "8b9f3bc749e9a76bc81f7b33546e255f08f560a5",
      "parents": [
        "bcc179a8b7b8c2cadefd52a48d75cb64887228d3"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 00:15:12 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 00:16:30 2026"
      },
      "message": "Properly open source tensor api in github through copybara\n\nPiperOrigin-RevId: 918107140\n"
    },
    {
      "commit": "bcc179a8b7b8c2cadefd52a48d75cb64887228d3",
      "tree": "db88c4b261d43d2d71664361dd8105ce2c1d619b",
      "parents": [
        "0547829228c2bab8f58f468b54db0a8ec91ebc62"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Wed May 20 00:01:00 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Wed May 20 00:01:56 2026"
      },
      "message": "Remove fp64 wasm support\n\nWe should never need this\n\nPiperOrigin-RevId: 918101335\n"
    },
    {
      "commit": "0547829228c2bab8f58f468b54db0a8ec91ebc62",
      "tree": "72c762fb6fbea18567f12fd0677b3133de2a2aaa",
      "parents": [
        "cc68da84bd4e062a79600b82109ea28bd1553e49"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 19 23:21:38 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 23:22:12 2026"
      },
      "message": "Remove lo/hi as member functions of `vec\u003cT, N\u003e`\n\nI think this is necessary as a step towards attempting to implement conditions and `select`, where the mask types might be things like `__mmask8`.\n\nPiperOrigin-RevId: 918084846\n"
    },
    {
      "commit": "cc68da84bd4e062a79600b82109ea28bd1553e49",
      "tree": "cc9b3f6473c89abb9c28b63f93e0b4f288b97b1a",
      "parents": [
        "4b53a435774a5ec388621aa594b7c60c2e129b02"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 19 23:14:22 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 23:15:11 2026"
      },
      "message": "Add sigmoid_fp64 kernels\n\nAnd rewrite the sigmoid_fp32 kernel using the same technique.\n\nIt turns out that this kernel is faster, which is a little surprising. It does have less \"overhead\" (special cases, piecewise branches, etc.) in exchange for more polynomial arithmetic.\n\nChange in performance for `sigmoid_fp32`:\n\n```\nname                                                                time/op        time/op     vs base\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time   1.759µ ± 3%   1.771µ ± 21%        ~ (p\u003d0.818 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:4/n:1024/real_time   1.740µ ± 3%   1.774µ ±  3%        ~ (p\u003d0.065 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:16/n:256/real_time   1.773µ ± 2%   1.783µ ±  1%        ~ (p\u003d0.699 n\u003d6)\nbench/sigmoid_fp32_1x16_x86_avx2/m:1/n:4096/real_time               4.909µ ± 1%   3.670µ ±  2%  -25.24% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x16_x86_avx2/m:4/n:1024/real_time               4.830µ ± 2%   3.706µ ±  4%  -23.28% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x16_x86_avx2/m:16/n:256/real_time               4.912µ ± 1%   3.740µ ±  2%  -23.87% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2/m:1/n:4096/real_time               6.632µ ± 3%   5.437µ ±  2%  -18.02% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2/m:4/n:1024/real_time               6.637µ ± 3%   5.524µ ±  3%  -16.77% (p\u003d0.002 n\u003d6)\nbench/sigmoid_fp32_1x32_x86_sse2/m:16/n:256/real_time               6.692µ ± 4%   5.493µ ±  1%  -17.92% (p\u003d0.002 n\u003d6)\ngeomean                                                             3.851µ        3.305µ        -14.19%\n```\n\n`sigmoid_fp64` compared to other kernels:\n```\n----------------------------------------------------------------------------------------------------------------------------\nBenchmark                                                                  Time             CPU   Iterations UserCounters...\n----------------------------------------------------------------------------------------------------------------------------\nbench_reference/sigmoid_float/m:1/n:4096/real_time                     38579 ns        38571 ns         7348 Bytes\u003d849.382M/s Op\u003d106.173M/s\nbench_reference/sigmoid_float/m:4/n:1024/real_time                     38345 ns        38338 ns         7440 Bytes\u003d854.556M/s Op\u003d106.819M/s\nbench_reference/sigmoid_float/m:16/n:256/real_time                     39192 ns        39187 ns         7190 Bytes\u003d836.095M/s Op\u003d104.512M/s\nbench_reference/sigmoid_double/m:1/n:4096/real_time                    91326 ns        91313 ns         3045 Bytes\u003d717.606M/s Op\u003d44.8504M/s\nbench_reference/sigmoid_double/m:4/n:1024/real_time                    91307 ns        91290 ns         3043 Bytes\u003d717.757M/s Op\u003d44.8598M/s\nbench_reference/sigmoid_double/m:16/n:256/real_time                    93505 ns        93486 ns         3018 Bytes\u003d700.885M/s Op\u003d43.8053M/s\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       1786 ns         1786 ns       155658 Bytes\u003d18.3422G/s Op\u003d2.29277G/s\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:4/n:1024/real_time       1802 ns         1802 ns       157599 Bytes\u003d18.1847G/s Op\u003d2.27309G/s\nbench/sigmoid_fp32_1x32_x86_avx512f_avx512bw/m:16/n:256/real_time       1791 ns         1791 ns       156134 Bytes\u003d18.2963G/s Op\u003d2.28704G/s\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       4475 ns         4475 ns        60425 Bytes\u003d14.6433G/s Op\u003d915.207M/s\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time       4822 ns         4821 ns        59593 Bytes\u003d13.5913G/s Op\u003d849.459M/s\nbench/sigmoid_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time       4842 ns         4840 ns        56596 Bytes\u003d13.5363G/s Op\u003d846.016M/s\nbench/sigmoid_fp32_1x16_x86_avx2/m:1/n:4096/real_time                   3789 ns         3788 ns        69486 Bytes\u003d8.64752G/s Op\u003d1.08094G/s\nbench/sigmoid_fp32_1x16_x86_avx2/m:4/n:1024/real_time                   3892 ns         3892 ns        74142 Bytes\u003d8.41825G/s Op\u003d1.05228G/s\nbench/sigmoid_fp32_1x16_x86_avx2/m:16/n:256/real_time                   3757 ns         3756 ns        72827 Bytes\u003d8.72073G/s Op\u003d1.09009G/s\nbench/sigmoid_fp64_1x8_x86_avx2/m:1/n:4096/real_time                   10451 ns        10450 ns        26516 Bytes\u003d6.27103G/s Op\u003d391.939M/s\nbench/sigmoid_fp64_1x8_x86_avx2/m:4/n:1024/real_time                   11010 ns        11007 ns        24451 Bytes\u003d5.95261G/s Op\u003d372.038M/s\nbench/sigmoid_fp64_1x8_x86_avx2/m:16/n:256/real_time                   10475 ns        10472 ns        26374 Bytes\u003d6.2567G/s Op\u003d391.044M/s\nbench/sigmoid_fp32_1x32_x86_sse2/m:1/n:4096/real_time                   5649 ns         5648 ns        49675 Bytes\u003d5.80048G/s Op\u003d725.06M/s\nbench/sigmoid_fp32_1x32_x86_sse2/m:4/n:1024/real_time                   5646 ns         5645 ns        50916 Bytes\u003d5.80353G/s Op\u003d725.441M/s\nbench/sigmoid_fp32_1x32_x86_sse2/m:16/n:256/real_time                   5571 ns         5571 ns        48792 Bytes\u003d5.88151G/s Op\u003d735.188M/s\nbench/sigmoid_fp64_1x8_x86_sse2/m:1/n:4096/real_time                   15957 ns        15952 ns        17116 Bytes\u003d4.10712G/s Op\u003d256.695M/s\nbench/sigmoid_fp64_1x8_x86_sse2/m:4/n:1024/real_time                   15657 ns        15654 ns        17451 Bytes\u003d4.18581G/s Op\u003d261.613M/s\nbench/sigmoid_fp64_1x8_x86_sse2/m:16/n:256/real_time                   15748 ns        15744 ns        17804 Bytes\u003d4.16163G/s Op\u003d260.102M/s\n```\n\nPiperOrigin-RevId: 918081537\n"
    },
    {
      "commit": "4b53a435774a5ec388621aa594b7c60c2e129b02",
      "tree": "766ec8b1ce83eee127182e28288345d92cb56898",
      "parents": [
        "0741ac549c8f6357a8e311ffee2cee234a6da7cf"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Tue May 19 21:41:49 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 21:42:21 2026"
      },
      "message": "Prevent scheduling of ki/ko loops in packing.\n\nIn order to keep rest of the loops fused I added \"identity\" splits where step \u003d\u003d extent, as a side effect now the packed buffer is stored inside of the loop.\n\nPiperOrigin-RevId: 918036935\n"
    },
    {
      "commit": "0741ac549c8f6357a8e311ffee2cee234a6da7cf",
      "tree": "14b17258fa9a1501f96207d7afd54d79dd11ec33",
      "parents": [
        "ce14e188cd02a9726c00970babc087a3be6de942"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 21:28:13 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 21:29:03 2026"
      },
      "message": "Open source Tensor API in google-ai-edge/LiteRT\n\nPiperOrigin-RevId: 918030392\n"
    },
    {
      "commit": "ce14e188cd02a9726c00970babc087a3be6de942",
      "tree": "a3a36647db738550cf888f0cf91c2d8d59f0ff93",
      "parents": [
        "d0004f80c78fed80c230045ee83ff34dc55be81a"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Tue May 19 18:25:32 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 18:26:41 2026"
      },
      "message": "Adjust bounds for elementwise unary kernels with sub-byte inputs.\n\nPiperOrigin-RevId: 917935792\n"
    },
    {
      "commit": "d0004f80c78fed80c230045ee83ff34dc55be81a",
      "tree": "0d15f93c0ae44f1d4eae1de180fa9d014e4dd832",
      "parents": [
        "adf9795c009f3d3190358c7df773dfa7180f694f"
      ],
      "author": {
        "name": "Pedro Gonnet",
        "email": "gonnet@google.com",
        "time": "Tue May 19 13:22:40 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 13:23:28 2026"
      },
      "message": "Add `xnn_datatype_qint2` for tensorwise quantized 2-bit values.\n\nPiperOrigin-RevId: 917797252\n"
    },
    {
      "commit": "adf9795c009f3d3190358c7df773dfa7180f694f",
      "tree": "ba4cdaafc4f7f76e925eb328ad5a1404cbdf3173",
      "parents": [
        "ea4434133af5537f58c2616d6d939e82966a7f53"
      ],
      "author": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 08:37:33 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 08:38:15 2026"
      },
      "message": "Split dot operation on K.\n\nCurrently split_k is the length of k so we don\u0027t expect major performance differences.\n\nSample IR for `f32[110,10240,2560]`.\n\nBefore:\n```\nnull \u003d constant_buffer([], 0, {}) {\n v3 \u003d allocate(automatic, 4, {\n   {},\n   {[0, 15], 4, \u003c\u003e},\n   {[0, 10239], 64, \u003c\u003e},\n   {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n }) {\n  out2.d0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n   closure {in0, in1, out2, v3, null, out2.d0} in {\n    v3.out2.d0 \u003d crop_dim(v3, 3, [(out2.d0 / 16), ((out2.d0 / 16) + 1)]) {\n     call(pack_b, {in1}, {v3.out2.d0}, {})\n    }\n    out2.out2.d0 \u003d crop_dim(out2, 0, [out2.d0, (out2.d0 + 31)]) {\n     call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.out2.d0}, {})\n    }\n   }\n  }\n }\n}\n```\n\nAfter where k_split\u003d1024.\n```\nnull \u003d constant_buffer([], 0, {}) {\n reduction \u003d allocate(automatic, 0, {\n   {[0, 10239], 0, \u003c\u003e}\n }) {\n  k0#0 \u003d loop(serial, [0, 10239], 1024) {\n   reduction \u003d crop_dim(reduction, 0, [k0#0, (k0#0 + 1023)]) {\n    v3 \u003d allocate(automatic, 4, {\n      {},\n      {[0, 15], 4, \u003c\u003e},\n      {[k0#0, buffer_max(reduction, 0)], 64, \u003c\u003e},\n      {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n    }) {\n     d0#0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n      closure {in0, in1, out2, v3, null, reduction, d0#0} in {\n       v3.d0#0 \u003d crop_dim(v3, 3, [(d0#0 / 16), ((d0#0 / 16) + 1)]) {\n        call(pack_b, {in1}, {v3.d0#0}, {})\n       }\n       out2.d0#0 \u003d crop_dim(out2, 0, [d0#0, (d0#0 + 31)]) {\n        call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.d0#0, reduction}, {})\n       }\n      }\n     }\n    }\n   }\n  }\n }\n}\n```\nPiperOrigin-RevId: 917686168\n"
    },
    {
      "commit": "ea4434133af5537f58c2616d6d939e82966a7f53",
      "tree": "fa70567e65f794cb3346d4a0880f5e7c418a43f7",
      "parents": [
        "7e3c789b2523ddc0f567019c3fa4c96fe4f6697d"
      ],
      "author": {
        "name": "Marie White",
        "email": "mariewhite@google.com",
        "time": "Tue May 19 06:14:39 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 06:15:36 2026"
      },
      "message": "Split dot operation on K.\n\nCurrently split_k is the length of k so we don\u0027t expect major performance differences.\n\nSample IR for `f32[110,10240,2560]`.\n\nBefore:\n```\nnull \u003d constant_buffer([], 0, {}) {\n v3 \u003d allocate(automatic, 4, {\n   {},\n   {[0, 15], 4, \u003c\u003e},\n   {[0, 10239], 64, \u003c\u003e},\n   {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n }) {\n  out2.d0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n   closure {in0, in1, out2, v3, null, out2.d0} in {\n    v3.out2.d0 \u003d crop_dim(v3, 3, [(out2.d0 / 16), ((out2.d0 / 16) + 1)]) {\n     call(pack_b, {in1}, {v3.out2.d0}, {})\n    }\n    out2.out2.d0 \u003d crop_dim(out2, 0, [out2.d0, (out2.d0 + 31)]) {\n     call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.out2.d0}, {})\n    }\n   }\n  }\n }\n}\n```\n\nAfter where k_split\u003d1024.\n```\nnull \u003d constant_buffer([], 0, {}) {\n reduction \u003d allocate(automatic, 0, {\n   {[0, 10239], 0, \u003c\u003e}\n }) {\n  k0#0 \u003d loop(serial, [0, 10239], 1024) {\n   reduction \u003d crop_dim(reduction, 0, [k0#0, (k0#0 + 1023)]) {\n    v3 \u003d allocate(automatic, 4, {\n      {},\n      {[0, 15], 4, \u003c\u003e},\n      {[k0#0, buffer_max(reduction, 0)], 64, \u003c\u003e},\n      {[(buffer_min(out2, 0) / 16), (buffer_max(out2, 0) / 16)], \u003c\u003e, \u003c\u003e}\n    }) {\n     d0#0 \u003d loop(parallel, [buffer_min(out2, 0), buffer_max(out2, 0)], 32) {\n      closure {in0, in1, out2, v3, null, reduction, d0#0} in {\n       v3.d0#0 \u003d crop_dim(v3, 3, [(d0#0 / 16), ((d0#0 / 16) + 1)]) {\n        call(pack_b, {in1}, {v3.d0#0}, {})\n       }\n       out2.d0#0 \u003d crop_dim(out2, 0, [d0#0, (d0#0 + 31)]) {\n        call(dot num_k_dims\u003d1, {in0, v3, null}, {out2.d0#0, reduction}, {})\n       }\n      }\n     }\n    }\n   }\n  }\n }\n}\n```\nPiperOrigin-RevId: 917625813\n"
    },
    {
      "commit": "7e3c789b2523ddc0f567019c3fa4c96fe4f6697d",
      "tree": "ba4cdaafc4f7f76e925eb328ad5a1404cbdf3173",
      "parents": [
        "7333afbd475d2f99c1af84512fff382271758a43"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 19 05:44:27 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 05:45:34 2026"
      },
      "message": "Add tanh_fp64 kernels\n\nThese are not direct approximations like most other transcendental ops. Instead it uses our `exp_fp64` polynomials to implement the textbook definition of `(e^x - e^-x) / (e^x + e^-x)`, with some minor refactoring and optimization to group and reuse like terms.\n\nI also tried to improve the `tanh_fp32` kernels. This new approximation is not clearly better (it has the same error as before), but we should switch to it anyways because it is reproducible (via the new cell in rational_approximation.py).\n\nBenchmarks:\n\n```\n-------------------------------------------------------------------------------------------------------------------------\nBenchmark                                                               Time             CPU   Iterations UserCounters...\n-------------------------------------------------------------------------------------------------------------------------\nbench_reference/tanh_float/m:1/n:4096/real_time                     43571 ns        43563 ns         6383 Bytes\u003d752.055M/s Op\u003d94.0068M/s\nbench_reference/tanh_float/m:4/n:1024/real_time                     43129 ns        43123 ns         6590 Bytes\u003d759.763M/s Op\u003d94.9704M/s\nbench_reference/tanh_float/m:16/n:256/real_time                     43515 ns        43506 ns         6420 Bytes\u003d753.033M/s Op\u003d94.1291M/s\nbench_reference/tanh_double/m:1/n:4096/real_time                   119486 ns       119465 ns         2175 Bytes\u003d548.482M/s Op\u003d34.2801M/s\nbench_reference/tanh_double/m:4/n:1024/real_time                   117688 ns       117657 ns         2364 Bytes\u003d556.861M/s Op\u003d34.8038M/s\nbench_reference/tanh_double/m:16/n:256/real_time                   117342 ns       117319 ns         2364 Bytes\u003d558.503M/s Op\u003d34.9064M/s\nbench/tanh_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time       1650 ns         1650 ns       153859 Bytes\u003d19.8583G/s Op\u003d2.48229G/s\nbench/tanh_fp32_1x32_x86_avx512f_avx512bw/m:4/n:1024/real_time       1619 ns         1619 ns       168542 Bytes\u003d20.2341G/s Op\u003d2.52927G/s\nbench/tanh_fp32_1x32_x86_avx512f_avx512bw/m:16/n:256/real_time       1623 ns         1622 ns       171671 Bytes\u003d20.1957G/s Op\u003d2.52446G/s\nbench/tanh_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       7737 ns         7734 ns        37410 Bytes\u003d8.47083G/s Op\u003d529.427M/s\nbench/tanh_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time       8076 ns         8074 ns        34535 Bytes\u003d8.11451G/s Op\u003d507.157M/s\nbench/tanh_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time       7981 ns         7980 ns        36502 Bytes\u003d8.21114G/s Op\u003d513.196M/s\nbench/tanh_fp32_1x32_x86_avx/m:1/n:4096/real_time                    3209 ns         3209 ns        85684 Bytes\u003d10.2099G/s Op\u003d1.27624G/s\nbench/tanh_fp32_1x32_x86_avx/m:4/n:1024/real_time                    3284 ns         3284 ns        84044 Bytes\u003d9.97765G/s Op\u003d1.24721G/s\nbench/tanh_fp32_1x32_x86_avx/m:16/n:256/real_time                    3274 ns         3273 ns        84131 Bytes\u003d10.0089G/s Op\u003d1.25111G/s\nbench/tanh_fp64_1x16_x86_avx/m:1/n:4096/real_time                   12061 ns        12059 ns        23596 Bytes\u003d5.43361G/s Op\u003d339.6M/s\nbench/tanh_fp64_1x16_x86_avx/m:4/n:1024/real_time                   12361 ns        12358 ns        22641 Bytes\u003d5.30194G/s Op\u003d331.371M/s\nbench/tanh_fp64_1x16_x86_avx/m:16/n:256/real_time                   12167 ns        12164 ns        22833 Bytes\u003d5.38646G/s Op\u003d336.654M/s\nbench/tanh_fp32_1x16_x86_sse2/m:1/n:4096/real_time                   5526 ns         5525 ns        49327 Bytes\u003d5.92943G/s Op\u003d741.179M/s\nbench/tanh_fp32_1x16_x86_sse2/m:4/n:1024/real_time                   5536 ns         5534 ns        48796 Bytes\u003d5.9192G/s Op\u003d739.9M/s\nbench/tanh_fp32_1x16_x86_sse2/m:16/n:256/real_time                   5585 ns         5583 ns        50897 Bytes\u003d5.86748G/s Op\u003d733.435M/s\nbench/tanh_fp64_1x8_x86_sse2/m:1/n:4096/real_time                   20497 ns        20495 ns        14192 Bytes\u003d3.19736G/s Op\u003d199.835M/s\nbench/tanh_fp64_1x8_x86_sse2/m:4/n:1024/real_time                   19400 ns        19397 ns        14340 Bytes\u003d3.37812G/s Op\u003d211.132M/s\nbench/tanh_fp64_1x8_x86_sse2/m:16/n:256/real_time                   20904 ns        20902 ns        14394 Bytes\u003d3.13508G/s Op\u003d195.943M/s\n```\n\nPiperOrigin-RevId: 917613592\n"
    },
    {
      "commit": "7333afbd475d2f99c1af84512fff382271758a43",
      "tree": "55417041aa29039d79e329e6149c5ecc97ebd648",
      "parents": [
        "5191fee1a7b2aaa6f9f37790321b6902bf5cbdf6"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 19 03:24:51 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 03:25:31 2026"
      },
      "message": "Optimize `floor_log2` for fp64 for non-AVX512 targets\n\nPreviously, `floor_log2` called the scalar `floor_log2`, which was quite slow.\n\n```\nname                                                            time/op       time/op     vs base\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time   4.845µ ±  9%   4.776µ ± 7%        ~ (p\u003d0.240 n\u003d6)\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time   4.867µ ±  4%   4.841µ ± 2%        ~ (p\u003d0.240 n\u003d6)\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time   4.982µ ± 11%   4.797µ ± 4%   -3.70% (p\u003d0.041 n\u003d6)\nbench/log_fp64_1x16_x86_avx2/m:1/n:4096/real_time               45.13µ ±  3%   13.87µ ± 4%  -69.27% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x16_x86_avx2/m:4/n:1024/real_time               45.07µ ±  9%   14.00µ ± 7%  -68.93% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x16_x86_avx2/m:16/n:256/real_time               45.20µ ±  9%   14.01µ ± 8%  -69.00% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x4_x86_sse2/m:1/n:4096/real_time                46.31µ ±  6%   25.67µ ± 6%  -44.56% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x4_x86_sse2/m:4/n:1024/real_time                46.15µ ±  2%   25.71µ ± 2%  -44.30% (p\u003d0.002 n\u003d6)\nbench/log_fp64_1x4_x86_sse2/m:16/n:256/real_time                46.13µ ±  1%   25.80µ ± 2%  -44.07% (p\u003d0.002 n\u003d6)\ngeomean                                                         21.69µ         12.00µ       -44.71%\n```\n\nMostly AI generated, here is what it had to say:\n\n✦ I have replaced the scalar floor_log2 implementations for doubles (f64x2, f64x4, etc.) with SIMD versions in the ynnpack library. This was done for ARM NEON (ARM64), x86 SSE2/AVX2, and WASM SIMD128 architectures, using the same bit-manipulation technique as the existing float (f32x4) implementations.\n\n  Key changes:\n   - base/simd/arm_neon_base.h: Implemented f64x2 floor_log2 using vshrq_n_s64 and bitwise operations.\n   - base/simd/x86_sse2.h: Implemented f64x2 floor_log2 using _mm_srai_epi32 on high 32-bit elements to emulate 64-bit arithmetic shift for the exponent bits.\n   - base/simd/x86_avx2.h: Updated f64x2 floor_log2 to use the same SIMD implementation as SSE2.\n   - base/simd/wasm_simd128.h: Implemented f64x2 floor_log2 using wasm_i64x2_shr and bitwise operations.\n\nPiperOrigin-RevId: 917567607\n"
    },
    {
      "commit": "5191fee1a7b2aaa6f9f37790321b6902bf5cbdf6",
      "tree": "2a71eb22e9b5392d8c4b0d3d2fbc9bd0f777daa2",
      "parents": [
        "7ef5fdca247dfc94e9ac8a93fac74e558565efac"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Tue May 19 01:16:58 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Tue May 19 01:17:51 2026"
      },
      "message": "Change tree reduction factor from 32 to 16, and add another level\n\nThis also fixes a bug where we clocked the accumulation mechanism based on the number of input values instead of accumulators in k1 reductions. This was inconsistent with kn reductions, which always have K \u003d 1, and also was excessive accumulation for bf16 and fp16 (which widen to fp32, which gives a lot of headroom on the precision).\n\nThis should make reductions more accurate. Performance is a mixed bag, it improves in some cases and regresses in others:\n```\nname                                                          sec/op         sec/op      vs base\nbench/sum_k1_bf16_fp32_avx512/real_time             [256x256]  5.176µ ±  4%    5.395µ ± 12%   +4.23% (p\u003d0.009 n\u003d6)\nbench/sum_kn_bf16_fp32_avx512/real_time             [256x256]  4.794µ ±  4%    4.793µ ± 29%        ~ (p\u003d0.818 n\u003d6)\nbench/sum_k1_fp16_fp32_avx512/real_time             [256x256]  5.593µ ±  5%    4.610µ ±  4%  -17.58% (p\u003d0.002 n\u003d6)\nbench/sum_kn_fp16_fp32_avx512/real_time             [256x256]  4.141µ ±  3%    4.302µ ± 29%   +3.87% (p\u003d0.026 n\u003d6)\nbench/sum_k1_fp32_avx512/real_time                  [256x256]  5.257µ ±  3%    4.325µ ±  5%  -17.73% (p\u003d0.002 n\u003d6)\nbench/sum_kn_fp32_avx512/real_time                  [256x256]  4.228µ ±  4%    4.624µ ± 22%   +9.36% (p\u003d0.002 n\u003d6)\nbench/sum_k1_fp64_avx512/real_time                  [256x256]  6.964µ ±  3%    6.799µ ±  5%        ~ (p\u003d0.065 n\u003d6)\nbench/sum_kn_fp64_avx512/real_time                  [256x256]  8.662µ ± 20%    9.734µ ± 32%        ~ (p\u003d0.132 n\u003d6)\nbench/sum_k1_bf16_fp32_avx2/real_time               [256x256]  6.199µ ±  7%    6.726µ ±  2%   +8.50% (p\u003d0.009 n\u003d6)\nbench/sum_kn_bf16_fp32_avx2/real_time               [256x256]  6.279µ ±  3%    6.304µ ±  3%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_k1_fp32_avx/real_time                     [256x256]  5.431µ ± 10%    4.540µ ±  7%  -16.41% (p\u003d0.002 n\u003d6)\nbench/sum_kn_fp32_avx/real_time                     [256x256]  5.343µ ±  8%    5.474µ ±  6%        ~ (p\u003d1.000 n\u003d6)\nbench/sum_k1_fp64_avx/real_time                     [256x256]  8.136µ ±  4%    8.021µ ±  3%        ~ (p\u003d0.589 n\u003d6)\nbench/sum_kn_fp64_avx/real_time                     [256x256]  10.85µ ± 14%    10.87µ ± 14%        ~ (p\u003d0.818 n\u003d6)\nbench/sum_k1_fp16_fp32_f16c/real_time               [256x256]  6.826µ ±  2%    7.002µ ±  4%   +2.57% (p\u003d0.026 n\u003d6)\nbench/sum_kn_fp16_fp32_f16c/real_time               [256x256]  5.118µ ±  2%    5.160µ ±  2%        ~ (p\u003d0.240 n\u003d6)\nbench/sum_k1_fp32_sse2/real_time                    [256x256] 10.113µ ±  2%    8.407µ ±  6%  -16.88% (p\u003d0.002 n\u003d6)\nbench/sum_kn_fp32_sse2/real_time                    [256x256]  7.663µ ± 16%    7.846µ ± 21%        ~ (p\u003d0.132 n\u003d6)\nbench/sum_k1_bf16_fp32_sse2/real_time               [256x256]  8.770µ ± 10%   11.084µ ±  3%  +26.38% (p\u003d0.002 n\u003d6)\nbench/sum_kn_bf16_fp32_sse2/real_time               [256x256]  9.860µ ±  1%    9.982µ ±  2%   +1.24% (p\u003d0.026 n\u003d6)\nbench/sum_k1_fp64/real_time                         [256x256]  15.39µ ±  2%    18.08µ ±  1%  +17.46% (p\u003d0.002 n\u003d6)\nbench/sum_kn_fp64/real_time                         [256x256]  16.77µ ±  4%    16.77µ ± 10%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_k1_fp16_fp32/real_time                    [256x256]  174.5µ ±  8%    173.9µ ± 18%        ~ (p\u003d0.589 n\u003d6)\nbench/sum_kn_fp16_fp32/real_time                    [256x256]  45.37µ ±  2%    45.38µ ±  3%        ~ (p\u003d0.699 n\u003d6)\nbench/sum_squared_k1_bf16_fp32_avx512/real_time     [256x256]  5.076µ ±  4%    5.480µ ±  2%   +7.97% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_avx512/real_time     [256x256]  4.825µ ±  3%    4.783µ ± 30%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_squared_k1_fp16_fp32_avx512/real_time     [256x256]  5.660µ ± 10%    4.519µ ±  5%  -20.17% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp16_fp32_avx512/real_time     [256x256]  4.114µ ±  6%    4.250µ ± 29%        ~ (p\u003d0.132 n\u003d6)\nbench/sum_squared_k1_fp32_avx512/real_time          [256x256]  6.666µ ±  3%    5.932µ ± 10%  -11.01% (p\u003d0.009 n\u003d6)\nbench/sum_squared_kn_fp32_avx512/real_time          [256x256]  4.536µ ±  4%    5.101µ ± 30%  +12.45% (p\u003d0.009 n\u003d6)\nbench/sum_squared_k1_fp64_avx512/real_time          [256x256]  9.204µ ±  5%    9.625µ ± 10%        ~ (p\u003d0.485 n\u003d6)\nbench/sum_squared_kn_fp64_avx512/real_time          [256x256]  9.599µ ± 11%   11.265µ ± 19%  +17.36% (p\u003d0.009 n\u003d6)\nbench/sum_squared_k1_bf16_fp32_avx2/real_time       [256x256]  7.023µ ±  5%    8.295µ ±  6%  +18.12% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_avx2/real_time       [256x256]  7.110µ ±  2%    7.484µ ±  4%   +5.26% (p\u003d0.002 n\u003d6)\nbench/sum_squared_k1_fp32_avx/real_time             [256x256]  7.916µ ±  5%    7.010µ ±  4%  -11.46% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp32_avx/real_time             [256x256]  5.849µ ±  5%    6.309µ ±  5%   +7.87% (p\u003d0.002 n\u003d6)\nbench/sum_squared_k1_fp64_avx/real_time             [256x256]  11.70µ ±  8%    12.27µ ±  4%   +4.90% (p\u003d0.041 n\u003d6)\nbench/sum_squared_kn_fp64_avx/real_time             [256x256]  11.86µ ±  5%    12.42µ ± 13%   +4.69% (p\u003d0.015 n\u003d6)\nbench/sum_squared_k1_fp16_fp32_f16c/real_time       [256x256]  8.402µ ±  4%    7.321µ ±  2%  -12.86% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp16_fp32_f16c/real_time       [256x256]  6.290µ ±  3%    6.410µ ±  3%        ~ (p\u003d0.132 n\u003d6)\nbench/sum_squared_k1_fp32_sse2/real_time            [256x256]  11.63µ ± 10%    11.16µ ±  2%   -4.03% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_fp32_sse2/real_time            [256x256]  9.218µ ± 12%   10.755µ ±  9%  +16.67% (p\u003d0.002 n\u003d6)\nbench/sum_squared_k1_bf16_fp32_sse2/real_time       [256x256]  10.70µ ±  2%    13.09µ ±  3%  +22.38% (p\u003d0.002 n\u003d6)\nbench/sum_squared_kn_bf16_fp32_sse2/real_time       [256x256]  11.27µ ± 16%    11.57µ ±  2%        ~ (p\u003d0.132 n\u003d6)\nbench/sum_squared_k1_fp64/real_time                 [256x256]  24.54µ ±  4%    24.17µ ± 21%        ~ (p\u003d0.485 n\u003d6)\nbench/sum_squared_kn_fp64/real_time                 [256x256]  21.43µ ±  4%    22.22µ ±  5%   +3.66% (p\u003d0.026 n\u003d6)\nbench/sum_squared_k1_fp16_fp32/real_time            [256x256]  185.7µ ± 15%    184.9µ ±  3%        ~ (p\u003d0.937 n\u003d6)\nbench/sum_squared_kn_fp16_fp32/real_time            [256x256]  48.95µ ±  2%    50.72µ ±  3%   +3.60% (p\u003d0.009 n\u003d6)\ngeomean                                                       9.426µ          9.563µ         +1.44%\n```\n\nPiperOrigin-RevId: 917527134\n"
    },
    {
      "commit": "7ef5fdca247dfc94e9ac8a93fac74e558565efac",
      "tree": "6a671dd9f0cd17a5d2a7cea8838863f443d8fb28",
      "parents": [
        "56ac34b3f45fae2eca1f32584f7f0b279be2cf1f"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 18 20:05:14 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 20:06:19 2026"
      },
      "message": "Add `round_to_bf16`\n\n- Add kernels for `round_to_bf16(x) \u003d convert(x_type, convert(bf16, x))`\n- Instead of abandoning fusion of `convert(x_type, convert(bf16, x))` when we can\u0027t change the numerics, replace the two operations with `round_to_bf16(x)` instead.\n\nPiperOrigin-RevId: 917386285\n"
    },
    {
      "commit": "56ac34b3f45fae2eca1f32584f7f0b279be2cf1f",
      "tree": "9320610ae4e923cc0317969a9504e5ea837116fa",
      "parents": [
        "f873466b2dab32b907f912ff7e63d1e77435572b"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Mon May 18 19:38:30 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 19:39:22 2026"
      },
      "message": "Add a graph rewrite to fallback to fp32 when fp16 isn\u0027t supported.\n\nPiperOrigin-RevId: 917373652\n"
    },
    {
      "commit": "f873466b2dab32b907f912ff7e63d1e77435572b",
      "tree": "86537c30cfdcb0ee8cdb9120ee3ac7021d1d1061",
      "parents": [
        "3ca1b08328931a3abf0fe5dd2ef392adc3655407"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Mon May 18 19:09:29 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 19:11:31 2026"
      },
      "message": "Align C++ standard to C++17 in CMake builds to be equal to Bazel builds.\n\nPiperOrigin-RevId: 917360156\n"
    },
    {
      "commit": "3ca1b08328931a3abf0fe5dd2ef392adc3655407",
      "tree": "b2e5eeac06bd7cd615581b5f05912244ac144af9",
      "parents": [
        "98c8ded4369968fab823ebbef877bfcdd87beb58"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 18 18:26:44 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 18:30:10 2026"
      },
      "message": "Relax tolerance for sum squared kernel test\n\nI saw a rare flake of this test whiel running locally\n\nPiperOrigin-RevId: 917338626\n"
    },
    {
      "commit": "98c8ded4369968fab823ebbef877bfcdd87beb58",
      "tree": "a53c57689dc1a8a4323562394168863ce1d76a41",
      "parents": [
        "f1fe9b5c42a97e20b46c0c9f0c2e1bb570a51e70"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 18 18:25:14 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 18:26:33 2026"
      },
      "message": "Polynomial approximation improvements for `exp` and `log`\n\n- Improve rational_approximation.py to use Remez algorithm (thanks AI), improves accuracy of `exp` and `log`.\n- Add `log_fp64` kernels.\n- Reduce `exp_fp64` numerator from degree 8 to degree 6 (while improving accuracy).\n\nRefactoring:\n- Split `exp.py` into `exp.py`, `log.py`, `erf.py`.\n- Add `eval_polynomial` helper.\n\n`exp_fp64` speedup:\n```\nname                                                             time/op         time/op     vs base\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time     5.347µ ± 60%   4.863µ ±  7%   -9.05% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time     5.296µ ± 59%   4.816µ ±  2%   -9.06% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time     5.349µ ± 62%   4.751µ ±  2%  -11.18% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time            7.810µ ± 72%   6.786µ ±  3%  -13.11% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:4/n:1024/real_time            8.008µ ± 19%   6.758µ ±  1%  -15.61% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:16/n:256/real_time            7.899µ ± 13%   6.763µ ±  3%  -14.37% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time                 11.35µ ±  3%   10.04µ ±  4%  -11.56% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:4/n:1024/real_time                11.605µ ±  7%   9.930µ ±  5%  -14.44% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:16/n:256/real_time                11.575µ ± 17%   9.985µ ± 14%  -13.74% (p\u003d0.004 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time                  18.59µ ±  2%   16.57µ ± 39%        ~ (p\u003d0.065 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:4/n:1024/real_time                  18.60µ ±  3%   16.78µ ± 21%        ~ (p\u003d0.065 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:16/n:256/real_time                  18.44µ ±  3%   16.58µ ± 20%        ~ (p\u003d0.065 n\u003d6)\ngeomean                                                          9.738µ         8.576µ        -11.93%\n```\n\n`log_fp64` benchmarks:\n```\n------------------------------------------------------------------------------------------------------------------------\nBenchmark                                                              Time             CPU   Iterations UserCounters...\n------------------------------------------------------------------------------------------------------------------------\nbench_reference/log_double/m:1/n:4096/real_time                    17434 ns        17429 ns        51083 Bytes\u003d3.75916G/s Op\u003d234.948M/s\nbench_reference/log_double/m:4/n:1024/real_time                    20406 ns        20399 ns        35681 Bytes\u003d3.21163G/s Op\u003d200.727M/s\nbench_reference/log_double/m:16/n:256/real_time                    21140 ns        21135 ns        31858 Bytes\u003d3.10012G/s Op\u003d193.758M/s\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time       5006 ns         5004 ns       131436 Bytes\u003d13.0923G/s Op\u003d818.27M/s\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time       5004 ns         5003 ns       100000 Bytes\u003d13.0979G/s Op\u003d818.62M/s\nbench/log_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time       4836 ns         4835 ns       134936 Bytes\u003d13.5507G/s Op\u003d846.917M/s\nbench/log_fp64_1x16_x86_avx2/m:1/n:4096/real_time                  45733 ns        45723 ns        15445 Bytes\u003d1.43303G/s Op\u003d89.5643M/s\nbench/log_fp64_1x16_x86_avx2/m:4/n:1024/real_time                  45963 ns        45953 ns        15467 Bytes\u003d1.42584G/s Op\u003d89.115M/s\nbench/log_fp64_1x16_x86_avx2/m:16/n:256/real_time                  45525 ns        45519 ns        15375 Bytes\u003d1.43957G/s Op\u003d89.9728M/s\nbench/log_fp64_1x4_x86_sse2/m:1/n:4096/real_time                   46637 ns        46629 ns        14979 Bytes\u003d1.40525G/s Op\u003d87.8279M/s\nbench/log_fp64_1x4_x86_sse2/m:4/n:1024/real_time                   49583 ns        49574 ns        14991 Bytes\u003d1.32174G/s Op\u003d82.6087M/s\nbench/log_fp64_1x4_x86_sse2/m:16/n:256/real_time                   47609 ns        47594 ns        14585 Bytes\u003d1.37655G/s Op\u003d86.0346M/s\n```\n\nPiperOrigin-RevId: 917337886\n"
    },
    {
      "commit": "f1fe9b5c42a97e20b46c0c9f0c2e1bb570a51e70",
      "tree": "da720450a9713ee7b4f879a8bc3c7caa2bd0a871",
      "parents": [
        "01db6e141adf744bb193a3a059c76c6799a37ab6"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 18 16:50:16 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 16:54:05 2026"
      },
      "message": "Only rewrite reduce(convert(x)) if we have a kernel for that reduction type.\n\nPiperOrigin-RevId: 917286989\n"
    },
    {
      "commit": "01db6e141adf744bb193a3a059c76c6799a37ab6",
      "tree": "ae1b50afe045bb2ff6d59ea5bfe7ce2050a9abd2",
      "parents": [
        "1052f90b15785d632e2dcbb6e9661c26d98271ed"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Mon May 18 16:49:55 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Mon May 18 16:50:36 2026"
      },
      "message": "Fix possible infinite recursion in convert\n\nIf we try to convert from fp32 to an unsupported type (e.g. int4), we try to convert to an fp32 intermediate and try again, which leads to infinite recursion.\n\nPiperOrigin-RevId: 917286839\n"
    },
    {
      "commit": "4ae8b8f607b2a35a91bba3ef491bda68a405fdc3",
      "tree": "2c7999de3cf7b7a322311e788624119cbed07cfd",
      "parents": [
        "49e266f77cf7930237c9c0bec5a0a1dd2c767ffd"
      ],
      "author": {
        "name": "velonica0",
        "email": "like@mail.nankai.edu.cn",
        "time": "Mon May 18 03:15:45 2026"
      },
      "committer": {
        "name": "velonica0",
        "email": "like@mail.nankai.edu.cn",
        "time": "Mon May 18 03:15:45 2026"
      },
      "message": "fix bug\n"
    },
    {
      "commit": "1052f90b15785d632e2dcbb6e9661c26d98271ed",
      "tree": "6641dc7784c047ad38950ac2aa6b3439adadb147",
      "parents": [
        "8da42ae2bd2626ae13d39c7c936076acac8b3735"
      ],
      "author": {
        "name": "Richard Townsend",
        "email": "ritownsend@google.com",
        "time": "Sat May 16 01:45:32 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sat May 16 01:46:36 2026"
      },
      "message": "[gn] Add support for building/testing AArch32\n\nAlthough AArch32 is not really the current focus for Chrome\nperformance, as we\u0027ve got the kernels around, there\u0027s no harm in\nhaving it.\n\nAlso modifies the GN test script to support testing on x64 hosts.\n\nPiperOrigin-RevId: 916281274\n"
    },
    {
      "commit": "8da42ae2bd2626ae13d39c7c936076acac8b3735",
      "tree": "994d4df13615aebe743c24bc00c7b7fbbdf77a37",
      "parents": [
        "1c292bfc98d0bc412721c335e72f7a188e436c8c"
      ],
      "author": {
        "name": "Gerardo Carranza",
        "email": "gcarranza@google.com",
        "time": "Sat May 16 00:20:41 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Sat May 16 00:21:17 2026"
      },
      "message": "Add support for log fp16 in XNNPACK.\n\nPiperOrigin-RevId: 916257784\n"
    },
    {
      "commit": "1c292bfc98d0bc412721c335e72f7a188e436c8c",
      "tree": "ea53af08a5bfbdf27cb5e6f1242822b05c3c410d",
      "parents": [
        "c3ac56a5bb4eb939370253ad1e4607ad04c803b0"
      ],
      "author": {
        "name": "Richard Townsend",
        "email": "ritownsend@google.com",
        "time": "Fri May 15 23:05:50 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 23:06:42 2026"
      },
      "message": "[gn] Test building AVX512\n\nBecause we don\u0027t test it in Github Actions, AVX512 support\nhas bit-rotted in GN a bit. Should take care of a cluster of\nfailures in the latest Chrome integration attempt. [1]\n\nAlso simplify/unify things for 32-bit.\n\n[1] https://chromium-review.googlesource.com/c/chromium/src/+/7615883?tab\u003dchecks\n\nPiperOrigin-RevId: 916227360\n"
    },
    {
      "commit": "c3ac56a5bb4eb939370253ad1e4607ad04c803b0",
      "tree": "900eeacda6433e698777b7a21aeb0623bb5684cf",
      "parents": [
        "7bf9c692b18a695b8681139508746b3959f70aad"
      ],
      "author": {
        "name": "Quentin Khan",
        "email": "qkhan@google.com",
        "time": "Fri May 15 19:36:08 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 19:37:21 2026"
      },
      "message": "Add subgraph matcher target to `BUILD.gn`.\n\nPiperOrigin-RevId: 916133551\n"
    },
    {
      "commit": "7bf9c692b18a695b8681139508746b3959f70aad",
      "tree": "fabd56fbc5ddff906daf6da709a1b664121b6956",
      "parents": [
        "34c80155b875d102e0ffbcbd12a628477ceb610b"
      ],
      "author": {
        "name": "Frank Barchard",
        "email": "fbarchard@google.com",
        "time": "Fri May 15 17:57:10 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 17:58:00 2026"
      },
      "message": "Fix ambiguous std::isfinite, std::abs, and std::fpclassify calls for _Float16 in test framework by explicitly casting to float.\n\nPiperOrigin-RevId: 916085457\n"
    },
    {
      "commit": "34c80155b875d102e0ffbcbd12a628477ceb610b",
      "tree": "9cce7b1849f42d91229f31a540583f5ff4d9428d",
      "parents": [
        "ace56b6162087f1926d782d39797a00fb56f2a30"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Fri May 15 16:21:58 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 16:22:57 2026"
      },
      "message": "Make sure partial reduction splits match the loop step.\n\nPiperOrigin-RevId: 916039463\n"
    },
    {
      "commit": "ace56b6162087f1926d782d39797a00fb56f2a30",
      "tree": "d196206776645ed3668d1fbae7fd568f97c63920",
      "parents": [
        "49e266f77cf7930237c9c0bec5a0a1dd2c767ffd"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 15 06:20:51 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 06:24:36 2026"
      },
      "message": "Improve `exp` kernel accuracy and correctness\n\n- Use a more accurate polynomial approximation for `exp`.\n- reference kernels accidentally used double precision intermediates due to double precision params.\n- `exp` op did not propagate NaN\n- `sign` op did not propagate NaN\n\nThis is more correct, but costs some performance:\n```\nname                                                          time/op        time/op     vs base\nbench_reference/exp_float/m:1/n:4096/real_time                74.80µ ±  1%   37.63µ ±  4%  -49.69% (p\u003d0.002 n\u003d6)\nbench_reference/exp_float/m:4/n:1024/real_time                75.09µ ±  1%   37.22µ ±  1%  -50.43% (p\u003d0.002 n\u003d6)\nbench_reference/exp_float/m:16/n:256/real_time                74.87µ ±  2%   37.40µ ±  4%  -50.05% (p\u003d0.002 n\u003d6)\nbench_reference/exp_double/m:1/n:4096/real_time               67.29µ ±  1%   67.16µ ±  9%        ~ (p\u003d0.818 n\u003d6)\nbench_reference/exp_double/m:4/n:1024/real_time               67.26µ ±  7%   66.93µ ±  2%        ~ (p\u003d0.699 n\u003d6)\nbench_reference/exp_double/m:16/n:256/real_time               67.35µ ±  2%   67.59µ ±  2%        ~ (p\u003d0.589 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time 1.689µ ±  1%   1.857µ ±  2%   +9.89% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:4/n:1024/real_time 1.711µ ±  4%   1.854µ ±  1%   +8.36% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx512f_avx512bw/m:16/n:256/real_time 1.711µ ±  1%   1.886µ ±  2%  +10.20% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:1/n:4096/real_time 5.004µ ±  8%   5.251µ ±  5%   +4.92% (p\u003d0.041 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:4/n:1024/real_time 5.148µ ± 11%   5.289µ ±  5%        ~ (p\u003d0.180 n\u003d6)\nbench/exp_fp64_1x16_x86_avx512f_avx512bw/m:16/n:256/real_time 5.117µ ±  4%   5.292µ ±  4%   +3.42% (p\u003d0.041 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time        2.259µ ±  3%   2.552µ ±  1%  +12.94% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:4/n:1024/real_time        2.273µ ±  2%   2.559µ ±  4%  +12.58% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2_fma3/m:16/n:256/real_time        2.314µ ±  1%   2.600µ ±  2%  +12.34% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:1/n:4096/real_time        7.145µ ±  3%   7.941µ ±  7%  +11.15% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:4/n:1024/real_time        7.158µ ±  3%   7.803µ ±  5%   +9.02% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2_fma3/m:16/n:256/real_time        7.199µ ±  2%   7.805µ ±  5%   +8.41% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x32_x86_avx2/m:1/n:4096/real_time             2.948µ ±  3%\nbench/exp_fp32_1x32_x86_avx2/m:4/n:1024/real_time             2.951µ ±  1%\nbench/exp_fp32_1x32_x86_avx2/m:16/n:256/real_time             2.961µ ±  2%\nbench/exp_fp64_1x16_x86_avx2/m:1/n:4096/real_time             11.00µ ±  6%   11.43µ ±  3%        ~ (p\u003d0.180 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:4/n:1024/real_time             11.16µ ±  3%   11.34µ ±  2%        ~ (p\u003d0.093 n\u003d6)\nbench/exp_fp64_1x16_x86_avx2/m:16/n:256/real_time             11.24µ ±  4%   11.31µ ±  2%        ~ (p\u003d0.699 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:1/n:4096/real_time             4.957µ ±  1%   5.654µ ±  4%  +14.05% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:4/n:1024/real_time             4.937µ ±  1%   5.591µ ±  1%  +13.26% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_sse2/m:16/n:256/real_time             4.919µ ±  1%   5.632µ ±  3%  +14.49% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:1/n:4096/real_time              17.42µ ±  3%   18.26µ ±  1%   +4.79% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:4/n:1024/real_time              17.53µ ±  3%   18.15µ ±  2%   +3.57% (p\u003d0.002 n\u003d6)\nbench/exp_fp64_1x8_x86_sse2/m:16/n:256/real_time              17.51µ ±  1%   18.19µ ± 12%   +3.87% (p\u003d0.002 n\u003d6)\nbench/exp_fp32_1x16_x86_avx2/m:1/n:4096/real_time             3.900µ ± 26%\nbench/exp_fp32_1x16_x86_avx2/m:4/n:1024/real_time             3.913µ ±  4%\nbench/exp_fp32_1x16_x86_avx2/m:16/n:256/real_time             3.906µ ±  3%\ngeomean                                                       8.526µ         8.622µ         -1.85%               ¹\n¹                                                            benchmark set differs from baseline; geomeans may not be comparable\n```\n\nPiperOrigin-RevId: 915813349\n"
    },
    {
      "commit": "49e266f77cf7930237c9c0bec5a0a1dd2c767ffd",
      "tree": "1865c3cf46a10d29024cc26e51aca0301c6b3d13",
      "parents": [
        "11fb88590dfa17facb3b7a57e41c5b2b6b1cb1a6"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Fri May 15 01:48:55 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 01:49:29 2026"
      },
      "message": "Add optimized convert int2/int4 to int8 kernels.\n\nAt the moment, I only added AVX2 and AVX512 kernels. I think we can do better if we use AVX512-VBMI or something, but I think that\u0027s lower priority.\n\n```\n------------------------------------------------------------------------------------------------------------------------------------\nBenchmark                                                                          Time             CPU   Iterations UserCounters...\n------------------------------------------------------------------------------------------------------------------------------------\nbench_reference_convert/int2x4_int8_t/m:1/n:4096/real_time                      1333 ns         1333 ns       290663 Bytes\u003d6.14607G/s Op\u003d3.07303G/s\nbench_reference_convert/int2x4_int8_t/m:4/n:1024/real_time                      1361 ns         1361 ns       306814 Bytes\u003d6.01974G/s Op\u003d3.00987G/s\nbench_reference_convert/int2x4_int8_t/m:16/n:256/real_time                      1318 ns         1317 ns       319517 Bytes\u003d6.21558G/s Op\u003d3.10779G/s\nbench/convert_int2_to_int8_1x64_x86_avx512f_avx512bw/m:1/n:4096/real_time       63.8 ns         63.8 ns      6551296 Bytes\u003d128.357G/s Op\u003d64.1784G/s\nbench/convert_int2_to_int8_1x64_x86_avx512f_avx512bw/m:4/n:1024/real_time       65.1 ns         65.1 ns      6545915 Bytes\u003d125.786G/s Op\u003d62.8929G/s\nbench/convert_int2_to_int8_1x64_x86_avx512f_avx512bw/m:16/n:256/real_time       71.6 ns         71.6 ns      4778839 Bytes\u003d114.456G/s Op\u003d57.228G/s\nbench/convert_int2_to_int8_1x32_x86_avx2/m:1/n:4096/real_time                   95.8 ns         95.8 ns      4580022 Bytes\u003d85.5078G/s Op\u003d42.7539G/s\nbench/convert_int2_to_int8_1x32_x86_avx2/m:4/n:1024/real_time                   94.4 ns         94.4 ns      4572399 Bytes\u003d86.7469G/s Op\u003d43.3734G/s\nbench/convert_int2_to_int8_1x32_x86_avx2/m:16/n:256/real_time                   96.4 ns         96.4 ns      4480847 Bytes\u003d85.0169G/s Op\u003d42.5084G/s\nbench_reference_convert/int4x2_int8_t/m:1/n:4096/real_time                      1675 ns         1674 ns       248804 Bytes\u003d4.89207G/s Op\u003d2.44603G/s\nbench_reference_convert/int4x2_int8_t/m:4/n:1024/real_time                      1695 ns         1695 ns       249163 Bytes\u003d4.83283G/s Op\u003d2.41642G/s\nbench_reference_convert/int4x2_int8_t/m:16/n:256/real_time                      1806 ns         1806 ns       233733 Bytes\u003d4.53658G/s Op\u003d2.26829G/s\nbench/convert_int4_to_int8_1x64_x86_avx512f_avx512bw/m:1/n:4096/real_time       64.2 ns         64.2 ns      6632649 Bytes\u003d127.538G/s Op\u003d63.7688G/s\nbench/convert_int4_to_int8_1x64_x86_avx512f_avx512bw/m:4/n:1024/real_time       64.2 ns         64.2 ns      6560923 Bytes\u003d127.632G/s Op\u003d63.8158G/s\nbench/convert_int4_to_int8_1x64_x86_avx512f_avx512bw/m:16/n:256/real_time       64.8 ns         64.8 ns      6320290 Bytes\u003d126.449G/s Op\u003d63.2245G/s\nbench/convert_int4_to_int8_1x32_x86_avx2/m:1/n:4096/real_time                    116 ns          116 ns      3617014 Bytes\u003d70.7825G/s Op\u003d35.3913G/s\nbench/convert_int4_to_int8_1x32_x86_avx2/m:4/n:1024/real_time                    119 ns          119 ns      3538506 Bytes\u003d68.6638G/s Op\u003d34.3319G/s\nbench/convert_int4_to_int8_1x32_x86_avx2/m:16/n:256/real_time                    134 ns          134 ns      3077822 Bytes\u003d61.0267G/s Op\u003d30.5134G/s\n\n```\n\nPiperOrigin-RevId: 915727140\n"
    },
    {
      "commit": "11fb88590dfa17facb3b7a57e41c5b2b6b1cb1a6",
      "tree": "300e221e67fdc8f42a6fe2c9a2ea2c16b906255c",
      "parents": [
        "9ab80cd66ef6f077e87551424c07e6327a374b3b"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Fri May 15 01:07:27 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Fri May 15 01:10:03 2026"
      },
      "message": "Implement round to nearest even for float -\u003e bf16 conversions\n\nAlso adds float -\u003e bf16 conversions to SIMD benchmarks\n\nThis is a performance regression, but I think it is worth the accuracy improvement.\n\n```\nname                                                                      sec/op       sec/op     vs base\nbench/convert_fp32_to_bf16_1x64_x86_avx512f_avx512bw/m:1/n:4096/real_time 188.1n ± 2%   421.3n ± 1%  +123.94% (p\u003d0.002 n\u003d6)\nbench/convert_fp32_to_bf16_1x64_x86_avx512f_avx512bw/m:4/n:1024/real_time 192.6n ± 3%   418.3n ± 1%  +117.19% (p\u003d0.002 n\u003d6)\nbench/convert_fp32_to_bf16_1x64_x86_avx512f_avx512bw/m:16/n:256/real_time 197.3n ± 3%   423.9n ± 4%  +114.86% (p\u003d0.002 n\u003d6)\nbench/convert_fp32_to_bf16_1x16_x86_avx2/m:1/n:4096/real_time             270.8n ± 3%   753.3n ± 1%  +178.14% (p\u003d0.002 n\u003d6)\nbench/convert_fp32_to_bf16_1x16_x86_avx2/m:4/n:1024/real_time             277.5n ± 1%   769.6n ± 2%  +177.32% (p\u003d0.002 n\u003d6)\nbench/convert_fp32_to_bf16_1x16_x86_avx2/m:16/n:256/real_time             310.5n ± 2%   795.2n ± 1%  +156.11% (p\u003d0.002 n\u003d6)\ngeomean                                                                   234.6n        570.4n       +143.11%\n```\n\nAs a side effect of the new test coverage in this change, I found a lot of bugs, like hitting UB in float -\u003e int conversions, that motivated the following refactoring:\n- We really shouldn\u0027t have a truncating `cast` at all for `float` -\u003e `int` conversions if we can avoid it.\n- Then I realized that some uses of cast really should be `round_float_to_int`, but some uses of this want to do `float` -\u003e `float` (`half` or `bfloat16`) conversions, so we should rename `round_float_to_int` to `cast` to allow for templates that handle both `int` and `float`.\n- Now the old cast is a duplicate symbol, we should just remove it.\n- `saturate_cast` is similarly redundant, we should just never not saturate when casting to any integer.\n- When casting floats to integers, we clamped at a value lower in magnitude than the int max/min. This was a small bug, fixed in this change.\n\nI think that we never need cast with overflow and no rounding, and is in fact a bug prone helper to offer.\n\nPiperOrigin-RevId: 915714096\n"
    },
    {
      "commit": "9ab80cd66ef6f077e87551424c07e6327a374b3b",
      "tree": "38a000a7e88f78e19cf68e9b99b061df120ea5ff",
      "parents": [
        "95ee916af734c126e7bf65d83cf0c08673a32f16"
      ],
      "author": {
        "name": "Volodymyr Kysenko",
        "email": "vksnk@google.com",
        "time": "Thu May 14 17:48:20 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 14 17:48:57 2026"
      },
      "message": "Allow adding function own loops even if some of its non-trivial loops has been already fused.\n\nThis really was just a heuristic to avoid creating too many loops I added because I was afraid of creating unnecessary overhead.\n\nPiperOrigin-RevId: 915507602\n"
    },
    {
      "commit": "95ee916af734c126e7bf65d83cf0c08673a32f16",
      "tree": "31fc7c8c180698ec416fdb0816011e53266df1db",
      "parents": [
        "d72fa85c9bd2ee33365497fa6264ff84f4b2b593"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 14 17:35:22 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 14 17:35:57 2026"
      },
      "message": "Use a better unroll factor for log2_fp32_sse2\n\n```\nname                                              time/op       time/op     vs base\nbench/log_fp32_1x8_x86_sse2/m:1/n:4096/real_time  6.296µ ± 3%\nbench/log_fp32_1x8_x86_sse2/m:4/n:1024/real_time  6.478µ ± 5%\nbench/log_fp32_1x8_x86_sse2/m:16/n:256/real_time  6.452µ ± 5%\nbench/log_fp32_1x16_x86_sse2/m:1/n:4096/real_time 6.055µ ± 3%\nbench/log_fp32_1x16_x86_sse2/m:4/n:1024/real_time 6.068µ ± 2%\nbench/log_fp32_1x16_x86_sse2/m:16/n:256/real_time 6.049µ ± 2%\ngeomean                                           6.408µ        6.057µ       ? ¹ ²\n¹                                                benchmark set differs from baseline; geomeans may not be comparable\n²                                                ratios must be \u003e0 to compute geomean\n```\n\nPiperOrigin-RevId: 915500812\n"
    },
    {
      "commit": "d72fa85c9bd2ee33365497fa6264ff84f4b2b593",
      "tree": "dbb188bc987d5b187aef97f8e5456b37b9feaf71",
      "parents": [
        "4fad5b3975c43ac74426e07f30af2e6ea419aa3d"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 14 16:41:12 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 14 16:42:11 2026"
      },
      "message": "Improve log_fp32 kernels\n\n- Avoid the need for the sqrt(2) multiplier by making the polynomial approximate log2 on [1, 2).\n- Use more accurate polynomial\n\nThis is both more accurate and faster on x86 (with a change to the kernel unrolling on SSE2).\n```\nname                                                          time/op        time/op     vs base\nbench/log_fp32_1x32_x86_avx512f_avx512bw/m:1/n:4096/real_time 1.620µ ± 2%   1.547µ ±  3%   -4.48% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx512f_avx512bw/m:4/n:1024/real_time 1.624µ ± 2%   1.560µ ±  6%   -3.95% (p\u003d0.041 n\u003d6)\nbench/log_fp32_1x32_x86_avx512f_avx512bw/m:16/n:256/real_time 1.631µ ± 2%   1.563µ ±  2%   -4.17% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx2_fma3/m:1/n:4096/real_time        3.573µ ± 3%   2.888µ ±  3%  -19.16% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx2_fma3/m:4/n:1024/real_time        3.605µ ± 3%   2.918µ ± 11%  -19.06% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx2_fma3/m:16/n:256/real_time        3.597µ ± 1%   2.890µ ±  5%  -19.67% (p\u003d0.002 n\u003d6)\nbench/log_fp32_1x32_x86_avx2/m:1/n:4096/real_time             3.592µ ± 5%   3.432µ ±  3%   -4.45% (p\u003d0.041 n\u003d6)\nbench/log_fp32_1x32_x86_avx2/m:4/n:1024/real_time             3.592µ ± 3%   3.492µ ±  6%        ~ (p\u003d0.132 n\u003d6)\nbench/log_fp32_1x32_x86_avx2/m:16/n:256/real_time             3.571µ ± 4%   3.471µ ±  3%   -2.80% (p\u003d0.026 n\u003d6)\nbench/log_fp32_1x8_x86_sse2/m:1/n:4096/real_time              6.682µ ± 3%\nbench/log_fp32_1x8_x86_sse2/m:4/n:1024/real_time              6.750µ ± 3%\nbench/log_fp32_1x8_x86_sse2/m:16/n:256/real_time              6.831µ ± 4%\nbench/log_fp32_1x16_x86_sse2/m:1/n:4096/real_time             6.405µ ±  4%\nbench/log_fp32_1x16_x86_sse2/m:4/n:1024/real_time             6.466µ ±  4%\nbench/log_fp32_1x16_x86_sse2/m:16/n:256/real_time             6.531µ ±  3%\n```\n\nPiperOrigin-RevId: 915474623\n"
    },
    {
      "commit": "4fad5b3975c43ac74426e07f30af2e6ea419aa3d",
      "tree": "aab7c6a6bf02b342ea7e0037556d9823848483cb",
      "parents": [
        "fe166973d8e38aeee4ca9a08bb025eca28821b9a"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 14 05:37:04 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 14 05:39:28 2026"
      },
      "message": "Disable static_slice test until slinky bug is fixed\n\nPiperOrigin-RevId: 915232622\n"
    },
    {
      "commit": "393da7ddec358bcaa2a1ec1b8e50634a010db4a9",
      "tree": "46d64a4f008884d3f5e0ed024312430c3ea27fb7",
      "parents": [
        "f351319454aaf6612e4dc67394bbe75eb9601815"
      ],
      "author": {
        "name": "Ken Unger",
        "email": "ken.j.unger@gmail.com",
        "time": "Thu May 14 04:51:07 2026"
      },
      "committer": {
        "name": "Ken Unger",
        "email": "ken.j.unger@gmail.com",
        "time": "Thu May 14 04:51:07 2026"
      },
      "message": "add rvv kernel for f16-vlog\n"
    },
    {
      "commit": "fe166973d8e38aeee4ca9a08bb025eca28821b9a",
      "tree": "5a9bb65fca3395ba8f8ecfad5dfe3d5fe99ed5c6",
      "parents": [
        "53007d6933e46552850148e90fc49747516fd4cf"
      ],
      "author": {
        "name": "Dillon Sharlet",
        "email": "dsharlet@google.com",
        "time": "Thu May 14 03:45:16 2026"
      },
      "committer": {
        "name": "XNNPACK Team",
        "email": "xnnpack-github-robot@google.com",
        "time": "Thu May 14 03:46:27 2026"
      },
      "message": "Disable static_slice test until slinky bug is fixed\n\nPiperOrigin-RevId: 915199617\n"
    }
  ],
  "next": "53007d6933e46552850148e90fc49747516fd4cf"
}
