| // Copyright 2020-2025 Google LLC |
| // |
| // This source code is licensed under the BSD-style license found in the |
| // LICENSE file in the root directory of this source tree. |
| |
| #include <assert.h> |
| #include <inttypes.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdio.h> // For snprintf. |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #if defined(__EMSCRIPTEN__) |
| #include <emscripten/emscripten.h> |
| #elif XNN_PLATFORM_WINDOWS |
| #include <windows.h> |
| #else |
| #include <errno.h> |
| #include <time.h> |
| #endif |
| |
| #include "include/experimental.h" |
| #include "include/xnnpack.h" |
| #include "src/xnnpack/allocation-type.h" |
| #include "src/xnnpack/allocator.h" |
| #include "src/xnnpack/cache.h" |
| #include "src/xnnpack/common.h" |
| #include "src/xnnpack/internal.h" |
| #include "src/xnnpack/log.h" |
| #include "src/xnnpack/math.h" |
| #include "src/xnnpack/memory-planner.h" |
| #include "src/xnnpack/memory.h" |
| #include "src/xnnpack/microkernel-type.h" |
| #include "src/xnnpack/node-type.h" |
| #include "src/xnnpack/operator-utils.h" |
| #include "src/xnnpack/operator.h" |
| #include "src/xnnpack/params.h" |
| #include "src/xnnpack/subgraph.h" |
| #include <pthreadpool.h> |
| |
| enum xnn_status xnn_reshape_external_value( |
| xnn_runtime_t runtime, |
| uint32_t external_id, |
| size_t num_dims, |
| const size_t* dims) { |
| if (external_id >= runtime->num_values) { |
| xnn_log_error("failed to reshape runtime: out-of-bounds ID %" PRIu32 " in external value", |
| external_id); |
| return xnn_status_invalid_parameter; |
| } |
| struct xnn_runtime_value* value = &runtime->values[external_id]; |
| if (value->allocation_type != xnn_allocation_type_external) { |
| xnn_log_error("failed to reshape runtime: Value %" PRIu32 " is not external (%d)", |
| external_id, value->allocation_type); |
| return xnn_status_invalid_parameter; |
| } |
| struct xnn_shape new_shape = {.num_dims = num_dims}; |
| for (int k = 0; k < num_dims; k++) { |
| new_shape.dim[k] = dims[k]; |
| } |
| if (!xnn_shape_match(&value->shape, &new_shape)) { |
| if (value->flags & XNN_VALUE_FLAG_SHAPE_IS_STATIC) { |
| xnn_log_error("failed to reshape runtime: Value %" PRIu32 |
| " is flagged as having a static shape", |
| external_id); |
| return xnn_status_invalid_parameter; |
| } |
| value->shape = new_shape; |
| } |
| value->size = xnn_runtime_tensor_get_size(value); |
| return xnn_status_success; |
| } |
| |
| enum xnn_status |
| xnn_get_external_value_shape(xnn_runtime_t runtime, uint32_t external_id, size_t* num_dims, size_t* dims) |
| { |
| if (external_id >= runtime->num_values) { |
| xnn_log_error("failed to get external value shape: out-of-bounds ID %" PRIu32 " in external value", external_id); |
| return xnn_status_invalid_parameter; |
| } |
| struct xnn_runtime_value* value = &runtime->values[external_id]; |
| if (value->allocation_type != xnn_allocation_type_external) { |
| xnn_log_error( |
| "failed to get external value shape: Value %" PRIu32 " is not external (%d)", external_id, |
| value->allocation_type); |
| return xnn_status_invalid_parameter; |
| } |
| if (num_dims == NULL || dims == NULL) { |
| xnn_log_error("failed to get external value shape: null pointer"); |
| return xnn_status_invalid_parameter; |
| } |
| *num_dims = value->shape.num_dims; |
| memcpy(dims, value->shape.dim, value->shape.num_dims * sizeof(size_t)); |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out) |
| { |
| if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
| xnn_log_error("failed to create workspace: XNNPACK is not initialized"); |
| return xnn_status_uninitialized; |
| } |
| |
| struct xnn_workspace* workspace = NULL; |
| workspace = xnn_allocate_zero_memory(sizeof(struct xnn_workspace)); |
| if (workspace == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for workspace descriptor", sizeof(struct xnn_workspace)); |
| return xnn_status_out_of_memory; |
| } |
| workspace->ref_count = 1; |
| *workspace_out = workspace; |
| return xnn_status_success; |
| } |
| |
| static inline void xnn_retain_workspace(xnn_workspace_t workspace) |
| { |
| workspace->ref_count++; |
| } |
| |
| enum xnn_status xnn_release_workspace(xnn_workspace_t workspace) |
| { |
| assert(workspace->ref_count != 0); |
| if (--workspace->ref_count == 0) { |
| xnn_release_simd_memory(workspace->data); |
| xnn_release_memory(workspace); |
| } |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out) |
| { |
| struct xnn_weights_cache_provider* cache_provider = NULL; |
| enum xnn_status status = xnn_status_uninitialized; |
| |
| if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
| xnn_log_error("failed to create weights cache: XNNPACK is not initialized"); |
| goto error; |
| } |
| |
| cache_provider = xnn_allocate_zero_memory(sizeof(struct xnn_weights_cache_provider)); |
| if (cache_provider == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for weights cache provider descriptor", sizeof(struct xnn_weights_cache_provider)); |
| goto error; |
| } |
| |
| cache_provider->context = xnn_allocate_zero_memory(sizeof(struct xnn_internal_weights_cache)); |
| if (cache_provider->context == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for weights cache descriptor", sizeof(struct xnn_internal_weights_cache)); |
| goto error; |
| } |
| |
| status = xnn_internal_init_weights_cache_with_size(cache_provider->context, size); |
| if (status != xnn_status_success) { |
| goto error; |
| } |
| cache_provider->look_up = (size_t(*)(void*, const struct xnn_weights_cache_look_up_key*))xnn_internal_weights_cache_look_up; |
| cache_provider->reserve_space = (void*(*)(void*, size_t))xnn_internal_reserve_space_in_weights_cache; |
| cache_provider->look_up_or_insert = (size_t (*)(void*, const struct xnn_weights_cache_look_up_key*, void*, size_t))xnn_internal_get_or_insert_weights_cache; |
| cache_provider->is_finalized = (bool (*)(void*))xnn_internal_weights_cache_is_finalized; |
| cache_provider->offset_to_addr = (void*(*)(void*, size_t))xnn_internal_weights_cache_offset_to_addr; |
| cache_provider->delete_cache = (enum xnn_status (*)(void*))xnn_internal_delete_weights_cache; |
| *weights_cache_out = cache_provider; |
| return xnn_status_success; |
| |
| error: |
| if (cache_provider != NULL) { |
| xnn_internal_release_weights_cache(cache_provider->context); |
| } |
| return status; |
| } |
| |
| enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out) |
| { |
| return xnn_create_weights_cache_with_size(XNN_DEFAULT_WEIGHTS_BUFFER_SIZE, weights_cache_out); |
| } |
| |
| enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache) |
| { |
| if XNN_LIKELY(weights_cache != NULL) { |
| enum xnn_status status = xnn_internal_release_weights_cache(weights_cache->context); |
| if (status != xnn_status_success) { |
| return status; |
| } |
| xnn_release_memory(weights_cache->context); |
| xnn_release_memory(weights_cache); |
| } |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_create_runtime( |
| xnn_subgraph_t subgraph, |
| xnn_runtime_t* runtime_out) |
| { |
| return xnn_create_runtime_v2(subgraph, NULL /* threadpool */, 0 /* flags */, runtime_out); |
| } |
| |
| enum xnn_status xnn_create_runtime_v2( |
| xnn_subgraph_t subgraph, |
| pthreadpool_t threadpool, |
| uint32_t flags, |
| xnn_runtime_t* runtime_out) |
| { |
| return xnn_create_runtime_v3(subgraph, /* weights_cache */ NULL, threadpool, flags, runtime_out); |
| } |
| |
| enum xnn_status xnn_create_runtime_v3( |
| xnn_subgraph_t subgraph, |
| xnn_weights_cache_t weights_cache, |
| pthreadpool_t threadpool, |
| uint32_t flags, |
| xnn_runtime_t* runtime_out) |
| { |
| xnn_workspace_t workspace; |
| enum xnn_status status = xnn_create_workspace(&workspace); |
| if (status != xnn_status_success) { |
| return status; |
| } |
| status = xnn_create_runtime_v4(subgraph, weights_cache, workspace, threadpool, flags, runtime_out); |
| // Release workspace regardless of return status of creating runtime. |
| xnn_release_workspace(workspace); |
| return status; |
| } |
| |
| static enum xnn_status initialize_workspace_values( |
| xnn_runtime_t runtime, |
| struct xnn_value_allocation_tracker* mem_alloc_tracker) |
| { |
| assert(runtime->workspace != NULL); |
| size_t mem_arena_size = mem_alloc_tracker->mem_arena_size; |
| if (mem_arena_size == 0) { |
| return xnn_status_success; |
| } |
| // Sparse microkernels can read up to 2 * XNN_EXTRA_BYTES beyond array bounds. |
| mem_arena_size += 2 * XNN_EXTRA_BYTES; |
| |
| // Records how much the workspace has moved by due to allocating a larger workspace. |
| ptrdiff_t workspace_data_delta = 0; |
| // Allocates larger workspace here if needed. |
| if (runtime->workspace->size < mem_arena_size) { |
| void* old_workspace_data = runtime->workspace->data; |
| void* new_workspace_data = xnn_allocate_zero_simd_memory(mem_arena_size); |
| if (new_workspace_data == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for runtime workspace", mem_arena_size); |
| return xnn_status_out_of_memory; |
| } |
| runtime->workspace->data = new_workspace_data; |
| runtime->workspace->size = mem_arena_size; |
| // Keep track of how much the workspace data moved. |
| if (old_workspace_data != NULL) { |
| workspace_data_delta = (uintptr_t) new_workspace_data - (uintptr_t) old_workspace_data; |
| xnn_release_simd_memory(old_workspace_data); |
| } |
| xnn_log_debug("created workspace of size %zu, old workspace %p, new workspace %p, delta %td", |
| mem_arena_size, old_workspace_data, new_workspace_data, workspace_data_delta); |
| } |
| |
| assert(runtime->workspace->size >= mem_arena_size); |
| |
| // Initialize current runtime's value pointers. |
| for (size_t i = 0; i < runtime->num_values; i++) { |
| struct xnn_runtime_value* value = &runtime->values[i]; |
| if (!xnn_value_is_valid(value->type)) { |
| continue; |
| } |
| |
| if (value->allocation_type == xnn_allocation_type_workspace) { |
| // Value is purely internal to the runtime, allocate it in the workspace. |
| value->data = |
| (void*) ((uintptr_t) runtime->workspace->data + mem_alloc_tracker->usage[i].alloc_offset); |
| if (value->datatype == xnn_datatype_qdint8 || |
| value->datatype == xnn_datatype_qduint8) { |
| value->quantization.dynamic_params = |
| (void*) ((uintptr_t) runtime->workspace->data + mem_alloc_tracker->usage[i].alloc_offset |
| + xnn_tensor_get_rounded_size(value)); |
| value->quantization.row_sum = |
| (void*) ((uintptr_t) value->quantization.dynamic_params + |
| xnn_tensor_get_rounded_dynamic_quant_param_size(value)); |
| } |
| } |
| } |
| |
| // Initialize operator workspace values. |
| for (size_t i = 0; i < runtime->num_ops; i++) { |
| const struct xnn_usage_record* usage = &mem_alloc_tracker->usage[runtime->num_values + i]; |
| if (usage->opdata_id == XNN_INVALID_NODE_ID) { |
| continue; |
| } |
| struct xnn_operator_data* opdata = &runtime->opdata[usage->opdata_id]; |
| opdata->workspace = (void*) ((uintptr_t) runtime->workspace->data + usage->alloc_offset); |
| } |
| |
| // Adjust the value pointers of all runtimes that share this workspace. |
| if (workspace_data_delta != 0) { |
| for (struct xnn_runtime* rt = runtime->workspace->first_user; rt != NULL; rt = rt->next_workspace_user) { |
| // The current runtime already has the correct offset. |
| if (rt == runtime) { |
| continue; |
| } |
| // This memory for this runtime has not yet been planned, so it doesn't have any pointers into workspace, so does not need to |
| // be updated. |
| if (!rt->memory_planned) { |
| continue; |
| } |
| |
| // Adjust offsets of values in workspace. |
| for (size_t i = 0; i < rt->num_values; i++) { |
| struct xnn_runtime_value* value = &rt->values[i]; |
| if (value->allocation_type == xnn_allocation_type_workspace) { |
| if (value->data != NULL) { |
| // Data can be null as the runtime using this workspace might not have been set up. |
| value->data = (void*) ((uintptr_t) value->data + workspace_data_delta); |
| if (value->datatype == xnn_datatype_qdint8 || |
| value->datatype == xnn_datatype_qduint8) { |
| value->quantization.dynamic_params = (void*) ((uintptr_t) value->quantization.dynamic_params |
| + workspace_data_delta); |
| value->quantization.row_sum = (void*) ((uintptr_t) value->quantization.row_sum |
| + workspace_data_delta); |
| } |
| } |
| } |
| } |
| |
| // Adjust offsets of op workspaces. |
| for (size_t i = 0; i < rt->num_ops; i++) { |
| struct xnn_operator_data* opdata = &rt->opdata[i]; |
| if (opdata->operator_objects[0] == NULL) { |
| // Operator was removed during optimization |
| continue; |
| } |
| |
| if (opdata->workspace != NULL) { |
| opdata->workspace = (void*) ((uintptr_t) opdata->workspace + workspace_data_delta); |
| } |
| } |
| // This runtime has not ever been setup yet, so it doesn't have any pointers into workspace, so does not need to |
| // be updated. |
| if (!rt->has_been_setup) { |
| continue; |
| } |
| // Re-setup all the nodes to adjust input/output pointers. |
| for (size_t i = 0; i < rt->num_ops; i++) { |
| struct xnn_operator_data* opdata = &rt->opdata[i]; |
| for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
| if (opdata->operator_objects[j] == NULL) { |
| // Operator was removed during optimization |
| continue; |
| } |
| assert(opdata->setup != NULL); |
| const enum xnn_status status = opdata->setup(opdata, rt->values, rt->num_values, rt->threadpool); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to setup runtime: error in operator #%zu", i); |
| return status; |
| } |
| } |
| } |
| } |
| } |
| |
| return xnn_status_success; |
| } |
| |
| // Output can reuse input memory if both are allocated in the workspace. |
| // If input has more than 1 consumer, we can't track all the consumers and update the first_consumer, so bail out. |
| // Output memory fits in input memory. One of the inputs to a binary node could be implicitly broadcasted. |
| static bool input_memory_can_be_reused(const xnn_runtime_t runtime, size_t input_id, size_t output_id) |
| { |
| if (input_id == XNN_INVALID_VALUE_ID || output_id == XNN_INVALID_VALUE_ID) { |
| return false; |
| } |
| const struct xnn_runtime_value* input = &runtime->values[input_id]; |
| const struct xnn_runtime_value* output = &runtime->values[output_id]; |
| const bool output_memory_fits = xnn_runtime_tensor_get_size(input) == xnn_runtime_tensor_get_size(output); |
| //assert(input->num_consumers != 0); |
| return input->allocation_type == xnn_allocation_type_workspace && |
| output->allocation_type == xnn_allocation_type_workspace && |
| (input->flags & XNN_VALUE_FLAG_ONE_CONSUMER) && output_memory_fits; |
| } |
| |
| // An in-place operation reuses the input tensor's memory for its output. Examples are element-wise unary operations |
| // like activation functions. Usually, an output tensor is allocated space. For an in-place operation, we want the |
| // output tensor to share the input tensor's memory. We do this by calling xnn_mark_tensor_as_reuse, which: |
| // - sets the tensor_size of output tensor's usage record to 0 |
| // - mark this usage record as reusing another tensor's memory |
| // - remember the id of the tensor which we will reuse the alloc_offset to set onto the output tensor |
| static void optimize_tensor_allocation_for_in_place_operations( |
| struct xnn_value_allocation_tracker* tracker, |
| const xnn_runtime_t runtime) |
| { |
| for (uint32_t n = 0; n < runtime->num_ops; n++) { |
| const struct xnn_operator_data* node = &runtime->opdata[n]; |
| switch (node->type) { |
| case xnn_node_type_unary_elementwise: |
| case xnn_node_type_binary_elementwise: |
| case xnn_node_type_copy: |
| case xnn_node_type_softmax: |
| case xnn_node_type_static_reshape: |
| // Valid operation types that can be optimized. |
| break; |
| default: |
| continue; |
| } |
| |
| // Check all of the node's input to see which we can reuse. |
| uint32_t input_id = XNN_INVALID_VALUE_ID; |
| for (size_t i = 0; i < node->num_inputs; i++) { |
| if (input_memory_can_be_reused(runtime, node->inputs[i], node->outputs[0])) { |
| input_id = node->inputs[i]; |
| break; // Found an input we can reuse, early exit. |
| } |
| } |
| // Check input_id and return if invalid. |
| if (input_id == XNN_INVALID_VALUE_ID) { |
| continue; |
| } |
| |
| // TODO(zhin): consider aliasing input to output rather than output to input. |
| struct xnn_runtime_value* output = &runtime->values[node->outputs[0]]; |
| if (output->flags & XNN_VALUE_FLAG_ONE_CONSUMER) { |
| uint32_t reuse_id = input_id; |
| // If the tensor we are reusing is itself reused, find the "root tensor" to be reused. |
| while (tracker->usage[reuse_id].reuse_value_id != XNN_INVALID_VALUE_ID) { |
| reuse_id = tracker->usage[reuse_id].reuse_value_id; |
| } |
| // We only support when output has a single consumer because we cannot easily find all consumer nodes |
| // without traversing the entire graph. This will require tracking output->last_consumer in the future. |
| assert(tracker->usage[reuse_id].last_node < output->first_consumer); |
| xnn_log_debug("reusing tensor id #%" PRIu32 " memory for tensor id #%" PRIu32 " Node #%" PRIu32 " %s", |
| reuse_id, output->id, node->id, xnn_node_type_to_string(node->type)); |
| xnn_mark_tensor_as_reuse(tracker, output->id, reuse_id, output->first_consumer); |
| } |
| } |
| } |
| |
| // Propagtes the rank through the subgraph so that each tensor's rank is |
| // correctly set. |
| void propagate_rank( |
| xnn_subgraph_t subgraph) |
| { |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| const struct xnn_node* node = subgraph->nodes + i; |
| const struct xnn_value* input_value = &subgraph->values[node->inputs[0]]; |
| const struct xnn_value* input_value_b = NULL; |
| const uint32_t flags = node->flags; |
| if (node->num_inputs > 1) { |
| input_value_b = &subgraph->values[node->inputs[1]]; |
| } |
| struct xnn_value* output_value = &subgraph->values[node->outputs[0]]; |
| switch (node->type) { |
| case xnn_node_type_argmax_pooling_2d: |
| case xnn_node_type_average_pooling_2d: |
| case xnn_node_type_convolution_2d: |
| case xnn_node_type_deconvolution_2d: |
| case xnn_node_type_depth_to_space_2d: |
| case xnn_node_type_depthwise_convolution_2d: |
| case xnn_node_type_max_pooling_2d: |
| case xnn_node_type_rope: |
| case xnn_node_type_space_to_depth_2d: |
| case xnn_node_type_static_resize_bilinear_2d: |
| case xnn_node_type_unpooling_2d: |
| output_value->shape.num_dims = 4; |
| break; |
| case xnn_node_type_global_average_pooling_2d: |
| case xnn_node_type_global_sum_pooling_1d: |
| case xnn_node_type_global_sum_pooling_2d: |
| case xnn_node_type_static_mean: |
| case xnn_node_type_static_mean_squared: |
| case xnn_node_type_static_reduce_max: |
| case xnn_node_type_static_reduce_min: |
| case xnn_node_type_static_sum: |
| case xnn_node_type_static_sum_squared: |
| if (flags & XNN_FLAG_KEEP_DIMS) { |
| output_value->shape.num_dims = input_value->shape.num_dims; |
| } else if (input_value->shape.num_dims >= |
| node->params.reduce.num_reduction_axes) { |
| output_value->shape.num_dims = input_value->shape.num_dims - |
| node->params.reduce.num_reduction_axes; |
| } else { |
| xnn_log_warning("Unable to determine output rank of Node #%" PRIu32 |
| " %s, assuming %zu.", |
| node->id, xnn_node_type_to_string(node->type), |
| output_value->shape.num_dims); |
| } |
| break; |
| case xnn_node_type_batch_matrix_multiply: |
| case xnn_node_type_binary_elementwise: |
| output_value->shape.num_dims = |
| max(input_value->shape.num_dims, input_value_b->shape.num_dims); |
| break; |
| case xnn_node_type_concatenate: |
| case xnn_node_type_copy: |
| case xnn_node_type_even_split: |
| case xnn_node_type_unary_elementwise: |
| case xnn_node_type_convert: |
| case xnn_node_type_pack_lh: |
| case xnn_node_type_softmax: |
| case xnn_node_type_static_transpose: |
| case xnn_node_type_static_constant_pad: |
| case xnn_node_type_static_slice: |
| output_value->shape.num_dims = input_value->shape.num_dims; |
| break; |
| case xnn_node_type_static_expand_dims: |
| output_value->shape.num_dims = |
| input_value->shape.num_dims + |
| node->params.static_reshape.new_shape.num_dims; |
| break; |
| case xnn_node_type_fully_connected: |
| case xnn_node_type_fully_connected_sparse: |
| output_value->shape.num_dims = input_value->shape.num_dims; |
| break; |
| case xnn_node_type_static_reshape: |
| case xnn_node_type_static_broadcast: |
| output_value->shape.num_dims = |
| node->params.static_reshape.new_shape.num_dims; |
| break; |
| case xnn_node_type_fuse_dims: |
| if (input_value->shape.num_dims >= |
| node->params.static_reshape.new_shape.num_dims + 1) { |
| output_value->shape.num_dims = |
| input_value->shape.num_dims - |
| (node->params.static_reshape.new_shape.num_dims + 1); |
| } else { |
| xnn_log_warning("Unable to determine output rank of Node #%" PRIu32 |
| " %s, assuming %zu.", |
| node->id, xnn_node_type_to_string(node->type), |
| output_value->shape.num_dims); |
| } |
| break; |
| case xnn_node_type_split_dims: |
| if (input_value->shape.num_dims + |
| node->params.static_reshape.new_shape.num_dims >= |
| 1) { |
| output_value->shape.num_dims = |
| (input_value->shape.num_dims + |
| node->params.static_reshape.new_shape.num_dims) - |
| 1; |
| } else { |
| xnn_log_warning("Unable to determine output rank of Node #%" PRIu32 |
| " %s, assuming %zu.", |
| node->id, xnn_node_type_to_string(node->type), |
| output_value->shape.num_dims); |
| } |
| break; |
| default: |
| XNN_UNREACHABLE; |
| } |
| assert(output_value->shape.num_dims <= XNN_MAX_TENSOR_DIMS); |
| } |
| } |
| |
| static enum xnn_status create_runtime_impl( |
| xnn_subgraph_t subgraph, |
| xnn_weights_cache_t weights_cache, |
| xnn_workspace_t workspace, |
| pthreadpool_t threadpool, |
| xnn_threadpool_t xnn_threadpool, |
| uint32_t flags, |
| xnn_runtime_t* runtime_out) |
| { |
| propagate_rank(subgraph); |
| struct xnn_runtime* runtime = NULL; |
| enum xnn_status status = xnn_status_uninitialized; |
| |
| if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
| xnn_log_error("failed to create runtime: XNNPACK is not initialized"); |
| goto error; |
| } |
| |
| xnn_subgraph_rewrite_ssa(subgraph); |
| |
| status = xnn_subgraph_rewrite_for_row_sum(subgraph); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to rewrite subgraph for row_sum"); |
| goto error; |
| } |
| |
| const uint32_t optimization_flags = |
| XNN_FLAG_HINT_SPARSE_INFERENCE | XNN_FLAG_HINT_FP16_INFERENCE | |
| XNN_FLAG_FORCE_FP16_INFERENCE | XNN_FLAG_NO_OPERATOR_FUSION | |
| XNN_FLAG_NO_INLINED_LHS_PACKING | XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC; |
| |
| status = xnn_subgraph_optimize(subgraph, flags & optimization_flags); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to optimize subgraph"); |
| goto error; |
| } |
| |
| status = xnn_status_out_of_memory; |
| |
| runtime = xnn_allocate_zero_memory(sizeof(struct xnn_runtime)); |
| if (runtime == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for runtime descriptor", sizeof(struct xnn_runtime)); |
| goto error; |
| } |
| |
| runtime->flags = flags; |
| |
| runtime->opdata = xnn_allocate_zero_memory(sizeof(struct xnn_operator_data) * subgraph->num_nodes); |
| if (runtime->opdata == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for opdata descriptors", |
| sizeof(struct xnn_operator_data) * (size_t) subgraph->num_nodes); |
| goto error; |
| } |
| if (flags & XNN_FLAG_BASIC_PROFILING) { |
| runtime->profiling = true; |
| } |
| |
| runtime->num_ops = subgraph->num_nodes; |
| |
| if (flags & XNN_FLAG_DONT_SPIN_WORKERS) { |
| struct xnn_node* last_valid_node = NULL; |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| struct xnn_node* node = subgraph->nodes + i; |
| if (node->type != xnn_node_type_invalid) { |
| last_valid_node = node; |
| } |
| } |
| if (last_valid_node != NULL) { |
| last_valid_node->flags |= XNN_FLAG_DONT_SPIN_WORKERS; |
| } |
| } |
| |
| if (flags & XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC) { |
| xnn_log_warning( |
| "XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC is enabled: performance will be " |
| "degraded! %d", |
| flags); |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| struct xnn_node* node = subgraph->nodes + i; |
| node->flags |= XNN_FLAG_SLOW_CONSISTENT_ARITHMETIC; |
| } |
| } |
| |
| if (flags & XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER) { |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| struct xnn_node* node = subgraph->nodes + i; |
| switch (node->type) { |
| case xnn_node_type_convolution_2d: |
| case xnn_node_type_depthwise_convolution_2d: |
| case xnn_node_type_static_resize_bilinear_2d: |
| node->flags |= XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| if (runtime->profiling) { |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| runtime->opdata[i].end_ts = xnn_allocate_zero_memory(sizeof(xnn_timestamp) * XNN_MAX_OPERATOR_OBJECTS); |
| } |
| } |
| |
| runtime->values = xnn_allocate_zero_memory(sizeof(struct xnn_runtime_value) * subgraph->num_values); |
| if (runtime->values == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for runtime's value descriptors", |
| sizeof(struct xnn_runtime_value) * (size_t) subgraph->num_values); |
| goto error; |
| } |
| |
| // Run a final analysis phase, no more modifications after this point. |
| xnn_subgraph_analyze_consumers_and_producers(subgraph); |
| // Make a copy of subgraph values since we can change them and runtime can outlive subgraph. |
| for (size_t i = 0; i < subgraph->num_values; i++) { |
| xnn_runtime_value_copy(runtime->values + i, subgraph->values + i); |
| // Value copy doesn't copy the id, but we want the same ID. |
| runtime->values[i].id = subgraph->values[i].id; |
| } |
| runtime->num_values = subgraph->num_values; |
| // No more optimizations should be performed on subgraph at this point, since modifications on the subgraph will not |
| // be copied to the runtime's values. |
| |
| for (size_t i = 0; i < subgraph->num_nodes; i++) { |
| const struct xnn_node* node = subgraph->nodes + i; |
| |
| // Initialize common fields we need for analysis. |
| runtime->opdata[i].type = node->type; |
| runtime->opdata[i].flags = node->flags; |
| runtime->opdata[i].id = node->id; |
| runtime->opdata[i].num_inputs = node->num_inputs; |
| runtime->opdata[i].num_outputs = node->num_outputs; |
| // Copy all inputs (not just num_inputs) to get all invalid ID (e.g. no bias). |
| for (size_t input_i = 0; input_i < node->num_inputs; input_i++) { |
| runtime->opdata[i].inputs[input_i] = node->inputs[input_i]; |
| } |
| for (size_t output_i = 0; output_i < node->num_outputs; output_i++) { |
| runtime->opdata[i].outputs[output_i] = node->outputs[output_i]; |
| } |
| |
| // Ignore fused nodes |
| if (node->type != xnn_node_type_invalid) { |
| assert(node->create != NULL); |
| status = node->create(node, runtime->values, runtime->num_values, runtime->opdata + i, weights_cache); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to create node %zu", i); |
| goto error; |
| } |
| runtime->opdata[i].setup = node->setup; |
| runtime->opdata[i].reshape = node->reshape; |
| } |
| } |
| |
| runtime->threadpool = threadpool; |
| |
| for (uint32_t i = 0; i < runtime->num_values; i++) { |
| struct xnn_runtime_value* value = &runtime->values[i]; |
| if (!xnn_value_is_valid(value->type)) { |
| continue; |
| } |
| |
| if (value->flags & XNN_VALUE_FLAG_FP16_COMPATIBLE && xnn_value_is_static(value->allocation_type)) { |
| // Value is static and has been converted to FP16 in a new buffer. |
| value->flags |= XNN_VALUE_FLAG_NEEDS_CLEANUP; |
| // Runtime takes ownership of the data from subgraph. |
| value->data = subgraph->values[i].data; |
| subgraph->values[i].data = NULL; |
| } |
| } |
| |
| // Create and/or add a workspace. |
| if (workspace == NULL) { |
| xnn_log_debug("Allocating non-shared workspace"); |
| workspace = xnn_allocate_zero_memory(sizeof(struct xnn_workspace)); |
| if (workspace == NULL) { |
| xnn_log_error("failed to allocate %zu bytes for non-shared workspace", |
| sizeof(struct xnn_workspace)); |
| goto error; |
| } |
| } |
| xnn_retain_workspace(workspace); |
| runtime->workspace = workspace; |
| runtime->next_workspace_user = runtime->workspace->first_user; |
| runtime->workspace->first_user = runtime; |
| |
| *runtime_out = runtime; |
| return xnn_status_success; |
| |
| error: |
| xnn_delete_runtime(runtime); |
| return status; |
| } |
| |
| enum xnn_status xnn_create_runtime_v4( |
| xnn_subgraph_t subgraph, |
| xnn_weights_cache_t weights_cache, |
| xnn_workspace_t workspace, |
| pthreadpool_t threadpool, |
| uint32_t flags, |
| xnn_runtime_t* runtime_out) |
| { |
| return create_runtime_impl(subgraph, weights_cache, workspace, threadpool, |
| /*xnn_threadpool=*/NULL, flags, runtime_out); |
| } |
| |
| // The xnn_threadpool consists of an `xnn_scheduler_v2` and its context. |
| struct xnn_threadpool { |
| struct xnn_scheduler_v2 scheduler; |
| void* scheduler_context; |
| }; |
| |
| enum xnn_status xnn_create_threadpool_v2(struct xnn_scheduler_v2 scheduler, |
| void* scheduler_context, |
| uint32_t flags, |
| xnn_threadpool_t* threadpool_out) { |
| *threadpool_out = xnn_allocate_memory(sizeof(struct xnn_threadpool)); |
| (*threadpool_out)->scheduler = scheduler; |
| (*threadpool_out)->scheduler_context = scheduler_context; |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_delete_threadpool(xnn_threadpool_t threadpool) { |
| xnn_release_memory(threadpool); |
| return xnn_status_success; |
| } |
| |
| int xnn_threadpool_num_threads(xnn_threadpool_t threadpool) { |
| return threadpool->scheduler.num_threads(threadpool->scheduler_context); |
| } |
| |
| enum xnn_status xnn_threadpool_schedule(xnn_threadpool_t threadpool, |
| void* context, |
| void (*task)(void* context)) { |
| threadpool->scheduler.schedule(threadpool->scheduler_context, context, task); |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_update_runtime_with_threadpool( |
| xnn_runtime_t runtime, xnn_threadpool_t xnn_threadpool) { |
| if (xnn_threadpool) { |
| struct pthreadpool_executor executor; |
| executor.num_threads = xnn_threadpool->scheduler.num_threads; |
| executor.schedule = xnn_threadpool->scheduler.schedule; |
| pthreadpool_update_executor(runtime->threadpool, &executor, |
| xnn_threadpool->scheduler_context); |
| } |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_create_runtime_with_threadpool( |
| xnn_subgraph_t subgraph, xnn_weights_cache_t weights_cache, |
| xnn_threadpool_t xnn_threadpool, uint32_t flags, |
| xnn_runtime_t* runtime_out) { |
| pthreadpool_t threadpool = NULL; |
| if (xnn_threadpool) { |
| struct pthreadpool_executor executor; |
| executor.num_threads = xnn_threadpool->scheduler.num_threads; |
| executor.schedule = xnn_threadpool->scheduler.schedule; |
| threadpool = |
| pthreadpool_create_v2(&executor, xnn_threadpool->scheduler_context, 0); |
| flags |= XNN_FLAG_RUNTIME_OWNS_THREADPOOL; |
| } |
| |
| return create_runtime_impl(subgraph, weights_cache, /*workspace=*/NULL, |
| threadpool, xnn_threadpool, flags, runtime_out); |
| } |
| |
| enum xnn_status xnn_plan_memory( |
| xnn_runtime_t runtime) { |
| enum xnn_status status = xnn_status_invalid_state; |
| struct xnn_value_allocation_tracker mem_alloc_tracker; |
| xnn_init_value_allocation_tracker(&mem_alloc_tracker, runtime); |
| |
| for (uint32_t i = 0; i < runtime->num_values; i++) { |
| const struct xnn_runtime_value* value = &runtime->values[i]; |
| if (!xnn_value_is_valid(value->type)) { |
| continue; |
| } |
| |
| if (value->allocation_type == xnn_allocation_type_workspace) { |
| // Value is purely internal to the runtime, and must be allocated in its workspace. |
| size_t tensor_size = xnn_tensor_get_rounded_size(value); |
| if (value->datatype == xnn_datatype_qdint8 || value->datatype == xnn_datatype_qduint8) { |
| tensor_size += xnn_tensor_get_rounded_dynamic_quant_param_size(value); |
| tensor_size += xnn_tensor_get_rounded_row_sum_size(value); |
| } |
| xnn_add_value_allocation_tracker(&mem_alloc_tracker, i, tensor_size); |
| } |
| } |
| |
| for (uint32_t opdata_id = 0; opdata_id < runtime->num_ops; opdata_id++) { |
| struct xnn_operator_data* opdata = &runtime->opdata[opdata_id]; |
| xnn_add_operator_workspace_allocation_tracker( |
| &mem_alloc_tracker, runtime->num_values + opdata_id, xnn_get_rounded_size(opdata->workspace_size), |
| opdata_id); |
| } |
| |
| optimize_tensor_allocation_for_in_place_operations(&mem_alloc_tracker, runtime); |
| xnn_plan_value_allocation_tracker(&mem_alloc_tracker); |
| |
| status = initialize_workspace_values(runtime, &mem_alloc_tracker); |
| if (status != xnn_status_success) { |
| xnn_log_debug("failed to initialize_workspace_values"); |
| goto error; |
| } |
| |
| xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
| |
| return xnn_status_success; |
| |
| error: |
| xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
| return status; |
| } |
| |
| enum xnn_status xnn_reshape_runtime(xnn_runtime_t runtime) { |
| bool reallocation_required = false; |
| |
| for (uint32_t opdata_id = 0; opdata_id < runtime->num_ops; opdata_id++) { |
| struct xnn_operator_data* opdata = &runtime->opdata[opdata_id]; |
| if (opdata->operator_objects[0] == NULL) { |
| // Operator was removed during optimization |
| continue; |
| } |
| assert(opdata->reshape != NULL); |
| xnn_log_debug("reshaping operator %u (%s)", opdata_id, |
| xnn_operator_type_to_string_v2(opdata->operator_objects[0])); |
| enum xnn_status status = opdata->reshape(opdata, runtime->values, runtime->num_values, runtime->threadpool); |
| if (status == xnn_status_reallocation_required) { |
| reallocation_required = true; |
| } else if (status != xnn_status_success) { |
| xnn_log_error( |
| "Operator #%u: %s failed reshape", opdata_id, |
| xnn_operator_type_to_string_v2(opdata->operator_objects[0])); |
| return status; |
| } |
| } |
| if (reallocation_required || !runtime->memory_planned) { |
| runtime->memory_planned = true; |
| return xnn_plan_memory(runtime); |
| } |
| return xnn_status_success; |
| } |
| |
| static enum xnn_status set_external_values( |
| xnn_runtime_t runtime, |
| size_t num_external_values, |
| const struct xnn_external_value* external_values) |
| { |
| // Validate inputs without changing internal state. |
| // This ensures that runtime stays in consistent state in case validation fails midway. |
| for (size_t i = 0; i < num_external_values; i++) { |
| const struct xnn_external_value* external_value = &external_values[i]; |
| const uint32_t value_id = external_value->id; |
| if (value_id >= runtime->num_values) { |
| xnn_log_error("failed to setup runtime: out-of-bounds ID %" PRIu32 " in external value #%zu", |
| value_id, i); |
| return xnn_status_invalid_parameter; |
| } |
| |
| const struct xnn_runtime_value* value = &runtime->values[value_id]; |
| if (value->allocation_type != xnn_allocation_type_external) { |
| xnn_log_error("failed to setup runtime: Value %" PRIu32 " is not external (%d)", value_id, value->allocation_type); |
| return xnn_status_invalid_parameter; |
| } |
| } |
| |
| // Apply runtime state changes. |
| for (size_t i = 0; i < num_external_values; i++) { |
| const struct xnn_external_value* external_value = &external_values[i]; |
| const uint32_t value_id = external_value->id; |
| struct xnn_runtime_value* value = &runtime->values[value_id]; |
| value->data = external_value->data; |
| } |
| return xnn_status_success; |
| } |
| |
| static enum xnn_status setup_runtime(xnn_runtime_t runtime) |
| { |
| for (uint32_t opdata_id = 0; opdata_id < runtime->num_ops; opdata_id++) { |
| struct xnn_operator_data* opdata = &runtime->opdata[opdata_id]; |
| for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
| if (opdata->operator_objects[j] == NULL) { |
| // Operator was removed during optimization |
| continue; |
| } |
| |
| assert(opdata->setup != NULL); |
| enum xnn_status status = opdata->setup(opdata, runtime->values, runtime->num_values, runtime->threadpool); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to setup runtime: error in setting pointers of operator #%u", opdata_id); |
| return status; |
| } |
| } |
| } |
| |
| runtime->has_been_setup = true; |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_setup_runtime( |
| xnn_runtime_t runtime, |
| size_t num_external_values, |
| const struct xnn_external_value* external_values) |
| { |
| enum xnn_status status = set_external_values(runtime, num_external_values, external_values); |
| if (status != xnn_status_success) { |
| return status; |
| } |
| |
| status = xnn_reshape_runtime(runtime); |
| if (status != xnn_status_success) { |
| xnn_log_error("failed to setup runtime: error in reshaping runtime"); |
| return status; |
| } |
| |
| return setup_runtime(runtime); |
| } |
| |
| enum xnn_status xnn_setup_runtime_v2( |
| xnn_runtime_t runtime, |
| size_t num_external_values, |
| const struct xnn_external_value* external_values) |
| { |
| enum xnn_status status = set_external_values(runtime, num_external_values, external_values); |
| if (status != xnn_status_success) { |
| return status; |
| } |
| |
| return setup_runtime(runtime); |
| } |
| |
| static xnn_timestamp xnn_read_timer() { |
| xnn_timestamp timestamp; |
| #ifdef __MACH__ |
| timestamp = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); |
| if (timestamp == 0) { |
| xnn_log_warning("clock_gettime failed: error code %d", errno); |
| } |
| #elif __EMSCRIPTEN__ |
| timestamp = emscripten_get_now(); |
| #elif XNN_PLATFORM_WINDOWS |
| BOOL res = QueryPerformanceCounter(×tamp); |
| if (!res) { |
| xnn_log_error("QueryPerformanceCounter failed: error code %u", GetLastError()); |
| memset(×tamp, 0, sizeof(timestamp)); |
| } |
| #else |
| int res = clock_gettime(CLOCK_MONOTONIC, ×tamp); |
| if (res != 0) { |
| xnn_log_error("clock_gettime failed: error code %d", errno); |
| memset(×tamp, 0, sizeof(timestamp)); |
| } |
| #endif |
| return timestamp; |
| } |
| |
| static inline uint64_t xnn_get_elapsed_time(const xnn_timestamp* start, const xnn_timestamp* end) { |
| #ifdef __MACH__ |
| const uint64_t kMicrosInNanos = 1000; |
| return (*end - *start) / kMicrosInNanos; |
| #elif __EMSCRIPTEN__ |
| const double kMillisInMicros = 1.0e3; |
| return (uint64_t) ((*end - *start) * kMillisInMicros); |
| #elif XNN_PLATFORM_WINDOWS |
| const uint64_t kMicrosInSec = 1000 * 1000; |
| LARGE_INTEGER frequency; |
| BOOL res = QueryPerformanceFrequency(&frequency); |
| if (!res) { |
| xnn_log_error("QueryPerformanceFrequency failed: error code %u", GetLastError()); |
| return 0; |
| } |
| return ((end->QuadPart - start->QuadPart) * kMicrosInSec) / frequency.QuadPart; |
| #else |
| const uint64_t kNanosInMicro = UINT64_C(1000); |
| const uint64_t kNanosInSec = UINT64_C(1000000000); |
| const uint64_t secs = (end->tv_sec - start->tv_sec) * kNanosInSec; |
| const uint64_t ns_secs = (end->tv_nsec - start->tv_nsec); |
| return (secs + ns_secs) / kNanosInMicro; |
| #endif |
| } |
| |
| enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime, |
| enum xnn_profile_info param_name, |
| size_t param_value_size, |
| void* param_value, |
| size_t* param_value_size_ret) |
| { |
| if (!runtime->profiling) { |
| return xnn_status_invalid_state; |
| } |
| enum xnn_status status = xnn_status_success; |
| size_t required_size = 0; |
| const struct xnn_operator_data* opdata = runtime->opdata; |
| switch (param_name) { |
| case xnn_profile_info_num_operators: |
| required_size = sizeof(size_t); |
| if (param_value_size < required_size){ |
| *param_value_size_ret = required_size; |
| status = xnn_status_out_of_memory; |
| } else { |
| size_t num_valid_ops = 0; |
| for (size_t i = 0; i < runtime->num_ops; ++i) { |
| if (opdata[i].operator_objects[0] != NULL) { |
| num_valid_ops += 1; |
| } |
| } |
| memcpy(param_value, &num_valid_ops, required_size); |
| } |
| break; |
| case xnn_profile_info_operator_name: |
| for (size_t i = 0; i < runtime->num_ops; ++i) { |
| if (opdata[i].operator_objects[0] != NULL) { |
| const char* op_name = |
| xnn_operator_type_to_string_v2(opdata[i].operator_objects[0]); |
| size_t op_name_len = strlen(op_name) + 1; |
| if (opdata[i].operator_objects[0]->ukernel.type != xnn_microkernel_type_default ) { |
| op_name_len += strlen(xnn_microkernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type)) + 1; |
| } |
| required_size += op_name_len; |
| } |
| } |
| if (param_value_size < required_size) { |
| *param_value_size_ret = required_size; |
| status = xnn_status_out_of_memory; |
| } else { |
| char* name_out = (char*) param_value; |
| for (size_t i = 0; i < runtime->num_ops; ++i) { |
| if (opdata[i].operator_objects[0] != NULL) { |
| const char* op_name = |
| xnn_operator_type_to_string_v2(opdata[i].operator_objects[0]); |
| size_t op_name_len = strlen(op_name) + 1; |
| if (opdata[i].operator_objects[0]->ukernel.type != xnn_microkernel_type_default ) { |
| const char* ukernel_type = xnn_microkernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type); |
| op_name_len += strlen(ukernel_type) + 1; |
| snprintf(name_out, op_name_len, "%s %s", op_name, ukernel_type); |
| } else { |
| snprintf(name_out, op_name_len, "%s", op_name); |
| } |
| name_out += op_name_len; |
| } |
| } |
| } |
| break; |
| case xnn_profile_info_operator_timing: |
| { |
| size_t num_valid_ops = 0; |
| for (size_t i = 0; i < runtime->num_ops; ++i) { |
| if (opdata[i].operator_objects[0] != NULL) { |
| num_valid_ops += 1; |
| } |
| } |
| required_size = num_valid_ops * sizeof(uint64_t); |
| if (param_value_size < required_size) { |
| *param_value_size_ret = required_size; |
| status = xnn_status_out_of_memory; |
| } else { |
| xnn_timestamp previous_ts = runtime->start_ts; |
| uint64_t* data = (uint64_t*) param_value; |
| for (size_t i = 0; i < runtime->num_ops; ++i) { |
| if (opdata[i].operator_objects[0] != NULL) { |
| uint64_t op_time = 0; |
| for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
| if (opdata[i].operator_objects[j] != NULL) { |
| op_time += xnn_get_elapsed_time(&previous_ts, &opdata[i].end_ts[j]); |
| previous_ts = opdata[i].end_ts[j]; |
| } |
| } |
| *data++ = op_time; |
| } |
| } |
| } |
| break; |
| } |
| default: |
| status = xnn_status_invalid_parameter; |
| } |
| return status; |
| } |
| |
| enum xnn_status xnn_invoke_runtime( |
| xnn_runtime_t runtime) |
| { |
| if (runtime->profiling) { |
| runtime->start_ts = xnn_read_timer(); |
| } |
| for (size_t i = 0; i < runtime->num_ops; i++) { |
| for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
| if (runtime->opdata[i].operator_objects[j] == NULL) { |
| // Operator was removed after fusion |
| continue; |
| } |
| |
| const enum xnn_status status = xnn_run_operator_with_index(runtime->opdata[i].operator_objects[j], i, j, runtime->threadpool); |
| if (status != xnn_status_success) { |
| return status; |
| } |
| if (runtime->profiling) { |
| runtime->opdata[i].end_ts[j] = xnn_read_timer(); |
| } |
| } |
| } |
| |
| // If the `pthreadpool` is using an external `pthreadpool_executor`, release |
| // the executor threads. |
| if (runtime->flags & XNN_FLAG_DONT_SPIN_WORKERS) { |
| pthreadpool_release_executor_threads(runtime->threadpool); |
| } |
| |
| return xnn_status_success; |
| } |
| |
| enum xnn_status xnn_delete_runtime( |
| xnn_runtime_t runtime) |
| { |
| if (runtime != NULL) { |
| if (runtime->opdata != NULL) { |
| for (size_t i = 0; i < runtime->num_ops; i++) { |
| for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
| xnn_delete_operator(runtime->opdata[i].operator_objects[j]); |
| } |
| xnn_release_memory(runtime->opdata[i].end_ts); |
| } |
| xnn_release_memory(runtime->opdata); |
| |
| if (runtime->values != NULL) { |
| // Release the buffers created during FP16 rewrite. |
| for (size_t i = 0; i < runtime->num_values; i++) { |
| struct xnn_runtime_value* value = &runtime->values[i]; |
| if (value->allocation_type == xnn_allocation_type_dynamic || |
| value->flags & XNN_VALUE_FLAG_NEEDS_CLEANUP) { |
| xnn_release_memory(value->data); |
| } |
| } |
| xnn_release_memory(runtime->values); |
| } |
| |
| if (runtime->workspace != NULL) { |
| // Remove this runtime from the list of users of the workspace. |
| assert(runtime->workspace->first_user != NULL); |
| if (runtime->workspace->first_user == runtime) { |
| runtime->workspace->first_user = runtime->next_workspace_user; |
| } else { |
| xnn_runtime_t prev = runtime->workspace->first_user; |
| xnn_runtime_t curr = prev->next_workspace_user; |
| while (curr != runtime) { |
| prev = curr; |
| curr = curr->next_workspace_user; |
| } |
| assert(curr == runtime); |
| prev->next_workspace_user = curr->next_workspace_user; |
| } |
| xnn_release_workspace(runtime->workspace); |
| } |
| } |
| |
| if (runtime->threadpool && |
| runtime->flags & XNN_FLAG_RUNTIME_OWNS_THREADPOOL) { |
| if (runtime->threadpool) { |
| pthreadpool_destroy(runtime->threadpool); |
| } |
| } |
| |
| xnn_release_memory(runtime); |
| } |
| return xnn_status_success; |
| } |