| // SPDX-License-Identifier: Apache-2.0 |
| // ---------------------------------------------------------------------------- |
| // Copyright 2011-2021 Arm Limited |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| // use this file except in compliance with the License. You may obtain a copy |
| // of the License at: |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| // License for the specific language governing permissions and limitations |
| // under the License. |
| // ---------------------------------------------------------------------------- |
| |
| /** |
| * @brief Functions for finding dominant direction of a set of colors. |
| */ |
| #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| |
| #include "astcenc_internal.h" |
| |
| #include <cassert> |
| |
| /* See header for documentation. */ |
| void compute_avgs_and_dirs_4_comp( |
| const partition_info& pi, |
| const image_block& blk, |
| const error_weight_block& ewb, |
| partition_metrics pm[BLOCK_MAX_PARTITIONS] |
| ) { |
| int partition_count = pi.partition_count; |
| promise(partition_count > 0); |
| |
| for (int partition = 0; partition < partition_count; partition++) |
| { |
| const uint8_t *weights = pi.texels_of_partition[partition]; |
| |
| vfloat4 error_sum = vfloat4::zero(); |
| vfloat4 base_sum = vfloat4::zero(); |
| vfloat4 rgba_min(1e38f); |
| vfloat4 rgba_max(-1e38f); |
| float partition_weight = 0.0f; |
| |
| int texel_count = pi.partition_texel_count[partition]; |
| promise(texel_count > 0); |
| |
| for (int i = 0; i < texel_count; i++) |
| { |
| int iwt = weights[i]; |
| float weight = ewb.texel_weight[iwt]; |
| vfloat4 texel_datum = blk.texel(iwt); |
| vfloat4 error_weight = ewb.error_weights[iwt]; |
| |
| if (weight > 1e-10f) |
| { |
| rgba_min = min(texel_datum, rgba_min); |
| rgba_max = max(texel_datum, rgba_max); |
| } |
| |
| partition_weight += weight; |
| base_sum += texel_datum * weight; |
| error_sum += error_weight; |
| } |
| |
| error_sum = error_sum / texel_count; |
| vfloat4 csf = normalize(sqrt(error_sum)) * 2.0f; |
| |
| vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); |
| |
| pm[partition].error_weight = error_sum; |
| pm[partition].avg = average * csf; |
| pm[partition].color_scale = csf; |
| pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); |
| vfloat4 range = max(rgba_max - rgba_min, 1e-10f); |
| pm[partition].range_sq = range * range; |
| |
| vfloat4 sum_xp = vfloat4::zero(); |
| vfloat4 sum_yp = vfloat4::zero(); |
| vfloat4 sum_zp = vfloat4::zero(); |
| vfloat4 sum_wp = vfloat4::zero(); |
| |
| for (int i = 0; i < texel_count; i++) |
| { |
| int iwt = weights[i]; |
| float weight = ewb.texel_weight[iwt]; |
| vfloat4 texel_datum = blk.texel(iwt); |
| texel_datum = (texel_datum - average) * weight; |
| |
| vfloat4 zero = vfloat4::zero(); |
| |
| vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; |
| sum_xp += select(zero, texel_datum, tdm0); |
| |
| vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; |
| sum_yp += select(zero, texel_datum, tdm1); |
| |
| vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero; |
| sum_zp += select(zero, texel_datum, tdm2); |
| |
| vmask4 tdm3 = vfloat4(texel_datum.lane<3>()) > zero; |
| sum_wp += select(zero, texel_datum, tdm3); |
| } |
| |
| float prod_xp = dot_s(sum_xp, sum_xp); |
| float prod_yp = dot_s(sum_yp, sum_yp); |
| float prod_zp = dot_s(sum_zp, sum_zp); |
| float prod_wp = dot_s(sum_wp, sum_wp); |
| |
| vfloat4 best_vector = sum_xp; |
| float best_sum = prod_xp; |
| |
| if (prod_yp > best_sum) |
| { |
| best_vector = sum_yp; |
| best_sum = prod_yp; |
| } |
| |
| if (prod_zp > best_sum) |
| { |
| best_vector = sum_zp; |
| best_sum = prod_zp; |
| } |
| |
| if (prod_wp > best_sum) |
| { |
| best_vector = sum_wp; |
| } |
| |
| pm[partition].dir = best_vector; |
| } |
| } |
| |
| /* See header for documentation. */ |
| void compute_avgs_and_dirs_3_comp( |
| const partition_info& pi, |
| const image_block& blk, |
| const error_weight_block& ewb, |
| unsigned int omitted_component, |
| partition_metrics pm[BLOCK_MAX_PARTITIONS] |
| ) { |
| const float *texel_weights = ewb.texel_weight_rgb; |
| |
| const float* data_vr = blk.data_r; |
| const float* data_vg = blk.data_g; |
| const float* data_vb = blk.data_b; |
| |
| const float* error_vr = ewb.texel_weight_r; |
| const float* error_vg = ewb.texel_weight_g; |
| const float* error_vb = ewb.texel_weight_b; |
| |
| if (omitted_component == 0) |
| { |
| texel_weights = ewb.texel_weight_gba; |
| |
| data_vr = blk.data_g; |
| data_vg = blk.data_b; |
| data_vb = blk.data_a; |
| |
| error_vr = ewb.texel_weight_g; |
| error_vg = ewb.texel_weight_b; |
| error_vb = ewb.texel_weight_a; |
| } |
| else if (omitted_component == 1) |
| { |
| texel_weights = ewb.texel_weight_rba; |
| |
| data_vg = blk.data_b; |
| data_vb = blk.data_a; |
| |
| error_vg = ewb.texel_weight_b; |
| error_vb = ewb.texel_weight_a; |
| } |
| else if (omitted_component == 2) |
| { |
| texel_weights = ewb.texel_weight_rga; |
| |
| data_vb = blk.data_a; |
| |
| error_vb = ewb.texel_weight_a; |
| } |
| |
| int partition_count = pi.partition_count; |
| promise(partition_count > 0); |
| |
| for (int partition = 0; partition < partition_count; partition++) |
| { |
| const uint8_t *weights = pi.texels_of_partition[partition]; |
| |
| vfloat4 error_sum = vfloat4::zero(); |
| vfloat4 base_sum = vfloat4::zero(); |
| vfloat4 rgb_min(1e38f); |
| vfloat4 rgb_max(-1e38f); |
| float partition_weight = 0.0f; |
| |
| int texel_count = pi.partition_texel_count[partition]; |
| promise(texel_count > 0); |
| |
| for (int i = 0; i < texel_count; i++) |
| { |
| int iwt = weights[i]; |
| float weight = texel_weights[iwt]; |
| |
| vfloat4 texel_datum(data_vr[iwt], |
| data_vg[iwt], |
| data_vb[iwt], |
| 0.0f); |
| |
| vfloat4 error_weight(error_vr[iwt], |
| error_vg[iwt], |
| error_vb[iwt], |
| 0.0f); |
| |
| if (weight > 1e-10f) |
| { |
| rgb_min = min(texel_datum, rgb_min); |
| rgb_max = max(texel_datum, rgb_max); |
| } |
| |
| partition_weight += weight; |
| base_sum += texel_datum * weight; |
| error_sum += error_weight; |
| } |
| |
| error_sum = error_sum / texel_count; |
| vfloat4 csf = normalize(sqrt(error_sum)) * 1.73205080f; |
| |
| vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); |
| |
| pm[partition].error_weight = error_sum; |
| pm[partition].avg = average * csf; |
| pm[partition].color_scale = csf; |
| pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); |
| vfloat4 range = max(rgb_max - rgb_min, 1e-10f); |
| pm[partition].range_sq = range * range; |
| |
| vfloat4 sum_xp = vfloat4::zero(); |
| vfloat4 sum_yp = vfloat4::zero(); |
| vfloat4 sum_zp = vfloat4::zero(); |
| |
| for (int i = 0; i < texel_count; i++) |
| { |
| int iwt = weights[i]; |
| float weight = texel_weights[iwt]; |
| vfloat4 texel_datum = vfloat3(data_vr[iwt], |
| data_vg[iwt], |
| data_vb[iwt]); |
| texel_datum = (texel_datum - average) * weight; |
| |
| vfloat4 zero = vfloat4::zero(); |
| |
| vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; |
| sum_xp += select(zero, texel_datum, tdm0); |
| |
| vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; |
| sum_yp += select(zero, texel_datum, tdm1); |
| |
| vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero; |
| sum_zp += select(zero, texel_datum, tdm2); |
| } |
| |
| float prod_xp = dot3_s(sum_xp, sum_xp); |
| float prod_yp = dot3_s(sum_yp, sum_yp); |
| float prod_zp = dot3_s(sum_zp, sum_zp); |
| |
| vfloat4 best_vector = sum_xp; |
| float best_sum = prod_xp; |
| |
| if (prod_yp > best_sum) |
| { |
| best_vector = sum_yp; |
| best_sum = prod_yp; |
| } |
| |
| if (prod_zp > best_sum) |
| { |
| best_vector = sum_zp; |
| } |
| |
| pm[partition].dir = best_vector; |
| } |
| } |
| |
| /* See header for documentation. */ |
| void compute_avgs_and_dirs_2_comp( |
| const partition_info& pt, |
| const image_block& blk, |
| const error_weight_block& ewb, |
| unsigned int component1, |
| unsigned int component2, |
| partition_metrics pm[BLOCK_MAX_PARTITIONS] |
| ) { |
| const float *texel_weights; |
| |
| const float* data_vr = nullptr; |
| const float* data_vg = nullptr; |
| |
| const float* error_vr = nullptr; |
| const float* error_vg = nullptr; |
| |
| if (component1 == 0 && component2 == 1) |
| { |
| texel_weights = ewb.texel_weight_rg; |
| |
| data_vr = blk.data_r; |
| data_vg = blk.data_g; |
| |
| error_vr = ewb.texel_weight_r; |
| error_vg = ewb.texel_weight_g; |
| } |
| else if (component1 == 0 && component2 == 2) |
| { |
| texel_weights = ewb.texel_weight_rb; |
| |
| data_vr = blk.data_r; |
| data_vg = blk.data_b; |
| |
| error_vr = ewb.texel_weight_r; |
| error_vg = ewb.texel_weight_b; |
| } |
| else // (component1 == 1 && component2 == 2) |
| { |
| assert(component1 == 1 && component2 == 2); |
| texel_weights = ewb.texel_weight_gb; |
| |
| data_vr = blk.data_g; |
| data_vg = blk.data_b; |
| |
| |
| error_vr = ewb.texel_weight_g; |
| error_vg = ewb.texel_weight_b; |
| } |
| |
| unsigned int partition_count = pt.partition_count; |
| promise(partition_count > 0); |
| |
| for (unsigned int partition = 0; partition < partition_count; partition++) |
| { |
| const uint8_t *weights = pt.texels_of_partition[partition]; |
| |
| vfloat4 error_sum = vfloat4::zero(); |
| vfloat4 base_sum = vfloat4::zero(); |
| float partition_weight = 0.0f; |
| |
| unsigned int texel_count = pt.partition_texel_count[partition]; |
| promise(texel_count > 0); |
| |
| for (unsigned int i = 0; i < texel_count; i++) |
| { |
| unsigned int iwt = weights[i]; |
| float weight = texel_weights[iwt]; |
| vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]) * weight; |
| |
| vfloat4 error_weight = vfloat2(error_vr[iwt], error_vg[iwt]); |
| |
| partition_weight += weight; |
| base_sum += texel_datum; |
| error_sum += error_weight; |
| } |
| |
| error_sum = error_sum / texel_count; |
| vfloat4 csf = normalize(sqrt(error_sum)) * 1.41421356f; |
| vfloat4 average = base_sum * (1.0f / astc::max(partition_weight, 1e-7f)); |
| |
| |
| pm[partition].error_weight = error_sum; |
| pm[partition].avg = average * csf; |
| pm[partition].color_scale = csf; |
| pm[partition].icolor_scale = 1.0f / max(csf, 1e-7f); |
| |
| vfloat4 sum_xp = vfloat4::zero(); |
| vfloat4 sum_yp = vfloat4::zero(); |
| |
| for (unsigned int i = 0; i < texel_count; i++) |
| { |
| unsigned int iwt = weights[i]; |
| float weight = texel_weights[iwt]; |
| vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); |
| texel_datum = (texel_datum - average) * weight; |
| |
| vfloat4 zero = vfloat4::zero(); |
| |
| vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; |
| sum_xp += select(zero, texel_datum, tdm0); |
| |
| vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; |
| sum_yp += select(zero, texel_datum, tdm1); |
| } |
| |
| float prod_xp = dot_s(sum_xp, sum_xp); |
| float prod_yp = dot_s(sum_yp, sum_yp); |
| |
| vfloat4 best_vector = sum_xp; |
| float best_sum = prod_xp; |
| |
| if (prod_yp > best_sum) |
| { |
| best_vector = sum_yp; |
| } |
| |
| pm[partition].dir = best_vector; |
| } |
| } |
| |
| /* See header for documentation. */ |
| void compute_error_squared_rgba( |
| const partition_info& pi, |
| const image_block& blk, |
| const error_weight_block& ewb, |
| const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], |
| const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], |
| float uncor_lengths[BLOCK_MAX_PARTITIONS], |
| float samec_lengths[BLOCK_MAX_PARTITIONS], |
| float& uncor_error, |
| float& samec_error |
| ) { |
| unsigned int partition_count = pi.partition_count; |
| promise(partition_count > 0); |
| |
| uncor_error = 0.0f; |
| samec_error = 0.0f; |
| |
| for (unsigned int partition = 0; partition < partition_count; partition++) |
| { |
| const uint8_t *weights = pi.texels_of_partition[partition]; |
| |
| float uncor_loparam = 1e10f; |
| float uncor_hiparam = -1e10f; |
| |
| float samec_loparam = 1e10f; |
| float samec_hiparam = -1e10f; |
| |
| processed_line4 l_uncor = uncor_plines[partition]; |
| processed_line4 l_samec = samec_plines[partition]; |
| |
| unsigned int texel_count = pi.partition_texel_count[partition]; |
| promise(texel_count > 0); |
| |
| // Vectorize some useful scalar inputs |
| vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); |
| vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); |
| vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); |
| vfloat l_uncor_bs3(l_uncor.bs.lane<3>()); |
| |
| vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); |
| vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); |
| vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); |
| vfloat l_uncor_amod3(l_uncor.amod.lane<3>()); |
| |
| vfloat l_uncor_bis0(l_uncor.bis.lane<0>()); |
| vfloat l_uncor_bis1(l_uncor.bis.lane<1>()); |
| vfloat l_uncor_bis2(l_uncor.bis.lane<2>()); |
| vfloat l_uncor_bis3(l_uncor.bis.lane<3>()); |
| |
| vfloat l_samec_bs0(l_samec.bs.lane<0>()); |
| vfloat l_samec_bs1(l_samec.bs.lane<1>()); |
| vfloat l_samec_bs2(l_samec.bs.lane<2>()); |
| vfloat l_samec_bs3(l_samec.bs.lane<3>()); |
| |
| assert(all(l_samec.amod == vfloat4(0.0f))); |
| |
| vfloat l_samec_bis0(l_samec.bis.lane<0>()); |
| vfloat l_samec_bis1(l_samec.bis.lane<1>()); |
| vfloat l_samec_bis2(l_samec.bis.lane<2>()); |
| vfloat l_samec_bis3(l_samec.bis.lane<3>()); |
| |
| vfloat uncor_loparamv(1e10f); |
| vfloat uncor_hiparamv(-1e10f); |
| vfloat4 uncor_errorsumv = vfloat4::zero(); |
| |
| vfloat samec_loparamv(1e10f); |
| vfloat samec_hiparamv(-1e10f); |
| vfloat4 samec_errorsumv = vfloat4::zero(); |
| |
| // This implementation over-shoots, but this is safe as we initialize the weights array |
| // to extend the last value. This means min/max are not impacted, but we need to mask |
| // out the dummy values when we compute the line weighting. |
| vint lane_ids = vint::lane_id(); |
| for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
| { |
| vmask mask = lane_ids < vint(texel_count); |
| vint texel_idxs(&(weights[i])); |
| |
| vfloat data_r = gatherf(blk.data_r, texel_idxs); |
| vfloat data_g = gatherf(blk.data_g, texel_idxs); |
| vfloat data_b = gatherf(blk.data_b, texel_idxs); |
| vfloat data_a = gatherf(blk.data_a, texel_idxs); |
| |
| vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs); |
| vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs); |
| vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs); |
| vfloat ew_a = gatherf(ewb.texel_weight_a, texel_idxs); |
| |
| vfloat uncor_param = (data_r * l_uncor_bs0) |
| + (data_g * l_uncor_bs1) |
| + (data_b * l_uncor_bs2) |
| + (data_a * l_uncor_bs3); |
| |
| uncor_loparamv = min(uncor_param, uncor_loparamv); |
| uncor_hiparamv = max(uncor_param, uncor_hiparamv); |
| |
| vfloat uncor_dist0 = (l_uncor_amod0 - data_r) |
| + (uncor_param * l_uncor_bis0); |
| vfloat uncor_dist1 = (l_uncor_amod1 - data_g) |
| + (uncor_param * l_uncor_bis1); |
| vfloat uncor_dist2 = (l_uncor_amod2 - data_b) |
| + (uncor_param * l_uncor_bis2); |
| vfloat uncor_dist3 = (l_uncor_amod3 - data_a) |
| + (uncor_param * l_uncor_bis3); |
| |
| vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) |
| + (ew_g * uncor_dist1 * uncor_dist1) |
| + (ew_b * uncor_dist2 * uncor_dist2) |
| + (ew_a * uncor_dist3 * uncor_dist3); |
| |
| uncor_err = select(vfloat::zero(), uncor_err, mask); |
| haccumulate(uncor_errorsumv, uncor_err); |
| |
| // Process samechroma data |
| vfloat samec_param = (data_r * l_samec_bs0) |
| + (data_g * l_samec_bs1) |
| + (data_b * l_samec_bs2) |
| + (data_a * l_samec_bs3); |
| |
| samec_loparamv = min(samec_param, samec_loparamv); |
| samec_hiparamv = max(samec_param, samec_hiparamv); |
| |
| vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r; |
| vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g; |
| vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b; |
| vfloat samec_dist3 = samec_param * l_samec_bis3 - data_a; |
| |
| vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) |
| + (ew_g * samec_dist1 * samec_dist1) |
| + (ew_b * samec_dist2 * samec_dist2) |
| + (ew_a * samec_dist3 * samec_dist3); |
| |
| samec_err = select(vfloat::zero(), samec_err, mask); |
| haccumulate(samec_errorsumv, samec_err); |
| |
| lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH); |
| } |
| |
| uncor_loparam = hmin_s(uncor_loparamv); |
| uncor_hiparam = hmax_s(uncor_hiparamv); |
| |
| samec_loparam = hmin_s(samec_loparamv); |
| samec_hiparam = hmax_s(samec_hiparamv); |
| |
| // Resolve the final scalar accumulator sum |
| haccumulate(uncor_error, uncor_errorsumv); |
| haccumulate(samec_error, samec_errorsumv); |
| |
| float uncor_linelen = uncor_hiparam - uncor_loparam; |
| float samec_linelen = samec_hiparam - samec_loparam; |
| |
| // Turn very small numbers and NaNs into a small number |
| uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f); |
| samec_lengths[partition] = astc::max(samec_linelen, 1e-7f); |
| } |
| } |
| |
| /* See header for documentation. */ |
| void compute_error_squared_rgb( |
| const partition_info& pi, |
| const image_block& blk, |
| const error_weight_block& ewb, |
| partition_lines3 plines[BLOCK_MAX_PARTITIONS], |
| float& uncor_error, |
| float& samec_error |
| ) { |
| unsigned int partition_count = pi.partition_count; |
| promise(partition_count > 0); |
| |
| uncor_error = 0.0f; |
| samec_error = 0.0f; |
| |
| for (unsigned int partition = 0; partition < partition_count; partition++) |
| { |
| partition_lines3& pl = plines[partition]; |
| const uint8_t *weights = pi.texels_of_partition[partition]; |
| unsigned int texel_count = pi.partition_texel_count[partition]; |
| promise(texel_count > 0); |
| |
| float uncor_loparam = 1e10f; |
| float uncor_hiparam = -1e10f; |
| |
| float samec_loparam = 1e10f; |
| float samec_hiparam = -1e10f; |
| |
| processed_line3 l_uncor = pl.uncor_pline; |
| processed_line3 l_samec = pl.samec_pline; |
| |
| // This implementation is an example vectorization of this function. |
| // It works for - the codec is a 2-4% faster than not vectorizing - but |
| // the benefit is limited by the use of gathers and register pressure |
| |
| // Vectorize some useful scalar inputs |
| vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); |
| vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); |
| vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); |
| |
| vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); |
| vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); |
| vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); |
| |
| vfloat l_uncor_bis0(l_uncor.bis.lane<0>()); |
| vfloat l_uncor_bis1(l_uncor.bis.lane<1>()); |
| vfloat l_uncor_bis2(l_uncor.bis.lane<2>()); |
| |
| vfloat l_samec_bs0(l_samec.bs.lane<0>()); |
| vfloat l_samec_bs1(l_samec.bs.lane<1>()); |
| vfloat l_samec_bs2(l_samec.bs.lane<2>()); |
| |
| assert(all(l_samec.amod == vfloat4(0.0f))); |
| |
| vfloat l_samec_bis0(l_samec.bis.lane<0>()); |
| vfloat l_samec_bis1(l_samec.bis.lane<1>()); |
| vfloat l_samec_bis2(l_samec.bis.lane<2>()); |
| |
| vfloat uncor_loparamv(1e10f); |
| vfloat uncor_hiparamv(-1e10f); |
| vfloat4 uncor_errorsumv = vfloat4::zero(); |
| |
| vfloat samec_loparamv(1e10f); |
| vfloat samec_hiparamv(-1e10f); |
| vfloat4 samec_errorsumv = vfloat4::zero(); |
| |
| // This implementation over-shoots, but this is safe as we initialize the weights array |
| // to extend the last value. This means min/max are not impacted, but we need to mask |
| // out the dummy values when we compute the line weighting. |
| vint lane_ids = vint::lane_id(); |
| for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) |
| { |
| vmask mask = lane_ids < vint(texel_count); |
| vint texel_idxs(&(weights[i])); |
| |
| vfloat data_r = gatherf(blk.data_r, texel_idxs); |
| vfloat data_g = gatherf(blk.data_g, texel_idxs); |
| vfloat data_b = gatherf(blk.data_b, texel_idxs); |
| |
| vfloat ew_r = gatherf(ewb.texel_weight_r, texel_idxs); |
| vfloat ew_g = gatherf(ewb.texel_weight_g, texel_idxs); |
| vfloat ew_b = gatherf(ewb.texel_weight_b, texel_idxs); |
| |
| vfloat uncor_param = (data_r * l_uncor_bs0) |
| + (data_g * l_uncor_bs1) |
| + (data_b * l_uncor_bs2); |
| |
| uncor_loparamv = min(uncor_param, uncor_loparamv); |
| uncor_hiparamv = max(uncor_param, uncor_hiparamv); |
| |
| vfloat uncor_dist0 = (l_uncor_amod0 - data_r) |
| + (uncor_param * l_uncor_bis0); |
| vfloat uncor_dist1 = (l_uncor_amod1 - data_g) |
| + (uncor_param * l_uncor_bis1); |
| vfloat uncor_dist2 = (l_uncor_amod2 - data_b) |
| + (uncor_param * l_uncor_bis2); |
| |
| vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) |
| + (ew_g * uncor_dist1 * uncor_dist1) |
| + (ew_b * uncor_dist2 * uncor_dist2); |
| |
| uncor_err = select(vfloat::zero(), uncor_err, mask); |
| haccumulate(uncor_errorsumv, uncor_err); |
| |
| // Process samechroma data |
| vfloat samec_param = (data_r * l_samec_bs0) |
| + (data_g * l_samec_bs1) |
| + (data_b * l_samec_bs2); |
| |
| samec_loparamv = min(samec_param, samec_loparamv); |
| samec_hiparamv = max(samec_param, samec_hiparamv); |
| |
| |
| vfloat samec_dist0 = samec_param * l_samec_bis0 - data_r; |
| vfloat samec_dist1 = samec_param * l_samec_bis1 - data_g; |
| vfloat samec_dist2 = samec_param * l_samec_bis2 - data_b; |
| |
| vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) |
| + (ew_g * samec_dist1 * samec_dist1) |
| + (ew_b * samec_dist2 * samec_dist2); |
| |
| samec_err = select(vfloat::zero(), samec_err, mask); |
| haccumulate(samec_errorsumv, samec_err); |
| |
| lane_ids = lane_ids + vint(ASTCENC_SIMD_WIDTH); |
| } |
| |
| uncor_loparam = hmin_s(uncor_loparamv); |
| uncor_hiparam = hmax_s(uncor_hiparamv); |
| |
| samec_loparam = hmin_s(samec_loparamv); |
| samec_hiparam = hmax_s(samec_hiparamv); |
| |
| // Resolve the final scalar accumulator sum |
| haccumulate(uncor_error, uncor_errorsumv); |
| haccumulate(samec_error, samec_errorsumv); |
| |
| float uncor_linelen = uncor_hiparam - uncor_loparam; |
| float samec_linelen = samec_hiparam - samec_loparam; |
| |
| // Turn very small numbers and NaNs into a small number |
| pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f); |
| pl.samec_line_len = astc::max(samec_linelen, 1e-7f); |
| } |
| } |
| |
| #endif |