blob: 74d5d59fc99326b3c9e14e5282cee77737ce06ea [file] [log] [blame] [edit]
// Transpose a 4x4 matrix
// Author: Peter Jensen
(function () {
// Kernel configuration
var kernelConfig = {
kernelName: "Transpose4x4",
kernelInit: init,
kernelCleanup: cleanup,
kernelSimd: simdTransposeN,
kernelNonSimd: transposeN,
kernelIterations: 100000000
};
// Hook up to the harness
benchmarks.add (new Benchmark (kernelConfig));
// Global object allocations
var src = new Float32Array(16);
var dst = new Float32Array(16);
var tsrc = new Float32Array(16);
var sel_ttff = SIMD.Bool32x4(true, true, false, false);
function initMatrix(matrix, matrixTransposed) {
for (var r = 0; r < 4; ++r) {
var r4 = 4*r;
for (var c = 0; c < 4; ++c) {
matrix[r4 + c] = r4 + c;
matrixTransposed[r + c*4] = r4 + c;
}
}
}
function printMatrix(matrix) {
for (var r = 0; r < 4; ++r) {
var str = "";
var ri = r*4;
for (var c = 0; c < 4; ++c) {
var value = matrix[ri + c];
str += " " + value.toFixed(2);
}
print(str);
}
}
function compareEqualMatrix(m1, m2) {
for (var i = 0; i < 16; ++i) {
if (m1[i] !== m2[i]) {
return false;
}
}
return true;
}
// Kernel Initializer
function init () {
initMatrix(src, tsrc);
transposeN(1);
if (!compareEqualMatrix (tsrc, dst)) {
return false;
}
simdTransposeN(1);
// printMatrix(dst);
if (!compareEqualMatrix (tsrc, dst)) {
return false;
}
return true;
}
// Kernel Cleanup
function cleanup () {
return init();
}
// SIMD version of the kernel with SIMD.Float32x4.shuffle operation
function simdTransposeMix() {
var src0 = SIMD.Float32x4.load(src, 0);
var src1 = SIMD.Float32x4.load(src, 4);
var src2 = SIMD.Float32x4.load(src, 8);
var src3 = SIMD.Float32x4.load(src, 12);
var dst0;
var dst1;
var dst2;
var dst3;
var tmp01;
var tmp23;
tmp01 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
tmp23 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
dst0 = SIMD.Float32x4.shuffle(tmp01, tmp23, 0, 2, 4, 6);
dst1 = SIMD.Float32x4.shuffle(tmp01, tmp23, 1, 3, 5, 7);
tmp01 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
tmp23 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
dst2 = SIMD.Float32x4.shuffle(tmp01, tmp23, 0, 2, 4, 6);
dst3 = SIMD.Float32x4.shuffle(tmp01, tmp23, 1, 3, 5, 7);
SIMD.Float32x4.store(dst, 0, dst0);
SIMD.Float32x4.store(dst, 4, dst1);
SIMD.Float32x4.store(dst, 8, dst2);
SIMD.Float32x4.store(dst, 12, dst3);
}
// SIMD version of the kernel
function simdTranspose() {
var src0 = SIMD.Float32x4.load(src, 0);
var src1 = SIMD.Float32x4.load(src, 4);
var src2 = SIMD.Float32x4.load(src, 8);
var src3 = SIMD.Float32x4.load(src, 12);
var dst0;
var dst1;
var dst2;
var dst3;
var tmp01;
var tmp23;
tmp01 = SIMD.Float32x4.select(sel_ttff, src0, SIMD.Float32x4.swizzle(src1, 0, 0, 0, 1));
tmp23 = SIMD.Float32x4.select(sel_ttff, src2, SIMD.Float32x4.swizzle(src3, 0, 0, 0, 1));
dst0 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(tmp01, 0, 2, 0, 0), SIMD.Float32x4.swizzle(tmp23, 0, 0, 0, 2));
dst1 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(tmp01, 1, 3, 0, 0), SIMD.Float32x4.swizzle(tmp23, 0, 0, 1, 3));
tmp01 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(src0, 2, 3, 0, 0), src1);
tmp23 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(src2, 2, 3, 0, 0), src3);
dst2 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(tmp01, 0, 2, 0, 0), SIMD.Float32x4.swizzle(tmp23, 0, 0, 0, 2));
dst3 = SIMD.Float32x4.select(sel_ttff, SIMD.Float32x4.swizzle(tmp01, 1, 3, 0, 0), SIMD.Float32x4.swizzle(tmp23, 0, 0, 1, 3));
SIMD.Float32x4.store(dst, 0, dst0);
SIMD.Float32x4.store(dst, 4, dst1);
SIMD.Float32x4.store(dst, 8, dst2);
SIMD.Float32x4.store(dst, 12, dst3);
}
// Non SIMD version of the kernel
function transpose() {
dst[0] = src[0];
dst[1] = src[4];
dst[2] = src[8];
dst[3] = src[12];
dst[4] = src[1];
dst[5] = src[5];
dst[6] = src[9];
dst[7] = src[13];
dst[8] = src[2];
dst[9] = src[6];
dst[10] = src[10];
dst[11] = src[14];
dst[12] = src[3];
dst[13] = src[7];
dst[14] = src[11];
dst[15] = src[15];
}
function simdTransposeN(n) {
for (var i = 0; i < n; ++i) {
var src0 = SIMD.Float32x4.load(src, 0);
var src1 = SIMD.Float32x4.load(src, 4);
var src2 = SIMD.Float32x4.load(src, 8);
var src3 = SIMD.Float32x4.load(src, 12);
var dst0;
var dst1;
var dst2;
var dst3;
var tmp01;
var tmp23;
tmp01 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
tmp23 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
dst0 = SIMD.Float32x4.shuffle(tmp01, tmp23, 0, 2, 4, 6);
dst1 = SIMD.Float32x4.shuffle(tmp01, tmp23, 1, 3, 5, 7);
tmp01 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
tmp23 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
dst2 = SIMD.Float32x4.shuffle(tmp01, tmp23, 0, 2, 4, 6);
dst3 = SIMD.Float32x4.shuffle(tmp01, tmp23, 1, 3, 5, 7);
SIMD.Float32x4.store(dst, 0, dst0);
SIMD.Float32x4.store(dst, 4, dst1);
SIMD.Float32x4.store(dst, 8, dst2);
SIMD.Float32x4.store(dst, 12, dst3);
}
}
function transposeN(n) {
for (var i = 0; i < n; ++i) {
dst[0] = src[0];
dst[1] = src[4];
dst[2] = src[8];
dst[3] = src[12];
dst[4] = src[1];
dst[5] = src[5];
dst[6] = src[9];
dst[7] = src[13];
dst[8] = src[2];
dst[9] = src[6];
dst[10] = src[10];
dst[11] = src[14];
dst[12] = src[3];
dst[13] = src[7];
dst[14] = src[11];
dst[15] = src[15];
}
}
} ());