#include "mat.h" void MatVecMulF32_(Vec4x1F32* b, float m[4][4], Vec4x1F32* a, int n) { // m must be transposed __asm volatile("\n\ // b = M a \n\ // v0: matrix M column 0 \n\ // v1: matrix M column 1 \n\ // v2: matrix M column 2 \n\ // v3: matrix M column 3 \n\ // [x0]...[x0+12]: vector b \n\ // [x2]...[x2+12]: vector a \n\ .macro Mat4x4MulVec \n\ ld1 {v4.4s}, [x2], 16 // v4 = [x2] ; x2 += 16 \n\ fmul v5.4s, v0.4s, v4.s[0] // v5 = M[:,0] a.w \n\ fmla v5.4s, v1.4s, v4.s[1] // v5 += M[:,1] a.x \n\ fmla v5.4s, v2.4s, v4.s[2] // v5 += M[:,1] a.y \n\ fmla v5.4s, v3.4s, v4.s[3] // v5 += M[:,1] a.z \n\ st1 {v5.4s}, [x0], 16 // [x0] = v5; x0 += 16 \n\ .endm \n\ \n\ \n\ cbz x3, LInvalidArg // if n == 0 goto InvalidArg \n\ tst x3, 0x3 // if (n & 3) != 0 \n\ b.ne LInvalidArg // goto InvalidArg \n\ tst x2, 0xf // if (n & f) != 0 \n\ b.ne LInvalidArg // goto InvalidArg \n\ \n\ ld4 {v0.4s-v3.4s}, [x1] // transpose M \n\ LLoop1: \n\ Mat4x4MulVec \n\ Mat4x4MulVec \n\ Mat4x4MulVec \n\ Mat4x4MulVec \n\ subs x3, x3, 4 // if ((n -= 4) != 0) \n\ b.ne LLoop1 // goto Loop1 \n\ \n\ mov w0, 1 // return code: success \n\ b LEXIT \n\ \n\ LInvalidArg: \n\ mov w0, 1 // return code: error \n\ LEXIT: \n\ " : : : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v5" ); }