#include "Vec128.h" void Mat4x4TransposeF32_(float* m_des, const float* m_src1) { __asm volatile ("\n\ ld1 {v0.4s-v3.4s}, [x1] \n\ trn1 v4.4s, v0.4s, v1.4s // a0 b0 a2 b2 \n\ trn2 v5.4s, v0.4s, v1.4s // a1 b1 a3 b3 \n\ trn1 v6.4s, v2.4s, v3.4s // c0 d0 c2 d2 \n\ trn2 v7.4s, v2.4s, v3.4s // c1 d1 c3 d3 \n\ trn1 v0.2d, v4.2d, v6.2d // a0 b0 c0 d0 \n\ trn1 v1.2d, v5.2d, v7.2d // a1 b1 c1 d1 \n\ trn2 v2.2d, v4.2d, v6.2d // a2 b2 c2 d2 \n\ trn2 v3.2d, v5.2d, v7.2d // a3 b3 c3 d3 \n\ st1 {v0.4s-v3.4s}, [x0] \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); }