//#include //#include #include "Vec128.h" void PackedMathF32_(Vec128 x[9], const Vec128& a, const Vec128& b) { __asm volatile("\n\ ld1 {v0.4s},[x1] // v0 = a \n\ ld1 {v1.4s},[x2] // v1 = b \n\ fadd v2.4s,v0.4s,v1.4s // v2 = a + b \n\ st1 {v2.4s},[x0],16 // save result to x[0] \n\ fsub v2.4s,v0.4s,v1.4s // v2 = a - b \n\ st1 {v2.4s},[x0],16 // save result to x[1] \n\ fmul v2.4s,v0.4s,v1.4s // v2 = a * b \n\ st1 {v2.4s},[x0],16 // save result to x[2] \n\ fdiv v2.4s,v0.4s,v1.4s // v2 = a / b \n\ st1 {v2.4s},[x0],16 // save result to x[3] \n\ fabs v2.4s,v0.4s // v2 = abs(a) \n\ st1 {v2.4s},[x0],16 // save result to x[4] \n\ fneg v2.4s,v1.4s // v2 = -b \n\ st1 {v2.4s},[x0],16 // save result to x[5] \n\ fminnm v2.4s,v0.4s,v1.4s // v2 = min(a, b) \n\ st1 {v2.4s},[x0],16 // save result to x[6] \n\ fmaxnm v2.4s,v0.4s,v1.4s // v2 = max(a, b) \n\ st1 {v2.4s},[x0],16 // save result to x[7] \n\ fsqrt v2.4s,v0.4s // v2 = sqrt(a) \n\ st1 {v2.4s},[x0],16 // save result to x[8] \n\ //ret \n\ \n" : : : "v0", "v1", "v2" ); } void PackedMathF64_(Vec128 x[9], const Vec128& a, const Vec128& b){ __asm volatile("\n\ ld1 {v0.2d},[x1] // v0 = a \n\ ld1 {v1.2d},[x2] // v1 = b \n\ fadd v2.2d,v0.2d,v1.2d // v2 = a + b \n\ st1 {v2.2d},[x0],16 // save result to x[0] \n\ fsub v2.2d,v0.2d,v1.2d // v2 = a - b \n\ st1 {v2.2d},[x0],16 // save result to x[1] \n\ fmul v2.2d,v0.2d,v1.2d // v2 = a * b \n\ st1 {v2.2d},[x0],16 // save result to x[2] \n\ fdiv v2.2d,v0.2d,v1.2d // v2 = a / b \n\ st1 {v2.2d},[x0],16 // save result to x[3] \n\ fabs v2.2d,v0.2d // v2 = abs(a) \n\ st1 {v2.2d},[x0],16 // save result to x[4] \n\ fneg v2.2d,v1.2d // v2 = -b \n\ st1 {v2.2d},[x0],16 // save result to x[5] \n\ fminnm v2.2d,v0.2d,v1.2d // v2 = min(a, b) \n\ st1 {v2.2d},[x0],16 // save result to x[6] \n\ fmaxnm v2.2d,v0.2d,v1.2d // v2 = max(a, b) \n\ st1 {v2.2d},[x0],16 // save result to x[7] \n\ fsqrt v2.2d,v0.2d // v2 = sqrt(a) \n\ st1 {v2.2d},[x0],16 // save result to x[8] \n\ //ret \n\ \n" : : : "v0", "v1", "v2" ); }