#include "vec.h" #define ASM_MACROS "\n\ // c = cross(a,b) \n\ // a: s0, s1, s2 \n\ // b: s3, s4, s5 \n\ // c: s19, s20, s21 \n\ .macro VecCp \n\ fmul s19, s1, s5 // s19 = ay * bz \n\ fmsub s19, s2, s4, s19 // s19 = s19 - az * by \n\ fmul s20, s2, s3 // s20 = az * bx \n\ fmsub s20, s0, s5, s20 // s20 = s20 - az * by \n\ fmul s21, s0, s4 // s21 = az * bx \n\ fmsub s21, s1, s3, s21 // s21 = s21 - az * by \n\ .endm \n\ \n\ // c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\ // a: v0, v1, v2 \n\ // b: v3, v4, v5 \n\ // c: v19, v20, v21 \n\ .macro VecCp4 \n\ fmul v19.4s, v1.4s, v5.4s // v19 = ay * bz \n\ fmls v19.4s, v2.4s, v4.4s // v19 -= - az * by \n\ fmul v20.4s, v2.4s, v3.4s // v20 = az * bx \n\ fmls v20.4s, v0.4s, v5.4s // v20 -= az * by \n\ fmul v21.4s, v0.4s, v4.4s // v21 = az * bx \n\ fmls v21.4s, v1.4s, v3.4s // v21 -= az * by \n\ .endm \n\ \n\ // c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\ // a: ( [x1],[x1+4],[x1+8] ) * 4 \n\ // b: ( [x2],[x2+4],[x2+8] ) * 4 \n\ // c: ( [x0],[x0+4],[x0+8] ) * 4 \n\ .macro VecCp4AOS \n\ ld3 {v0.4s, v1.4s, v2.4s}, [x1], 48 // load (ax,ay,az) into (v0,v1,v2); x1 += 48 \n\ ld3 {v3.4s, v4.4s, v5.4s}, [x2], 48 // load (bx,vy,vz) into (v3,v4,v5); x2 += 48 \n\ VecCp4 // c = cross(a,b) \n\ st3 {v19.4s, v20.4s, v21.4s}, [x0], 48 // store (cx,cy,cz) from (v19,v20,v21); x0 += 48 \n\ .endm \n\ \n\ // c = SIMD:cross(a,b) \n\ // ax:[x7] * 4 \n\ // ay:[x8] * 4 \n\ // az:[x9] * 4 \n\ // bx:[x10] * 4 \n\ // by:[x11] * 4 \n\ // bz:[x12] * 4 \n\ // cx:[x13] * 4 \n\ // cy:[x14] * 4 \n\ // cz:[x15] * 4 \n\ .macro VecCp4SOA \n\ ld1 {v0.4s}, [x7], 16 // v0: ax \n\ ld1 {v1.4s}, [x8], 16 // v1: ay \n\ ld1 {v2.4s}, [x9], 16 // v2: az \n\ ld1 {v3.4s}, [x10], 16 // v3: bx \n\ ld1 {v4.4s}, [x11], 16 // v4: by \n\ ld1 {v5.4s}, [x12], 16 // v5: bz \n\ VecCp4 // c = cross(a,b) \n\ st1 {v19.4s}, [x13], 16 // [x13]: cx \n\ st1 {v20.4s}, [x14], 16 // [x14]: cx \n\ st1 {v21.4s}, [x15], 16 // [x15]: cy \n\ .endm \n\ \n\ " void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n) { __asm volatile (ASM_MACROS "\n\ \n\ cmp x3, 16 // if n < 0 \n\ b.lo LSkipLoop1A // goto SkipLoop1A \n\ \n\ LLoop1A: \n\ VecCp4AOS \n\ VecCp4AOS \n\ VecCp4AOS \n\ VecCp4AOS \n\ sub x3, x3, 16 // n -= 16 \n\ cmp x3, 16 // if n >= 16 \n\ b.hs LLoop1A // goto Loop1A \n\ LSkipLoop1A: \n\ cbz x3, LDoneA \n\ LLoop2A: \n\ ldp s0, s1, [x1], 8 // (s0, s1) = (ax, ay); x1 += 8 \n\ ldr s2, [x1], 4 // s2 = az; x1 += 4 b \n\ ldp s3, s4, [x2], 8 // (s3, s4) = (bx, by); x1 += 8 \n\ ldr s5, [x2], 4 // s5 = bz; x1 += 4 \n\ VecCp \n\ stp s19, s20, [x0], 8 // [x0] = (cx, cy); x0 += 8 \n\ str s21, [x0], 4 // [x0] = cz; x0 += 4 \n\ subs x3, x3, 1 // if --x3 != 0 \n\ b.ne LLoop2A // goto Loop2A \n\ LDoneA: \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3" ); } void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) { __asm volatile ("\n\ \n\ ldp x7, x8, [x1], 16 // (x7, x8) = address of (ax, ay) \n\ ldr x9, [x1] // x9 = address of az \n\ ldp x10, x11, [x2], 16 // (x10, x11) = address of (bx, by) \n\ ldr x12, [x2] // x12 = address of bz \n\ ldp x13, x14, [x0], 16 // (x13, x14) = address of (cx, cy) \n\ ldr x15, [x0] // x15 = address of cz \n\ \n\ cmp x3, 16 // if n < 0 \n\ b.lo LSkipLoop1B // goto SkipLoop1A \n\ \n\ LLoop1B: \n\ VecCp4SOA \n\ VecCp4SOA \n\ VecCp4SOA \n\ VecCp4SOA \n\ sub x3, x3, 16 // n -= 16 \n\ cmp x3, 16 // if n >= 16 \n\ b.hs LLoop1B // goto Loop1B \n\ LSkipLoop1B: \n\ cbz x3, LDoneB \n\ LLoop2B: \n\ ldr s0, [x7], 4 // s0 = ax; x7 += 4 \n\ ldr s1, [x8], 4 // s1 = ay; x8 += 4 \n\ ldr s2, [x9], 4 // s2 = az; x9 += 4 \n\ ldr s3, [x10], 4 // s3 = bx; x10 += 4 \n\ ldr s4, [x11], 4 // s4 = by; x11 += 4 \n\ ldr s5, [x12], 4 // s5 = bz; x12 += 4 \n\ VecCp \n\ str s19, [x13], 4 // [x13] = cx; x13 += 4 \n\ str s20, [x14], 4 // [x14] = cy; x14 += 4 \n\ str s21, [x15], 4 // [x15] = cz; x15 += 4 \n\ subs x3, x3, 1 // if --n != 0 \n\ b.ne LLoop2B // goto Loop2B \n\ LDoneB: \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15" ); }