.section __TEXT,__text,regular,pure_instructions .build_version macos, 11, 0 sdk_version 11, 3 .globl __Z13CrossProdAOS_P6VectorPKS_S2_i ; -- Begin function _Z13CrossProdAOS_P6VectorPKS_S2_i .p2align 2 __Z13CrossProdAOS_P6VectorPKS_S2_i: ; @_Z13CrossProdAOS_P6VectorPKS_S2_i .cfi_startproc ; %bb.0: ; InlineAsm Start ; c = cross(a,b) ; a: s0, s1, s2 ; b: s3, s4, s5 ; c: s19, s20, s21 ; c[0-3] = SIMD:cross(a[0-3],b[0-3]) ; a: v0, v1, v2 ; b: v3, v4, v5 ; c: v19, v20, v21 ; c[0-3] = SIMD:cross(a[0-3],b[0-3]) ; a: ( [x1],[x1+4],[x1+8] ) * 4 ; b: ( [x2],[x2+4],[x2+8] ) * 4 ; c: ( [x0],[x0+4],[x0+8] ) * 4 ; c = SIMD:cross(a,b) ; ax:[x7] * 4 ; ay:[x8] * 4 ; az:[x9] * 4 ; bx:[x10] * 4 ; by:[x11] * 4 ; bz:[x12] * 4 ; cx:[x13] * 4 ; cy:[x14] * 4 ; cz:[x15] * 4 cmp x3, #16 ; if n < 0 ; =16 b.lo LSkipLoop1A ; goto SkipLoop1A LLoop1A: ld3.4s { v0, v1, v2 }, [x1], #48 ; load (ax,ay,az) into (v0,v1,v2); x1 += 48 ld3.4s { v3, v4, v5 }, [x2], #48 ; load (bx,vy,vz) into (v3,v4,v5); x2 += 48 fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st3.4s { v19, v20, v21 }, [x0], #48 ; store (cx,cy,cz) from (v19,v20,v21); x0 += 48 ld3.4s { v0, v1, v2 }, [x1], #48 ; load (ax,ay,az) into (v0,v1,v2); x1 += 48 ld3.4s { v3, v4, v5 }, [x2], #48 ; load (bx,vy,vz) into (v3,v4,v5); x2 += 48 fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st3.4s { v19, v20, v21 }, [x0], #48 ; store (cx,cy,cz) from (v19,v20,v21); x0 += 48 ld3.4s { v0, v1, v2 }, [x1], #48 ; load (ax,ay,az) into (v0,v1,v2); x1 += 48 ld3.4s { v3, v4, v5 }, [x2], #48 ; load (bx,vy,vz) into (v3,v4,v5); x2 += 48 fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st3.4s { v19, v20, v21 }, [x0], #48 ; store (cx,cy,cz) from (v19,v20,v21); x0 += 48 ld3.4s { v0, v1, v2 }, [x1], #48 ; load (ax,ay,az) into (v0,v1,v2); x1 += 48 ld3.4s { v3, v4, v5 }, [x2], #48 ; load (bx,vy,vz) into (v3,v4,v5); x2 += 48 fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st3.4s { v19, v20, v21 }, [x0], #48 ; store (cx,cy,cz) from (v19,v20,v21); x0 += 48 sub x3, x3, #16 ; n -= 16 ; =16 cmp x3, #16 ; if n >= 16 ; =16 b.hs LLoop1A ; goto Loop1A LSkipLoop1A: cbz x3, LDoneA LLoop2A: ldp s0, s1, [x1], #8 ; (s0, s1) = (ax, ay); x1 += 8 ldr s2, [x1], #4 ; s2 = az; x1 += 4 b ldp s3, s4, [x2], #8 ; (s3, s4) = (bx, by); x1 += 8 ldr s5, [x2], #4 ; s5 = bz; x1 += 4 fmul s19, s1, s5 ; s19 = ay * bz fmsub s19, s2, s4, s19 ; s19 = s19 - az * by fmul s20, s2, s3 ; s20 = az * bx fmsub s20, s0, s5, s20 ; s20 = s20 - az * by fmul s21, s0, s4 ; s21 = az * bx fmsub s21, s1, s3, s21 ; s21 = s21 - az * by stp s19, s20, [x0], #8 ; [x0] = (cx, cy); x0 += 8 str s21, [x0], #4 ; [x0] = cz; x0 += 4 subs x3, x3, #1 ; if --x3 != 0 ; =1 b.ne LLoop2A ; goto Loop2A LDoneA: ; InlineAsm End ret .cfi_endproc ; -- End function .globl __Z13CrossProdSOA_R9VectorSoARKS_S2_i ; -- Begin function _Z13CrossProdSOA_R9VectorSoARKS_S2_i .p2align 2 __Z13CrossProdSOA_R9VectorSoARKS_S2_i: ; @_Z13CrossProdSOA_R9VectorSoARKS_S2_i .cfi_startproc ; %bb.0: ; InlineAsm Start ldp x7, x8, [x1], #16 ; (x7, x8) = address of (ax, ay) ldr x9, [x1] ; x9 = address of az ldp x10, x11, [x2], #16 ; (x10, x11) = address of (bx, by) ldr x12, [x2] ; x12 = address of bz ldp x13, x14, [x0], #16 ; (x13, x14) = address of (cx, cy) ldr x15, [x0] ; x15 = address of cz cmp x3, #16 ; if n < 0 ; =16 b.lo LSkipLoop1B ; goto SkipLoop1A LLoop1B: ld1.4s { v0 }, [x7], #16 ; v0: ax ld1.4s { v1 }, [x8], #16 ; v1: ay ld1.4s { v2 }, [x9], #16 ; v2: az ld1.4s { v3 }, [x10], #16 ; v3: bx ld1.4s { v4 }, [x11], #16 ; v4: by ld1.4s { v5 }, [x12], #16 ; v5: bz fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st1.4s { v19 }, [x13], #16 ; [x13]: cx st1.4s { v20 }, [x14], #16 ; [x14]: cx st1.4s { v21 }, [x15], #16 ; [x15]: cy ld1.4s { v0 }, [x7], #16 ; v0: ax ld1.4s { v1 }, [x8], #16 ; v1: ay ld1.4s { v2 }, [x9], #16 ; v2: az ld1.4s { v3 }, [x10], #16 ; v3: bx ld1.4s { v4 }, [x11], #16 ; v4: by ld1.4s { v5 }, [x12], #16 ; v5: bz fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st1.4s { v19 }, [x13], #16 ; [x13]: cx st1.4s { v20 }, [x14], #16 ; [x14]: cx st1.4s { v21 }, [x15], #16 ; [x15]: cy ld1.4s { v0 }, [x7], #16 ; v0: ax ld1.4s { v1 }, [x8], #16 ; v1: ay ld1.4s { v2 }, [x9], #16 ; v2: az ld1.4s { v3 }, [x10], #16 ; v3: bx ld1.4s { v4 }, [x11], #16 ; v4: by ld1.4s { v5 }, [x12], #16 ; v5: bz fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st1.4s { v19 }, [x13], #16 ; [x13]: cx st1.4s { v20 }, [x14], #16 ; [x14]: cx st1.4s { v21 }, [x15], #16 ; [x15]: cy ld1.4s { v0 }, [x7], #16 ; v0: ax ld1.4s { v1 }, [x8], #16 ; v1: ay ld1.4s { v2 }, [x9], #16 ; v2: az ld1.4s { v3 }, [x10], #16 ; v3: bx ld1.4s { v4 }, [x11], #16 ; v4: by ld1.4s { v5 }, [x12], #16 ; v5: bz fmul.4s v19, v1, v5 ; c = cross(a,b) ; v19 = ay * bz fmls.4s v19, v2, v4 ; v19 -= - az * by fmul.4s v20, v2, v3 ; v20 = az * bx fmls.4s v20, v0, v5 ; v20 -= az * by fmul.4s v21, v0, v4 ; v21 = az * bx fmls.4s v21, v1, v3 ; v21 -= az * by ; c = cross(a,b) st1.4s { v19 }, [x13], #16 ; [x13]: cx st1.4s { v20 }, [x14], #16 ; [x14]: cx st1.4s { v21 }, [x15], #16 ; [x15]: cy sub x3, x3, #16 ; n -= 16 ; =16 cmp x3, #16 ; if n >= 16 ; =16 b.hs LLoop1B ; goto Loop1B LSkipLoop1B: cbz x3, LDoneB LLoop2B: ldr s0, [x7], #4 ; s0 = ax; x7 += 4 ldr s1, [x8], #4 ; s1 = ay; x8 += 4 ldr s2, [x9], #4 ; s2 = az; x9 += 4 ldr s3, [x10], #4 ; s3 = bx; x10 += 4 ldr s4, [x11], #4 ; s4 = by; x11 += 4 ldr s5, [x12], #4 ; s5 = bz; x12 += 4 fmul s19, s1, s5 ; s19 = ay * bz fmsub s19, s2, s4, s19 ; s19 = s19 - az * by fmul s20, s2, s3 ; s20 = az * bx fmsub s20, s0, s5, s20 ; s20 = s20 - az * by fmul s21, s0, s4 ; s21 = az * bx fmsub s21, s1, s3, s21 ; s21 = s21 - az * by str s19, [x13], #4 ; [x13] = cx; x13 += 4 str s20, [x14], #4 ; [x14] = cy; x14 += 4 str s21, [x15], #4 ; [x15] = cz; x15 += 4 subs x3, x3, #1 ; if --n != 0 ; =1 b.ne LLoop2B ; goto Loop2B LDoneB: ; InlineAsm End ret .cfi_endproc ; -- End function .subsections_via_symbols