.section __TEXT,__text,regular,pure_instructions .build_version macos, 11, 0 sdk_version 11, 3 .globl __Z13CalcCorrCoef_PfS_PKfS1_mf ; -- Begin function _Z13CalcCorrCoef_PfS_PKfS1_mf .p2align 2 __Z13CalcCorrCoef_PfS_PKfS1_mf: ; @_Z13CalcCorrCoef_PfS_PKfS1_mf .cfi_startproc ; %bb.0: ; InlineAsm Start cbz x4, LInvalidArg ; if n == 0 goto END tst x2, #0xf ; if (x2 != 15) b.ne LInvalidArg ; goto END tst x3, #0xf ; if (x3 != 15) b.ne LInvalidArg ; goto END mov x5, x4 ; save n to x5 eor.16b v16, v16, v16 ; sum_x = 0 eor.16b v17, v17, v17 ; sum_y = 0 eor.16b v18, v18, v18 ; sum_xx = 0 eor.16b v19, v19, v19 ; sum_yy = 0 eor.16b v20, v20, v20 ; sum_xy = 0 cmp x4, #16 ; if n<=16 ; =16 b.lo LSkipLoop1 ; goto SkipLoop1 LLoop1: ld1.4s { v0, v1, v2, v3 }, [x2], #64 ; load x[0:16] ld1.4s { v4, v5, v6, v7 }, [x3], #64 ; load y[0:16] fadd.4s v16, v16, v0 ; sum_x: v16 += VregX fadd.4s v17, v17, v4 ; sum_y: v17 += VregX fmla.4s v18, v0, v0 ; sum_xx: v18 += VregX^2 fmla.4s v19, v4, v4 ; sum_yy: v19 += VregY^2 fmla.4s v20, v0, v4 ; sum_xy: v20 += VregX * VregY fadd.4s v16, v16, v1 ; sum_x: v16 += VregX fadd.4s v17, v17, v5 ; sum_y: v17 += VregX fmla.4s v18, v1, v1 ; sum_xx: v18 += VregX^2 fmla.4s v19, v5, v5 ; sum_yy: v19 += VregY^2 fmla.4s v20, v1, v5 ; sum_xy: v20 += VregX * VregY fadd.4s v16, v16, v2 ; sum_x: v16 += VregX fadd.4s v17, v17, v6 ; sum_y: v17 += VregX fmla.4s v18, v2, v2 ; sum_xx: v18 += VregX^2 fmla.4s v19, v6, v6 ; sum_yy: v19 += VregY^2 fmla.4s v20, v2, v6 ; sum_xy: v20 += VregX * VregY fadd.4s v16, v16, v3 ; sum_x: v16 += VregX fadd.4s v17, v17, v7 ; sum_y: v17 += VregX fmla.4s v18, v3, v3 ; sum_xx: v18 += VregX^2 fmla.4s v19, v7, v7 ; sum_yy: v19 += VregY^2 fmla.4s v20, v3, v7 ; sum_xy: v20 += VregX * VregY sub x4, x4, #16 ; n -= 16 ; =16 cmp x4, #16 ; if x4 >= 16 ; =16 b.hs LLoop1 ; goto Loop LSkipLoop1: faddp.4s v16, v16, v16 ; lane0=lane0+lane1,lane1=lane2+lane3 faddp.4s v16, v16, v16 ; s16 = lane0+lane1 faddp.4s v17, v17, v17 ; lane0=lane0+lane1,lane1=lane2+lane3 faddp.4s v17, v17, v17 ; s17 = lane0+lane1 faddp.4s v18, v18, v18 ; lane0=lane0+lane1,lane1=lane2+lane3 faddp.4s v18, v18, v18 ; s18 = lane0+lane1 faddp.4s v19, v19, v19 ; lane0=lane0+lane1,lane1=lane2+lane3 faddp.4s v19, v19, v19 ; s19 = lane0+lane1 faddp.4s v20, v20, v20 ; lane0=lane0+lane1,lane1=lane2+lane3 faddp.4s v20, v20, v20 ; s20 = lane0+lane1 cbz x4, LSkipLoop2 ; if x4==0 goto SkipLoop2 LLoop2: ldr s1, [x2], #4 ; s1 = x; x+=4 ldr s2, [x3], #4 ; s2 = y; y+=4 fadd s16, s16, s1 ; s16 += s1 fadd s17, s17, s2 ; s17 += s2 fmla.4s v18, v1, v1[0] ; v18 += s1 * s1 fmla.4s v19, v2, v2[0] ; f19 += s2 * s2 fmla.4s v20, v1, v2[0] ; f20 += s1 * s2 subs x4, x4, #1 ; if (--n !=0) ; =1 b.ne LLoop2 ; goto Loop2 LSkipLoop2: stp s16, s17, [x1], #8 ; [x1]=s16,s17; x1+=8 stp s18, s19, [x1], #8 ; [x1]=s18,s19; x1+=8 str s20, [x1] ; [x1]=s20; x1+=8 ; rho numerator scvtf s21, x5 ; s21 = n fmul s1, s21, s20 ; s1 = n * sum_xy fmls.4s v1, v16, v17[0] ; s1 -= sum_x * sum_y ; rho denominator fmul s2, s21, s18 ; s2 = n * sum_xx fmsub s2, s16, s16, s2 ; s2 = s2 - sum_x * sum_x fsqrt s2, s2 ; s2 = sqrt(s2) fmul s3, s21, s19 ; s2 = n * sum_yy fmsub s3, s17, s17, s3 ; s2 = s3 - sum_y * sum_y fsqrt s3, s3 ; s3 = sqrt(s3) fmul s4, s2, s3 ; s4 = s2 * s3 fcmp s4, s0 ; if rho_den < epsilon b.lo LBadRhoDen ; goto BadRhoDen fdiv s5, s1, s4 ; s5 = rho str s5, [x0] ; [x0] = s5 mov w8, #1 ; return code: success b LReturn LBadRhoDen: eor.16b v5, v5, v5 ; rho = 0 str s5, [x0] ; [x0] = rho LInvalidArg: mov w8, #0 ; return code: fail LReturn: ; InlineAsm End and w0, w8, #0x1 ret .cfi_endproc ; -- End function .subsections_via_symbols