.section __TEXT,__text,regular,pure_instructions .build_version macos, 11, 0 sdk_version 11, 3 .globl __Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i ; -- Begin function _Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i .p2align 2 __Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i: ; @_Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i .cfi_startproc ; %bb.0: ; InlineAsm Start ; b = M a ; v0: matrix M column 0 ; v1: matrix M column 1 ; v2: matrix M column 2 ; v3: matrix M column 3 ; [x0]...[x0+12]: vector b ; [x2]...[x2+12]: vector a cbz x3, LInvalidArg ; if n == 0 goto InvalidArg tst x3, #0x3 ; if (n & 3) != 0 b.ne LInvalidArg ; goto InvalidArg tst x2, #0xf ; if (n & f) != 0 b.ne LInvalidArg ; goto InvalidArg ld4.4s { v0, v1, v2, v3 }, [x1] ; transpose M LLoop1: ld1.4s { v4 }, [x2], #16 ; v4 = [x2] ; x2 += 16 fmul.4s v5, v0, v4[0] ; v5 = M[:,0] a.w fmla.4s v5, v1, v4[1] ; v5 += M[:,1] a.x fmla.4s v5, v2, v4[2] ; v5 += M[:,1] a.y fmla.4s v5, v3, v4[3] ; v5 += M[:,1] a.z st1.4s { v5 }, [x0], #16 ; [x0] = v5; x0 += 16 ld1.4s { v4 }, [x2], #16 ; v4 = [x2] ; x2 += 16 fmul.4s v5, v0, v4[0] ; v5 = M[:,0] a.w fmla.4s v5, v1, v4[1] ; v5 += M[:,1] a.x fmla.4s v5, v2, v4[2] ; v5 += M[:,1] a.y fmla.4s v5, v3, v4[3] ; v5 += M[:,1] a.z st1.4s { v5 }, [x0], #16 ; [x0] = v5; x0 += 16 ld1.4s { v4 }, [x2], #16 ; v4 = [x2] ; x2 += 16 fmul.4s v5, v0, v4[0] ; v5 = M[:,0] a.w fmla.4s v5, v1, v4[1] ; v5 += M[:,1] a.x fmla.4s v5, v2, v4[2] ; v5 += M[:,1] a.y fmla.4s v5, v3, v4[3] ; v5 += M[:,1] a.z st1.4s { v5 }, [x0], #16 ; [x0] = v5; x0 += 16 ld1.4s { v4 }, [x2], #16 ; v4 = [x2] ; x2 += 16 fmul.4s v5, v0, v4[0] ; v5 = M[:,0] a.w fmla.4s v5, v1, v4[1] ; v5 += M[:,1] a.x fmla.4s v5, v2, v4[2] ; v5 += M[:,1] a.y fmla.4s v5, v3, v4[3] ; v5 += M[:,1] a.z st1.4s { v5 }, [x0], #16 ; [x0] = v5; x0 += 16 subs x3, x3, #4 ; if ((n -= 4) != 0) ; =4 b.ne LLoop1 ; goto Loop1 mov w0, #1 ; return code: success b LEXIT LInvalidArg: mov w0, #1 ; return code: error LEXIT: ; InlineAsm End ret .cfi_endproc ; -- End function .subsections_via_symbols