.section	__TEXT,__text,regular,pure_instructions
	.build_version macos, 11, 0	sdk_version 11, 3
	.globl	__Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i ; -- Begin function _Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i
	.p2align	2
__Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i: ; @_Z13MatVecMulF32_P9Vec4x1F32PA4_fS0_i
	.cfi_startproc
; %bb.0:
	; InlineAsm Start

	; b = M a                                                                                   
	; v0: matrix M column 0                                                                     
	; v1: matrix M column 1                                                                     
	; v2: matrix M column 2                                                                     
	; v3: matrix M column 3                                                                     
	; [x0]...[x0+12]: vector b                                                                  
	; [x2]...[x2+12]: vector a                                                                  


	cbz	x3, LInvalidArg	; if n == 0 goto InvalidArg                            
	tst	x3, #0x3	; if (n & 3) != 0                                      
	b.ne	LInvalidArg	;   goto InvalidArg                                    
	tst	x2, #0xf	; if (n & f) != 0                                      
	b.ne	LInvalidArg	;   goto InvalidArg                                    

	ld4.4s	{ v0, v1, v2, v3 }, [x1]	; transpose M                                          
LLoop1:
	ld1.4s	{ v4 }, [x2], #16	; v4 = [x2] ; x2 += 16                                 
	fmul.4s	v5, v0, v4[0]	; v5 = M[:,0] a.w                                      
	fmla.4s	v5, v1, v4[1]	; v5 += M[:,1] a.x                                     
	fmla.4s	v5, v2, v4[2]	; v5 += M[:,1] a.y                                     
	fmla.4s	v5, v3, v4[3]	; v5 += M[:,1] a.z                                     
	st1.4s	{ v5 }, [x0], #16	; [x0] = v5; x0 += 16                                  

	ld1.4s	{ v4 }, [x2], #16	; v4 = [x2] ; x2 += 16                                 
	fmul.4s	v5, v0, v4[0]	; v5 = M[:,0] a.w                                      
	fmla.4s	v5, v1, v4[1]	; v5 += M[:,1] a.x                                     
	fmla.4s	v5, v2, v4[2]	; v5 += M[:,1] a.y                                     
	fmla.4s	v5, v3, v4[3]	; v5 += M[:,1] a.z                                     
	st1.4s	{ v5 }, [x0], #16	; [x0] = v5; x0 += 16                                  

	ld1.4s	{ v4 }, [x2], #16	; v4 = [x2] ; x2 += 16                                 
	fmul.4s	v5, v0, v4[0]	; v5 = M[:,0] a.w                                      
	fmla.4s	v5, v1, v4[1]	; v5 += M[:,1] a.x                                     
	fmla.4s	v5, v2, v4[2]	; v5 += M[:,1] a.y                                     
	fmla.4s	v5, v3, v4[3]	; v5 += M[:,1] a.z                                     
	st1.4s	{ v5 }, [x0], #16	; [x0] = v5; x0 += 16                                  

	ld1.4s	{ v4 }, [x2], #16	; v4 = [x2] ; x2 += 16                                 
	fmul.4s	v5, v0, v4[0]	; v5 = M[:,0] a.w                                      
	fmla.4s	v5, v1, v4[1]	; v5 += M[:,1] a.x                                     
	fmla.4s	v5, v2, v4[2]	; v5 += M[:,1] a.y                                     
	fmla.4s	v5, v3, v4[3]	; v5 += M[:,1] a.z                                     
	st1.4s	{ v5 }, [x0], #16	; [x0] = v5; x0 += 16                                  

	subs	x3, x3, #4	; if ((n -= 4) != 0)                                     ; =4
	b.ne	LLoop1	;   goto Loop1                                          

	mov	w0, #1	; return code: success                                  
	b	LEXIT

LInvalidArg:
	mov	w0, #1	; return code: error                                    
LEXIT:

	; InlineAsm End
	ret
	.cfi_endproc
                                        ; -- End function
.subsections_via_symbols