%include "./FMOD_static/win32/src/c32.mac"

; ==========================================================================================
; GLOBAL UNINITIALIZED DATA
; ==========================================================================================

[SEGMENT .data use32 align=32]

distortion_ones     dd 1.0,1.0,1.0,1.0
distortion_fabsmask dd 7FFFFFFFh, 7FFFFFFFh, 7FFFFFFFh, 7FFFFFFFh

; ==========================================================================================
; CODE
; ==========================================================================================

[SEGMENT .text use32 align=32]

; =================================================================================================================================
; void FMOD_DSP_Distortion_SIMD(float * inbuffer, float * outbuffer, unsigned int length, int inchannels, int outchannels, float k);
; =================================================================================================================================
proc FMOD_DSP_Distortion_SIMD

		%$inbuffer		arg 
		%$outbuffer		arg 
		%$length		arg 
		%$inchannels	arg
		%$outchannels	arg
		%$k         	arg

		push	eax
		push	ebx
		push	ecx
		push	edx
		push	esi
		push	edi

        mov     esi, [ebp+%$inbuffer]
        mov     edi, [ebp+%$outbuffer]

        ; xmm0 = [0x7fffffff][0x7fffffff][0x7fffffff][0x7fffffff]
        ; xmm1 = [k         ][k         ][k         ][k         ]
        ; xmm2 = [k+1       ][k+1       ][k+1       ][k+1       ]
        ; xmm3 = 
        ; xmm4 = 
        ; xmm5 = [1.0f      ][1.0f      ][1.0f      ][1.0f      ]
        ; xmm6 = 
        ; xmm7 = 

        movaps  xmm0, [distortion_fabsmask]
		movss	xmm1, [ebp+%$k]
        shufps  xmm1, xmm1, 0x00
        movaps  xmm2, xmm1
        movaps  xmm5, [distortion_ones]
        addps   xmm2, xmm5

		mov     edx, [ebp+%$length]
 		imul    edx, [ebp+%$inchannels]
        mov     ecx, edx
        shr     ecx, 3
        test    ecx, ecx
        jz      distortionlooprolledstart

distortionloopunrolled:

        movups  xmm3, [esi]
        movups  xmm4, xmm3
        andps   xmm3, xmm0
        mulps   xmm3, xmm1
        addps   xmm3, xmm5
        rcpps   xmm3, xmm3      ; <--- this is the reciprical calc.
        mulps   xmm4, xmm2      ;     |
;        divps   xmm4, xmm3     ;     |
        mulps   xmm4, xmm3      ; reciprical multiply instead of a div.  Only 12bit accuracy but good enough for here.  much faster.
        movups  [edi], xmm4

        movups  xmm6, [esi+16]
        movups  xmm7, xmm6
        andps   xmm6, xmm0
        mulps   xmm6, xmm1
        addps   xmm6, xmm5
        rcpps   xmm6, xmm6      ; <--- this is the reciprical calc.
        mulps   xmm7, xmm2      ;     |
;        divps   xmm7, xmm6     ;     |
        mulps   xmm7, xmm6      ; reciprical multiply instead of a div.  Only 12bit accuracy but good enough for here.  much faster.
        movups  [edi+16], xmm7
        		
        add     edi, 32
        add     esi, 32
		dec		ecx
		jnz		near distortionloopunrolled		

distortionlooprolledstart:

        xorps   xmm3, xmm3

        mov     ecx, edx
        and     ecx, 7
        test    ecx, ecx
        jz      distortiondone

distortionlooprolled:

        movss   xmm3, [esi]
        movss   xmm4, xmm3
        andps   xmm3, xmm0
        mulss   xmm3, xmm1
        addss   xmm3, xmm5
        rcpss   xmm3, xmm3      ; <--- this is the reciprical calc.
        mulss   xmm4, xmm2      ;     |
;        divss   xmm4, xmm3     ;     |
        mulss   xmm4, xmm3      ; reciprical multiply instead of a div.  Only 12bit accuracy but good enough for here.  much faster.
        movss  [edi], xmm4

        add     edi, 4
        add     esi, 4

		dec		ecx
		jnz		near distortionlooprolled

distortiondone:

		pop		edi
		pop		esi
		pop		edx
		pop		ecx
		pop		ebx
		pop		eax
endproc