fmodex/fmod/win32/src/fmod_dsp_distortion_asm.s

129 lines
3.8 KiB
ArmAsm
Raw Normal View History

%include "./FMOD_static/win32/src/c32.mac"
; ==========================================================================================
; GLOBAL UNINITIALIZED DATA
; ==========================================================================================
[SEGMENT .data use32 align=32]
distortion_ones dd 1.0,1.0,1.0,1.0
distortion_fabsmask dd 7FFFFFFFh, 7FFFFFFFh, 7FFFFFFFh, 7FFFFFFFh
; ==========================================================================================
; CODE
; ==========================================================================================
[SEGMENT .text use32 align=32]
; =================================================================================================================================
; void FMOD_DSP_Distortion_SIMD(float * inbuffer, float * outbuffer, unsigned int length, int inchannels, int outchannels, float k);
; =================================================================================================================================
proc FMOD_DSP_Distortion_SIMD
%$inbuffer arg
%$outbuffer arg
%$length arg
%$inchannels arg
%$outchannels arg
%$k arg
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, [ebp+%$inbuffer]
mov edi, [ebp+%$outbuffer]
; xmm0 = [0x7fffffff][0x7fffffff][0x7fffffff][0x7fffffff]
; xmm1 = [k ][k ][k ][k ]
; xmm2 = [k+1 ][k+1 ][k+1 ][k+1 ]
; xmm3 =
; xmm4 =
; xmm5 = [1.0f ][1.0f ][1.0f ][1.0f ]
; xmm6 =
; xmm7 =
movaps xmm0, [distortion_fabsmask]
movss xmm1, [ebp+%$k]
shufps xmm1, xmm1, 0x00
movaps xmm2, xmm1
movaps xmm5, [distortion_ones]
addps xmm2, xmm5
mov edx, [ebp+%$length]
imul edx, [ebp+%$inchannels]
mov ecx, edx
shr ecx, 3
test ecx, ecx
jz distortionlooprolledstart
distortionloopunrolled:
movups xmm3, [esi]
movups xmm4, xmm3
andps xmm3, xmm0
mulps xmm3, xmm1
addps xmm3, xmm5
rcpps xmm3, xmm3 ; <--- this is the reciprical calc.
mulps xmm4, xmm2 ; |
; divps xmm4, xmm3 ; |
mulps xmm4, xmm3 ; reciprical multiply instead of a div. Only 12bit accuracy but good enough for here. much faster.
movups [edi], xmm4
movups xmm6, [esi+16]
movups xmm7, xmm6
andps xmm6, xmm0
mulps xmm6, xmm1
addps xmm6, xmm5
rcpps xmm6, xmm6 ; <--- this is the reciprical calc.
mulps xmm7, xmm2 ; |
; divps xmm7, xmm6 ; |
mulps xmm7, xmm6 ; reciprical multiply instead of a div. Only 12bit accuracy but good enough for here. much faster.
movups [edi+16], xmm7
add edi, 32
add esi, 32
dec ecx
jnz near distortionloopunrolled
distortionlooprolledstart:
xorps xmm3, xmm3
mov ecx, edx
and ecx, 7
test ecx, ecx
jz distortiondone
distortionlooprolled:
movss xmm3, [esi]
movss xmm4, xmm3
andps xmm3, xmm0
mulss xmm3, xmm1
addss xmm3, xmm5
rcpss xmm3, xmm3 ; <--- this is the reciprical calc.
mulss xmm4, xmm2 ; |
; divss xmm4, xmm3 ; |
mulss xmm4, xmm3 ; reciprical multiply instead of a div. Only 12bit accuracy but good enough for here. much faster.
movss [edi], xmm4
add edi, 4
add esi, 4
dec ecx
jnz near distortionlooprolled
distortiondone:
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
endproc