diff options
Diffstat (limited to 'muse_qt4_evolution/al/dspSSE.cpp')
-rw-r--r-- | muse_qt4_evolution/al/dspSSE.cpp | 531 |
1 files changed, 0 insertions, 531 deletions
diff --git a/muse_qt4_evolution/al/dspSSE.cpp b/muse_qt4_evolution/al/dspSSE.cpp deleted file mode 100644 index 0f3d84b9..00000000 --- a/muse_qt4_evolution/al/dspSSE.cpp +++ /dev/null @@ -1,531 +0,0 @@ -/* - Copyright (C) 2005 Paul Davis - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - Author: Sampo Savolainen - - $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $ -*/ - - -#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain); - -.globl x86_sse_mix_buffers_with_gain - .type x86_sse_mix_buffers_with_gain,@function - -x86_sse_mix_buffers_with_gain: -#; 8(%ebp) = float *dst = %edi -#; 12(%ebp) = float *src = %esi -#; 16(%ebp) = long nframes = %ecx -#; 20(%ebp) = float gain = st(0) - - pushl %ebp - movl %esp, %ebp - - #; save the registers -#; pushl %eax - pushl %ebx -#; pushl %ecx - pushl %edi - pushl %esi - - #; if nframes == 0, go to end - movl 16(%ebp), %ecx #; nframes - cmp $0, %ecx - je .MBWG_END - - #; Check for alignment - - movl 8(%ebp), %edi #; dst - movl 12(%ebp), %esi #; src - - movl %edi, %eax - andl $12, %eax #; mask alignemnt offset - - movl %esi, %ebx - andl $12, %ebx #; mask alignment offset - - cmp %eax, %ebx - jne .MBWG_NONALIGN #; if not aligned, calculate manually - - #; if we are aligned - cmp $0, %ebx - jz .MBWG_SSE - - #; Pre-loop, we need to run 1-3 frames "manually" without - #; SSE instructions - - movss 20(%ebp), %xmm1 #; xmm1 - -.MBWG_PRELOOP: - - movss (%esi), %xmm0 - mulss %xmm1, %xmm0 - addss (%edi), %xmm0 - movss %xmm0, (%edi) - - addl $4, %edi #; dst++ - addl $4, %esi #; src++ - decl %ecx #; nframes-- - jz .MBWG_END - -#; cmp $0, %ecx -#; je .MBWG_END #; if we run out of frames, go to end - - addl $4, %ebx - - cmp $16, %ebx #; test if we've reached 16 byte alignment - jne .MBWG_PRELOOP - - -.MBWG_SSE: - - cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then - jnge .MBWG_NONALIGN #; we jump straight to the "normal" code - - #; copy gain to fill %xmm1 - movss 20(%ebp), %xmm1 - shufps $0x00, %xmm1, %xmm1 - - -.MBWG_SSELOOP: - - movaps (%esi), %xmm0 #; source => xmm0 - mulps %xmm1, %xmm0 #; apply gain to source - addps (%edi), %xmm0 #; mix with destination - movaps %xmm0, (%edi) #; copy result to destination - - addl $16, %edi #; dst+=4 - addl $16, %esi #; src+=4 - - subl $4, %ecx #; nframes-=4 - cmp $4, %ecx - jge .MBWG_SSELOOP - - cmp $0, %ecx - je .MBWG_END - - #; if there are remaining frames, the nonalign code will do nicely - #; for the rest 1-3 frames. - -.MBWG_NONALIGN: - #; not aligned! - - movss 20(%ebp), %xmm1 #; gain => xmm1 - -.MBWG_NONALIGNLOOP: - - movss (%esi), %xmm0 - mulss %xmm1, %xmm0 - addss (%edi), %xmm0 - movss %xmm0, (%edi) - - addl $4, %edi - addl $4, %esi - - decl %ecx - jnz .MBWG_NONALIGNLOOP - -.MBWG_END: - - popl %esi - popl %edi -#; popl %ecx - popl %ebx -#; popl %eax - - #; return - leave - ret - -.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain - - - - -#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes); - -.globl x86_sse_mix_buffers_no_gain - .type x86_sse_mix_buffers_no_gain,@function - -x86_sse_mix_buffers_no_gain: -#; 8(%ebp) = float *dst = %edi -#; 12(%ebp) = float *src = %esi -#; 16(%ebp) = long nframes = %ecx - - pushl %ebp - movl %esp, %ebp - - #; save the registers -#; pushl %eax - pushl %ebx -#; pushl %ecx - pushl %edi - pushl %esi - - #; the real function - - #; if nframes == 0, go to end - movl 16(%ebp), %ecx #; nframes - cmp $0, %ecx - je .MBNG_END - - #; Check for alignment - - movl 8(%ebp), %edi #; dst - movl 12(%ebp), %esi #; src - - movl %edi, %eax - andl $12, %eax #; mask alignemnt offset - - movl %esi, %ebx - andl $12, %ebx #; mask alignment offset - - cmp %eax, %ebx - jne .MBNG_NONALIGN #; if not aligned, calculate manually - - cmp $0, %ebx - je .MBNG_SSE - - #; Pre-loop, we need to run 1-3 frames "manually" without - #; SSE instructions - -.MBNG_PRELOOP: - - movss (%esi), %xmm0 - addss (%edi), %xmm0 - movss %xmm0, (%edi) - - addl $4, %edi #; dst++ - addl $4, %esi #; src++ - decl %ecx #; nframes-- - jz .MBNG_END - addl $4, %ebx - - cmp $16, %ebx #; test if we've reached 16 byte alignment - jne .MBNG_PRELOOP - -.MBNG_SSE: - - cmp $4, %ecx #; if there are frames left, but less than 4 - jnge .MBNG_NONALIGN #; we can't run SSE - -.MBNG_SSELOOP: - - movaps (%esi), %xmm0 #; source => xmm0 - addps (%edi), %xmm0 #; mix with destination - movaps %xmm0, (%edi) #; copy result to destination - - addl $16, %edi #; dst+=4 - addl $16, %esi #; src+=4 - - subl $4, %ecx #; nframes-=4 - cmp $4, %ecx - jge .MBNG_SSELOOP - - cmp $0, %ecx - je .MBNG_END - - #; if there are remaining frames, the nonalign code will do nicely - #; for the rest 1-3 frames. - -.MBNG_NONALIGN: - #; not aligned! - - movss (%esi), %xmm0 #; src => xmm0 - addss (%edi), %xmm0 #; xmm0 += dst - movss %xmm0, (%edi) #; xmm0 => dst - - addl $4, %edi - addl $4, %esi - - decl %ecx - jnz .MBNG_NONALIGN - -.MBNG_END: - - popl %esi - popl %edi -#; popl %ecx - popl %ebx -#; popl %eax - - #; return - leave - ret - -.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain - - - - -#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain); - -.globl x86_sse_apply_gain_to_buffer - .type x86_sse_apply_gain_to_buffer,@function - -x86_sse_apply_gain_to_buffer: -#; 8(%ebp) = float *buf = %edi -#; 12(%ebp) = long nframes = %ecx -#; 16(%ebp) = float gain = st(0) - - pushl %ebp - movl %esp, %ebp - - #; save %edi - pushl %edi - - #; the real function - - #; if nframes == 0, go to end - movl 12(%ebp), %ecx #; nframes - cmp $0, %ecx - je .AG_END - - #; create the gain buffer in %xmm1 - movss 16(%ebp), %xmm1 - shufps $0x00, %xmm1, %xmm1 - - #; Check for alignment - - movl 8(%ebp), %edi #; buf - movl %edi, %edx #; buf => %edx - andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 - jz .AG_SSE #; if buffer IS aligned - - #; PRE-LOOP - #; we iterate 1-3 times, doing normal x87 float comparison - #; so we reach a 16 byte aligned "buf" (=%edi) value - -.AGLP_START: - - #; Load next value from the buffer - movss (%edi), %xmm0 - mulss %xmm1, %xmm0 - movss %xmm0, (%edi) - - #; increment buffer, decrement counter - addl $4, %edi #; buf++; - - decl %ecx #; nframes-- - jz .AG_END #; if we run out of frames, we go to the end - - addl $4, %edx #; one non-aligned byte less - cmp $16, %edx - jne .AGLP_START #; if more non-aligned frames exist, we do a do-over - -.AG_SSE: - - #; We have reached the 16 byte aligned "buf" ("edi") value - - #; Figure out how many loops we should do - movl %ecx, %eax #; copy remaining nframes to %eax for division - movl $0, %edx #; 0 the edx register - - - pushl %edi - movl $4, %edi - divl %edi #; %edx = remainder == 0 - popl %edi - - #; %eax = SSE iterations - cmp $0, %eax - je .AGPOST_START - - -.AGLP_SSE: - - movaps (%edi), %xmm0 - mulps %xmm1, %xmm0 - movaps %xmm0, (%edi) - - addl $16, %edi -#; subl $4, %ecx #; nframes-=4 - - decl %eax - jnz .AGLP_SSE - - #; Next we need to post-process all remaining frames - #; the remaining frame count is in %ecx - - #; if no remaining frames, jump to the end -#; cmp $0, %ecx - andl $3, %ecx #; nframes % 4 - je .AG_END - -.AGPOST_START: - - movss (%edi), %xmm0 - mulss %xmm1, %xmm0 - movss %xmm0, (%edi) - - #; increment buffer, decrement counter - addl $4, %edi #; buf++; - - decl %ecx #; nframes-- - jnz .AGPOST_START #; if we run out of frames, we go to the end - -.AG_END: - - - popl %edi - - #; return - leave - ret - -.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer -#; end proc - - - -#; float x86_sse_compute_peak(float *buf, long nframes, float current); - -.globl x86_sse_compute_peak - .type x86_sse_compute_peak,@function - -x86_sse_compute_peak: -#; 8(%ebp) = float *buf = %edi -#; 12(%ebp) = long nframes = %ecx -#; 16(%ebp) = float current = st(0) - - pushl %ebp - movl %esp, %ebp - - #; save %edi - pushl %edi - - #; the real function - - #; Load "current" in xmm0 - movss 16(%ebp), %xmm0 - - #; if nframes == 0, go to end - movl 12(%ebp), %ecx #; nframes - cmp $0, %ecx - je .CP_END - - #; create the "abs" mask in %xmm2 - pushl $2147483647 - movss (%esp), %xmm2 - addl $4, %esp - shufps $0x00, %xmm2, %xmm2 - - #; Check for alignment - - movl 8(%ebp), %edi #; buf - movl %edi, %edx #; buf => %edx - andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 - jz .CP_SSE #; if buffer IS aligned - - #; PRE-LOOP - #; we iterate 1-3 times, doing normal x87 float comparison - #; so we reach a 16 byte aligned "buf" (=%edi) value - -.LP_START: - - #; Load next value from the buffer - movss (%edi), %xmm1 - andps %xmm2, %xmm1 - maxss %xmm1, %xmm0 - - #; increment buffer, decrement counter - addl $4, %edi #; buf++; - - decl %ecx #; nframes-- - jz .CP_END #; if we run out of frames, we go to the end - - addl $4, %edx #; one non-aligned byte less - cmp $16, %edx - jne .LP_START #; if more non-aligned frames exist, we do a do-over - -.CP_SSE: - - #; We have reached the 16 byte aligned "buf" ("edi") value - - #; Figure out how many loops we should do - movl %ecx, %eax #; copy remaining nframes to %eax for division - - shr $2,%eax #; unsigned divide by 4 - jz .POST_START - - #; %eax = SSE iterations - - #; current maximum is at %xmm0, but we need to .. - shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's - - #;prefetcht0 16(%edi) - -.LP_SSE: - - movaps (%edi), %xmm1 - andps %xmm2, %xmm1 - maxps %xmm1, %xmm0 - - addl $16, %edi - - decl %eax - jnz .LP_SSE - - #; Calculate the maximum value contained in the 4 FP's in %xmm0 - movaps %xmm0, %xmm1 - shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) - maxps %xmm1, %xmm0 #; maximums of the two pairs - movaps %xmm0, %xmm1 - shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) - maxps %xmm1, %xmm0 - - #; now every float in %xmm0 is the same value, current maximum value - - #; Next we need to post-process all remaining frames - #; the remaining frame count is in %ecx - - #; if no remaining frames, jump to the end - - andl $3, %ecx #; nframes % 4 - jz .CP_END - -.POST_START: - - movss (%edi), %xmm1 - andps %xmm2, %xmm1 - maxss %xmm1, %xmm0 - - addl $4, %edi #; buf++; - - decl %ecx #; nframes--; - jnz .POST_START - -.CP_END: - - #; Load the value from xmm0 to the float stack for returning - movss %xmm0, 16(%ebp) - flds 16(%ebp) - - popl %edi - - #; return - leave - ret - -.size x86_sse_compute_peak, .-x86_sse_compute_peak -#; end proc - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif - - |