summaryrefslogtreecommitdiff
path: root/muse_qt4_evolution/al/dspSSE.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'muse_qt4_evolution/al/dspSSE.cpp')
-rw-r--r--muse_qt4_evolution/al/dspSSE.cpp531
1 files changed, 0 insertions, 531 deletions
diff --git a/muse_qt4_evolution/al/dspSSE.cpp b/muse_qt4_evolution/al/dspSSE.cpp
deleted file mode 100644
index 0f3d84b9..00000000
--- a/muse_qt4_evolution/al/dspSSE.cpp
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- Copyright (C) 2005 Paul Davis
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
- Author: Sampo Savolainen
-
- $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $
-*/
-
-
-#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
-
-.globl x86_sse_mix_buffers_with_gain
- .type x86_sse_mix_buffers_with_gain,@function
-
-x86_sse_mix_buffers_with_gain:
-#; 8(%ebp) = float *dst = %edi
-#; 12(%ebp) = float *src = %esi
-#; 16(%ebp) = long nframes = %ecx
-#; 20(%ebp) = float gain = st(0)
-
- pushl %ebp
- movl %esp, %ebp
-
- #; save the registers
-#; pushl %eax
- pushl %ebx
-#; pushl %ecx
- pushl %edi
- pushl %esi
-
- #; if nframes == 0, go to end
- movl 16(%ebp), %ecx #; nframes
- cmp $0, %ecx
- je .MBWG_END
-
- #; Check for alignment
-
- movl 8(%ebp), %edi #; dst
- movl 12(%ebp), %esi #; src
-
- movl %edi, %eax
- andl $12, %eax #; mask alignemnt offset
-
- movl %esi, %ebx
- andl $12, %ebx #; mask alignment offset
-
- cmp %eax, %ebx
- jne .MBWG_NONALIGN #; if not aligned, calculate manually
-
- #; if we are aligned
- cmp $0, %ebx
- jz .MBWG_SSE
-
- #; Pre-loop, we need to run 1-3 frames "manually" without
- #; SSE instructions
-
- movss 20(%ebp), %xmm1 #; xmm1
-
-.MBWG_PRELOOP:
-
- movss (%esi), %xmm0
- mulss %xmm1, %xmm0
- addss (%edi), %xmm0
- movss %xmm0, (%edi)
-
- addl $4, %edi #; dst++
- addl $4, %esi #; src++
- decl %ecx #; nframes--
- jz .MBWG_END
-
-#; cmp $0, %ecx
-#; je .MBWG_END #; if we run out of frames, go to end
-
- addl $4, %ebx
-
- cmp $16, %ebx #; test if we've reached 16 byte alignment
- jne .MBWG_PRELOOP
-
-
-.MBWG_SSE:
-
- cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
- jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
-
- #; copy gain to fill %xmm1
- movss 20(%ebp), %xmm1
- shufps $0x00, %xmm1, %xmm1
-
-
-.MBWG_SSELOOP:
-
- movaps (%esi), %xmm0 #; source => xmm0
- mulps %xmm1, %xmm0 #; apply gain to source
- addps (%edi), %xmm0 #; mix with destination
- movaps %xmm0, (%edi) #; copy result to destination
-
- addl $16, %edi #; dst+=4
- addl $16, %esi #; src+=4
-
- subl $4, %ecx #; nframes-=4
- cmp $4, %ecx
- jge .MBWG_SSELOOP
-
- cmp $0, %ecx
- je .MBWG_END
-
- #; if there are remaining frames, the nonalign code will do nicely
- #; for the rest 1-3 frames.
-
-.MBWG_NONALIGN:
- #; not aligned!
-
- movss 20(%ebp), %xmm1 #; gain => xmm1
-
-.MBWG_NONALIGNLOOP:
-
- movss (%esi), %xmm0
- mulss %xmm1, %xmm0
- addss (%edi), %xmm0
- movss %xmm0, (%edi)
-
- addl $4, %edi
- addl $4, %esi
-
- decl %ecx
- jnz .MBWG_NONALIGNLOOP
-
-.MBWG_END:
-
- popl %esi
- popl %edi
-#; popl %ecx
- popl %ebx
-#; popl %eax
-
- #; return
- leave
- ret
-
-.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
-
-
-
-
-#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
-
-.globl x86_sse_mix_buffers_no_gain
- .type x86_sse_mix_buffers_no_gain,@function
-
-x86_sse_mix_buffers_no_gain:
-#; 8(%ebp) = float *dst = %edi
-#; 12(%ebp) = float *src = %esi
-#; 16(%ebp) = long nframes = %ecx
-
- pushl %ebp
- movl %esp, %ebp
-
- #; save the registers
-#; pushl %eax
- pushl %ebx
-#; pushl %ecx
- pushl %edi
- pushl %esi
-
- #; the real function
-
- #; if nframes == 0, go to end
- movl 16(%ebp), %ecx #; nframes
- cmp $0, %ecx
- je .MBNG_END
-
- #; Check for alignment
-
- movl 8(%ebp), %edi #; dst
- movl 12(%ebp), %esi #; src
-
- movl %edi, %eax
- andl $12, %eax #; mask alignemnt offset
-
- movl %esi, %ebx
- andl $12, %ebx #; mask alignment offset
-
- cmp %eax, %ebx
- jne .MBNG_NONALIGN #; if not aligned, calculate manually
-
- cmp $0, %ebx
- je .MBNG_SSE
-
- #; Pre-loop, we need to run 1-3 frames "manually" without
- #; SSE instructions
-
-.MBNG_PRELOOP:
-
- movss (%esi), %xmm0
- addss (%edi), %xmm0
- movss %xmm0, (%edi)
-
- addl $4, %edi #; dst++
- addl $4, %esi #; src++
- decl %ecx #; nframes--
- jz .MBNG_END
- addl $4, %ebx
-
- cmp $16, %ebx #; test if we've reached 16 byte alignment
- jne .MBNG_PRELOOP
-
-.MBNG_SSE:
-
- cmp $4, %ecx #; if there are frames left, but less than 4
- jnge .MBNG_NONALIGN #; we can't run SSE
-
-.MBNG_SSELOOP:
-
- movaps (%esi), %xmm0 #; source => xmm0
- addps (%edi), %xmm0 #; mix with destination
- movaps %xmm0, (%edi) #; copy result to destination
-
- addl $16, %edi #; dst+=4
- addl $16, %esi #; src+=4
-
- subl $4, %ecx #; nframes-=4
- cmp $4, %ecx
- jge .MBNG_SSELOOP
-
- cmp $0, %ecx
- je .MBNG_END
-
- #; if there are remaining frames, the nonalign code will do nicely
- #; for the rest 1-3 frames.
-
-.MBNG_NONALIGN:
- #; not aligned!
-
- movss (%esi), %xmm0 #; src => xmm0
- addss (%edi), %xmm0 #; xmm0 += dst
- movss %xmm0, (%edi) #; xmm0 => dst
-
- addl $4, %edi
- addl $4, %esi
-
- decl %ecx
- jnz .MBNG_NONALIGN
-
-.MBNG_END:
-
- popl %esi
- popl %edi
-#; popl %ecx
- popl %ebx
-#; popl %eax
-
- #; return
- leave
- ret
-
-.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
-
-
-
-
-#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
-
-.globl x86_sse_apply_gain_to_buffer
- .type x86_sse_apply_gain_to_buffer,@function
-
-x86_sse_apply_gain_to_buffer:
-#; 8(%ebp) = float *buf = %edi
-#; 12(%ebp) = long nframes = %ecx
-#; 16(%ebp) = float gain = st(0)
-
- pushl %ebp
- movl %esp, %ebp
-
- #; save %edi
- pushl %edi
-
- #; the real function
-
- #; if nframes == 0, go to end
- movl 12(%ebp), %ecx #; nframes
- cmp $0, %ecx
- je .AG_END
-
- #; create the gain buffer in %xmm1
- movss 16(%ebp), %xmm1
- shufps $0x00, %xmm1, %xmm1
-
- #; Check for alignment
-
- movl 8(%ebp), %edi #; buf
- movl %edi, %edx #; buf => %edx
- andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
- jz .AG_SSE #; if buffer IS aligned
-
- #; PRE-LOOP
- #; we iterate 1-3 times, doing normal x87 float comparison
- #; so we reach a 16 byte aligned "buf" (=%edi) value
-
-.AGLP_START:
-
- #; Load next value from the buffer
- movss (%edi), %xmm0
- mulss %xmm1, %xmm0
- movss %xmm0, (%edi)
-
- #; increment buffer, decrement counter
- addl $4, %edi #; buf++;
-
- decl %ecx #; nframes--
- jz .AG_END #; if we run out of frames, we go to the end
-
- addl $4, %edx #; one non-aligned byte less
- cmp $16, %edx
- jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
-
-.AG_SSE:
-
- #; We have reached the 16 byte aligned "buf" ("edi") value
-
- #; Figure out how many loops we should do
- movl %ecx, %eax #; copy remaining nframes to %eax for division
- movl $0, %edx #; 0 the edx register
-
-
- pushl %edi
- movl $4, %edi
- divl %edi #; %edx = remainder == 0
- popl %edi
-
- #; %eax = SSE iterations
- cmp $0, %eax
- je .AGPOST_START
-
-
-.AGLP_SSE:
-
- movaps (%edi), %xmm0
- mulps %xmm1, %xmm0
- movaps %xmm0, (%edi)
-
- addl $16, %edi
-#; subl $4, %ecx #; nframes-=4
-
- decl %eax
- jnz .AGLP_SSE
-
- #; Next we need to post-process all remaining frames
- #; the remaining frame count is in %ecx
-
- #; if no remaining frames, jump to the end
-#; cmp $0, %ecx
- andl $3, %ecx #; nframes % 4
- je .AG_END
-
-.AGPOST_START:
-
- movss (%edi), %xmm0
- mulss %xmm1, %xmm0
- movss %xmm0, (%edi)
-
- #; increment buffer, decrement counter
- addl $4, %edi #; buf++;
-
- decl %ecx #; nframes--
- jnz .AGPOST_START #; if we run out of frames, we go to the end
-
-.AG_END:
-
-
- popl %edi
-
- #; return
- leave
- ret
-
-.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
-#; end proc
-
-
-
-#; float x86_sse_compute_peak(float *buf, long nframes, float current);
-
-.globl x86_sse_compute_peak
- .type x86_sse_compute_peak,@function
-
-x86_sse_compute_peak:
-#; 8(%ebp) = float *buf = %edi
-#; 12(%ebp) = long nframes = %ecx
-#; 16(%ebp) = float current = st(0)
-
- pushl %ebp
- movl %esp, %ebp
-
- #; save %edi
- pushl %edi
-
- #; the real function
-
- #; Load "current" in xmm0
- movss 16(%ebp), %xmm0
-
- #; if nframes == 0, go to end
- movl 12(%ebp), %ecx #; nframes
- cmp $0, %ecx
- je .CP_END
-
- #; create the "abs" mask in %xmm2
- pushl $2147483647
- movss (%esp), %xmm2
- addl $4, %esp
- shufps $0x00, %xmm2, %xmm2
-
- #; Check for alignment
-
- movl 8(%ebp), %edi #; buf
- movl %edi, %edx #; buf => %edx
- andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
- jz .CP_SSE #; if buffer IS aligned
-
- #; PRE-LOOP
- #; we iterate 1-3 times, doing normal x87 float comparison
- #; so we reach a 16 byte aligned "buf" (=%edi) value
-
-.LP_START:
-
- #; Load next value from the buffer
- movss (%edi), %xmm1
- andps %xmm2, %xmm1
- maxss %xmm1, %xmm0
-
- #; increment buffer, decrement counter
- addl $4, %edi #; buf++;
-
- decl %ecx #; nframes--
- jz .CP_END #; if we run out of frames, we go to the end
-
- addl $4, %edx #; one non-aligned byte less
- cmp $16, %edx
- jne .LP_START #; if more non-aligned frames exist, we do a do-over
-
-.CP_SSE:
-
- #; We have reached the 16 byte aligned "buf" ("edi") value
-
- #; Figure out how many loops we should do
- movl %ecx, %eax #; copy remaining nframes to %eax for division
-
- shr $2,%eax #; unsigned divide by 4
- jz .POST_START
-
- #; %eax = SSE iterations
-
- #; current maximum is at %xmm0, but we need to ..
- shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
-
- #;prefetcht0 16(%edi)
-
-.LP_SSE:
-
- movaps (%edi), %xmm1
- andps %xmm2, %xmm1
- maxps %xmm1, %xmm0
-
- addl $16, %edi
-
- decl %eax
- jnz .LP_SSE
-
- #; Calculate the maximum value contained in the 4 FP's in %xmm0
- movaps %xmm0, %xmm1
- shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
- maxps %xmm1, %xmm0 #; maximums of the two pairs
- movaps %xmm0, %xmm1
- shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
- maxps %xmm1, %xmm0
-
- #; now every float in %xmm0 is the same value, current maximum value
-
- #; Next we need to post-process all remaining frames
- #; the remaining frame count is in %ecx
-
- #; if no remaining frames, jump to the end
-
- andl $3, %ecx #; nframes % 4
- jz .CP_END
-
-.POST_START:
-
- movss (%edi), %xmm1
- andps %xmm2, %xmm1
- maxss %xmm1, %xmm0
-
- addl $4, %edi #; buf++;
-
- decl %ecx #; nframes--;
- jnz .POST_START
-
-.CP_END:
-
- #; Load the value from xmm0 to the float stack for returning
- movss %xmm0, 16(%ebp)
- flds 16(%ebp)
-
- popl %edi
-
- #; return
- leave
- ret
-
-.size x86_sse_compute_peak, .-x86_sse_compute_peak
-#; end proc
-
-#ifdef __ELF__
-.section .note.GNU-stack,"",%progbits
-#endif
-
-