diff options
-rw-r--r-- | muse/al/dsp.cpp | 123 | ||||
-rw-r--r-- | muse/al/dsp.h | 76 | ||||
-rw-r--r-- | muse/al/dspSSE.cpp | 531 | ||||
-rw-r--r-- | muse/al/dspXMM.cpp | 115 |
4 files changed, 845 insertions, 0 deletions
diff --git a/muse/al/dsp.cpp b/muse/al/dsp.cpp new file mode 100644 index 00000000..b840de92 --- /dev/null +++ b/muse/al/dsp.cpp @@ -0,0 +1,123 @@ +//============================================================================= +// AL +// Audio Utility Library +// $Id:$ +// +// Copyright (C) 2002-2006 by Werner Schweer and others +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +//============================================================================= + +#include "dsp.h" +#include "config.h" + +namespace AL { + +Dsp* dsp; + +#ifdef __i386__ + +//--------------------------------------------------------- +// DspSSE86 +//--------------------------------------------------------- + +extern "C" { +extern float x86_sse_compute_peak(float*, unsigned, float); +extern void x86_sse_apply_gain_to_buffer(float*, unsigned, float); +extern void x86_sse_mix_buffers_with_gain(float*, float*, unsigned, float); +extern void x86_sse_mix_buffers_no_gain(float*, float*, unsigned); + }; + +class DspSSE86 : public Dsp { + public: + DspSSE86() {} + virtual ~DspSSE86() {} + + virtual float peak(float* buf, unsigned n, float current) { + if ( ((intptr_t)buf % 16) != 0) + fprintf(stderr, "peak(): buffer unaligned!\n"); + return x86_sse_compute_peak(buf, n, current); + } + + virtual void applyGainToBuffer(float* buf, unsigned n, float gain) { + if ( ((intptr_t)buf % 16) != 0) + fprintf(stderr, "applyGainToBuffer(): buffer unaligned!\n"); + x86_sse_apply_gain_to_buffer(buf, n, gain); + } + + virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) { + if ( ((intptr_t)dst & 15) != 0) + fprintf(stderr, "mixWithGainain(): dst unaligned!\n"); + if (((intptr_t)dst & 15) != ((intptr_t)src & 15) ) { + fprintf(stderr, "mixWithGain(): dst & src don't have the same alignment!\n"); + Dsp::mixWithGain(dst, src,n, gain); + } + else + x86_sse_mix_buffers_with_gain(dst, src, n, gain); + } + virtual void mix(float* dst, float* src, unsigned n) { + if ( ((intptr_t)dst & 15) != 0) + fprintf(stderr, "mix_buffers_no_gain(): dst unaligned!\n"); + if ( ((intptr_t)dst & 15) != ((intptr_t)src & 15) ) { + fprintf(stderr, "mix_buffers_no_gain(): dst & src don't have the same alignment!\n"); + Dsp::mix(dst, src, n); + } + else + x86_sse_mix_buffers_no_gain(dst, src, n); + } + }; +#endif + +//--------------------------------------------------------- +// initDsp +//--------------------------------------------------------- + +void initDsp() + { +#if defined(__i386__) && defined(USE_SSE) + unsigned long useSSE = 0; + +#ifdef __x86_64__ + asm ( + "pushq %%rbx\n" + "movq $1, %%rax\n" + "cpuid\n" + "movq %%rdx, %0\n" + "popq %%rbx\n" + : "=r" (useSSE) + : + : "%rax", "%rcx", "%rdx", "memory"); +#else + asm ( + "mov $1, %%eax\n" + "pushl %%ebx\n" + "cpuid\n" + "movl %%edx, %0\n" + "popl %%ebx\n" + : "=r" (useSSE) + : + : "%eax", "%ecx", "%edx", "memory"); +#endif + useSSE &= (1 << 25); // bit 25 = SSE support + if (useSSE) { + printf("Using SSE optimized routines\n"); + dsp = new DspSSE86(); + return; + } + // fall through to not hardware optimized routines +#endif + dsp = new Dsp(); + } + +} + diff --git a/muse/al/dsp.h b/muse/al/dsp.h new file mode 100644 index 00000000..d8da11dc --- /dev/null +++ b/muse/al/dsp.h @@ -0,0 +1,76 @@ +//============================================================================= +// AL +// Audio Utility Library +// $Id:$ +// +// Copyright (C) 2002-2006 by Werner Schweer and others +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +//============================================================================= + +#ifndef __DSP_H__ +#define __DSP_H__ + +namespace AL { + + +//--------------------------------------------------------- +// f_max +//--------------------------------------------------------- + +static inline float f_max(float x, float a) + { + x -= a; + x += fabsf(x); + x *= 0.5f; + x += a; + return x; + } + +//--------------------------------------------------------- +// Dsp +// standard version of all dsp routines without any +// hw acceleration +//--------------------------------------------------------- + +class Dsp { + public: + Dsp() {} + virtual ~Dsp() {} + + virtual float peak(float* buf, unsigned n, float current) { + for (unsigned i = 0; i < n; ++i) + current = f_max(current, fabsf(buf[i])); + return current; + } + virtual void applyGainToBuffer(float* buf, unsigned n, float gain) { + for (unsigned i = 0; i < n; ++i) + buf[i] *= gain; + } + virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) { + for (unsigned i = 0; i < n; ++i) + dst[i] += src[i] * gain; + } + virtual void mix(float* dst, float* src, unsigned n) { + for (unsigned i = 0; i < n; ++i) + dst[i] += src[i]; + } + }; + +extern void initDsp(); +extern Dsp* dsp; + +} + +#endif + diff --git a/muse/al/dspSSE.cpp b/muse/al/dspSSE.cpp new file mode 100644 index 00000000..0f3d84b9 --- /dev/null +++ b/muse/al/dspSSE.cpp @@ -0,0 +1,531 @@ +/* + Copyright (C) 2005 Paul Davis + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Author: Sampo Savolainen + + $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $ +*/ + + +#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain); + +.globl x86_sse_mix_buffers_with_gain + .type x86_sse_mix_buffers_with_gain,@function + +x86_sse_mix_buffers_with_gain: +#; 8(%ebp) = float *dst = %edi +#; 12(%ebp) = float *src = %esi +#; 16(%ebp) = long nframes = %ecx +#; 20(%ebp) = float gain = st(0) + + pushl %ebp + movl %esp, %ebp + + #; save the registers +#; pushl %eax + pushl %ebx +#; pushl %ecx + pushl %edi + pushl %esi + + #; if nframes == 0, go to end + movl 16(%ebp), %ecx #; nframes + cmp $0, %ecx + je .MBWG_END + + #; Check for alignment + + movl 8(%ebp), %edi #; dst + movl 12(%ebp), %esi #; src + + movl %edi, %eax + andl $12, %eax #; mask alignemnt offset + + movl %esi, %ebx + andl $12, %ebx #; mask alignment offset + + cmp %eax, %ebx + jne .MBWG_NONALIGN #; if not aligned, calculate manually + + #; if we are aligned + cmp $0, %ebx + jz .MBWG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + + movss 20(%ebp), %xmm1 #; xmm1 + +.MBWG_PRELOOP: + + movss (%esi), %xmm0 + mulss %xmm1, %xmm0 + addss (%edi), %xmm0 + movss %xmm0, (%edi) + + addl $4, %edi #; dst++ + addl $4, %esi #; src++ + decl %ecx #; nframes-- + jz .MBWG_END + +#; cmp $0, %ecx +#; je .MBWG_END #; if we run out of frames, go to end + + addl $4, %ebx + + cmp $16, %ebx #; test if we've reached 16 byte alignment + jne .MBWG_PRELOOP + + +.MBWG_SSE: + + cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then + jnge .MBWG_NONALIGN #; we jump straight to the "normal" code + + #; copy gain to fill %xmm1 + movss 20(%ebp), %xmm1 + shufps $0x00, %xmm1, %xmm1 + + +.MBWG_SSELOOP: + + movaps (%esi), %xmm0 #; source => xmm0 + mulps %xmm1, %xmm0 #; apply gain to source + addps (%edi), %xmm0 #; mix with destination + movaps %xmm0, (%edi) #; copy result to destination + + addl $16, %edi #; dst+=4 + addl $16, %esi #; src+=4 + + subl $4, %ecx #; nframes-=4 + cmp $4, %ecx + jge .MBWG_SSELOOP + + cmp $0, %ecx + je .MBWG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBWG_NONALIGN: + #; not aligned! + + movss 20(%ebp), %xmm1 #; gain => xmm1 + +.MBWG_NONALIGNLOOP: + + movss (%esi), %xmm0 + mulss %xmm1, %xmm0 + addss (%edi), %xmm0 + movss %xmm0, (%edi) + + addl $4, %edi + addl $4, %esi + + decl %ecx + jnz .MBWG_NONALIGNLOOP + +.MBWG_END: + + popl %esi + popl %edi +#; popl %ecx + popl %ebx +#; popl %eax + + #; return + leave + ret + +.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain + + + + +#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes); + +.globl x86_sse_mix_buffers_no_gain + .type x86_sse_mix_buffers_no_gain,@function + +x86_sse_mix_buffers_no_gain: +#; 8(%ebp) = float *dst = %edi +#; 12(%ebp) = float *src = %esi +#; 16(%ebp) = long nframes = %ecx + + pushl %ebp + movl %esp, %ebp + + #; save the registers +#; pushl %eax + pushl %ebx +#; pushl %ecx + pushl %edi + pushl %esi + + #; the real function + + #; if nframes == 0, go to end + movl 16(%ebp), %ecx #; nframes + cmp $0, %ecx + je .MBNG_END + + #; Check for alignment + + movl 8(%ebp), %edi #; dst + movl 12(%ebp), %esi #; src + + movl %edi, %eax + andl $12, %eax #; mask alignemnt offset + + movl %esi, %ebx + andl $12, %ebx #; mask alignment offset + + cmp %eax, %ebx + jne .MBNG_NONALIGN #; if not aligned, calculate manually + + cmp $0, %ebx + je .MBNG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + +.MBNG_PRELOOP: + + movss (%esi), %xmm0 + addss (%edi), %xmm0 + movss %xmm0, (%edi) + + addl $4, %edi #; dst++ + addl $4, %esi #; src++ + decl %ecx #; nframes-- + jz .MBNG_END + addl $4, %ebx + + cmp $16, %ebx #; test if we've reached 16 byte alignment + jne .MBNG_PRELOOP + +.MBNG_SSE: + + cmp $4, %ecx #; if there are frames left, but less than 4 + jnge .MBNG_NONALIGN #; we can't run SSE + +.MBNG_SSELOOP: + + movaps (%esi), %xmm0 #; source => xmm0 + addps (%edi), %xmm0 #; mix with destination + movaps %xmm0, (%edi) #; copy result to destination + + addl $16, %edi #; dst+=4 + addl $16, %esi #; src+=4 + + subl $4, %ecx #; nframes-=4 + cmp $4, %ecx + jge .MBNG_SSELOOP + + cmp $0, %ecx + je .MBNG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBNG_NONALIGN: + #; not aligned! + + movss (%esi), %xmm0 #; src => xmm0 + addss (%edi), %xmm0 #; xmm0 += dst + movss %xmm0, (%edi) #; xmm0 => dst + + addl $4, %edi + addl $4, %esi + + decl %ecx + jnz .MBNG_NONALIGN + +.MBNG_END: + + popl %esi + popl %edi +#; popl %ecx + popl %ebx +#; popl %eax + + #; return + leave + ret + +.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain + + + + +#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain); + +.globl x86_sse_apply_gain_to_buffer + .type x86_sse_apply_gain_to_buffer,@function + +x86_sse_apply_gain_to_buffer: +#; 8(%ebp) = float *buf = %edi +#; 12(%ebp) = long nframes = %ecx +#; 16(%ebp) = float gain = st(0) + + pushl %ebp + movl %esp, %ebp + + #; save %edi + pushl %edi + + #; the real function + + #; if nframes == 0, go to end + movl 12(%ebp), %ecx #; nframes + cmp $0, %ecx + je .AG_END + + #; create the gain buffer in %xmm1 + movss 16(%ebp), %xmm1 + shufps $0x00, %xmm1, %xmm1 + + #; Check for alignment + + movl 8(%ebp), %edi #; buf + movl %edi, %edx #; buf => %edx + andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .AG_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%edi) value + +.AGLP_START: + + #; Load next value from the buffer + movss (%edi), %xmm0 + mulss %xmm1, %xmm0 + movss %xmm0, (%edi) + + #; increment buffer, decrement counter + addl $4, %edi #; buf++; + + decl %ecx #; nframes-- + jz .AG_END #; if we run out of frames, we go to the end + + addl $4, %edx #; one non-aligned byte less + cmp $16, %edx + jne .AGLP_START #; if more non-aligned frames exist, we do a do-over + +.AG_SSE: + + #; We have reached the 16 byte aligned "buf" ("edi") value + + #; Figure out how many loops we should do + movl %ecx, %eax #; copy remaining nframes to %eax for division + movl $0, %edx #; 0 the edx register + + + pushl %edi + movl $4, %edi + divl %edi #; %edx = remainder == 0 + popl %edi + + #; %eax = SSE iterations + cmp $0, %eax + je .AGPOST_START + + +.AGLP_SSE: + + movaps (%edi), %xmm0 + mulps %xmm1, %xmm0 + movaps %xmm0, (%edi) + + addl $16, %edi +#; subl $4, %ecx #; nframes-=4 + + decl %eax + jnz .AGLP_SSE + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %ecx + + #; if no remaining frames, jump to the end +#; cmp $0, %ecx + andl $3, %ecx #; nframes % 4 + je .AG_END + +.AGPOST_START: + + movss (%edi), %xmm0 + mulss %xmm1, %xmm0 + movss %xmm0, (%edi) + + #; increment buffer, decrement counter + addl $4, %edi #; buf++; + + decl %ecx #; nframes-- + jnz .AGPOST_START #; if we run out of frames, we go to the end + +.AG_END: + + + popl %edi + + #; return + leave + ret + +.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer +#; end proc + + + +#; float x86_sse_compute_peak(float *buf, long nframes, float current); + +.globl x86_sse_compute_peak + .type x86_sse_compute_peak,@function + +x86_sse_compute_peak: +#; 8(%ebp) = float *buf = %edi +#; 12(%ebp) = long nframes = %ecx +#; 16(%ebp) = float current = st(0) + + pushl %ebp + movl %esp, %ebp + + #; save %edi + pushl %edi + + #; the real function + + #; Load "current" in xmm0 + movss 16(%ebp), %xmm0 + + #; if nframes == 0, go to end + movl 12(%ebp), %ecx #; nframes + cmp $0, %ecx + je .CP_END + + #; create the "abs" mask in %xmm2 + pushl $2147483647 + movss (%esp), %xmm2 + addl $4, %esp + shufps $0x00, %xmm2, %xmm2 + + #; Check for alignment + + movl 8(%ebp), %edi #; buf + movl %edi, %edx #; buf => %edx + andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .CP_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%edi) value + +.LP_START: + + #; Load next value from the buffer + movss (%edi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + #; increment buffer, decrement counter + addl $4, %edi #; buf++; + + decl %ecx #; nframes-- + jz .CP_END #; if we run out of frames, we go to the end + + addl $4, %edx #; one non-aligned byte less + cmp $16, %edx + jne .LP_START #; if more non-aligned frames exist, we do a do-over + +.CP_SSE: + + #; We have reached the 16 byte aligned "buf" ("edi") value + + #; Figure out how many loops we should do + movl %ecx, %eax #; copy remaining nframes to %eax for division + + shr $2,%eax #; unsigned divide by 4 + jz .POST_START + + #; %eax = SSE iterations + + #; current maximum is at %xmm0, but we need to .. + shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's + + #;prefetcht0 16(%edi) + +.LP_SSE: + + movaps (%edi), %xmm1 + andps %xmm2, %xmm1 + maxps %xmm1, %xmm0 + + addl $16, %edi + + decl %eax + jnz .LP_SSE + + #; Calculate the maximum value contained in the 4 FP's in %xmm0 + movaps %xmm0, %xmm1 + shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) + maxps %xmm1, %xmm0 #; maximums of the two pairs + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) + maxps %xmm1, %xmm0 + + #; now every float in %xmm0 is the same value, current maximum value + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %ecx + + #; if no remaining frames, jump to the end + + andl $3, %ecx #; nframes % 4 + jz .CP_END + +.POST_START: + + movss (%edi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + addl $4, %edi #; buf++; + + decl %ecx #; nframes--; + jnz .POST_START + +.CP_END: + + #; Load the value from xmm0 to the float stack for returning + movss %xmm0, 16(%ebp) + flds 16(%ebp) + + popl %edi + + #; return + leave + ret + +.size x86_sse_compute_peak, .-x86_sse_compute_peak +#; end proc + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + + diff --git a/muse/al/dspXMM.cpp b/muse/al/dspXMM.cpp new file mode 100644 index 00000000..1943fa7d --- /dev/null +++ b/muse/al/dspXMM.cpp @@ -0,0 +1,115 @@ +/* + Copyright (C) 2007 Paul sDavis + Written by Sampo Savolainen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include <xmmintrin.h> + +void +x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max) +{ + __m128 current_max, current_min, work; + + // Load max and min values into all four slots of the XMM registers + current_min = _mm_set1_ps(*min); + current_max = _mm_set1_ps(*max); + + // Work input until "buf" reaches 16 byte alignment + while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { + + // Load the next float into the work buffer + work = _mm_set1_ps(*buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf++; + nframes--; + } + + // use 64 byte prefetch for quadruple quads + while (nframes >= 16) { + __builtin_prefetch(buf+64,0,0); + + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + work = _mm_load_ps(buf); + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + buf+=4; + nframes-=16; + } + + // work through aligned buffers + while (nframes >= 4) { + + work = _mm_load_ps(buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf+=4; + nframes-=4; + } + + // work through the rest < 4 samples + while ( nframes > 0) { + + // Load the next float into the work buffer + work = _mm_set1_ps(*buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf++; + nframes--; + } + + // Find min & max value in current_max through shuffle tricks + + work = current_min; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); + work = _mm_min_ps (work, current_min); + current_min = work; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); + work = _mm_min_ps (work, current_min); + + _mm_store_ss(min, work); + + work = current_max; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); + work = _mm_max_ps (work, current_max); + current_max = work; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); + work = _mm_max_ps (work, current_max); + + _mm_store_ss(max, work); +} + + + |