diff options
authorWerner Schweer <>2007-04-17 14:17:41 +0000
committerWerner Schweer <>2007-04-17 14:17:41 +0000
commit71a18c73e704314291ec604c5ca612af91d80a22 (patch)
parent5cb6baa63ddc339a2136957966ba20734c03ada6 (diff)
add missing files
4 files changed, 845 insertions, 0 deletions
diff --git a/muse/al/dsp.cpp b/muse/al/dsp.cpp
new file mode 100644
index 00000000..b840de92
--- /dev/null
+++ b/muse/al/dsp.cpp
@@ -0,0 +1,123 @@
+// AL
+// Audio Utility Library
+// $Id:$
+// Copyright (C) 2002-2006 by Werner Schweer and others
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2.
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#include "dsp.h"
+#include "config.h"
+namespace AL {
+Dsp* dsp;
+#ifdef __i386__
+// DspSSE86
+extern "C" {
+extern float x86_sse_compute_peak(float*, unsigned, float);
+extern void x86_sse_apply_gain_to_buffer(float*, unsigned, float);
+extern void x86_sse_mix_buffers_with_gain(float*, float*, unsigned, float);
+extern void x86_sse_mix_buffers_no_gain(float*, float*, unsigned);
+ };
+class DspSSE86 : public Dsp {
+ public:
+ DspSSE86() {}
+ virtual ~DspSSE86() {}
+ virtual float peak(float* buf, unsigned n, float current) {
+ if ( ((intptr_t)buf % 16) != 0)
+ fprintf(stderr, "peak(): buffer unaligned!\n");
+ return x86_sse_compute_peak(buf, n, current);
+ }
+ virtual void applyGainToBuffer(float* buf, unsigned n, float gain) {
+ if ( ((intptr_t)buf % 16) != 0)
+ fprintf(stderr, "applyGainToBuffer(): buffer unaligned!\n");
+ x86_sse_apply_gain_to_buffer(buf, n, gain);
+ }
+ virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) {
+ if ( ((intptr_t)dst & 15) != 0)
+ fprintf(stderr, "mixWithGainain(): dst unaligned!\n");
+ if (((intptr_t)dst & 15) != ((intptr_t)src & 15) ) {
+ fprintf(stderr, "mixWithGain(): dst & src don't have the same alignment!\n");
+ Dsp::mixWithGain(dst, src,n, gain);
+ }
+ else
+ x86_sse_mix_buffers_with_gain(dst, src, n, gain);
+ }
+ virtual void mix(float* dst, float* src, unsigned n) {
+ if ( ((intptr_t)dst & 15) != 0)
+ fprintf(stderr, "mix_buffers_no_gain(): dst unaligned!\n");
+ if ( ((intptr_t)dst & 15) != ((intptr_t)src & 15) ) {
+ fprintf(stderr, "mix_buffers_no_gain(): dst & src don't have the same alignment!\n");
+ Dsp::mix(dst, src, n);
+ }
+ else
+ x86_sse_mix_buffers_no_gain(dst, src, n);
+ }
+ };
+// initDsp
+void initDsp()
+ {
+#if defined(__i386__) && defined(USE_SSE)
+ unsigned long useSSE = 0;
+#ifdef __x86_64__
+ asm (
+ "pushq %%rbx\n"
+ "movq $1, %%rax\n"
+ "cpuid\n"
+ "movq %%rdx, %0\n"
+ "popq %%rbx\n"
+ : "=r" (useSSE)
+ :
+ : "%rax", "%rcx", "%rdx", "memory");
+ asm (
+ "mov $1, %%eax\n"
+ "pushl %%ebx\n"
+ "cpuid\n"
+ "movl %%edx, %0\n"
+ "popl %%ebx\n"
+ : "=r" (useSSE)
+ :
+ : "%eax", "%ecx", "%edx", "memory");
+ useSSE &= (1 << 25); // bit 25 = SSE support
+ if (useSSE) {
+ printf("Using SSE optimized routines\n");
+ dsp = new DspSSE86();
+ return;
+ }
+ // fall through to not hardware optimized routines
+ dsp = new Dsp();
+ }
diff --git a/muse/al/dsp.h b/muse/al/dsp.h
new file mode 100644
index 00000000..d8da11dc
--- /dev/null
+++ b/muse/al/dsp.h
@@ -0,0 +1,76 @@
+// AL
+// Audio Utility Library
+// $Id:$
+// Copyright (C) 2002-2006 by Werner Schweer and others
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2.
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#ifndef __DSP_H__
+#define __DSP_H__
+namespace AL {
+// f_max
+static inline float f_max(float x, float a)
+ {
+ x -= a;
+ x += fabsf(x);
+ x *= 0.5f;
+ x += a;
+ return x;
+ }
+// Dsp
+// standard version of all dsp routines without any
+// hw acceleration
+class Dsp {
+ public:
+ Dsp() {}
+ virtual ~Dsp() {}
+ virtual float peak(float* buf, unsigned n, float current) {
+ for (unsigned i = 0; i < n; ++i)
+ current = f_max(current, fabsf(buf[i]));
+ return current;
+ }
+ virtual void applyGainToBuffer(float* buf, unsigned n, float gain) {
+ for (unsigned i = 0; i < n; ++i)
+ buf[i] *= gain;
+ }
+ virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) {
+ for (unsigned i = 0; i < n; ++i)
+ dst[i] += src[i] * gain;
+ }
+ virtual void mix(float* dst, float* src, unsigned n) {
+ for (unsigned i = 0; i < n; ++i)
+ dst[i] += src[i];
+ }
+ };
+extern void initDsp();
+extern Dsp* dsp;
diff --git a/muse/al/dspSSE.cpp b/muse/al/dspSSE.cpp
new file mode 100644
index 00000000..0f3d84b9
--- /dev/null
+++ b/muse/al/dspSSE.cpp
@@ -0,0 +1,531 @@
+ Copyright (C) 2005 Paul Davis
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ Author: Sampo Savolainen
+ $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
+.globl x86_sse_mix_buffers_with_gain
+ .type x86_sse_mix_buffers_with_gain,@function
+#; 8(%ebp) = float *dst = %edi
+#; 12(%ebp) = float *src = %esi
+#; 16(%ebp) = long nframes = %ecx
+#; 20(%ebp) = float gain = st(0)
+ pushl %ebp
+ movl %esp, %ebp
+ #; save the registers
+#; pushl %eax
+ pushl %ebx
+#; pushl %ecx
+ pushl %edi
+ pushl %esi
+ #; if nframes == 0, go to end
+ movl 16(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .MBWG_END
+ #; Check for alignment
+ movl 8(%ebp), %edi #; dst
+ movl 12(%ebp), %esi #; src
+ movl %edi, %eax
+ andl $12, %eax #; mask alignemnt offset
+ movl %esi, %ebx
+ andl $12, %ebx #; mask alignment offset
+ cmp %eax, %ebx
+ jne .MBWG_NONALIGN #; if not aligned, calculate manually
+ #; if we are aligned
+ cmp $0, %ebx
+ jz .MBWG_SSE
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+ movss 20(%ebp), %xmm1 #; xmm1
+ movss (%esi), %xmm0
+ mulss %xmm1, %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+ addl $4, %edi #; dst++
+ addl $4, %esi #; src++
+ decl %ecx #; nframes--
+ jz .MBWG_END
+#; cmp $0, %ecx
+#; je .MBWG_END #; if we run out of frames, go to end
+ addl $4, %ebx
+ cmp $16, %ebx #; test if we've reached 16 byte alignment
+ cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
+ jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+ #; copy gain to fill %xmm1
+ movss 20(%ebp), %xmm1
+ shufps $0x00, %xmm1, %xmm1
+ movaps (%esi), %xmm0 #; source => xmm0
+ mulps %xmm1, %xmm0 #; apply gain to source
+ addps (%edi), %xmm0 #; mix with destination
+ movaps %xmm0, (%edi) #; copy result to destination
+ addl $16, %edi #; dst+=4
+ addl $16, %esi #; src+=4
+ subl $4, %ecx #; nframes-=4
+ cmp $4, %ecx
+ cmp $0, %ecx
+ je .MBWG_END
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+ #; not aligned!
+ movss 20(%ebp), %xmm1 #; gain => xmm1
+ movss (%esi), %xmm0
+ mulss %xmm1, %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+ addl $4, %edi
+ addl $4, %esi
+ decl %ecx
+ popl %esi
+ popl %edi
+#; popl %ecx
+ popl %ebx
+#; popl %eax
+ #; return
+ leave
+ ret
+.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
+.globl x86_sse_mix_buffers_no_gain
+ .type x86_sse_mix_buffers_no_gain,@function
+#; 8(%ebp) = float *dst = %edi
+#; 12(%ebp) = float *src = %esi
+#; 16(%ebp) = long nframes = %ecx
+ pushl %ebp
+ movl %esp, %ebp
+ #; save the registers
+#; pushl %eax
+ pushl %ebx
+#; pushl %ecx
+ pushl %edi
+ pushl %esi
+ #; the real function
+ #; if nframes == 0, go to end
+ movl 16(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .MBNG_END
+ #; Check for alignment
+ movl 8(%ebp), %edi #; dst
+ movl 12(%ebp), %esi #; src
+ movl %edi, %eax
+ andl $12, %eax #; mask alignemnt offset
+ movl %esi, %ebx
+ andl $12, %ebx #; mask alignment offset
+ cmp %eax, %ebx
+ jne .MBNG_NONALIGN #; if not aligned, calculate manually
+ cmp $0, %ebx
+ je .MBNG_SSE
+ #; Pre-loop, we need to run 1-3 frames "manually" without
+ #; SSE instructions
+ movss (%esi), %xmm0
+ addss (%edi), %xmm0
+ movss %xmm0, (%edi)
+ addl $4, %edi #; dst++
+ addl $4, %esi #; src++
+ decl %ecx #; nframes--
+ jz .MBNG_END
+ addl $4, %ebx
+ cmp $16, %ebx #; test if we've reached 16 byte alignment
+ cmp $4, %ecx #; if there are frames left, but less than 4
+ jnge .MBNG_NONALIGN #; we can't run SSE
+ movaps (%esi), %xmm0 #; source => xmm0
+ addps (%edi), %xmm0 #; mix with destination
+ movaps %xmm0, (%edi) #; copy result to destination
+ addl $16, %edi #; dst+=4
+ addl $16, %esi #; src+=4
+ subl $4, %ecx #; nframes-=4
+ cmp $4, %ecx
+ cmp $0, %ecx
+ je .MBNG_END
+ #; if there are remaining frames, the nonalign code will do nicely
+ #; for the rest 1-3 frames.
+ #; not aligned!
+ movss (%esi), %xmm0 #; src => xmm0
+ addss (%edi), %xmm0 #; xmm0 += dst
+ movss %xmm0, (%edi) #; xmm0 => dst
+ addl $4, %edi
+ addl $4, %esi
+ decl %ecx
+ popl %esi
+ popl %edi
+#; popl %ecx
+ popl %ebx
+#; popl %eax
+ #; return
+ leave
+ ret
+.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
+.globl x86_sse_apply_gain_to_buffer
+ .type x86_sse_apply_gain_to_buffer,@function
+#; 8(%ebp) = float *buf = %edi
+#; 12(%ebp) = long nframes = %ecx
+#; 16(%ebp) = float gain = st(0)
+ pushl %ebp
+ movl %esp, %ebp
+ #; save %edi
+ pushl %edi
+ #; the real function
+ #; if nframes == 0, go to end
+ movl 12(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .AG_END
+ #; create the gain buffer in %xmm1
+ movss 16(%ebp), %xmm1
+ shufps $0x00, %xmm1, %xmm1
+ #; Check for alignment
+ movl 8(%ebp), %edi #; buf
+ movl %edi, %edx #; buf => %edx
+ andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .AG_SSE #; if buffer IS aligned
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%edi) value
+ #; Load next value from the buffer
+ movss (%edi), %xmm0
+ mulss %xmm1, %xmm0
+ movss %xmm0, (%edi)
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+ decl %ecx #; nframes--
+ jz .AG_END #; if we run out of frames, we go to the end
+ addl $4, %edx #; one non-aligned byte less
+ cmp $16, %edx
+ jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+ #; We have reached the 16 byte aligned "buf" ("edi") value
+ #; Figure out how many loops we should do
+ movl %ecx, %eax #; copy remaining nframes to %eax for division
+ movl $0, %edx #; 0 the edx register
+ pushl %edi
+ movl $4, %edi
+ divl %edi #; %edx = remainder == 0
+ popl %edi
+ #; %eax = SSE iterations
+ cmp $0, %eax
+ movaps (%edi), %xmm0
+ mulps %xmm1, %xmm0
+ movaps %xmm0, (%edi)
+ addl $16, %edi
+#; subl $4, %ecx #; nframes-=4
+ decl %eax
+ jnz .AGLP_SSE
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %ecx
+ #; if no remaining frames, jump to the end
+#; cmp $0, %ecx
+ andl $3, %ecx #; nframes % 4
+ je .AG_END
+ movss (%edi), %xmm0
+ mulss %xmm1, %xmm0
+ movss %xmm0, (%edi)
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+ decl %ecx #; nframes--
+ jnz .AGPOST_START #; if we run out of frames, we go to the end
+ popl %edi
+ #; return
+ leave
+ ret
+.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+.globl x86_sse_compute_peak
+ .type x86_sse_compute_peak,@function
+#; 8(%ebp) = float *buf = %edi
+#; 12(%ebp) = long nframes = %ecx
+#; 16(%ebp) = float current = st(0)
+ pushl %ebp
+ movl %esp, %ebp
+ #; save %edi
+ pushl %edi
+ #; the real function
+ #; Load "current" in xmm0
+ movss 16(%ebp), %xmm0
+ #; if nframes == 0, go to end
+ movl 12(%ebp), %ecx #; nframes
+ cmp $0, %ecx
+ je .CP_END
+ #; create the "abs" mask in %xmm2
+ pushl $2147483647
+ movss (%esp), %xmm2
+ addl $4, %esp
+ shufps $0x00, %xmm2, %xmm2
+ #; Check for alignment
+ movl 8(%ebp), %edi #; buf
+ movl %edi, %edx #; buf => %edx
+ andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+ jz .CP_SSE #; if buffer IS aligned
+ #; we iterate 1-3 times, doing normal x87 float comparison
+ #; so we reach a 16 byte aligned "buf" (=%edi) value
+ #; Load next value from the buffer
+ movss (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+ #; increment buffer, decrement counter
+ addl $4, %edi #; buf++;
+ decl %ecx #; nframes--
+ jz .CP_END #; if we run out of frames, we go to the end
+ addl $4, %edx #; one non-aligned byte less
+ cmp $16, %edx
+ jne .LP_START #; if more non-aligned frames exist, we do a do-over
+ #; We have reached the 16 byte aligned "buf" ("edi") value
+ #; Figure out how many loops we should do
+ movl %ecx, %eax #; copy remaining nframes to %eax for division
+ shr $2,%eax #; unsigned divide by 4
+ #; %eax = SSE iterations
+ #; current maximum is at %xmm0, but we need to ..
+ shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+ #;prefetcht0 16(%edi)
+ movaps (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxps %xmm1, %xmm0
+ addl $16, %edi
+ decl %eax
+ jnz .LP_SSE
+ #; Calculate the maximum value contained in the 4 FP's in %xmm0
+ movaps %xmm0, %xmm1
+ shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+ maxps %xmm1, %xmm0 #; maximums of the two pairs
+ movaps %xmm0, %xmm1
+ shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
+ maxps %xmm1, %xmm0
+ #; now every float in %xmm0 is the same value, current maximum value
+ #; Next we need to post-process all remaining frames
+ #; the remaining frame count is in %ecx
+ #; if no remaining frames, jump to the end
+ andl $3, %ecx #; nframes % 4
+ jz .CP_END
+ movss (%edi), %xmm1
+ andps %xmm2, %xmm1
+ maxss %xmm1, %xmm0
+ addl $4, %edi #; buf++;
+ decl %ecx #; nframes--;
+ #; Load the value from xmm0 to the float stack for returning
+ movss %xmm0, 16(%ebp)
+ flds 16(%ebp)
+ popl %edi
+ #; return
+ leave
+ ret
+.size x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
diff --git a/muse/al/dspXMM.cpp b/muse/al/dspXMM.cpp
new file mode 100644
index 00000000..1943fa7d
--- /dev/null
+++ b/muse/al/dspXMM.cpp
@@ -0,0 +1,115 @@
+ Copyright (C) 2007 Paul sDavis
+ Written by Sampo Savolainen
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#include <xmmintrin.h>
+x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max)
+ __m128 current_max, current_min, work;
+ // Load max and min values into all four slots of the XMM registers
+ current_min = _mm_set1_ps(*min);
+ current_max = _mm_set1_ps(*max);
+ // Work input until "buf" reaches 16 byte alignment
+ while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
+ // Load the next float into the work buffer
+ work = _mm_set1_ps(*buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf++;
+ nframes--;
+ }
+ // use 64 byte prefetch for quadruple quads
+ while (nframes >= 16) {
+ __builtin_prefetch(buf+64,0,0);
+ work = _mm_load_ps(buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf+=4;
+ work = _mm_load_ps(buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf+=4;
+ work = _mm_load_ps(buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf+=4;
+ work = _mm_load_ps(buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf+=4;
+ nframes-=16;
+ }
+ // work through aligned buffers
+ while (nframes >= 4) {
+ work = _mm_load_ps(buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf+=4;
+ nframes-=4;
+ }
+ // work through the rest < 4 samples
+ while ( nframes > 0) {
+ // Load the next float into the work buffer
+ work = _mm_set1_ps(*buf);
+ current_min = _mm_min_ps(current_min, work);
+ current_max = _mm_max_ps(current_max, work);
+ buf++;
+ nframes--;
+ }
+ // Find min & max value in current_max through shuffle tricks
+ work = current_min;
+ work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
+ work = _mm_min_ps (work, current_min);
+ current_min = work;
+ work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
+ work = _mm_min_ps (work, current_min);
+ _mm_store_ss(min, work);
+ work = current_max;
+ work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
+ work = _mm_max_ps (work, current_max);
+ current_max = work;
+ work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
+ work = _mm_max_ps (work, current_max);
+ _mm_store_ss(max, work);