4 files changed, 845 insertions, 0 deletions
diff --git a/muse/al/dsp.cpp b/muse/al/dsp.cpp
new file mode 100644
index 00000000..b840de92
--- /dev/null
+++ b/muse/al/dsp.cpp
@@ -0,0 +1,123 @@
+//=============================================================================
+//  AL
+//  Audio Utility Library
+//  $Id:$
+//
+//  Copyright (C) 2002-2006 by Werner Schweer and others
+//
+//  This program is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License version 2.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+//=============================================================================
+
+#include "dsp.h"
+#include "config.h"
+
+namespace AL {
+
+Dsp* dsp;
+
+#ifdef __i386__
+
+//---------------------------------------------------------
+//   DspSSE86
+//---------------------------------------------------------
+
+extern "C" {
+extern float x86_sse_compute_peak(float*, unsigned, float);
+extern void x86_sse_apply_gain_to_buffer(float*, unsigned, float);
+extern void x86_sse_mix_buffers_with_gain(float*, float*, unsigned, float);
+extern void x86_sse_mix_buffers_no_gain(float*, float*, unsigned);
+   };
+
+class DspSSE86 : public Dsp {
+   public:
+      DspSSE86() {}
+      virtual ~DspSSE86() {}
+
+      virtual float peak(float* buf, unsigned n, float current) {
+            if ( ((intptr_t)buf % 16) != 0)
+                  fprintf(stderr, "peak(): buffer unaligned!\n");
+            return x86_sse_compute_peak(buf, n, current);
+            }
+
+      virtual void applyGainToBuffer(float* buf, unsigned n, float gain) {
+            if ( ((intptr_t)buf % 16) != 0)
+                  fprintf(stderr, "applyGainToBuffer(): buffer unaligned!\n");
+            x86_sse_apply_gain_to_buffer(buf, n, gain);
+            }
+
+      virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) {
+            if ( ((intptr_t)dst & 15) != 0)
+                  fprintf(stderr, "mixWithGainain(): dst unaligned!\n");
+            if (((intptr_t)dst & 15) != ((intptr_t)src & 15) ) {
+                  fprintf(stderr, "mixWithGain(): dst & src don't have the same alignment!\n");
+                  Dsp::mixWithGain(dst, src,n, gain);
+                  }
+            else
+                  x86_sse_mix_buffers_with_gain(dst, src, n, gain);
+            }
+      virtual void mix(float* dst, float* src, unsigned n) {
+            if ( ((intptr_t)dst & 15) != 0)
+                  fprintf(stderr, "mix_buffers_no_gain(): dst unaligned!\n");
+            if ( ((intptr_t)dst & 15) != ((intptr_t)src & 15) ) {
+                  fprintf(stderr, "mix_buffers_no_gain(): dst & src don't have the same alignment!\n");
+                  Dsp::mix(dst, src, n);
+                  }
+            else
+                  x86_sse_mix_buffers_no_gain(dst, src, n);
+            }
+      };
+#endif
+
+//---------------------------------------------------------
+//   initDsp
+//---------------------------------------------------------
+
+void initDsp()
+      {
+#if defined(__i386__) && defined(USE_SSE)
+      unsigned long useSSE = 0;
+
+#ifdef __x86_64__
+      asm (
+         "pushq %%rbx\n"
+         "movq $1, %%rax\n"
+         "cpuid\n"
+         "movq %%rdx, %0\n"
+         "popq %%rbx\n"
+         : "=r" (useSSE)
+         :
+         : "%rax", "%rcx", "%rdx", "memory");
+#else
+      asm (
+         "mov $1, %%eax\n"
+         "pushl %%ebx\n"
+         "cpuid\n"
+         "movl %%edx, %0\n"
+         "popl %%ebx\n"
+         : "=r" (useSSE)
+         :
+         : "%eax", "%ecx", "%edx", "memory");
+#endif
+      useSSE &= (1 << 25); // bit 25 = SSE support
+      if (useSSE) {
+            printf("Using SSE optimized routines\n");
+            dsp = new DspSSE86();
+            return;
+            }
+      // fall through to not hardware optimized routines
+#endif
+      dsp = new Dsp();
+      }
+
+}
+
diff --git a/muse/al/dsp.h b/muse/al/dsp.h
new file mode 100644
index 00000000..d8da11dc
--- /dev/null
+++ b/muse/al/dsp.h
@@ -0,0 +1,76 @@
+//=============================================================================
+//  AL
+//  Audio Utility Library
+//  $Id:$
+//
+//  Copyright (C) 2002-2006 by Werner Schweer and others
+//
+//  This program is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License version 2.
+//
+//  This program is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+//=============================================================================
+
+#ifndef __DSP_H__
+#define __DSP_H__
+
+namespace AL {
+
+
+//---------------------------------------------------------
+//   f_max
+//---------------------------------------------------------
+
+static inline float f_max(float x, float a)
+      {
+      x -= a;
+      x += fabsf(x);
+      x *= 0.5f;
+      x += a;
+      return x;
+      }
+
+//---------------------------------------------------------
+//   Dsp
+//    standard version of all dsp routines without any
+//    hw acceleration
+//---------------------------------------------------------
+
+class Dsp {
+   public:
+      Dsp() {}
+      virtual ~Dsp() {}
+
+      virtual float peak(float* buf, unsigned n, float current) {
+            for (unsigned i = 0; i < n; ++i)
+                  current = f_max(current, fabsf(buf[i]));
+            return current;
+            }
+      virtual void applyGainToBuffer(float* buf, unsigned n, float gain) {
+            for (unsigned i = 0; i < n; ++i)
+                  buf[i] *= gain;
+            }
+      virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) {
+            for (unsigned i = 0; i < n; ++i)
+                  dst[i] += src[i] * gain;
+            }
+      virtual void mix(float* dst, float* src, unsigned n) {
+            for (unsigned i = 0; i < n; ++i)
+                  dst[i] += src[i];
+            }
+      };
+
+extern void initDsp();
+extern Dsp* dsp;
+
+}
+
+#endif
+
diff --git a/muse/al/dspSSE.cpp b/muse/al/dspSSE.cpp
new file mode 100644
index 00000000..0f3d84b9
--- /dev/null
+++ b/muse/al/dspSSE.cpp
@@ -0,0 +1,531 @@
+/*
+    Copyright (C) 2005 Paul Davis
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+	Author: Sampo Savolainen
+
+    $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $
+*/
+
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+	.type	x86_sse_mix_buffers_with_gain,@function
+
+x86_sse_mix_buffers_with_gain:
+#; 8(%ebp)	= float	*dst 	= %edi
+#; 12(%ebp) = float *src	= %esi
+#; 16(%ebp) = long	nframes = %ecx
+#; 20(%ebp) = float	gain    = st(0)
+
+	pushl %ebp
+	movl %esp, %ebp
+
+	#; save the registers
+#;	pushl %eax
+	pushl %ebx
+#;	pushl %ecx
+	pushl %edi
+	pushl %esi
+	
+	#; if nframes == 0, go to end
+	movl 16(%ebp), %ecx #; nframes
+	cmp	$0, %ecx
+	je	.MBWG_END
+
+	#; Check for alignment
+
+	movl 8(%ebp), %edi  #; dst 
+	movl 12(%ebp), %esi #; src
+
+	movl %edi, %eax
+	andl $12, %eax #; mask alignemnt offset
+
+	movl %esi, %ebx
+	andl $12, %ebx #; mask alignment offset
+
+	cmp %eax, %ebx
+	jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+	#; if we are aligned
+	cmp $0, %ebx
+	jz .MBWG_SSE
+	
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+	movss 20(%ebp), %xmm1 #; xmm1
+
+.MBWG_PRELOOP:
+	
+	movss (%esi), %xmm0
+	mulss %xmm1, %xmm0
+	addss (%edi), %xmm0
+	movss %xmm0, (%edi)
+
+	addl $4, %edi #; dst++
+	addl $4, %esi #; src++
+	decl %ecx 	  #; nframes--
+	jz .MBWG_END
+
+#;	cmp $0, %ecx
+#;	je .MBWG_END #; if we run out of frames, go to end
+	
+	addl $4, %ebx
+	
+	cmp $16, %ebx #; test if we've reached 16 byte alignment
+	jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+	cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
+	jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+	#; copy gain to fill %xmm1
+	movss   20(%ebp), %xmm1
+    shufps  $0x00, %xmm1, %xmm1
+
+
+.MBWG_SSELOOP:
+
+	movaps	(%esi), %xmm0 #; source => xmm0
+	mulps	%xmm1,  %xmm0 #; apply gain to source
+	addps	(%edi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%edi) #; copy result to destination
+	
+	addl $16, %edi #; dst+=4
+	addl $16, %esi #; src+=4
+
+	subl $4, %ecx #; nframes-=4
+	cmp $4, %ecx
+	jge .MBWG_SSELOOP
+
+	cmp $0, %ecx
+	je .MBWG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBWG_NONALIGN:
+	#; not aligned!
+
+	movss 20(%ebp), %xmm1 #; gain => xmm1
+
+.MBWG_NONALIGNLOOP:
+
+	movss (%esi), %xmm0
+	mulss %xmm1, %xmm0
+	addss (%edi), %xmm0
+	movss %xmm0, (%edi)
+	
+	addl $4, %edi
+	addl $4, %esi
+	
+	decl %ecx
+	jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+	popl %esi
+	popl %edi
+#;	popl %ecx
+	popl %ebx
+#;	popl %eax
+	
+	#; return
+	leave
+	ret
+
+.size	x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
+
+
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+	.type	x86_sse_mix_buffers_no_gain,@function
+
+x86_sse_mix_buffers_no_gain:
+#; 8(%ebp)	= float	*dst 	= %edi
+#; 12(%ebp) = float *src	= %esi
+#; 16(%ebp) = long	nframes = %ecx
+
+	pushl %ebp
+	movl %esp, %ebp
+
+	#; save the registers
+#;	pushl %eax
+	pushl %ebx
+#;	pushl %ecx
+	pushl %edi
+	pushl %esi
+	
+	#; the real function
+
+	#; if nframes == 0, go to end
+	movl 16(%ebp), %ecx #; nframes
+	cmp	$0, %ecx
+	je	.MBNG_END
+
+	#; Check for alignment
+
+	movl 8(%ebp), %edi  #; dst 
+	movl 12(%ebp), %esi #; src
+
+	movl %edi, %eax
+	andl $12, %eax #; mask alignemnt offset
+
+	movl %esi, %ebx
+	andl $12, %ebx #; mask alignment offset
+
+	cmp %eax, %ebx
+	jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+	cmp $0, %ebx
+	je .MBNG_SSE
+
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+.MBNG_PRELOOP:
+		
+	movss (%esi), %xmm0
+	addss (%edi), %xmm0
+	movss %xmm0, (%edi)
+
+	addl $4, %edi #; dst++
+	addl $4, %esi #; src++
+	decl %ecx 	  #; nframes--
+	jz	.MBNG_END
+	addl $4, %ebx
+	
+	cmp $16, %ebx #; test if we've reached 16 byte alignment
+	jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+	cmp $4, %ecx #; if there are frames left, but less than 4
+	jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+	movaps	(%esi), %xmm0 #; source => xmm0
+	addps	(%edi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%edi) #; copy result to destination
+	
+	addl $16, %edi #; dst+=4
+	addl $16, %esi #; src+=4
+
+	subl $4, %ecx #; nframes-=4
+	cmp $4, %ecx
+	jge .MBNG_SSELOOP
+
+	cmp $0, %ecx
+	je .MBNG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBNG_NONALIGN:
+	#; not aligned!
+
+	movss (%esi), %xmm0 #; src => xmm0
+	addss (%edi), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%edi) #; xmm0 => dst
+	
+	addl $4, %edi
+	addl $4, %esi
+	
+	decl %ecx
+	jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+	popl %esi
+	popl %edi
+#;	popl %ecx
+	popl %ebx
+#;	popl %eax
+	
+	#; return
+	leave
+	ret
+
+.size	x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
+
+
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);
+
+.globl x86_sse_apply_gain_to_buffer
+	.type	x86_sse_apply_gain_to_buffer,@function
+
+x86_sse_apply_gain_to_buffer:
+#; 8(%ebp)	= float	*buf 	= %edi
+#; 12(%ebp) = long	nframes = %ecx
+#; 16(%ebp) = float	gain    = st(0)
+
+	pushl %ebp
+	movl %esp, %ebp
+
+	#; save %edi
+	pushl %edi
+	
+	#; the real function
+
+	#; if nframes == 0, go to end
+	movl 12(%ebp), %ecx #; nframes
+	cmp	$0, %ecx
+	je	.AG_END
+
+	#; create the gain buffer in %xmm1
+	movss	16(%ebp), %xmm1
+	shufps	$0x00, %xmm1, %xmm1
+	
+	#; Check for alignment
+
+	movl 8(%ebp), %edi #; buf 
+	movl %edi, %edx #; buf => %edx
+	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.AG_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.AGLP_START:
+
+	#; Load next value from the buffer
+	movss (%edi), %xmm0
+	mulss %xmm1, %xmm0
+	movss %xmm0, (%edi)
+
+	#; increment buffer, decrement counter
+	addl $4, %edi #; buf++;
+	
+	decl %ecx   #; nframes--
+	jz	.AG_END #; if we run out of frames, we go to the end
+	
+	addl $4, %edx #; one non-aligned byte less
+	cmp $16, %edx
+	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("edi") value
+
+	#; Figure out how many loops we should do
+	movl %ecx, %eax #; copy remaining nframes to %eax for division
+	movl $0, %edx   #; 0 the edx register
+	
+	
+	pushl %edi
+	movl $4, %edi
+	divl %edi #; %edx = remainder == 0
+	popl %edi
+
+	#; %eax = SSE iterations
+	cmp $0, %eax
+	je .AGPOST_START
+
+	
+.AGLP_SSE:
+
+	movaps (%edi), %xmm0
+	mulps %xmm1, %xmm0
+	movaps %xmm0, (%edi)
+
+	addl $16, %edi
+#;	subl $4, %ecx   #; nframes-=4
+
+	decl %eax
+	jnz .AGLP_SSE
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %ecx
+	
+	#; if no remaining frames, jump to the end
+#;	cmp $0, %ecx
+	andl $3, %ecx #; nframes % 4
+	je .AG_END
+
+.AGPOST_START:
+
+	movss (%edi), %xmm0
+	mulss %xmm1, %xmm0
+	movss %xmm0, (%edi)
+
+	#; increment buffer, decrement counter
+	addl $4, %edi #; buf++;
+	
+	decl %ecx   #; nframes--
+	jnz	.AGPOST_START #; if we run out of frames, we go to the end
+	
+.AG_END:
+
+
+	popl %edi
+	
+	#; return
+	leave
+	ret
+
+.size	x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
+#; end proc
+
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+	.type	x86_sse_compute_peak,@function
+
+x86_sse_compute_peak:
+#; 8(%ebp)	= float	*buf 	= %edi
+#; 12(%ebp) = long	nframes = %ecx
+#; 16(%ebp) = float	current = st(0)
+
+	pushl %ebp
+	movl %esp, %ebp
+
+	#; save %edi
+	pushl %edi
+	
+	#; the real function
+
+	#; Load "current" in xmm0
+	movss 16(%ebp), %xmm0
+
+	#; if nframes == 0, go to end
+	movl 12(%ebp), %ecx #; nframes
+	cmp	$0, %ecx
+	je	.CP_END
+
+	#; create the "abs" mask in %xmm2
+	pushl	$2147483647
+	movss	(%esp), %xmm2
+	addl    $4, %esp
+	shufps	$0x00, %xmm2, %xmm2
+
+	#; Check for alignment
+
+	movl 8(%ebp), %edi #; buf 
+	movl %edi, %edx #; buf => %edx
+	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.CP_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%edi) value
+
+.LP_START:
+
+	#; Load next value from the buffer
+	movss (%edi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+
+	#; increment buffer, decrement counter
+	addl $4, %edi #; buf++;
+	
+	decl %ecx   #; nframes--
+	jz	.CP_END #; if we run out of frames, we go to the end
+	
+	addl $4, %edx #; one non-aligned byte less
+	cmp $16, %edx
+	jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("edi") value
+
+	#; Figure out how many loops we should do
+	movl %ecx, %eax #; copy remaining nframes to %eax for division
+
+	shr $2,%eax #; unsigned divide by 4
+	jz .POST_START
+
+	#; %eax = SSE iterations
+
+	#; current maximum is at %xmm0, but we need to ..
+	shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+	#;prefetcht0 16(%edi)
+
+.LP_SSE:
+
+	movaps (%edi), %xmm1
+	andps %xmm2, %xmm1
+	maxps %xmm1, %xmm0
+
+	addl $16, %edi
+
+	decl %eax
+	jnz .LP_SSE
+
+	#; Calculate the maximum value contained in the 4 FP's in %xmm0
+	movaps %xmm0, %xmm1
+	shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+	maxps  %xmm1, %xmm0 #; maximums of the two pairs
+	movaps %xmm0, %xmm1
+	shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
+	maxps  %xmm1, %xmm0 
+
+	#; now every float in %xmm0 is the same value, current maximum value
+	
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %ecx
+	
+	#; if no remaining frames, jump to the end
+
+	andl $3, %ecx #; nframes % 4
+	jz .CP_END
+
+.POST_START:
+
+	movss (%edi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+	
+	addl $4, %edi 	#; buf++;
+	
+	decl %ecx		#; nframes--;
+	jnz .POST_START
+
+.CP_END:
+
+	#; Load the value from xmm0 to the float stack for returning
+	movss %xmm0, 16(%ebp)
+	flds 16(%ebp)
+
+	popl %edi
+	
+	#; return
+	leave
+	ret
+
+.size	x86_sse_compute_peak, .-x86_sse_compute_peak
+#; end proc
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+
diff --git a/muse/al/dspXMM.cpp b/muse/al/dspXMM.cpp
new file mode 100644
index 00000000..1943fa7d
--- /dev/null
+++ b/muse/al/dspXMM.cpp
@@ -0,0 +1,115 @@
+/*
+    Copyright (C) 2007 Paul sDavis
+    	Written by Sampo Savolainen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <xmmintrin.h>
+
+void
+x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max)
+{
+	__m128 current_max, current_min, work;
+
+	// Load max and min values into all four slots of the XMM registers
+	current_min = _mm_set1_ps(*min);
+	current_max = _mm_set1_ps(*max);
+
+	// Work input until "buf" reaches 16 byte alignment
+	while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
+
+		// Load the next float into the work buffer
+		work = _mm_set1_ps(*buf);
+
+		current_min = _mm_min_ps(current_min, work);
+		current_max = _mm_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+        // use 64 byte prefetch for quadruple quads
+        while (nframes >= 16) {
+                __builtin_prefetch(buf+64,0,0);
+
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                work = _mm_load_ps(buf);
+                current_min = _mm_min_ps(current_min, work);
+                current_max = _mm_max_ps(current_max, work);
+                buf+=4;
+                nframes-=16;
+        }
+
+	// work through aligned buffers
+	while (nframes >= 4) {
+
+		work = _mm_load_ps(buf);
+
+		current_min = _mm_min_ps(current_min, work);
+		current_max = _mm_max_ps(current_max, work);
+
+		buf+=4;
+		nframes-=4;
+	}
+
+	// work through the rest < 4 samples
+	while ( nframes > 0) {
+
+		// Load the next float into the work buffer
+		work = _mm_set1_ps(*buf);
+
+		current_min = _mm_min_ps(current_min, work);
+		current_max = _mm_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+	// Find min & max value in current_max through shuffle tricks
+
+	work = current_min;
+	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
+	work = _mm_min_ps (work, current_min);
+	current_min = work;
+	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
+	work = _mm_min_ps (work, current_min);
+
+	_mm_store_ss(min, work);
+
+	work = current_max;
+	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
+	work = _mm_max_ps (work, current_max);
+	current_max = work;
+	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
+	work = _mm_max_ps (work, current_max);
+
+	_mm_store_ss(max, work);
+}
+
+
+