diff options
| -rw-r--r-- | muse/al/dsp.cpp | 123 | ||||
| -rw-r--r-- | muse/al/dsp.h | 76 | ||||
| -rw-r--r-- | muse/al/dspSSE.cpp | 531 | ||||
| -rw-r--r-- | muse/al/dspXMM.cpp | 115 | 
4 files changed, 845 insertions, 0 deletions
diff --git a/muse/al/dsp.cpp b/muse/al/dsp.cpp new file mode 100644 index 00000000..b840de92 --- /dev/null +++ b/muse/al/dsp.cpp @@ -0,0 +1,123 @@ +//============================================================================= +//  AL +//  Audio Utility Library +//  $Id:$ +// +//  Copyright (C) 2002-2006 by Werner Schweer and others +// +//  This program is free software; you can redistribute it and/or modify +//  it under the terms of the GNU General Public License version 2. +// +//  This program is distributed in the hope that it will be useful, +//  but WITHOUT ANY WARRANTY; without even the implied warranty of +//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +//  GNU General Public License for more details. +// +//  You should have received a copy of the GNU General Public License +//  along with this program; if not, write to the Free Software +//  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +//============================================================================= + +#include "dsp.h" +#include "config.h" + +namespace AL { + +Dsp* dsp; + +#ifdef __i386__ + +//--------------------------------------------------------- +//   DspSSE86 +//--------------------------------------------------------- + +extern "C" { +extern float x86_sse_compute_peak(float*, unsigned, float); +extern void x86_sse_apply_gain_to_buffer(float*, unsigned, float); +extern void x86_sse_mix_buffers_with_gain(float*, float*, unsigned, float); +extern void x86_sse_mix_buffers_no_gain(float*, float*, unsigned); +   }; + +class DspSSE86 : public Dsp { +   public: +      DspSSE86() {} +      virtual ~DspSSE86() {} + +      virtual float peak(float* buf, unsigned n, float current) { +            if ( ((intptr_t)buf % 16) != 0) +                  fprintf(stderr, "peak(): buffer unaligned!\n"); +            return x86_sse_compute_peak(buf, n, current); +            } + +      virtual void applyGainToBuffer(float* buf, unsigned n, float gain) { +            if ( ((intptr_t)buf % 16) != 0) +                  fprintf(stderr, "applyGainToBuffer(): buffer unaligned!\n"); +            x86_sse_apply_gain_to_buffer(buf, n, gain); +            } + +      virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) { +            if ( ((intptr_t)dst & 15) != 0) +                  fprintf(stderr, "mixWithGainain(): dst unaligned!\n"); +            if (((intptr_t)dst & 15) != ((intptr_t)src & 15) ) { +                  fprintf(stderr, "mixWithGain(): dst & src don't have the same alignment!\n"); +                  Dsp::mixWithGain(dst, src,n, gain); +                  } +            else +                  x86_sse_mix_buffers_with_gain(dst, src, n, gain); +            } +      virtual void mix(float* dst, float* src, unsigned n) { +            if ( ((intptr_t)dst & 15) != 0) +                  fprintf(stderr, "mix_buffers_no_gain(): dst unaligned!\n"); +            if ( ((intptr_t)dst & 15) != ((intptr_t)src & 15) ) { +                  fprintf(stderr, "mix_buffers_no_gain(): dst & src don't have the same alignment!\n"); +                  Dsp::mix(dst, src, n); +                  } +            else +                  x86_sse_mix_buffers_no_gain(dst, src, n); +            } +      }; +#endif + +//--------------------------------------------------------- +//   initDsp +//--------------------------------------------------------- + +void initDsp() +      { +#if defined(__i386__) && defined(USE_SSE) +      unsigned long useSSE = 0; + +#ifdef __x86_64__ +      asm ( +         "pushq %%rbx\n" +         "movq $1, %%rax\n" +         "cpuid\n" +         "movq %%rdx, %0\n" +         "popq %%rbx\n" +         : "=r" (useSSE) +         : +         : "%rax", "%rcx", "%rdx", "memory"); +#else +      asm ( +         "mov $1, %%eax\n" +         "pushl %%ebx\n" +         "cpuid\n" +         "movl %%edx, %0\n" +         "popl %%ebx\n" +         : "=r" (useSSE) +         : +         : "%eax", "%ecx", "%edx", "memory"); +#endif +      useSSE &= (1 << 25); // bit 25 = SSE support +      if (useSSE) { +            printf("Using SSE optimized routines\n"); +            dsp = new DspSSE86(); +            return; +            } +      // fall through to not hardware optimized routines +#endif +      dsp = new Dsp(); +      } + +} + diff --git a/muse/al/dsp.h b/muse/al/dsp.h new file mode 100644 index 00000000..d8da11dc --- /dev/null +++ b/muse/al/dsp.h @@ -0,0 +1,76 @@ +//============================================================================= +//  AL +//  Audio Utility Library +//  $Id:$ +// +//  Copyright (C) 2002-2006 by Werner Schweer and others +// +//  This program is free software; you can redistribute it and/or modify +//  it under the terms of the GNU General Public License version 2. +// +//  This program is distributed in the hope that it will be useful, +//  but WITHOUT ANY WARRANTY; without even the implied warranty of +//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +//  GNU General Public License for more details. +// +//  You should have received a copy of the GNU General Public License +//  along with this program; if not, write to the Free Software +//  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +//============================================================================= + +#ifndef __DSP_H__ +#define __DSP_H__ + +namespace AL { + + +//--------------------------------------------------------- +//   f_max +//--------------------------------------------------------- + +static inline float f_max(float x, float a) +      { +      x -= a; +      x += fabsf(x); +      x *= 0.5f; +      x += a; +      return x; +      } + +//--------------------------------------------------------- +//   Dsp +//    standard version of all dsp routines without any +//    hw acceleration +//--------------------------------------------------------- + +class Dsp { +   public: +      Dsp() {} +      virtual ~Dsp() {} + +      virtual float peak(float* buf, unsigned n, float current) { +            for (unsigned i = 0; i < n; ++i) +                  current = f_max(current, fabsf(buf[i])); +            return current; +            } +      virtual void applyGainToBuffer(float* buf, unsigned n, float gain) { +            for (unsigned i = 0; i < n; ++i) +                  buf[i] *= gain; +            } +      virtual void mixWithGain(float* dst, float* src, unsigned n, float gain) { +            for (unsigned i = 0; i < n; ++i) +                  dst[i] += src[i] * gain; +            } +      virtual void mix(float* dst, float* src, unsigned n) { +            for (unsigned i = 0; i < n; ++i) +                  dst[i] += src[i]; +            } +      }; + +extern void initDsp(); +extern Dsp* dsp; + +} + +#endif + diff --git a/muse/al/dspSSE.cpp b/muse/al/dspSSE.cpp new file mode 100644 index 00000000..0f3d84b9 --- /dev/null +++ b/muse/al/dspSSE.cpp @@ -0,0 +1,531 @@ +/* +    Copyright (C) 2005 Paul Davis + +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 2 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, write to the Free Software +    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +	Author: Sampo Savolainen + +    $Id: sse_functions.s 988 2006-10-17 20:40:39Z paul $ +*/ + + +#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain); + +.globl x86_sse_mix_buffers_with_gain +	.type	x86_sse_mix_buffers_with_gain,@function + +x86_sse_mix_buffers_with_gain: +#; 8(%ebp)	= float	*dst 	= %edi +#; 12(%ebp) = float *src	= %esi +#; 16(%ebp) = long	nframes = %ecx +#; 20(%ebp) = float	gain    = st(0) + +	pushl %ebp +	movl %esp, %ebp + +	#; save the registers +#;	pushl %eax +	pushl %ebx +#;	pushl %ecx +	pushl %edi +	pushl %esi +	 +	#; if nframes == 0, go to end +	movl 16(%ebp), %ecx #; nframes +	cmp	$0, %ecx +	je	.MBWG_END + +	#; Check for alignment + +	movl 8(%ebp), %edi  #; dst  +	movl 12(%ebp), %esi #; src + +	movl %edi, %eax +	andl $12, %eax #; mask alignemnt offset + +	movl %esi, %ebx +	andl $12, %ebx #; mask alignment offset + +	cmp %eax, %ebx +	jne .MBWG_NONALIGN #; if not aligned, calculate manually + +	#; if we are aligned +	cmp $0, %ebx +	jz .MBWG_SSE +	 +	#; Pre-loop, we need to run 1-3 frames "manually" without +	#; SSE instructions + +	movss 20(%ebp), %xmm1 #; xmm1 + +.MBWG_PRELOOP: +	 +	movss (%esi), %xmm0 +	mulss %xmm1, %xmm0 +	addss (%edi), %xmm0 +	movss %xmm0, (%edi) + +	addl $4, %edi #; dst++ +	addl $4, %esi #; src++ +	decl %ecx 	  #; nframes-- +	jz .MBWG_END + +#;	cmp $0, %ecx +#;	je .MBWG_END #; if we run out of frames, go to end +	 +	addl $4, %ebx +	 +	cmp $16, %ebx #; test if we've reached 16 byte alignment +	jne .MBWG_PRELOOP + + +.MBWG_SSE: + +	cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then +	jnge .MBWG_NONALIGN #; we jump straight to the "normal" code + +	#; copy gain to fill %xmm1 +	movss   20(%ebp), %xmm1 +    shufps  $0x00, %xmm1, %xmm1 + + +.MBWG_SSELOOP: + +	movaps	(%esi), %xmm0 #; source => xmm0 +	mulps	%xmm1,  %xmm0 #; apply gain to source +	addps	(%edi), %xmm0 #; mix with destination +	movaps  %xmm0, (%edi) #; copy result to destination +	 +	addl $16, %edi #; dst+=4 +	addl $16, %esi #; src+=4 + +	subl $4, %ecx #; nframes-=4 +	cmp $4, %ecx +	jge .MBWG_SSELOOP + +	cmp $0, %ecx +	je .MBWG_END + +	#; if there are remaining frames, the nonalign code will do nicely +	#; for the rest 1-3 frames. +	 +.MBWG_NONALIGN: +	#; not aligned! + +	movss 20(%ebp), %xmm1 #; gain => xmm1 + +.MBWG_NONALIGNLOOP: + +	movss (%esi), %xmm0 +	mulss %xmm1, %xmm0 +	addss (%edi), %xmm0 +	movss %xmm0, (%edi) +	 +	addl $4, %edi +	addl $4, %esi +	 +	decl %ecx +	jnz .MBWG_NONALIGNLOOP + +.MBWG_END: + +	popl %esi +	popl %edi +#;	popl %ecx +	popl %ebx +#;	popl %eax +	 +	#; return +	leave +	ret + +.size	x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain + + + + +#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes); + +.globl x86_sse_mix_buffers_no_gain +	.type	x86_sse_mix_buffers_no_gain,@function + +x86_sse_mix_buffers_no_gain: +#; 8(%ebp)	= float	*dst 	= %edi +#; 12(%ebp) = float *src	= %esi +#; 16(%ebp) = long	nframes = %ecx + +	pushl %ebp +	movl %esp, %ebp + +	#; save the registers +#;	pushl %eax +	pushl %ebx +#;	pushl %ecx +	pushl %edi +	pushl %esi +	 +	#; the real function + +	#; if nframes == 0, go to end +	movl 16(%ebp), %ecx #; nframes +	cmp	$0, %ecx +	je	.MBNG_END + +	#; Check for alignment + +	movl 8(%ebp), %edi  #; dst  +	movl 12(%ebp), %esi #; src + +	movl %edi, %eax +	andl $12, %eax #; mask alignemnt offset + +	movl %esi, %ebx +	andl $12, %ebx #; mask alignment offset + +	cmp %eax, %ebx +	jne .MBNG_NONALIGN #; if not aligned, calculate manually + +	cmp $0, %ebx +	je .MBNG_SSE + +	#; Pre-loop, we need to run 1-3 frames "manually" without +	#; SSE instructions + +.MBNG_PRELOOP: +		 +	movss (%esi), %xmm0 +	addss (%edi), %xmm0 +	movss %xmm0, (%edi) + +	addl $4, %edi #; dst++ +	addl $4, %esi #; src++ +	decl %ecx 	  #; nframes-- +	jz	.MBNG_END +	addl $4, %ebx +	 +	cmp $16, %ebx #; test if we've reached 16 byte alignment +	jne .MBNG_PRELOOP + +.MBNG_SSE: + +	cmp $4, %ecx #; if there are frames left, but less than 4 +	jnge .MBNG_NONALIGN #; we can't run SSE + +.MBNG_SSELOOP: + +	movaps	(%esi), %xmm0 #; source => xmm0 +	addps	(%edi), %xmm0 #; mix with destination +	movaps  %xmm0, (%edi) #; copy result to destination +	 +	addl $16, %edi #; dst+=4 +	addl $16, %esi #; src+=4 + +	subl $4, %ecx #; nframes-=4 +	cmp $4, %ecx +	jge .MBNG_SSELOOP + +	cmp $0, %ecx +	je .MBNG_END + +	#; if there are remaining frames, the nonalign code will do nicely +	#; for the rest 1-3 frames. +	 +.MBNG_NONALIGN: +	#; not aligned! + +	movss (%esi), %xmm0 #; src => xmm0 +	addss (%edi), %xmm0 #; xmm0 += dst +	movss %xmm0, (%edi) #; xmm0 => dst +	 +	addl $4, %edi +	addl $4, %esi +	 +	decl %ecx +	jnz .MBNG_NONALIGN + +.MBNG_END: + +	popl %esi +	popl %edi +#;	popl %ecx +	popl %ebx +#;	popl %eax +	 +	#; return +	leave +	ret + +.size	x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain + + + + +#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain); + +.globl x86_sse_apply_gain_to_buffer +	.type	x86_sse_apply_gain_to_buffer,@function + +x86_sse_apply_gain_to_buffer: +#; 8(%ebp)	= float	*buf 	= %edi +#; 12(%ebp) = long	nframes = %ecx +#; 16(%ebp) = float	gain    = st(0) + +	pushl %ebp +	movl %esp, %ebp + +	#; save %edi +	pushl %edi +	 +	#; the real function + +	#; if nframes == 0, go to end +	movl 12(%ebp), %ecx #; nframes +	cmp	$0, %ecx +	je	.AG_END + +	#; create the gain buffer in %xmm1 +	movss	16(%ebp), %xmm1 +	shufps	$0x00, %xmm1, %xmm1 +	 +	#; Check for alignment + +	movl 8(%ebp), %edi #; buf  +	movl %edi, %edx #; buf => %edx +	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 +	jz	.AG_SSE #; if buffer IS aligned + +	#; PRE-LOOP +	#; we iterate 1-3 times, doing normal x87 float comparison +	#; so we reach a 16 byte aligned "buf" (=%edi) value + +.AGLP_START: + +	#; Load next value from the buffer +	movss (%edi), %xmm0 +	mulss %xmm1, %xmm0 +	movss %xmm0, (%edi) + +	#; increment buffer, decrement counter +	addl $4, %edi #; buf++; +	 +	decl %ecx   #; nframes-- +	jz	.AG_END #; if we run out of frames, we go to the end +	 +	addl $4, %edx #; one non-aligned byte less +	cmp $16, %edx +	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over + +.AG_SSE: + +	#; We have reached the 16 byte aligned "buf" ("edi") value + +	#; Figure out how many loops we should do +	movl %ecx, %eax #; copy remaining nframes to %eax for division +	movl $0, %edx   #; 0 the edx register +	 +	 +	pushl %edi +	movl $4, %edi +	divl %edi #; %edx = remainder == 0 +	popl %edi + +	#; %eax = SSE iterations +	cmp $0, %eax +	je .AGPOST_START + +	 +.AGLP_SSE: + +	movaps (%edi), %xmm0 +	mulps %xmm1, %xmm0 +	movaps %xmm0, (%edi) + +	addl $16, %edi +#;	subl $4, %ecx   #; nframes-=4 + +	decl %eax +	jnz .AGLP_SSE + +	#; Next we need to post-process all remaining frames +	#; the remaining frame count is in %ecx +	 +	#; if no remaining frames, jump to the end +#;	cmp $0, %ecx +	andl $3, %ecx #; nframes % 4 +	je .AG_END + +.AGPOST_START: + +	movss (%edi), %xmm0 +	mulss %xmm1, %xmm0 +	movss %xmm0, (%edi) + +	#; increment buffer, decrement counter +	addl $4, %edi #; buf++; +	 +	decl %ecx   #; nframes-- +	jnz	.AGPOST_START #; if we run out of frames, we go to the end +	 +.AG_END: + + +	popl %edi +	 +	#; return +	leave +	ret + +.size	x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer +#; end proc + + + +#; float x86_sse_compute_peak(float *buf, long nframes, float current); + +.globl x86_sse_compute_peak +	.type	x86_sse_compute_peak,@function + +x86_sse_compute_peak: +#; 8(%ebp)	= float	*buf 	= %edi +#; 12(%ebp) = long	nframes = %ecx +#; 16(%ebp) = float	current = st(0) + +	pushl %ebp +	movl %esp, %ebp + +	#; save %edi +	pushl %edi +	 +	#; the real function + +	#; Load "current" in xmm0 +	movss 16(%ebp), %xmm0 + +	#; if nframes == 0, go to end +	movl 12(%ebp), %ecx #; nframes +	cmp	$0, %ecx +	je	.CP_END + +	#; create the "abs" mask in %xmm2 +	pushl	$2147483647 +	movss	(%esp), %xmm2 +	addl    $4, %esp +	shufps	$0x00, %xmm2, %xmm2 + +	#; Check for alignment + +	movl 8(%ebp), %edi #; buf  +	movl %edi, %edx #; buf => %edx +	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12 +	jz	.CP_SSE #; if buffer IS aligned + +	#; PRE-LOOP +	#; we iterate 1-3 times, doing normal x87 float comparison +	#; so we reach a 16 byte aligned "buf" (=%edi) value + +.LP_START: + +	#; Load next value from the buffer +	movss (%edi), %xmm1 +	andps %xmm2, %xmm1 +	maxss %xmm1, %xmm0 + +	#; increment buffer, decrement counter +	addl $4, %edi #; buf++; +	 +	decl %ecx   #; nframes-- +	jz	.CP_END #; if we run out of frames, we go to the end +	 +	addl $4, %edx #; one non-aligned byte less +	cmp $16, %edx +	jne .LP_START #; if more non-aligned frames exist, we do a do-over + +.CP_SSE: + +	#; We have reached the 16 byte aligned "buf" ("edi") value + +	#; Figure out how many loops we should do +	movl %ecx, %eax #; copy remaining nframes to %eax for division + +	shr $2,%eax #; unsigned divide by 4 +	jz .POST_START + +	#; %eax = SSE iterations + +	#; current maximum is at %xmm0, but we need to .. +	shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's + +	#;prefetcht0 16(%edi) + +.LP_SSE: + +	movaps (%edi), %xmm1 +	andps %xmm2, %xmm1 +	maxps %xmm1, %xmm0 + +	addl $16, %edi + +	decl %eax +	jnz .LP_SSE + +	#; Calculate the maximum value contained in the 4 FP's in %xmm0 +	movaps %xmm0, %xmm1 +	shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) +	maxps  %xmm1, %xmm0 #; maximums of the two pairs +	movaps %xmm0, %xmm1 +	shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) +	maxps  %xmm1, %xmm0  + +	#; now every float in %xmm0 is the same value, current maximum value +	 +	#; Next we need to post-process all remaining frames +	#; the remaining frame count is in %ecx +	 +	#; if no remaining frames, jump to the end + +	andl $3, %ecx #; nframes % 4 +	jz .CP_END + +.POST_START: + +	movss (%edi), %xmm1 +	andps %xmm2, %xmm1 +	maxss %xmm1, %xmm0 +	 +	addl $4, %edi 	#; buf++; +	 +	decl %ecx		#; nframes--; +	jnz .POST_START + +.CP_END: + +	#; Load the value from xmm0 to the float stack for returning +	movss %xmm0, 16(%ebp) +	flds 16(%ebp) + +	popl %edi +	 +	#; return +	leave +	ret + +.size	x86_sse_compute_peak, .-x86_sse_compute_peak +#; end proc + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + + diff --git a/muse/al/dspXMM.cpp b/muse/al/dspXMM.cpp new file mode 100644 index 00000000..1943fa7d --- /dev/null +++ b/muse/al/dspXMM.cpp @@ -0,0 +1,115 @@ +/* +    Copyright (C) 2007 Paul sDavis +    	Written by Sampo Savolainen + +    This program is free software; you can redistribute it and/or modify +    it under the terms of the GNU General Public License as published by +    the Free Software Foundation; either version 2 of the License, or +    (at your option) any later version. + +    This program is distributed in the hope that it will be useful, +    but WITHOUT ANY WARRANTY; without even the implied warranty of +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +    GNU General Public License for more details. + +    You should have received a copy of the GNU General Public License +    along with this program; if not, write to the Free Software +    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include <xmmintrin.h> + +void +x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max) +{ +	__m128 current_max, current_min, work; + +	// Load max and min values into all four slots of the XMM registers +	current_min = _mm_set1_ps(*min); +	current_max = _mm_set1_ps(*max); + +	// Work input until "buf" reaches 16 byte alignment +	while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { + +		// Load the next float into the work buffer +		work = _mm_set1_ps(*buf); + +		current_min = _mm_min_ps(current_min, work); +		current_max = _mm_max_ps(current_max, work); + +		buf++; +		nframes--; +	} + +        // use 64 byte prefetch for quadruple quads +        while (nframes >= 16) { +                __builtin_prefetch(buf+64,0,0); + +                work = _mm_load_ps(buf); +                current_min = _mm_min_ps(current_min, work); +                current_max = _mm_max_ps(current_max, work); +                buf+=4; +                work = _mm_load_ps(buf); +                current_min = _mm_min_ps(current_min, work); +                current_max = _mm_max_ps(current_max, work); +                buf+=4; +                work = _mm_load_ps(buf); +                current_min = _mm_min_ps(current_min, work); +                current_max = _mm_max_ps(current_max, work); +                buf+=4; +                work = _mm_load_ps(buf); +                current_min = _mm_min_ps(current_min, work); +                current_max = _mm_max_ps(current_max, work); +                buf+=4; +                nframes-=16; +        } + +	// work through aligned buffers +	while (nframes >= 4) { + +		work = _mm_load_ps(buf); + +		current_min = _mm_min_ps(current_min, work); +		current_max = _mm_max_ps(current_max, work); + +		buf+=4; +		nframes-=4; +	} + +	// work through the rest < 4 samples +	while ( nframes > 0) { + +		// Load the next float into the work buffer +		work = _mm_set1_ps(*buf); + +		current_min = _mm_min_ps(current_min, work); +		current_max = _mm_max_ps(current_max, work); + +		buf++; +		nframes--; +	} + +	// Find min & max value in current_max through shuffle tricks + +	work = current_min; +	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); +	work = _mm_min_ps (work, current_min); +	current_min = work; +	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); +	work = _mm_min_ps (work, current_min); + +	_mm_store_ss(min, work); + +	work = current_max; +	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); +	work = _mm_max_ps (work, current_max); +	current_max = work; +	work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); +	work = _mm_max_ps (work, current_max); + +	_mm_store_ss(max, work); +} + + +  | 
