drivers/staging/echo/fir.h

   1 /*
   2  * SpanDSP - a series of DSP components for telephony
   3  *
   4  * fir.h - General telephony FIR routines
   5  *
   6  * Written by Steve Underwood <steveu@coppice.org>
   7  *
   8  * Copyright (C) 2002 Steve Underwood
   9  *
  10  * All rights reserved.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2, as
  14  * published by the Free Software Foundation.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  24  *
  25  * $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $
  26  */
  27
  28 /*! \page fir_page FIR filtering
  29 \section fir_page_sec_1 What does it do?
  30 ???.
  31
  32 \section fir_page_sec_2 How does it work?
  33 ???.
  34 */
  35
  36 #if !defined(_FIR_H_)
  37 #define _FIR_H_
  38
  39 /*
  40    Blackfin NOTES & IDEAS:
  41
  42    A simple dot product function is used to implement the filter.  This performs
  43    just one MAC/cycle which is inefficient but was easy to implement as a first
  44    pass.  The current Blackfin code also uses an unrolled form of the filter
  45    history to avoid 0 length hardware loop issues.  This is wasteful of
  46    memory.
  47
  48    Ideas for improvement:
  49
  50    1/ Rewrite filter for dual MAC inner loop.  The issue here is handling
  51    history sample offsets that are 16 bit aligned - the dual MAC needs
  52    32 bit aligmnent.  There are some good examples in libbfdsp.
  53
  54    2/ Use the hardware circular buffer facility tohalve memory usage.
  55
  56    3/ Consider using internal memory.
  57
  58    Using less memory might also improve speed as cache misses will be
  59    reduced. A drop in MIPs and memory approaching 50% should be
  60    possible.
  61
  62    The foreground and background filters currenlty use a total of
  63    about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo
  64    can.
  65 */
  66
  67 #if defined(USE_MMX)  ||  defined(USE_SSE2)
  68 #include "mmx.h"
  69 #endif
  70
  71 /*!
  72     16 bit integer FIR descriptor. This defines the working state for a single
  73     instance of an FIR filter using 16 bit integer coefficients.
  74 */
  75 typedef struct {
  76         int taps;
  77         int curr_pos;
  78         const int16_t *coeffs;
  79         int16_t *history;
  80 } fir16_state_t;
  81
  82 /*!
  83     32 bit integer FIR descriptor. This defines the working state for a single
  84     instance of an FIR filter using 32 bit integer coefficients, and filtering
  85     16 bit integer data.
  86 */
  87 typedef struct {
  88         int taps;
  89         int curr_pos;
  90         const int32_t *coeffs;
  91         int16_t *history;
  92 } fir32_state_t;
  93
  94 /*!
  95     Floating point FIR descriptor. This defines the working state for a single
  96     instance of an FIR filter using floating point coefficients and data.
  97 */
  98 typedef struct {
  99         int taps;
 100         int curr_pos;
 101         const float *coeffs;
 102         float *history;
 103 } fir_float_state_t;
 104
 105 static __inline__ const int16_t *fir16_create(fir16_state_t * fir,
 106                                               const int16_t * coeffs, int taps)
 107 {
 108         fir->taps = taps;
 109         fir->curr_pos = taps - 1;
 110         fir->coeffs = coeffs;
 111 #if defined(USE_MMX)  ||  defined(USE_SSE2) || defined(__bfin__)
 112         fir->history = kcalloc(2 * taps, sizeof(int16_t), GFP_KERNEL);
 113 #else
 114         fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
 115 #endif
 116         return fir->history;
 117 }
 118
 119 static __inline__ void fir16_flush(fir16_state_t * fir)
 120 {
 121 #if defined(USE_MMX)  ||  defined(USE_SSE2) || defined(__bfin__)
 122         memset(fir->history, 0, 2 * fir->taps * sizeof(int16_t));
 123 #else
 124         memset(fir->history, 0, fir->taps * sizeof(int16_t));
 125 #endif
 126 }
 127
 128 static __inline__ void fir16_free(fir16_state_t * fir)
 129 {
 130         kfree(fir->history);
 131 }
 132
 133 #ifdef __bfin__
 134 static inline int32_t dot_asm(short *x, short *y, int len)
 135 {
 136         int dot;
 137
 138         len--;
 139
 140         __asm__("I0 = %1;\n\t"
 141                 "I1 = %2;\n\t"
 142                 "A0 = 0;\n\t"
 143                 "R0.L = W[I0++] || R1.L = W[I1++];\n\t"
 144                 "LOOP dot%= LC0 = %3;\n\t"
 145                 "LOOP_BEGIN dot%=;\n\t"
 146                 "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
 147                 "LOOP_END dot%=;\n\t"
 148                 "A0 += R0.L*R1.L (IS);\n\t"
 149                 "R0 = A0;\n\t"
 150                 "%0 = R0;\n\t"
 151                 :"=&d"(dot)
 152                 :"a"(x), "a"(y), "a"(len)
 153                 :"I0", "I1", "A1", "A0", "R0", "R1"
 154         );
 155
 156         return dot;
 157 }
 158 #endif
 159
 160 static __inline__ int16_t fir16(fir16_state_t * fir, int16_t sample)
 161 {
 162         int32_t y;
 163 #if defined(USE_MMX)
 164         int i;
 165         mmx_t *mmx_coeffs;
 166         mmx_t *mmx_hist;
 167
 168         fir->history[fir->curr_pos] = sample;
 169         fir->history[fir->curr_pos + fir->taps] = sample;
 170
 171         mmx_coeffs = (mmx_t *) fir->coeffs;
 172         mmx_hist = (mmx_t *) & fir->history[fir->curr_pos];
 173         i = fir->taps;
 174         pxor_r2r(mm4, mm4);
 175         /* 8 samples per iteration, so the filter must be a multiple of 8 long. */
 176         while (i > 0) {
 177                 movq_m2r(mmx_coeffs[0], mm0);
 178                 movq_m2r(mmx_coeffs[1], mm2);
 179                 movq_m2r(mmx_hist[0], mm1);
 180                 movq_m2r(mmx_hist[1], mm3);
 181                 mmx_coeffs += 2;
 182                 mmx_hist += 2;
 183                 pmaddwd_r2r(mm1, mm0);
 184                 pmaddwd_r2r(mm3, mm2);
 185                 paddd_r2r(mm0, mm4);
 186                 paddd_r2r(mm2, mm4);
 187                 i -= 8;
 188         }
 189         movq_r2r(mm4, mm0);
 190         psrlq_i2r(32, mm0);
 191         paddd_r2r(mm0, mm4);
 192         movd_r2m(mm4, y);
 193         emms();
 194 #elif defined(USE_SSE2)
 195         int i;
 196         xmm_t *xmm_coeffs;
 197         xmm_t *xmm_hist;
 198
 199         fir->history[fir->curr_pos] = sample;
 200         fir->history[fir->curr_pos + fir->taps] = sample;
 201
 202         xmm_coeffs = (xmm_t *) fir->coeffs;
 203         xmm_hist = (xmm_t *) & fir->history[fir->curr_pos];
 204         i = fir->taps;
 205         pxor_r2r(xmm4, xmm4);
 206         /* 16 samples per iteration, so the filter must be a multiple of 16 long. */
 207         while (i > 0) {
 208                 movdqu_m2r(xmm_coeffs[0], xmm0);
 209                 movdqu_m2r(xmm_coeffs[1], xmm2);
 210                 movdqu_m2r(xmm_hist[0], xmm1);
 211                 movdqu_m2r(xmm_hist[1], xmm3);
 212                 xmm_coeffs += 2;
 213                 xmm_hist += 2;
 214                 pmaddwd_r2r(xmm1, xmm0);
 215                 pmaddwd_r2r(xmm3, xmm2);
 216                 paddd_r2r(xmm0, xmm4);
 217                 paddd_r2r(xmm2, xmm4);
 218                 i -= 16;
 219         }
 220         movdqa_r2r(xmm4, xmm0);
 221         psrldq_i2r(8, xmm0);
 222         paddd_r2r(xmm0, xmm4);
 223         movdqa_r2r(xmm4, xmm0);
 224         psrldq_i2r(4, xmm0);
 225         paddd_r2r(xmm0, xmm4);
 226         movd_r2m(xmm4, y);
 227 #elif defined(__bfin__)
 228         fir->history[fir->curr_pos] = sample;
 229         fir->history[fir->curr_pos + fir->taps] = sample;
 230         y = dot_asm((int16_t *) fir->coeffs, &fir->history[fir->curr_pos],
 231                     fir->taps);
 232 #else
 233         int i;
 234         int offset1;
 235         int offset2;
 236
 237         fir->history[fir->curr_pos] = sample;
 238
 239         offset2 = fir->curr_pos;
 240         offset1 = fir->taps - offset2;
 241         y = 0;
 242         for (i = fir->taps - 1; i >= offset1; i--)
 243                 y += fir->coeffs[i] * fir->history[i - offset1];
 244         for (; i >= 0; i--)
 245                 y += fir->coeffs[i] * fir->history[i + offset2];
 246 #endif
 247         if (fir->curr_pos <= 0)
 248                 fir->curr_pos = fir->taps;
 249         fir->curr_pos--;
 250         return (int16_t) (y >> 15);
 251 }
 252
 253 static __inline__ const int16_t *fir32_create(fir32_state_t * fir,
 254                                               const int32_t * coeffs, int taps)
 255 {
 256         fir->taps = taps;
 257         fir->curr_pos = taps - 1;
 258         fir->coeffs = coeffs;
 259         fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
 260         return fir->history;
 261 }
 262
 263 static __inline__ void fir32_flush(fir32_state_t * fir)
 264 {
 265         memset(fir->history, 0, fir->taps * sizeof(int16_t));
 266 }
 267
 268 static __inline__ void fir32_free(fir32_state_t * fir)
 269 {
 270         kfree(fir->history);
 271 }
 272
 273 static __inline__ int16_t fir32(fir32_state_t * fir, int16_t sample)
 274 {
 275         int i;
 276         int32_t y;
 277         int offset1;
 278         int offset2;
 279
 280         fir->history[fir->curr_pos] = sample;
 281         offset2 = fir->curr_pos;
 282         offset1 = fir->taps - offset2;
 283         y = 0;
 284         for (i = fir->taps - 1; i >= offset1; i--)
 285                 y += fir->coeffs[i] * fir->history[i - offset1];
 286         for (; i >= 0; i--)
 287                 y += fir->coeffs[i] * fir->history[i + offset2];
 288         if (fir->curr_pos <= 0)
 289                 fir->curr_pos = fir->taps;
 290         fir->curr_pos--;
 291         return (int16_t) (y >> 15);
 292 }
 293
 294 #endif
 295 /*- End of file ------------------------------------------------------------*/