/*- * Copyright (c) 2005 Colin Percival * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted providing that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Redistributions of source code must ensure that the list of * copyright notices is complete, and the lack of a copyright notice * corresponding to a copyrightable contribution or modification may * be taken as an affirmative statement that said contribution or * modification has been placed in the public domain. * 4. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Colin Percival. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef RCSID static const char rcsid[] = "$Id: fft.c,v 1.2 2005/07/11 11:31:39 cperciva Exp $"; #endif #include #include #include "local.h" #include "fft.h" #include "roots.h" /**T \section{Twiddle factors} */ /**T The function {\em tricl\_fft\_makelut}($LUT$, $n$) generates an FFT lookup table suitable for use in computing FFTs of length up to $2^n$. This table is in fact a series of tables of sizes $4$, $8$, $16$, \ldots $2^{n - 2}$ complex values; the table of size $2^{k - 2}$ contains the $2^k$th roots of unity in the first quadrant, in order of increasing angle. For example, {\em tricl\_fft\_makelut}($LUT$, $32$) would generate a table containing the following values: \begin{align*} 0 & + 0 i \\ 0 & + 0 i \\ 0 & + 0 i \\ 0 & + 0 i \\ cos(0) & + sin(0) i \\ cos(\pi / 8) & + sin(\pi / 8) i \\ cos(\pi / 4) & + sin(\pi / 4) i \\ cos(3 \pi / 8) & + sin(3 \pi / 8) i \\ cos(0) & + sin(0) i \\ cos(\pi / 16) & + sin(\pi / 16) i \\ cos(\pi / 8) & + sin(\pi / 8) i \\ cos(3 \pi / 16) & + sin(3 \pi / 16) i \\ cos(\pi / 4) & + sin(\pi / 4) i \\ cos(5 \pi / 16) & + sin(5 \pi / 16) i \\ cos(3 \pi / 8) & + sin(3 \pi / 8) i \\ cos(7 \pi / 16) & + sin(7 \pi / 16) i \end{align*} (Note that the first four complex values are left zeroed, in order that the table of size $N$ starts at the $N$th complex value.) The input $n$ must satisfy $0 \leq n \leq 29$, and $LUT$ must have space to store $2^n$ doubles (i.e., $2^{n + 3}$ bytes). */ void tricl_fft_makelut(double * LUT, int n) { int32_t i; assert(0 <= n && n <= 29); /**T If $n < 4$, we just need to write $2^n$ zeroes. */ if (n < 4) { for (i = 0; i < (1 << n); i++) LUT[i] = 0.0; return; } /**T Otherwise, write $8$ zeroes\ldots */ for (i = 0; i < 8; i++) LUT[i] = 0.0; /**T\ldots generate the table of size $2^{n - 2}$\ldots*/ tricl_roots_makelut(LUT + (1 << (n - 1)), n); /**T\ldots and then generate the smaller tables by copying every second value from the next larger table. */ for (i = (1 << (n - 1)) - 2; i >= 8; i -= 2) { LUT[i] = LUT[i * 2]; LUT[i + 1] = LUT[i * 2 + 1]; } } /**T \section{The FFT} We use a recursive ``exponent -1'' split-radix decimation-in-frequency FFT, based roughly on ideas from Bernstein~\cite{DJB99}. \subsection{Building blocks} We build the FFT and inverse FFT out of seven primative blocks: {\em FFT\_PM}, {\em FFT\_SRM} and {\em IFFT\_SRM}, {\em FFT\_SRM\_PI\_4} and {\em IFFT\_SRM\_PI\_4}, and {\em FFT\_SRM\_W} and {\em IFFT\_SRM\_W}. The macro {\em FFT\_PM} transforms $(a, b)$ into $(a + b, a - b)$, i.e., it performs a length-$2$ FFT (which is also a length-$2$ inverse FFT). */ #define FFT_PM(a, b) do { \ double t; \ \ t = (b)[0]; \ (b)[0] = (a)[0] - t; \ (a)[0] += t; \ t = (b)[1]; \ (b)[1] = (a)[1] - t; \ (a)[1] += t; \ } while (0) /**T The macro {\em FFT\_SRM} performs ``split-radix mixing'' of four values: It transforms $(a, b, c, d)$ into $(a + c, b + d, (a - c) + (b - d) i, (a - c) - (b - d) i)$ -- this equates to one pass of FFT on half the data and two passes of FFT minus twiddling on the other half of the data. The macro {\em IFFT\_SRM} is the inverse of {\em FFT\_SRM}. */ #define FFT_SRM(a, b, c, d) do { \ double t0r, t0i, t1r, t1i; \ \ t0r = (a)[0] - (c)[0]; \ (a)[0] += (c)[0]; \ t0i = (a)[1] - (c)[1]; \ (a)[1] += (c)[1]; \ t1r = (b)[0] - (d)[0]; \ (b)[0] += (d)[0]; \ t1i = (b)[1] - (d)[1]; \ (b)[1] += (d)[1]; \ \ (c)[0] = t0r - t1i; \ (d)[0] = t0r + t1i; \ (c)[1] = t0i + t1r; \ (d)[1] = t0i - t1r; \ } while (0) #define IFFT_SRM(a, b, c, d) do { \ double t0r, t0i, t1r, t1i, t2r, t2i; \ \ t2r = (c)[0]; \ t2i = (c)[1]; \ t0r = t2r + (d)[0]; \ t0i = t2i + (d)[1]; \ t1r = t2i - (d)[1]; \ t1i = (d)[0] - t2r; \ \ (c)[0] = (a)[0] - t0r; \ (a)[0] += t0r; \ (c)[1] = (a)[1] - t0i; \ (a)[1] += t0i; \ (d)[0] = (b)[0] - t1r; \ (b)[0] += t1r; \ (d)[1] = (b)[1] - t1i; \ (b)[1] += t1i; \ } while (0) /**T The macro {\em FFT\_SRM\_PI\_4} performs split-radix mixing and twiddling of four values: It transforms $(a, b, c, d)$ into $(a + c, b + d, ((a - c) + (b - d) i) w, ((a - c) - (b - d)i) \bar{w})$ where $w = \exp(i \pi / 4) = (1 + i) / \sqrt{2}$. This is equivalent to {\em FFT\_SRM} followed by rotating $c$ left by $\pi / 4$ and rotating $d$ right by $\pi / 4$. The macro {\em IFFT\_SRM\_PI\_4} is the inverse of {\em FFT\_SRM\_PI\_4}. */ #define SQRTHALF TRICL_ROOTS_SQRTHALF #define FFT_SRM_PI_4(a, b, c, d) do { \ double t0r, t0i, t1r, t1i, t2r, t2i; \ \ t0r = (a)[0] - (c)[0]; \ (a)[0] += (c)[0]; \ t0i = (a)[1] - (c)[1]; \ (a)[1] += (c)[1]; \ t1r = (b)[0] - (d)[0]; \ (b)[0] += (d)[0]; \ t1i = (b)[1] - (d)[1]; \ (b)[1] += (d)[1]; \ \ t2r = t0r - t1i; \ t2i = t0i + t1r; \ t0r += t1i; \ t0i -= t1r; \ \ (c)[0] = (t2r - t2i) * SQRTHALF; \ (c)[1] = (t2r + t2i) * SQRTHALF; \ (d)[0] = (t0r + t0i) * SQRTHALF; \ (d)[1] = (t0i - t0r) * SQRTHALF; \ } while (0) #define IFFT_SRM_PI_4(a, b, c, d) do { \ double t0r, t0i, t1r, t1i, t2r, t2i; \ \ t0r = ((c)[0] + (c)[1]) * SQRTHALF; \ t0i = ((c)[1] - (c)[0]) * SQRTHALF; \ t1r = ((d)[0] - (d)[1]) * SQRTHALF; \ t1i = ((d)[0] + (d)[1]) * SQRTHALF; \ \ t2r = t0i - t1i; \ t2i = t1r - t0r; \ t0r += t1r; \ t0i += t1i; \ \ (c)[0] = (a)[0] - t0r; \ (a)[0] += t0r; \ (c)[1] = (a)[1] - t0i; \ (a)[1] += t0i; \ (d)[0] = (b)[0] - t2r; \ (b)[0] += t2r; \ (d)[1] = (b)[1] - t2i; \ (b)[1] += t2i; \ } while (0) /**T The macro {\em FFT\_SRM\_W} performs split-radix mixing and twiddling of four values: It transforms $(a, b, c, d)$ into $(a + c, b + d, ((a - c) + (b - d) i) w, ((a - c) - (b - d)i) \bar{w})$ where $w$ is a root of unity. The macro {\em IFFT\_SRM\_W} is the inverse of {\em FFT\_SRM\_W}. */ #define FFT_SRM_W(a, b, c, d, w) do { \ double t0r, t0i, t1r, t1i, t2r, t2i; \ \ t0r = (a)[0] - (c)[0]; \ (a)[0] += (c)[0]; \ t0i = (a)[1] - (c)[1]; \ (a)[1] += (c)[1]; \ t1r = (b)[0] - (d)[0]; \ (b)[0] += (d)[0]; \ t1i = (b)[1] - (d)[1]; \ (b)[1] += (d)[1]; \ \ t2r = t0r - t1i; \ t2i = t0i + t1r; \ t0r += t1i; \ t0i -= t1r; \ \ (c)[0] = t2r * (w)[0] - t2i * (w)[1]; \ (c)[1] = t2i * (w)[0] + t2r * (w)[1]; \ (d)[0] = t0r * (w)[0] + t0i * (w)[1]; \ (d)[1] = t0i * (w)[0] - t0r * (w)[1]; \ } while (0) #define IFFT_SRM_W(a, b, c, d, w) do { \ double t0r, t0i, t1r, t1i, t2r, t2i; \ \ t0r = (c)[0] * (w)[0] + (c)[1] * (w)[1];\ t0i = (c)[1] * (w)[0] - (c)[0] * (w)[1];\ t1r = (d)[0] * (w)[0] - (d)[1] * (w)[1];\ t1i = (d)[1] * (w)[0] + (d)[0] * (w)[1];\ \ t2r = t0i - t1i; \ t2i = t1r - t0r; \ t0r += t1r; \ t0i += t1i; \ \ (c)[0] = (a)[0] - t0r; \ (a)[0] += t0r; \ (c)[1] = (a)[1] - t0i; \ (a)[1] += t0i; \ (d)[0] = (b)[0] - t2r; \ (b)[0] += t2r; \ (d)[1] = (b)[1] - t2i; \ (b)[1] += t2i; \ } while (0) /**T \subsection{Small FFTs} Up to length $2^4$, we provide hard-coded FFTs: */ static void fft_0(__unused double * __restrict DAT, __unused double * __restrict LUT) { /* Do nothing */ } static void fft_1(double * __restrict DAT, __unused double * __restrict LUT) { FFT_PM(DAT, DAT + 2); } static void fft_2(double * __restrict DAT, __unused double * __restrict LUT) { FFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); FFT_PM(DAT, DAT + 2); } static void fft_3(double * __restrict DAT, __unused double * __restrict LUT) { FFT_SRM(DAT, DAT + 4, DAT + 8, DAT + 12); FFT_SRM_PI_4(DAT + 2, DAT + 6, DAT + 10, DAT + 14); FFT_PM(DAT + 8, DAT + 10); FFT_PM(DAT + 12, DAT + 14); FFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); FFT_PM(DAT, DAT + 2); } static void fft_4(double * __restrict DAT, double * __restrict LUT) { FFT_SRM(DAT, DAT + 8, DAT + 16, DAT + 24); FFT_SRM_W(DAT + 2, DAT + 10, DAT + 18, DAT + 26, LUT + 10); FFT_SRM_PI_4(DAT + 4, DAT + 12, DAT + 20, DAT + 28); FFT_SRM_W(DAT + 6, DAT + 14, DAT + 22, DAT + 30, LUT + 14); FFT_SRM(DAT + 16, DAT + 18, DAT + 20, DAT + 22); FFT_PM(DAT + 16, DAT + 18); FFT_SRM(DAT + 24, DAT + 26, DAT + 28, DAT + 30); FFT_PM(DAT + 24, DAT + 26); FFT_SRM(DAT, DAT + 4, DAT + 8, DAT + 12); FFT_SRM_PI_4(DAT + 2, DAT + 6, DAT + 10, DAT + 14); FFT_PM(DAT + 8, DAT + 10); FFT_PM(DAT + 12, DAT + 14); FFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); FFT_PM(DAT, DAT + 2); } /**T \subsection{Large FFTs} For lengths $2^5$ up to $2^{29}$, we define a generic split-radix FFT macro which outputs a function for computing that particular FFT length (doing some computations and then recursing into the length $2^{n - 1}$ and length $2^{n - 2}$ FFTs, as the split-radix FFT normally does). We then construct an array of pointers to the FFT functions and tricl\_fft\_fft looks up and calls the correct function -- this is around 5--10\% faster than a single recursive FFT function. This could be made slightly faster by using {\em FFT\_SRM\_PI\_4}, but the difference is less than 1\%, so it isn't worth the cost of the increased code size which would result. */ #define FFT_FUNC(n, nm1, nm2, len) \ static void fft_ ## n(double * __restrict DAT, \ double * __restrict LUT) \ { \ uint32_t i; \ \ FFT_SRM(DAT, DAT + len * 2, DAT + len * 4, DAT + len * 6); \ for (i = 2; i < 2 * len; i += 2) \ FFT_SRM_W(DAT + i, DAT + len * 2 + i, \ DAT + len * 4 + i, DAT + len * 6 + i, \ LUT + len * 2 + i); \ \ fft_ ## nm2(DAT + len * 4, LUT); \ fft_ ## nm2(DAT + len * 6, LUT); \ fft_ ## nm1(DAT, LUT); \ } FFT_FUNC(5, 4, 3, 8) FFT_FUNC(6, 5, 4, 16) FFT_FUNC(7, 6, 5, 32) FFT_FUNC(8, 7, 6, 64) FFT_FUNC(9, 8, 7, 128) FFT_FUNC(10, 9, 8, 256) FFT_FUNC(11, 10, 9, 512) FFT_FUNC(12, 11, 10, 1024) FFT_FUNC(13, 12, 11, 2048) FFT_FUNC(14, 13, 12, 4096) FFT_FUNC(15, 14, 13, 8192) FFT_FUNC(16, 15, 14, 16384) FFT_FUNC(17, 16, 15, 32768) FFT_FUNC(18, 17, 16, 65536) FFT_FUNC(19, 18, 17, 131072) FFT_FUNC(20, 19, 18, 262144) FFT_FUNC(21, 20, 19, 524288) FFT_FUNC(22, 21, 20, 1048576) FFT_FUNC(23, 22, 21, 2097152) FFT_FUNC(24, 23, 22, 4194304) FFT_FUNC(25, 24, 23, 8388608) FFT_FUNC(26, 25, 24, 16777216) FFT_FUNC(27, 26, 25, 33554432) FFT_FUNC(28, 27, 26, 67108864) FFT_FUNC(29, 28, 27, 134217728) static void (* fft_list[])(double * __restrict, double * __restrict) = { fft_0, fft_1, fft_2, fft_3, fft_4, fft_5, fft_6, fft_7, fft_8, fft_9, fft_10, fft_11, fft_12, fft_13, fft_14, fft_15, fft_16, fft_17, fft_18, fft_19, fft_20, fft_21, fft_22, fft_23, fft_24, fft_25, fft_26, fft_27, fft_28, fft_29 }; void tricl_fft_fft(double * __restrict DAT, int size, double * __restrict LUT) { assert(0 <= size && size <= (int)(sizeof(fft_list) / sizeof(fft_list[0]))); fft_list[size](DAT, LUT); } /**T \subsection{Small IFFTs} Up to length $2^4$, we provide hard-coded inverse FFTs: */ static void ifft_0(__unused double * __restrict DAT, __unused double * __restrict LUT) { /* Do nothing */ } static void ifft_1(double * __restrict DAT, __unused double * __restrict LUT) { FFT_PM(DAT, DAT + 2); } static void ifft_2(double * __restrict DAT, __unused double * __restrict LUT) { FFT_PM(DAT, DAT + 2); IFFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); } static void ifft_3(double * __restrict DAT, __unused double * __restrict LUT) { FFT_PM(DAT, DAT + 2); IFFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); FFT_PM(DAT + 8, DAT + 10); FFT_PM(DAT + 12, DAT + 14); IFFT_SRM(DAT, DAT + 4, DAT + 8, DAT + 12); IFFT_SRM_PI_4(DAT + 2, DAT + 6, DAT + 10, DAT + 14); } static void ifft_4(double * __restrict DAT, double * __restrict LUT) { FFT_PM(DAT, DAT + 2); IFFT_SRM(DAT, DAT + 2, DAT + 4, DAT + 6); FFT_PM(DAT + 8, DAT + 10); FFT_PM(DAT + 12, DAT + 14); IFFT_SRM(DAT, DAT + 4, DAT + 8, DAT + 12); IFFT_SRM_PI_4(DAT + 2, DAT + 6, DAT + 10, DAT + 14); FFT_PM(DAT + 16, DAT + 18); IFFT_SRM(DAT + 16, DAT + 18, DAT + 20, DAT + 22); FFT_PM(DAT + 24, DAT + 26); IFFT_SRM(DAT + 24, DAT + 26, DAT + 28, DAT + 30); IFFT_SRM(DAT, DAT + 8, DAT + 16, DAT + 24); IFFT_SRM_W(DAT + 2, DAT + 10, DAT + 18, DAT + 26, LUT + 10); IFFT_SRM_PI_4(DAT + 4, DAT + 12, DAT + 20, DAT + 28); IFFT_SRM_W(DAT + 6, DAT + 14, DAT + 22, DAT + 30, LUT + 14); } /**T \subsection{Large IFFTs} As with FFTs, we define a generic split-radix inverse FFT macro for sizes $2^5$ up to $2^{29}$. */ #define IFFT_FUNC(n, nm1, nm2, len) \ static void ifft_ ## n(double * __restrict DAT, \ double * __restrict LUT) \ { \ uint32_t i; \ \ ifft_ ## nm1(DAT, LUT); \ ifft_ ## nm2(DAT + len * 4, LUT); \ ifft_ ## nm2(DAT + len * 6, LUT); \ \ IFFT_SRM(DAT, DAT + len * 2, DAT + len * 4, DAT + len * 6); \ for (i = 2; i < 2 * len; i += 2) \ IFFT_SRM_W(DAT + i, DAT + len * 2 + i, \ DAT + len * 4 + i, DAT + len * 6 + i, \ LUT + len * 2 + i); \ } IFFT_FUNC(5, 4, 3, 8) IFFT_FUNC(6, 5, 4, 16) IFFT_FUNC(7, 6, 5, 32) IFFT_FUNC(8, 7, 6, 64) IFFT_FUNC(9, 8, 7, 128) IFFT_FUNC(10, 9, 8, 256) IFFT_FUNC(11, 10, 9, 512) IFFT_FUNC(12, 11, 10, 1024) IFFT_FUNC(13, 12, 11, 2048) IFFT_FUNC(14, 13, 12, 4096) IFFT_FUNC(15, 14, 13, 8192) IFFT_FUNC(16, 15, 14, 16384) IFFT_FUNC(17, 16, 15, 32768) IFFT_FUNC(18, 17, 16, 65536) IFFT_FUNC(19, 18, 17, 131072) IFFT_FUNC(20, 19, 18, 262144) IFFT_FUNC(21, 20, 19, 524288) IFFT_FUNC(22, 21, 20, 1048576) IFFT_FUNC(23, 22, 21, 2097152) IFFT_FUNC(24, 23, 22, 4194304) IFFT_FUNC(25, 24, 23, 8388608) IFFT_FUNC(26, 25, 24, 16777216) IFFT_FUNC(27, 26, 25, 33554432) IFFT_FUNC(28, 27, 26, 67108864) IFFT_FUNC(29, 28, 27, 134217728) static void (* ifft_list[])(double * __restrict, double * __restrict) = { ifft_0, ifft_1, ifft_2, ifft_3, ifft_4, ifft_5, ifft_6, ifft_7, ifft_8, ifft_9, ifft_10, ifft_11, ifft_12, ifft_13, ifft_14, ifft_15, ifft_16, ifft_17, ifft_18, ifft_19, ifft_20, ifft_21, ifft_22, ifft_23, ifft_24, ifft_25, ifft_26, ifft_27, ifft_28, ifft_29 }; void tricl_fft_ifft(double * __restrict DAT, int size, double * __restrict LUT) { assert(0 <= size && size <= (int)(sizeof(ifft_list) / sizeof(ifft_list[0]))); ifft_list[size](DAT, LUT); } /**T \bibliographystyle{amsplain} \begin{thebibliography}{X} \bibitem{DJB99} D.J.~Bernstein, \textit{djbfft}, {\tt http://cr.yp.to/djbfft.html} \end{thebibliography} */