(eval-when (compile load) (declaim (optimize (speed 3)(safety 0)))) (load "fftwdsimple.fasl") ;;;; overly simplified wrt alloc. but WORKS. Uses regular lisp arrays. ;;;; works for ffts of size less than ;;;; gives segmentation violation??? #+ignore (defmacro mref(A i) ;; memory reference in fftw world `(sys::memref-int ,A 0 (ash ,i 3) :double-float)) (defmacro mref(A i) `(aref ,A ,i)) ;; much of the futzing around here is to help re-use plans for fft. ;; much ot that has been tossed out.. we recompute plans on the fly. ;(defvar *fftplan-param* nil) ;choose between 0 and 64 in formulating plans. optimal=0. estimate=64 (defvar *fftbufsize* 8) (defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float :allocation :static-reclaimable )) (defun fftpoly(poly len) ;; exps input is array of exponents in increasing order ;; coefs corresponding coefficients ;; len is desired length of transform, which is at least as large as degree of poly ;; but may be rounded up to next power of 2, or some other convenient FFT length ;; This program returns an array of double-floats, stored real, imag, real, imag. ;; It computes a plan, if there is not one, for this FFT size. (let*((exps (car poly)) (coefs (cdr poly)) (r *fftbuf*) ;; this is known to be at least 2*len in size big enough (ans(make-array (+ len len) :element-type 'double-float)) ) (declare (fixnum len) (type (array double-float (*)) ans r) (optimize (speed 3)(safety 0))) ;; array is twice the length of the (real) input to make room for real & imag parts ;; copy real/imag parts of input into adjacent array locations and pre-normalize. ;; imag parts are all 0, so may be many of the real parts. ;; Fill with zeros first. (loop for i fixnum from 0 to (+ len len -1) do (setf (mref r i) 0.0d0)) ;; next, copy real numbers from input (map nil #'(lambda(e c) (declare (fixnum e)) (setf (mref r (+ e e))(float c 1.0d0))) ;; 0 2 4 real parts. exps coefs) ;; we can do the fft plan here since estimate only will be cheap and not clobber r?? (fftw_execute_dft (fftw_plan len r ans -1 64) r ans) ;; do the FFT ans)) (defun fftiR(input) ;; this is the inverse FFT, retaining only REAL part of answer ;; input is array of real, imag, real, imag, ... ;; returns array of REAL parts of answer, double-float. ;; computes a plan, if there is not one. (declare (type (array double-float (*)) input)) (let*((len (length input)) (r *fftbuf*) (q (ash len -1)) (anse (make-array (ash len -1) :element-type 'fixnum :fill-pointer 0)) (ansc (make-array (ash len -1) :fill-pointer 0)) (k 0)) (declare (fixnum len k) (type (array double-float (*)) r) (type (array fixnum (*)) anse) (optimize (speed 3)(safety 0))) ;; array is twice the length of the input to make room for real & imag parts ;; copy real/imag parts of input into adjacent array locations and pre-normalize (loop for i fixnum from 0 to (1- len) do (setf (mref r i)(aref input i))) (fftw_execute_dft (fftw_plan q r r 1 64) r r) ;; do the FFT (loop for i fixnum from 0 to (1- len) by 2 do ;; copy real parts into answer (setf k (round (mref r i) q)) (cond((= k 0) nil) (t (vector-push (ash i -1) anse) (vector-push k ansc)))) (cons anse ansc))) (defun mulfft(r s) ;; multiply polynomials r,s using floating FFT ;;compute the size Z of the answer. Z is a power of 2. ;; compute complex array r, increased to size S ;; compute complex array r, increased to size S ;; compute two FFTs ;; multiply pointwise ;; compute inverse FFT * 1/n ;; convert back to array and return answer, coefficients rounded to integers ;; the answer will be only approximate if the coefficients are too large or ;; corrupted by round-off. (let* ((lr (length (car r))) (ls (length (car s))) ;; compute deg of r + degree of s +1 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1)) (z (ash 1 (ceiling (log lans 2))))) ; round up to power of 2 ;;; check the size of fftbuf here.. (if (< *fftbufsize* (+ z z)) ;; enough for real and imag part (setf *fftbuf* (make-array (setf *fftbufsize* (+ z z) ) ;; real and imag parts :element-type 'double-float :allocation :static-reclaimable ))) (fftiR(prodarray (fftpoly r z) (fftpoly s z) z)))) (defun prodarray(r s len) (declare (type (array double-float (*)) r s) (fixnum len) (optimize (speed 3)(safety 0))) ;; r and s are the same length arrays ;; compute, for i=0, 2, ..., len-2 ;; ans[i]:= r[i]*s[i]-r[i+1]*s[i+1] ;; real part ;; ans[i+1]:=r[i]*s[i+1]+s[i]*r[i+1] ;; imag part (loop for i fixnum from 0 to (+ len -2) by 2 finally (return r) do (let* ((ip1 (1+ i)) (a (aref r i)) (b (aref r ip1)) (c (aref s i)) (d (aref s ip1))) (declare (fixnum ip1)(double-float a b c d)) (setf (aref r i ) (- (* a c)(* b d))) (setf (aref r ip1) (+ (* a d)(* b c)))))) ;; test data #| (defvar xv (cons (coerce (loop for i from 0 to 1999 collect i) 'vector) (coerce (loop for i from 0 to 1999 collect 1) 'vector))) ;mulfft is 35X faster than mul-dense only 8.9X on XPS Intel (defvar xf (cons (coerce (loop for i from 0 to 1999 collect i) 'vector) (coerce (loop for i from 0 to 1999 collect 1.0d0) 'vector))) (defvar yv (cons (coerce (loop for i from 0 to 5 collect i) 'vector) (coerce (loop for i from 0 to 5 collect 1) 'vector))) (defvar xs (cons (coerce (loop for i from 0 to 100 collect (* 20 i)) 'vector) (coerce (loop for i from 0 to 100 collect 1) 'vector))) ;mul-dense is 5X faster squaring (defvar xs2 (cons (coerce (loop for i from 0 to 200 collect (* 10 i)) 'vector) (coerce (loop for i from 0 to 200 collect 1) 'vector))) ; mul-dense is 1.7 times faster (defvar xs3 (cons (coerce (loop for i from 0 to 333 collect (* 6 i)) 'vector) (coerce (loop for i from 0 to 333 collect 1) 'vector))) ; mulfft is 2X faster (defvar xs4(cons (coerce (loop for i from 0 to 285 collect (* 7 i)) 'vector) (coerce (loop for i from 0 to 285 collect 1) 'vector))) ;; mulfft and mul-dense are about same ;; in other words, if 85% of the coefficients are zero, mul-dense and mulfft are similar in speed, at degree 2000 X degree 2000 (defvar z40 (cons (coerce (loop for i from 0 to 39 collect i) 'vector) (coerce (loop for i from 0 to 39 collect 1) 'vector))) ;; muldense is 4X faster than mulfft even fully dense (defvar z100 (cons (coerce (loop for i from 0 to 99 collect i) 'vector) ;; muldense is about the same speed as mulfft (coerce (loop for i from 0 to 99 collect 1) 'vector)));; muldense uses about 80X less memory.. (defvar z400 (cons (coerce (loop for i from 0 to 399 collect i) 'vector) (coerce (loop for i from 0 to 399 collect 1) 'vector))) ;; mulfft is 6X faster here ; 3.3X faster on XPS Intel Allegro ;; of course there is an issue that mul-dense is fully prepared to do exact integer bignum arithmetic, and mulfft is ;; going to occasionally suffer from roundoff, unless other measures are taken. |#