(eval-when (compile load) (declaim (optimize (speed 3)(safety 0)))) (load "fftwdsimple.fasl") ;;; Used discrete cosine transform from FFTW to multiply polynomials ;;; with real coefficients, in Chebyshev representation. ;;; It seems to work!! (defvar *fftbufsize* 8) ;grow if necessary. (defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float ;; :allocation :static-reclaimable )) (defmacro mref(A i) `(aref ,A ,i)) (defun dctpoly(poly len) ;; poly = exps . coefs where exps input is array of exponents in increasing order ;; coefs corresponding coefficients ;; len is desired length of transform, which is at least as large as degree of poly ;; but may be rounded up to next power of 2. ;; This program returns an array of double-floats. (let*((exps (car poly)) (coefs (cdr poly)) (r *fftbuf*) (ans(make-array len :element-type 'double-float)) ) (declare (fixnum len) (type (array double-float (*)) ans r) (optimize (speed 3)(safety 0))) (loop for i fixnum from 0 to (1- len) do (setf (mref r i) 0.0d0)) (map nil #'(lambda(e c) (declare (fixnum e)) (setf (mref r e)(* c 1.0d0))) ;; normalize later after mult. exps coefs) ;; compute the plan here since estimate-only will be cheap and not clobber r (fftw_execute_r2r (fftw_plan_r2r len r ans (if (evenp len) 4 8) 64) r ans) ;; do the iDCT ans)) (defun dctr(r) ;; clobbers r. does a dct on input and returns polynomial result ;; input is array of real, real, ... ;; output is a standard (but chebyshev!!) polynomial (declare (type (array double-float (*)) r) (optimize (speed 3)(safety 0))) (let*((len (length r)) (anse (make-array len :element-type 'fixnum :fill-pointer 0)) ;; might end up smaller.. (ansc (make-array len :fill-pointer 0)) (q (/ 2.0d0 (* len len))) ;; normalize for iDCT and DCT together (k 0.0d0)) (declare (fixnum len)(double-float q k) (type (array fixnum (*)) anse)) (fftw_execute_r2r (fftw_plan_r2r len r r (if (evenp len) 5 9) 64) r r) (loop for i fixnum from 0 to (1- len) do (cond((= (setf k (mref r i)) 0.0d0) nil) ; ((<= (abs k) 1.0d-12) nil) ;; optional check to make answers briefer.. (t (vector-push i anse) (vector-push (* k q) ansc)))) (cons anse ansc))) (defun muldct(r s) ;; multiply polynomials r,s using floating DCT ;;compute the size Z of the answer. Z ;; compute two inverseDCTs (dct-III) ;; multiply pointwise ;; compute DCT (dct-II) ;; convert back to pair of arrays of exp and coef and return answer. ;; The answer will be only approximate if the coefficients are too large or ;; corrupted by round-off. In general the chebyshev coeffs will NOT ;; be integers, even if the inputs were. (let* ((lr (length (car r))) (ls (length (car s))) ;; compute deg of r + degree of s +1 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1)) (z (ash 1 (ceiling (log lans 2)))) ; round up to power of 2 ;; (z lans) ;; smaller transform, maybe faster or not.. gets wrong answers though. ) ;;; check the size of fftbuf here.. (if (< *fftbufsize* z) ;; if too small, make it larger (setf *fftbuf* (make-array (setf *fftbufsize* z) :element-type 'double-float ;;:allocation :static-reclaimable ))) ;; this next line does the whole thing. (dctr (proddct (dctpoly r z) (dctpoly s z) z)))) (defun proddct(r s len) ;clobbers r ;; r and s are the same length arrays ;; compute r:= r*s element-wise (declare (type (array double-float (*)) r s) (optimize (speed 3)(safety 0))) (dotimes (i len r) (setf (aref r i)(* (aref r i)(aref s i))))) ;; test data #| (defvar xv (cons (coerce (loop for i from 0 to 1999 collect i) 'vector) (coerce (loop for i from 0 to 1999 collect 1) 'vector))) ;mulfft is 35X faster than mul-dense (defvar xf (cons (coerce (loop for i from 0 to 1999 collect i) 'vector) (coerce (loop for i from 0 to 1999 collect 1.0d0) 'vector))) (defvar yv (cons (coerce (loop for i from 0 to 5 collect i) 'vector) (coerce (loop for i from 0 to 5 collect 1) 'vector))) (defvar xs (cons (coerce (loop for i from 0 to 100 collect (* 20 i)) 'vector) (coerce (loop for i from 0 to 100 collect 1) 'vector))) ;mul-dense is 5X faster squaring (defvar xs2 (cons (coerce (loop for i from 0 to 200 collect (* 10 i)) 'vector) (coerce (loop for i from 0 to 200 collect 1) 'vector))) ; mul-dense is 1.7 times faster (defvar xs3 (cons (coerce (loop for i from 0 to 333 collect (* 6 i)) 'vector) (coerce (loop for i from 0 to 333 collect 1) 'vector))) ; mulfft is 2X faster (defvar xs4(cons (coerce (loop for i from 0 to 285 collect (* 7 i)) 'vector) (coerce (loop for i from 0 to 285 collect 1) 'vector))) ;; mulfft and mul-dense are about same (defvar y0 (cons #(0 1 2 3) #(23 0 0 0))) (defvar y1 (cons #(0 1 2 3) #(23 0 0 1))) (defvar y2 (cons #(0 3) #(23 1))) (defvar y3 (cons #(0 100) #(23 1))) ;;;example, x^2+1 as a chebyshev series is [3,0,1/2]. (setf rr (dct3 #(3.0d0 0.0d0 0.5d0 0.0d0 0.0d0 0.0d0 0.0d0 0.0d0))) ;;is #(1.3873009129989156d0 1.1959591968163708d0 0.9253611467432722d0 0.7340194305607273d0 0.7340194305607273d0 0.9253611467432722d0 1.1959591968163708d0 1.3873009129989156d0) ;; square this by (map 'array #'* rr rr) #(1.9246038232076246d0 1.4303184004496585d0 0.8562932519020238d0 0.5387845244406944d0 0.5387845244406944d0 0.8562932519020238d0 1.4303184004496585d0 1.9246038232076246d0) ;; dropping small terms to zero and computing dct2 ... (stz (dct2 *)) is #(3.3587572106361017d0 0 1.0606601717798214d0 0 0.08838834764831825d0 0 0 0) ;;multiply by normalization of 1.4142135623730947d0 ;;to get #(4.75d0 0.0d0 1.4999999999999998d0 0.0d0 0.12499999999999971d0 0.0d0 0.0d0 0.0d0) ;;compared to correct answer of #(4.75 0 1.5 0 0.125 0 0 0 ) ;;;since (x^2+1)^2 = x^4+2*x^2+1 is ;;;4.75, 0, 1.5, 0, 0.125 (defvar x21 (cons #(0 2) #(3 1/2))) (defvar x212 (cons #(0 2 4) #(4.75d0 1.5d0 0.125d0))) (defun stz(x) (map 'array #'(lambda(r)(if (< (abs r) 1.0d-12) 0 r)) x)) ;; another example. ;; 4*x^3+2*x^2-2*x-1/2 ;; in cheby form [1,1,1,1] ;; its square is [3.5,3.0,2.5,2.0,1.5,1.0,0.5,0.0] ;;as a polynomial it is (defvar h1 (cons #(0 1 2 3) #( 1 1 1 1))) ;;and the answer to (muldct h1 h1) is this, which is pretty close. (#(0 1 2 3 4 5 6) . #(3.500000000000001d0 3.0000000000000004d0 2.5000000000000004d0 2.0000000000000004d0 1.5000000000000002d0 1.0d0 0.5000000000000001d0)) |#