(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
(load "fftwdsimple.fasl")
;; polynomial multiplication by FFT:  pack 2 input arrays into one array
;; for a complex fft.
;; like 4f.lisp but returns answer in lists

(defvar *fftbufsize* 8)
(defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float 
			     :allocation :static-reclaimable ))
(defvar *fftbuf2*(make-array *fftbufsize* :element-type 'double-float 
			     :allocation :static-reclaimable ))

(defun fftpoly2(poly1 poly2 len)
  ;; poly1, poly2 inputs are pairs of arrays:
  ;; of exponents in increasing order,
  ;; and of coefs corresponding to those exponents.
  ;; len is desired length of transform, which is at least as large as degree of poly1,2
  ;; but may be rounded up to next power of 2, or some other convenient FFT length
  ;; This program returns a transformed sequence suitable to
  ;; send into untangle to get the
  ;; fourier transforms of poly1 and poly2l of length len
  (let*((exps1 (car poly1))
	(coefs1 (cdr poly1))
	(exps2 (car poly2))
	(coefs2 (cdr poly2))
	(e 0)
	;(c 0.0d0)
	(r *fftbuf*) ;; this is known to be at least 2*len in size big enough 
	;; we could do this in r, for this program..
	;;(ans(make-array (+ len len) :element-type 'double-float :displaced-to r))
	;;(ans *fftbuf*)
	)
    (declare (fixnum len e ); (double-float c)
	     (type (array double-float (*)) r) 
	     (optimize (speed 3)(safety 0)))
    ;; array is twice the length of the (real) input to make room for real & imag parts
     ;; copy real/imag parts of input into adjacent array locations and pre-normalize.
    ;; Fill with zeros first.
 
    (loop for i fixnum from 0 to (+ len len -1) do (setf (aref r i) 0.0d0))
    ;; next, copy real numbers from poly1
    (loop for i fixnum from 0 to (1- (length exps1)) do
	  (setf e (aref exps1 i))  ;; assume coefs are already double-floats ...
	  (setf (aref r (+ e e))  (float (aref coefs1 i) 1.0d0)
	    ))
    (loop for i fixnum from 0 to (1- (length exps2)) do
	  (setf e (aref exps2 i))
	  (setf (aref r (+ 1 e e)) (float (aref coefs2 i) 1.0d0)
  ))
    ;; we can do the fft plan here since estimate only will be cheap and not clobber r?
    (fftw_execute_dft (fftw_plan len r r -1 64) r r)	      ;; do the FFT

    r ))

;; faster yet?

;; um, broken

(defun untangleprod2(f NN )  
  ;;; given an array *f*, of real/imag/real/imag doubles, untangle to a pair of
  ;;; fourier transforms, and multiply them pointwise
  
  ;;; according to http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
  ;;; with these interleaved data r,s.  fft(k,z) is the kth element of 
  ;;; the array of fft coefs. (but we use 2 spots for real/ imag
  
  ;;;  r= fft(k,r) =   (fft(k,z)+conj(fft(N-k,z))/2
  ;;;  s= fft(k,s) =-i*(fft(k,z)-conj(fft(N-k,z))/2
  
  ;;; they leave out what to do when k=0, since fft is defined from 0 to N-1.
  ;;; treat fft(N,z) == fft(0,z).
  
  ;;; so ans[q]=r[q]*s[q] 
  ;;; returned as real/imag real/image etc.

  (declare (type (array double-float (*)) f)
	   (fixnum NN)
	   (optimize (speed 3)(safety 0)))
  
  ;;  (format t "~%f=~s" f)

  (let*( 
	;;(ans2 *fftbuf*) ;; re-use buffer??
	;;(ans2 (make-array  (+ NN NN) :element-type 'double-float :initial-element 0.0d0))
	(ans2 *fftbuf2*)
	(ar 0.0d0)(ai 0.0d0)(br 0.0d0)(bi 0.0d0)(idx1 0)(idx2 0) 
	(sr 0.0d0)(si 0.0d0)(rr 0.0d0)(ri 0.0d0))
    (declare (type (array double-float (*)) ans2) (fixnum idx1 idx2)
	     (double-float ar ai br bi sr si rr ri))
    (setf (aref ans2 0) (* (aref f 0)(aref f 1)))
    (setf (aref ans2 1) 0.0d0)

    (loop for k fixnum from 1 to  (1- NN) do 
	  (setf idx1 (+ k k))
	  (setf idx2 (* (- NN k) 2))
	  (setf ar (aref f idx1)
		ai (aref f (1+ idx1)))
	  (setf br (aref f idx2)
		bi (aref f (+ idx2 1)))
	    
	  (setf rr  (+ ar br)
		ri  (- ai bi))
	  (setf sr (+ ai bi)
		si (- br ar))
	 ; (format t "r= ~s ~s, s= ~s ~s" rr ri sr si)
	  (setf (aref ans2 idx1)     (* 0.25d0 (- (* rr sr) (* ri si))))
	  (setf (aref ans2 (1+ idx1))(* 0.25d0 (+ (* rr si)(* ri sr)))))

    ;; this all accomplishes complex multiplication of relevant parts.
    ;; pieces unraveled for speed, instead of consing up complex numbers.
    ans2))

(defun mulfft3(r s)  ;; multiply polynomials r,s using floating FFT
  ;;compute the size Z of the answer. Z is a power of 2.
  ;; compute complex array r, increased to size S
  ;; compute complex array r, increased to size S
  ;; compute two FFTs
  ;; multiply pointwise
  ;; compute inverse FFT  * 1/n
  ;; convert back to array and return answer, coefficients rounded to integers.
  ;; The answer will be only approximate if the coefficients are too large or
  ;; too corrupted by round-off.
  (let* ((lr (length (car r)))
	 (ls (length (car s)))
	 ;; compute deg of r + degree of s +1
	 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1))
	 (z (ash 1 (ceiling (log lans 2))))) ; round up to power of 2
	 ;;; check the size of fftbuf here..
    (if (< *fftbufsize* (+ z z)) ;; enough for real and imag part 
	(setf *fftbuf* (make-array (setf *fftbufsize* (+ z z) )  ;; real and imag parts
				   :element-type 'double-float
				   :allocation :static-reclaimable
				   )
	      *fftbuf2*  ;; need another one, or could overlap ops in *fftbuf* with some fancier progs.
	      (make-array (setf *fftbufsize* (+ z z) )  ;; real and imag parts
				   :element-type 'double-float
				   :allocation :static-reclaimable
				   )))
    ;; this next line does the whole job.
    (fftiR3(untangleprod2 (fftpoly2 r s z) z))))

(defun fftiR3(input) ;; this is the inverse FFT, retaining only REAL part of answer
  ;; input is array of real, imag, real, imag, ...
  ;; returns array of REAL parts of answer, double-float.

  ;; computes a plan, if there is not one.
  (declare   (type (array double-float (*)) input))
  (let*((len (ash (length input) -1))
	(r input)
	;;(anse (make-array  len :element-type 'fixnum :fill-pointer 0))
	(anse nil)
	;;(ansc (make-array  len  :fill-pointer 0))
	(ansc nil)
	(q  len)
	(k 0))

    (declare (fixnum len k q)  
	     ;;    (type (array double-float (*)) ansc)
	     ;;    (type (array fixnum (*)) anse)
	     (optimize (speed 3)(safety 0)))
    (fftw_execute_dft (fftw_plan len r r 1 64) r r)		  ;; do the FFT
    
    (loop for i fixnum from 0 to (1- (* 2 len)) by 2 do ;; copy real parts into answer
	  (setf k (round (aref r i) q))
	;;  (format t "~% r[i]=~s k=~s " (aref r i) k)
	  (cond((= k 0) nil)
	       (t  (push (ash i -1) anse)
		   (push k ansc))))
    ;;(cons (nreverse anse)(nreverse ansc))
    (cons anse ansc)
    ))


;;maxima version of pieces of this..
#|
;; normalization is 1, 1/n

(slowFFT(x):= block([n:length(x)],
               map(lambda([k], sum(x[j]*exp((-2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       

slowFFTi(x):= block([n:length(x)],
               map(lambda([k], 1/n*sum(x[j]*exp((2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       

untangle(v):=block([n:length(v)], local(a,b,r),
 r[i]:=v[i+1],
 a[0]:realpart(r[0]),
 b[0]:imagpart(r[0]),
 a[i]:=1/2*(r[i]+conjugate(r[n-i])),
 b[i]:= -%i/2*(r[i]-conjugate(r[n-i])),

  [makelist(a[i],i,0,n-1),
   makelist(b[i],i,0,n-1)]))
 |#

 #| 
(defvar xv (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		 (coerce  (loop for i from 0 to 1999 collect 1) 'vector)))
(defvar xd (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		    (coerce  (loop for i from 0 to 1999 collect 1.0d0) 'vector)))
|#