(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
(load "fftwdsimple.fasl")
;; based on fft7.lisp, but trying to pack 2 input arrays into one array
;; for a complex fft.

;;; mulffts2  WORKS. slow.  in file 2fft.lisp
;;; try to make it fast, now.

(defmacro mref(A i)   `(aref ,A ,i))

;; much of the futzing around here is to help re-use plans for fft.
;; much ot that has been tossed out.. we recompute plans on the fly.

(defvar *fftbufsize* 8)
(defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float 
			     :allocation :static-reclaimable
			     ))

(defun fftpoly2(poly1 poly2 len)
  ;;poly1, poly2 inputs are pairs of arrays:
  ;; of exponents in increasing order,
  ;; and of coefs corresponding to those exponents.
  ;; len is desired length of transform, which is at least as large as degree of poly1,2
  ;; but may be rounded up to next power of 2, or some other convenient FFT length
  ;; This program returns a transformed sequence suitable to
  ;; send into untangle to get the
  ;; fourier transforms of poly1 and poly2l of length len
  (let*((exps1 (car poly1))
	(coefs1 (cdr poly1))
	(exps2 (car poly2))
	(coefs2 (cdr poly2))
	(e 0)
	;(c 0.0d0)
	(r *fftbuf*) ;; this is known to be at least 2*len in size big enough 
	;; we could do this in r, for this program..
	(ans(make-array (+ len len) :element-type 'double-float :displaced-to r))		)
    (declare (fixnum len e ) (double-float c)
	     (type (array double-float (*)) ans r) 
	     (optimize (speed 3)(safety 0)))
    ;; array is twice the length of the (real) input to make room for real & imag parts
     ;; copy real/imag parts of input into adjacent array locations and pre-normalize.
    ;; Fill with zeros first.
 
    (loop for i fixnum from 0 to (+ len len -1) do (setf (mref r i) 0.0d0))
    ;; next, copy real numbers from poly1
    (loop for i fixnum from 0 to (1- (length exps1)) do
	  (setf e (aref exps1 i))
	  (setf (mref r (+ e e)) (float (mref coefs1 i) 1.0d0)))
    (loop for i fixnum from 0 to (1- (length exps2)) do
	  (setf e (aref exps2 i))
	  (setf (mref r (+ 1 e e)) (float (mref coefs2 i) 1.0d0)))
    ;; we can do the fft plan here since estimate only will be cheap and not clobber r?
    (fftw_execute_dft (fftw_plan len r r -1 64) r r)	      ;; do the FFT
    ans       ))

;; faster yet?

(defun untangleprod2(f NN )  
  ;;; given an array *f*, of complex doubles, untangle to a pair of
  ;;; fourier transforms.
  
  ;;; according to http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
  ;;; with these interleaved data r,s.  fft(k,z) is the kth element of 
  ;;; the array of fft coefs. (but we use 2 spots for real/ imag
  
  ;;;  fft(k,r) =   (fft(k,z)+conj(fft(N-k,z))/2
  ;;;  fft(k,s) =-i*(fft(k,z)-conj(fft(N-k,z))/2
  
  ;;; so ans[q]=r[q]*s[q] is a complex number that looks like

    (declare (type (array double-float (*)) f)
	   (optimize (speed 3)(safety 0)))

    (let*(;(ans (make-array  NN :initial-element #c(0.0d0 0.0d0)  ))
	  (ans2 (make-array  (+ NN NN) :element-type 'double-float :initial-element 0.0d0))
	  (a 0.0d0)(b 0.0d0) (r 0.0d0)(s 0.0d0)  )
      (declare (type (complex double-float) ans2))
;       (format t"~%f=~s" *f*)
      (setf r (aref f 0)
	    s (aref f 1))
      (setf (aref ans2 0) (* r s))
      (setf (aref ans2 1) 0.0d0)

      (loop for k from 1 to  (1- NN ) do 
	    (setf a			;(aref *f* k)
	      (complex (aref f (* 2 k))(aref  f (1+ (* 2 k))))) ;; ok
	      
	    (setf  b 
	      (complex (aref f (*(- NN k) 2)) (- (aref f (+(* 2 (- NN k))  1)))) ;ok
	      )
	    (setf r  (+ a b))
	    (setf s (* #.(complex 0.0d0 -1) (- a b)))
	    (setf (aref ans2 (* 2 k)) (realpart (* 0.25d0 r s)))
	    (setf (aref ans2 (1+ (* 2 k))) (imagpart (* 0.25d0 r s))))

      ans2       ))


(defun mulfft3(r s)  ;; multiply polynomials r,s using floating FFT
  ;;compute the size Z of the answer. Z is a power of 2.
  ;; compute complex array r, increased to size S
  ;; compute complex array r, increased to size S
  ;; compute two FFTs
  ;; multiply pointwise
  ;; compute inverse FFT  * 1/n
  ;; convert back to array and return answer, coefficients rounded to integers
  ;; the answer will be only approximate if the coefficients are too large or
  ;; corrupted by round-off.
  (let* ((lr (length (car r)))
	 (ls (length (car s)))
	 ;; compute deg of r + degree of s +1
	 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1))
	 (z (ash 1 (ceiling (log lans 2))))) ; round up to power of 2
	 ;;; check the size of fftbuf here..
    (if (< *fftbufsize* (+ z z)) ;; enough for real and imag part 
	(setf *fftbuf* (make-array (setf *fftbufsize* (+ z z) )  ;; real and imag parts
				   :element-type 'double-float
				   :allocation :static-reclaimable
				   )))
    (fftiR3(untangleprod2 (fftpoly2 r s z) z))))


(defun fftiR3(input) ;; this is the inverse FFT, retaining only REAL part of answer
  ;; input is array of real, imag, real, imag, ...
  ;; returns array of REAL parts of answer, double-float.
  ;; computes a plan, if there is not one.
  (let*(				;(len (ash (length input) -1) )
	(len (ash (length input) -1))
;					(r (flatten input))
	;;(r (make-array (length input) :element-type 'double-float :initial-contents input))
	(r input)
	(anse (make-array  len :element-type 'fixnum :fill-pointer 0))
	(ansc (make-array  len  :fill-pointer 0))
	(q  len)
	(k 0))

    (declare (fixnum len k)  
	     (type (array double-float (*)) r)
	     (type (array fixnum (*)) anse)
	     (optimize (speed 3)(safety 0)))
    (fftw_execute_dft (fftw_plan len r r 1 64) r r)		  ;; do the FFT
    
;;    (loop for i fixnum from 0 to (1- len) do (print (aref r i))) ;debug
      
    (loop for i fixnum from 0 to (1- (* 2 len)) by 2 do ;; copy real parts into answer
	  (setf k (round (mref r i) q))
	  (cond((= k 0) nil)
	       (t  (vector-push (ash i -1) anse)
		   (vector-push k ansc))))
    (cons anse ansc)))


;;maxima version


;; normalization should be such that n1*n2 = 1/n.
;; here it is 1/n^2 ??

#|
slowFFT(x):= block([n:length(x)],
               map(lambda([k], 1/n*sum(x[j]*exp((-2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1)))$		       

slowFFTi(x):= block([n:length(x)],
               map(lambda([k], 1/n*sum(x[j]*exp((2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1)))$		       


;; normalization is 1, 1/n

(slowFFT(x):= block([n:length(x)],
               map(lambda([k], sum(x[j]*exp((-2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       

slowFFTi(x):= block([n:length(x)],
               map(lambda([k], 1/n*sum(x[j]*exp((2*%i*%pi*(j-1)*k)/n),j,1,n)),
	       makelist(i,i,0,n-1))),		       
	       
a1: slowFFT([1+%i,2+3*%i,3+6*%i, 4+7*%i]),
a2: slowFFT([1,2,3,4]),
a3: slowFFT([1,3,6,7]),

untangle(v):=block([n:length(v)], local(a,b,r),
 r[i]:=v[i+1],
 a[0]:realpart(r[0]),
 b[0]:imagpart(r[0]),
 a[i]:=1/2*(r[i]+conjugate(r[n-i])),
 b[i]:= -%i/2*(r[i]-conjugate(r[n-i])),

  [makelist(a[i],i,0,n-1),
   makelist(b[i],i,0,n-1)]))

	       
 |#