(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
(load "fftwdsimple.fasl")
;;; Used discrete cosine transform from FFTW to multiply polynomials
;;; with real coefficients, in Chebyshev representation.
;;; It seems to work!!

(defvar *fftbufsize* 8) ;grow if necessary.
(defvar *fftbuf* (make-array *fftbufsize* :element-type 'double-float 
			     ;; :allocation :static-reclaimable
			     ))
(defmacro mref(A i)  `(aref ,A ,i))

(defun dctpoly(poly len)
  ;; poly = exps . coefs where exps input is array of exponents in increasing order
  ;; coefs corresponding coefficients
  ;; len is desired length of transform, which is at least as large as degree of poly
  ;; but may be rounded up to next power of 2.
  ;; This program returns an array of double-floats. 
  (let*((exps (car poly))
	(coefs (cdr poly))
	(r *fftbuf*)
	(ans(make-array len :element-type 'double-float))		)
    (declare (fixnum len)
	     (type (array double-float (*)) ans r)
	     (optimize (speed 3)(safety 0)))
    (loop for i fixnum from 0 to (1- len) do (setf (mref r i) 0.0d0))
    (map nil 
      #'(lambda(e c)  (declare (fixnum e))
	  (setf (mref r e)(* c 1.0d0))) ;; normalize later after mult.
      exps coefs)
    ;; compute the plan here since estimate-only will be cheap and not clobber r
   (fftw_execute_r2r (fftw_plan_r2r len r ans (if (evenp len) 4 8) 64) r ans)	      ;; do the iDCT
    ans))

(defun dctr(r) ;; clobbers r. does a dct on input and returns polynomial result
  ;; input is array of real, real, ...
  ;; output is a standard (but chebyshev!!) polynomial  
  (declare (type (array double-float (*)) r) (optimize (speed 3)(safety 0)))
  (let*((len (length r)) 
	(anse (make-array len :element-type 'fixnum :fill-pointer 0)) ;; might end up smaller..
	(ansc (make-array len :fill-pointer 0))
	(q (/ 2.0d0 (* len len))) ;; normalize for iDCT and DCT together
	(k 0.0d0))
    (declare (fixnum len)(double-float q k) (type (array fixnum (*)) anse))
    (fftw_execute_r2r (fftw_plan_r2r len r r (if (evenp len) 5 9) 64) r r)
    (loop for i fixnum from 0 to (1- len)  do 
	  (cond((= (setf k  (mref r i)) 0.0d0) nil)
;	       ((<= (abs k) 1.0d-12) nil) ;; optional check to make answers briefer..
	       (t  (vector-push i anse)
		   (vector-push (* k q) ansc))))
    (cons anse ansc)))

(defun muldct(r s)  ;; multiply polynomials r,s using floating DCT
  ;;compute the size Z of the answer. Z 
  ;; compute two inverseDCTs  (dct-III)
  ;; multiply pointwise
  ;; compute  DCT    (dct-II)
  ;; convert back to pair of arrays of exp and coef and return answer.
  ;; The answer will be only approximate if the coefficients are too large or
  ;; corrupted by round-off. In general the chebyshev coeffs will NOT
  ;; be integers, even if the inputs were.
  (let* ((lr (length (car r)))
	 (ls (length (car s)))
	 ;; compute deg of r + degree of s +1
	 (lans (+ (aref (car r) (1- lr)) (aref (car s) (1- ls)) 1))
	 (z (ash 1 (ceiling (log lans 2)))) ; round up to power of 2
	 ;;  (z lans) ;; smaller transform, maybe faster or not.. gets wrong answers though.
	 )	 
	 ;;; check the size of fftbuf here..
    (if (< *fftbufsize* z) ;; if too small, make it larger
	(setf *fftbuf* (make-array (setf *fftbufsize* z) 
				   :element-type 'double-float
				   ;;:allocation :static-reclaimable
				   )))
    ;; this next line does the whole thing.
    (dctr (proddct (dctpoly r z) (dctpoly s z) z))))

(defun proddct(r s len) ;clobbers r
  ;; r and s are the same length arrays
  ;; compute r:= r*s element-wise
  (declare (type (array double-float (*)) r s) (optimize (speed 3)(safety 0)))
   (dotimes (i len r)
     (setf (aref r i)(* (aref r i)(aref s i)))))

;; test data
#|
(defvar xv (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		 (coerce  (loop for i from 0 to 1999 collect 1) 'vector))) ;mulfft is 35X faster than mul-dense

(defvar xf (cons (coerce  (loop for i from 0 to 1999 collect i) 'vector)
		 (coerce  (loop for i from 0 to 1999 collect 1.0d0) 'vector)))

(defvar yv (cons (coerce  (loop for i from 0 to 5 collect i) 'vector)
		 (coerce  (loop for i from 0 to 5 collect 1) 'vector)))

(defvar xs  (cons (coerce  (loop for i from 0 to 100 collect (* 20 i)) 'vector)
		  (coerce  (loop for i from 0 to 100 collect 1) 'vector))) ;mul-dense is 5X faster squaring


(defvar xs2  (cons (coerce  (loop for i from 0 to 200 collect (* 10 i)) 'vector)
		   (coerce  (loop for i from 0 to 200 collect 1) 'vector))) ; mul-dense is 1.7 times faster


(defvar xs3  (cons (coerce  (loop for i from 0 to 333 collect (* 6 i)) 'vector)
		   (coerce  (loop for i from 0 to 333 collect 1) 'vector))) ; mulfft is 2X faster

(defvar xs4(cons (coerce  (loop for i from 0 to 285 collect (* 7 i)) 'vector)
		   (coerce  (loop for i from 0 to 285 collect 1) 'vector)))   ;; mulfft and mul-dense are about same
(defvar y0 (cons #(0 1 2 3) #(23 0 0 0)))
(defvar y1 (cons #(0 1 2 3) #(23 0 0 1)))
(defvar y2 (cons #(0  3) #(23 1)))
(defvar y3 (cons #(0  100) #(23 1)))

;;;example,  x^2+1  as a chebyshev series is [3,0,1/2].

(setf rr (dct3 #(3.0d0 0.0d0 0.5d0 0.0d0 0.0d0 0.0d0 0.0d0 0.0d0)))

;;is 
#(1.3873009129989156d0 1.1959591968163708d0 0.9253611467432722d0 0.7340194305607273d0 0.7340194305607273d0
  0.9253611467432722d0 1.1959591968163708d0 1.3873009129989156d0)

;; square this by  (map 'array #'* rr rr)

#(1.9246038232076246d0 1.4303184004496585d0 0.8562932519020238d0 0.5387845244406944d0 0.5387845244406944d0
  0.8562932519020238d0 1.4303184004496585d0 1.9246038232076246d0)

;; dropping small terms to zero and computing dct2 ... (stz (dct2 *)) is

#(3.3587572106361017d0 0 1.0606601717798214d0 0 0.08838834764831825d0 0 0 0)
;;multiply by normalization of 1.4142135623730947d0
;;to get
#(4.75d0 0.0d0 1.4999999999999998d0 0.0d0 0.12499999999999971d0 0.0d0 0.0d0 0.0d0)

;;compared to correct answer of #(4.75 0 1.5 0 0.125 0 0 0 )
;;;since (x^2+1)^2 = x^4+2*x^2+1  is 
;;;4.75, 0, 1.5, 0, 0.125

(defvar x21 (cons #(0 2) #(3 1/2)))
(defvar x212 (cons #(0 2 4) #(4.75d0 1.5d0 0.125d0)))

(defun stz(x) (map 'array #'(lambda(r)(if (< (abs r) 1.0d-12) 0 r)) x))

;; another example.
;; 4*x^3+2*x^2-2*x-1/2
;; in cheby form  [1,1,1,1]
;; its square is
[3.5,3.0,2.5,2.0,1.5,1.0,0.5,0.0]

;;as a polynomial it is 
(defvar h1 (cons #(0 1 2 3) #( 1 1 1 1)))

;;and the answer to (muldct h1 h1)  is this, which is pretty close.
(#(0 1 2 3 4 5 6)
 . #(3.500000000000001d0 3.0000000000000004d0 2.5000000000000004d0
     2.0000000000000004d0 1.5000000000000002d0 1.0d0
     0.5000000000000001d0))


|#