;;; -*- Lisp -*-
(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
;;; Multiplication POLYALGORITHM

;;; parts needed are in multbyfft.fasl (for four1, fft) 
;;; and polymultx.fasl (for karatsuba etc)
;;; and also newmul.fasl for some older stuff for testing.

(defun maxnorm(a) ;; returns max of abs vals in array a.
  (let ((m 0) (mm 0) (k 0))
        (declare (type (simple-array t (*))a) (integer mm mm k))
    (dotimes (i (length a) m)
      (setf k (aref a i))
      (if (< k mm) (setf mm k m (- k)))
      (if (> k m)  (setf m k mm (- k))))))

(defun sumnorm(a) ;; returns sum of abs vals in array a.
  (let ((sum 0)(k 0))
    (declare (type (simple-array t (*))a) (integer sum))
    (dotimes (i (length a) sum)
      (setf k (aref a i))
      (if (< k 0) (decf sum k)
	(incf sum k)))))


(defun avsp(p)				; average spacing calculation
  (/ (degspan p)
     (max 1 (length (car p))) ));; actual term count (avoid divide by 0 for empty polynomial)

(defun degspan(p)
  (- (aref (car p)(1- (length (car p)))) ;; maxdegree-mindegree+1 = max terms possible
	(aref (car p)0)
	-1.0))


;;avsp 

(defun mtest(n m n2 m2);; test some hypothetical formula for avsp
  (let* ((p (make-racg n m 1))
	 (q (make-racg n2 m2 1))
	 (ap (avsp p))
	 (aq (avsp q))
	 ;(max (max ap aq))	
	 ;(min (min ap aq))
	 ) 
    
    (format t "~% ap= ~s aq=~s ar=~s pred=~s" 
	    ap aq (avsp (mul-naive p q))
	    ;;(* 0.4 (+(sqrt ap)(sqrt aq)))
	    ;; (* 0.6 (sqrt(max ap aq))
	    ;;(* 0.6 (sqrt (/(max ap aq)(min ap aq))))
	    ;;(sqrt (* ap aq))
	    ;;(expt (* 0.5(+ (expt ap 0.25)(expt aq 0.25))) 0.25)
	    ;; (expt (* 0.5(+ (expt max 0.50)(expt min 0.25))) 0.25)
	    ;;	    (expt (* 0.5(+ (expt max 0.20) (expt min 0.1))) 0.3)
	    ;;    (expt (* 0.5(+ (expt max 0.4) min)) 0.3)
	    
	    (sparsity-estimate ap aq n n2 nil nil ) ;guess at avsp for r
		    )))

(defun datagen(termsp m termsq m2)
  (let* ((p (make-racg termsp m 1))
	 (q (make-racg termsq m2 1)))
    (analyze p q)))

(defun datagen2(termsp m termsq m2)
  (let* ((p (make-racg termsp m 1))
	 (q (make-racg termsq m2 1)))
;; square p and q.
    (analyze(mul-hash1 p p)(mul-hash1 q q))))

(defun sq(p )(mul-naive p p))

(defun analyze(p q)
    (let ((r (mul-naive p q)))
    (format t "~%{~s, ~s, ~s, ~s, ~s}"
	    (length (car p))
	    (degspan p)

	    (length (car q))
	    (degspan q)
	    
	    (/ (length (car r))
	       (degspan r)))))


(defun pprof (p)			;profile the polynomial exponent distribution
  (let((a (make-array (1+(aref (car p)(1-(length (car p))))) ;max degree
		      :element-type 'character :initial-element #\ )))
    (dotimes (i (length(car p)) a)
      (setf (aref a (aref (car p) i)) #\*))))

(defun guess (p q)(analyze p q)(format t " -- guess=~s" (sparsity-estimate2 p q)))

(defun sparsity-estimate2(p q) ;; ?? hm. try this
  (if (>(max (degspan p)(degspan q))
	(* (length (car p))(length (car q))) 5) 'sparse<0.2 'dense>0.2))

(defun degof(p)(aref (car p)(1- (length (car p)))))

;; we want to bound maxnorm of p*q.
#| from p and q, compute sumnorm, maxnorm.
The maxnorm(p*q) is <= min(sumnorm(p)*maxnorm(q),sumnorm(q)*maxnorm(p)).

|#

(defun mul-pa(p q)
  ;;poly-algorithm for multiplication of polynomials
  ;; first gather some statistics
  (let* ((np (maxnorm (cdr p)))		; max of abs val of coeffs in p
	 (nq (maxnorm (cdr q)))		; max of abs val of coeffs in q
	 (sp (sumnorm (cdr p)))		; sum of abs vals of coeffs in p
	 (sq (sumnorm (cdr q)))		; sum of abs vals of coeffs in q
	 (degp (degof p))
	 (degq (degof q))
	; (countp (length (car p)))	;number of terms in p
	; (countq (length (car q)))	;number of terms in q
	 (nr (min (* np sq)(* nq sp)))  ;; bound on maxnorm of p*q [achievable]
	 ;(maxdegr (+ degp degq))	; r's largest exponent
	 ;(mindegr (+ (aref (car p) 0)	; r's minimum exponent
	;	     (aref (car p) 0))
	#+ignore (se (sparsity-estimate degp degq ; degrees of input
				countp countq ; number of terms..
				maxdegr mindegr))
	 (nr 0)
	 (nr2 0)
	) 				;compute guess at avsp for r
    
    ;; We should test for different common coefficient domains, 
    ;; e.g. double-float,  rational, etc. and use different programs.
    
    
    ;; Here we assume that the choices are fixnums (abs val less than 2^30 or so?)
    ;; or bignums (any integer). We test for exponents being fixnums, or
    ;; exponents being bignums, unlikely unless encoding several variables packed
    ;; in exponents.
    
    ;; we could also test (first) about multivariate.
    
    ;; it is possible, and advantageous, to pack multiple coefficients into 
    ;; single words if the coefficients are small, e.g. integers mod 5.
    ;; then add and multiply require some extra steps for computing remainder.
    ;; Maybe better then is to pack the whole problem into a single bignum multiply.
    
    
    
    ;; Maybe first check for small cases that can be done naively in fixnum, dense?
    ;; The fastest program is mul-dense-fix if the inputs are less than 2/3 zeros and
    ;; the result coefficients are fixnum size, for inputs of size 10,000X5,000
    ;; double-float?
    ;;; Maybe first check for small cases that can be done naively in double-float?
    ;; Maybe then dense larger cases that can be done in fft double-float?
    ;; maybe then dense cases that can be done with karatsuba + naive  over integers?
    ;; maybe then sparse case
    ;; Can do some cases in double-float. Can we prove the answer is right even over
    ;; the integers?
    
    ;; note that we have an arbitrary precision fft in mpfr.lisp www...~fateman/generic/mpfr.lisp
    (declare (optimize (speed 3)(safety 0)))

    (format t "~% np ~s nq ~s sp ~s sq ~s nr ~s nr2 ~s nr3 ~s realnr ~s" np nq sp sq nr nr2 nr3 (maxnorm (cdr (mul-naive p q)))) ;; check answer!!
    
					;        (format t "~% np ~s nq ~s sp ~s sq ~s nr ~s nr2 ~s nr3 ~s" np nq sp sq nr nr2 nr3)
    
   #+ignore
    
   (cond ((typep maxdegr 'fixnum)
	  ;; otherwise the exponents are bignums, what-to-do?
	  ((typep nr 'fixnum) ;; We can use FIXNUM ONLY methods
	   
	   ;; BOTH exponents and coefficients are FIXNUMS
	   
	   ;; is it dense enough, small enough?
	   ;; copy (the smaller) poly into an array and do it that way.
	   
	   ;;....
	   )
	  (t ;; the exponents are fixnums, the coefs may not be fixnums
	   
	   
	   (cond((< se .2)  ;; a guess
		 (cond ((< degr 5000) ;; a guess
			(mul-dense p q))
		       (t (mul-fft p q))))

		((and (> avspp 0.99)
	     (> avspq 0.99))(mul-dense p q)) ;; dense case, just multiply densely
	      
	      ((fixnump maxdegr) ;; if max exponent in answer is a fixnum
	  (cond ((fixnump nr);; and if max coefficient in answer is fixnum
		 (cond ((< se 1.01)(mul-fftxx p q)) ;very dense, small coeffs, use fft
		       ((> se 1.20) (mul-heap5xx p q)) ;sparse version, use heap fix fix
		       (t (mul-naivexx p q)))) ;dense mostly version use naive fix fix
		
		;; max coefficient is not guaranteed to be a fixnum. Can't use simple FFT.
		(t (cond ((> se 1.20) (mul-heap5xb p q)) ;sparse version, use heap fix big
			 (t (mul-naivexb p q)))))) ; dense version use heap fix bignum
	  (t  
	   ;; max exponent in answer is a bignum! It has to be sparse or it
	   ;; won't fit in memory!
	   (mul-heap5 p q))))		;sparse version, use heap bignum bignum
	   )
	 
	 

	 
	 (t ; the max degree is NOT a fixnum.
	 ;; Try using mul-seg, but with which underlying program??	  
	  (mul-seg (segbyexp p)(segbyexp q) :mulfun '???)))
   ))

    

;; This is the place we need some formula  for estimated sparsity. I just
;; made this up.  
;; (sparsity-estimate 1.0 1.0 ...) should be 1.0
;; monotonically increasing in avspp and avspq
;; easy to generate cases by randomly generated polynomials in newmulpa file.
;;
(defun sparsity-estimate(avspp avspq termsp termsq maxdegr mindegr)
  ;; average spacing for p, q; maximum and minimum degree for answer r
   ;; some formula I made up... 
  (expt (* 0.5(+ (expt (max avspp avspq) 0.35)(expt (min avspp avspq) 0.2))) 0.3)
  
  )

	

(defun tcount(n sep) ;n is number of terms.
   (let ((p (make-racg n sep 1)))
     (length(car (mul-naive p p)))))

(defun runcount(k m);; do 10^0, 10^1 ... 10^k separation, poly of size m, e.g. 1000
  
  (dotimes (i k)
    (let* ((sep (expt 10 i))
	   (p (make-racg m sep 1))
	   (q (make-racg m sep 1))
	   (r (mul-naive p q))
	   (mindegr (* 1.0(aref (car r)0)))
	   (maxdegr (aref (car r) (1- m)))
	   (avsp (/(- maxdegr mindegr)(length (car r))))
	   (addspermult (/  (* m m 1.0)(length (car r)))))

      (format t "~% sep=~s ~30t avsp=~s ~70t adds/mult=~s" sep avsp addspermult)
      (if (<= addspermult 1.00005) (return 'cut-off)))))

(defun runcounth(k m);; do 10^0, 10^1 ... 10^k separation, poly of size m, e.g. 1000
  
  (dotimes (i k)
    (let* ((sep (expt 10 i))
	   (p (make-racg m sep 1))
	   (q (make-racg (ash m -1) sep 1)) ;; half the size
	   (r (mul-naive p q))
	   (mindegr (* 1.0(aref (car r)0)))
	   (maxdegr (aref (car r) (1- m)))
	   (avsp (/(- maxdegr mindegr)(length (car r))))
	   (addspermult (/  (* m m 0.5)(length (car r)))))

      (format t "~% sep=~s ~30t avsp=~s ~70t adds/mult=~s" sep avsp addspermult)
      (if (<= addspermult 1.00005) (return 'cut-off)))))





	 
    
  
;;;;; excerpt from newmul with Roman's version of chaining, sort of.
(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
;;; this tries to do f:=f+a*b  instead of c:=a*b; f:=f+c. could be faster with GMP.

;;; attempt to shorten because the "insert at top" heuristic doesn't really work often enough.
  
(defmacro heap-empty-p ()
    "Returns non-NIL if & only if the heap contains no items."
    `(<= fill-pointer 1))

(defmacro lesser-child (a parent fp)
    "Return the index of the lesser child.  If there's one child,
 return its index.  If there are no children, return 
 (FILL-POINTER (HEAP-A HEAP))."
    `(let* ((left;;(+ (the fixnum ,parent)(the fixnum  ,parent))
	     (ash ,parent 1))
	    (right (1+ (the fixnum left))))
       (declare (fixnum left right ,fp ,parent )
		(optimize(speed 3)(safety 0))
		(type (simple-array t (*)) ,a))
       (cond ((>= left ,fp) ,fp)
	     ((= right ,fp) left)
	     ;;; use the line below if exponents are fixnums!!
	     ((< (the fixnum (st-exp (aref ,a left)))(the fixnum (st-exp (aref ,a right)))) left)
	     ;; ((< (st-exp (aref ,a left))(st-exp (aref ,a right))) left) ;more general
	     (t right))))

;; put together in a lexical context to make access faster to global vars
(let ((*YYEXPS* nil)
      (*YYCOEFS* nil)
      (*YYLEN* 0)
      (fill-pointer 0))
  (declare (fixnum *YYLEN* fill-pointer)
	   (type (simple-array t (*)) *YYEXPS* *YYCOEFS*))

  (defstruct (st) 
    ;; A kind of stream.  each structure contains 6 items, not 5
    ;; which by reference to arrays of XX and YY , can burp out
    ;; successive exponents and coefficients of the product.
  
    (exp 0 :type integer)
    (coef1 0 :type integer)  ;; could be doubles, or gmp numbers or ..
    (coef2 1 :type integer)
    (i 0 :type fixnum)  
    (v 0 :type integer) (e 0 :type integer)); v, e are read-only 

  (defun make-simple-array (A) (if (typep A 'simple-array) A 
				 (make-array (length A)
					     :element-type (array-element-type A)
					     :initial-contents A)))
  
  (defun mularZ4(x y)  ;; set up initial heap.
    (declare (optimize (safety 3)(speed 0)))
    (let* ((xexps (car x))
	   (xcoefs (cdr x))
	   (yexps (make-simple-array (car y)))
	   (ycoefs(make-simple-array (cdr y)))
	   (yexp0 (aref yexps 0))
	   (ycoef0 (aref ycoefs 0))
	   (ans (make-array (1+(length xexps))))
	   (ylen (length (car y)))
	   (xe 0)
	   (v 0))			;
      (declare (fixnum i) (type (simple-array t (*))
				ans xexps xcoefs yexps ycoefs))
      (setf *YYEXPS* yexps *YYCOEFS* ycoefs *YYLEN* ylen)
      (setf (aref ans 0) (make-st :exp 0 :coef1 0 :coef2 1 :i ylen :v 0 :e 0))
      (dotimes (j (length xexps) ans)
	(setf v (aref xcoefs j))
	(setf xe (aref xexps j))
	(setf (aref ans (1+ j)) 
	  (make-st :exp (+ xe yexp0) :coef1 (* v ycoef0) :i 1 :v v :e xe) ))))

  (defun percolate-down4 (aa hole xx fp)
    "Private. Move the HOLE down until it's in a location suitable for X.
Return the new index of the hole."

    (let ((xxval (st-exp xx)))
      (declare (fixnum hole fp)(integer xxval)
	       (type (simple-array t (*)) aa)(optimize (speed 3)(safety 0)))
      (do* ((child (lesser-child aa hole fp) (lesser-child aa hole fp))
	    (aaval (aref aa child)(aref aa child)))
	  ((or (>= (the fixnum child) fp) 
	       (< xxval (the fixnum(st-exp aaval))) ;faster, less general
	    ;;   (< xxval (st-exp aaval))
	       )
	   hole)
	(declare (fixnum child) (type st aaval))
	(setf (aref aa hole)  aaval  hole child))))
  
  ;; important: initial-contents of heap must have an element 0 that
  ;; is not really used. for convenience we could have it equal to the first element.
  ;; e.g.  (3 5 8)  should be (3 3 5 8)
    
  (defun heap-insert4 (a xx);; xx is new element
    "Insert a new element into the heap. Heap is changed";; rjf
    (declare (type (simple-array t (*)) a)(fixnum fill-pointer)
	     (integer newe)
	     (optimize (speed 3)(safety 0)))
    (let ((c nil)			; potential chain point
	  (fp (incf fill-pointer)) )
      (declare (type (simple-array t (*)) a)(fixnum hole fp)
	       (optimize (speed 3)(safety 0)))

      (decf fp)
      (setf (aref a  fp) xx) 

      (do ((i fp parent)
	   (parent (ash fp -1);; initialize to parent of fp
		   (ash parent -1);; and its parent, next time
		   ))
	  ((>= (st-exp xx) (st-exp(aref a parent)));; is xx>= parent? if so, exit
	   a);; inserted the item at location i.
	(declare (fixnum i parent))
	(setf c (aref a parent))
	(cond
	 ((= (st-exp xx) (st-exp c));; aha, we can insert this item by chaining here.
	  (incf (st-coef c) (* (st-coef1 xx) (st-coef2 xx)))
	  ;; Line above is one op-code in GMP, using no persistent storage outside c...
					;(format t "~%              to ~s " c)
					;	  (format t "~% heap is ~s" a)
	  (setf (aref a fill-pointer) nil)
	  (decf fill-pointer);; we don't need the extra heap spot for this item 
	  (if (setf xx  (next-term xx))	;try inserting the successor to this item then.
	      (heap-insert4  a  xx))	
	  (return a)) ;exit from do loop
	 (t  (setf (aref a i) c 
		   (aref a parent) xx) a)))))
	
  (defun heap-remove4(a) 
    ;; returns nil if the heap is empty, otherwise the removed item is returned
    (declare(type (simple-array t(*)) a)(optimize (speed 3)(safety 0)))
    (if (heap-empty-p) nil
        (let*  ((xx (aref a 1))
		(fp (decf fill-pointer)) ;faster, fp is local now
		(last-object (aref a fp)))
	  (declare (fixnum fp))
	  (setf (aref a fp) nil)  ;; optional. makes tracing neater
	  (setf (aref a (percolate-down4 a 1 last-object fp)) last-object)
	  xx)))
  
   ;; this is the multiplication program, eliminated: peeking at top
 (defun mul-heap5(x y)
    (declare (optimize(speed 3)(safety 0)))
    (let* ((hh (if (< (length (car x))(length (car y)))(mularZ4 x y)(mularZ4 y x)))
	   (alen  10)
	   ;; initial heap
	   (anse (make-array alen :adjustable t :element-type 'integer :fill-pointer 0))
	   (ansc (make-array alen :adjustable t :element-type 'integer :fill-pointer 0))
	   (preve 0) (prevc 0)
	   (top nil)
	   (newe 0)(newc1 0) (newc2 1) )
      (declare (integer preve prevc newe newc1 newc2) 
	       (type (array t(*)) anse ansc hh))
      
      (setf fill-pointer  (length hh))
      (loop while  (not (heap-empty-p))
	    do
	    (setf top (heap-remove4 hh))
	    (setf newe (st-exp top) newc1 (st-coef1 top) newc2 (st-coef2 top))
	    (cond   ((= preve newe)		; is this same exponent as previous?
		     (incf prevc (* newc1 newc2))) ; if so, add coefficient, and loop
		    (t (unless (= prevc 0) ; normalize: remove zero coeffs
			 (vector-push-extend prevc ansc)
			 (vector-push-extend preve anse))
		       (setf prevc (* newc1 newc2)) ; initialize new coefficient
		       (setf preve newe)))
	    
	    (if (setf top (next-term top));; is there a successor?
		(heap-insert4 hh top)));; if so, insert it in heap
      ;; end of loop, push out the final monomial before exit from routine.
      (vector-push-extend prevc ansc)
      (vector-push-extend preve anse)
      (cons anse ansc)))
 
 (defun next-term(rec)   	; record is a stream 5-tuple  return succesor or nil
    (declare(type (simple-array t(*)) *YYCOEFS* *YYEXPS*)
	    (optimize (speed 3)(safety 0)) 
	    (type st rec))
    (let ((i (st-i rec)))
      (cond ((>= i *YYLEN*) nil);; no more items on stream of products
	    (t (incf (the integer (st-i rec)))
	      ;;; (setf (st-coef rec)(* (st-v rec)(aref *YYCOEFS* i)))
	       
	       (setf (st-coef1 rec)(st-v rec))
	       (setf (st-coef2 rec)(aref *YYCOEFS* i)) ;; don't multiply yet, save for later
	       ;; later we can do  c:=c+a*b in one fell swoop.
	       	       
	       (setf (st-exp rec) 
		     (+ (st-e rec)(aref *YYEXPS* i)))
	       rec))))
 
 (defun next-term-peek(rec)   	; record is a stream 5-tuple  return next expon or nil
    (declare(type (simple-array t(*)) *YYCOEFS* *YYEXPS*)
	    (optimize (speed 3)(safety 0)) 
	    (type st rec))
    (let ((i (st-i rec)))
      (cond ((>= i *YYLEN*) nil);; no more items on stream of products
	    (t  (+ (st-e rec)(aref *YYEXPS* i))))))
 
 
 
 
 )






(defun makesq (n m) (let* ((p (make-racg n m 1))
			   (p2 (mul-naive p p))
			   (terms (length(car p2)))
			   (ds (degspan p2))
			   (avsp (/ ds terms)))
		      (format t "~% terms= ~s degspan=~s avsp =~s" terms ds avsp)
		      p2))


(defun dumbtestsq()
  (dotimes (i 3)
    (dotimes (j 3)
      (dotimes (k 6)
	(dotimes (l 6)
	  (if (< i j)
	      (guess (sq (make-racg (expt 10 (1+ i))   (expt 10 k) 1))
		     (sq (make-racg  (expt 10 (1+ j))  (expt 10 l) 1)))))))))



(defun PA2D (p) 
  ;;Pair of Arrays to Dense single array

  ;; L is a LIST of (coef coef coef ...), implicit exponents (0 1 2 ...)
  ;; create a pair: one array of exponents and one of coefficients.
  (let* ((e (car p))
	 (c (cdr p))
	 (n   (1+(aref e(1-(length e))))); the necessary size of array
	 (ans  (make-array n :initial-element 0) ))
    (declare (fixnum n)(type (simple-array t (*)) ans e c))
    (do ((i (1- (length e))  (1- i)))
	((< i 0) ans)
      (declare (fixnum i))
      (setf (aref ans (aref e i))(aref c i)))))





;; works

(defun mul-fftw(p q)
  (DA2PA (ptimesfft(PA2D p)(PA2D q))))

;; try to streamline multiple conversions , rewriting ptimes-fft. works
(defun mul-fft(r s)
  (let* ((er (car r))
	 (es (car s))
	 (rdeg (aref er(1- (length er))))
	 (sdeg (aref es(1- (length es))))
	 (lans (+ rdeg sdeg -1))
	 (z (ash 1 (ceiling (log lans 2)))) ;round up to power of 2
	 ;;(z (ash 1 (1+(ceiling (log lans 2))))) ;round up to power of 2
	 (rfft (four1 (pa2v r z) z))
	 (sfft (four1 (pa2v s z) z)))
 ;;   (setf *r* rfft)
    ;; someday, fix dfa2pa
;;     (dfa2pa   (four1 (prodarray rfft sfft z) z :isign -1) ) ;; wrong
     
  (DA2PA   (dfa2v (four1 (prodarray rfft sfft z) z :isign -1) z))
  ))

;; works also, but slower???
(defun mul-fftd(r s)  ;; using one step to convert
  (let* ((er (car r))
	 (es (car s))
	 (rdeg (aref er(1- (length er)))) ;; highest degree exponent of input r
	 (sdeg (aref es(1- (length es)))) ;; highest degree exponent of input s
	 (tdeg (+ rdeg sdeg)) ;; highest degree exponent of t=r*s 
	 ;; [assuming a!=0, b!=0 --> a*b !=0]
	 (z (ash 1 (ceiling (log tdeg 2)))) ;round up to power of 2, 
	 ;;(z (ash 1 (1+(ceiling (log tdeg 2))))) ;round up to power of 2, 
	 (rfft (four1 (pa2v r z) z))
	 (sfft (four1 (pa2v s z) z))
	)
    (dfa2pa   (four1 (prodarray rfft sfft (* 2 z)) z :isign -1)  (1+ tdeg)) ;; wrong?
     
    ))


(defun dfa2pa (a tdeg)			;double float array to pair of arrays
  (let* (
					;(k (coerce (ash (length a) -1) 'double-float))
	 (k (coerce (ash (length a)-2) 'double-float))
	 (lim 0)
	 (tf 0.0d0))

    (declare (type (simple-array double-float (*)) a)
	     (fixnum tdeg lim)(fixnum q)(double-float k tf))
;;    (format t "~% length a =~s, tdeg=~s" (length a)tdeg)
    (setf tdeg (1- (* 2 tdeg)))
    (do ((i 0 (+ i 2)))
	((>= i  tdeg) nil)
      (declare (fixnum i))
      (setf (aref a i) (setf tf (/ (the double-float (aref a i)) k)))
      (unless  (<= -0.5d0  tf 0.5d0)(incf lim)))
    ;; at this point we know how many elements in the result
;  (format t "~%lim=~s ~%a=~s~%length a ~a" lim a (length a))
  (let ((anse (make-array lim))
	  (ansc (make-array lim))
	  (j 0)
	  (c 0)
	  (limm1 (1- lim)))
    (declare (optimize (speed 3)(safety 0))
	     (fixnum limm1 j)(integer c) (type (simple-array t (*)) anse ansc))
      (do 
	  ((i 0 (+ i 2)))
	  ((> j limm1)  (cons anse ansc))
	(declare (fixnum i))
	(setf c (round (aref a i)))
	(unless  (= c 0)
	  (setf (aref anse j) (ash i -1));; should do this exactly lim times.
	  (setf (aref ansc j) c)
	  (incf j))
	;(format t "~%i=~s, j=~s c=~s"  i j c)
	))))


(defun mul-karat(p q)
  (DA2PA (polymultk2 (PA2D p)(PA2D q))))

(defun mul-toom(p q)
  (DA2PA (polymulttc (PA2D p)(PA2D q))))

(defun mul-toom2(p q) ;faster. maybe wrong?
   (DA2PA (polymulttc2 (PA2D p)(PA2D q))))


(defun pa2v (p size) 
  ;;Pair of Arrays to Dense double-float array , "complex"
  (let* ((e (car p))
	 (c (cdr p))
	 (lim (length (car p)))
	 (ans  (make-array  (ash size 1) :initial-element 0.0d0 :element-type 'double-float) ))
    (declare (fixnum n lim)(type (simple-array double-float (*)) ans)
	     (type (simple-array t (*)) e c))
    (do ((i 0 (+ i 1)))
	((= i lim) ans)
      (declare (fixnum i))
      (setf (aref ans  (ash (aref e i) 1))
	(coerce (aref c i) 'double-float)))))


(defun how-acc(size h)
  
  ;; how accurate is the fft answer?
 (let ((u nil)(ans nil) (test nil))
  (dotimes (i 31)

    (setf u (make-racg size h (expt 2 i)))
    ;; (setf ans (mul-karat u u))
    (setf ans (mul-naive u u))
    (setf test (pol= ans (mul-fft u u)))
    (format t"~%i=~s bits, log2 maxnorm is ~s, answer is ~s"  
	    i (log (max 1 (maxnorm (cdr ans))) 2) test)
    (unless test (return 'done)))))
    

(defun make-racgs(n d c) 
    ;; make Random Array of increasing exponents of length n with
    ;; spacing k, with 0<k<=d between elements and with Coefficients that
    ;; are random integers between 1 and c. d may be, in General, arbitrary precision.
    (let ((exps (make-array n))
	  (coefs(make-array n))
	  (v (random d)))
      (dotimes (i n (cons exps coefs))
	(setf (aref coefs i)
	      (-  (random c) (ash c -1))) ;; signs vary
	(setf (aref exps i)v)
	(setf v (+ 1 v (random d))))))


;;; use destructuring-lambda to save program text space
(defmacro with-pairs2 (a1 a2 bind1 bind2 body)
  `(let ,(append 
	  `((,(car bind1) (car ,a1)))
	  `((,(cdr bind1)(cdr ,a1)))
	  `((,(car bind2) (car ,a2)))
	  `((,(cdr bind2)(cdr ,a2))))
     ,body))

#+ignore
(defun mul-naivedf(r s)  ;; same as mul-dense but using doubles.
  ;;exponents are fixnums, coeffs fit in double-float
  ;; copy one input into double-float-array
  ;; produce answer into DENSE double-float-array.
  ;; then copy into sparse standard form.
  (if (< (length (car r))(length (car s))) ; reverse args if r is shorter
      (mul-naivedf s r)

  (with-pairs2 r s (er . cr)(es . cs)
    (let* ((cri 0.0d0)
	   (eri 0) (esj 0)
	   (ler (length er))
	   (les (length es))
	   (rdeg (aref er(1- ler))) ;; highest degree exponent of input r
	   (sdeg (aref es(1- les))) ;; highest degree exponent of input s
	   (tdeg (+ rdeg sdeg)) ;; highest degree exponent of t=r*s 
	   (sa (make-array les :element-type  'double-float))
	   (ans (make-array (1+ tdeg) :element-type 
			    'double-float :initial-element 0.0d0))	   )
      (declare (type (simple-array double-float (*)) sa ans)
	       (type (simple-array t (*)) cr cs er es)
	       (fixnum eri ler les esj)(double-float cri))
	  ;; convert 2nd argument to an array of double-floats
	  (dotimes (i les)
	    (declare (fixnum i))
	    (setf (aref sa i)(coerce (aref cs i) 'double-float)))
	  ;; do the NXM multiplies into the answer array
	  (dotimes (i ler (dfaX2pa ans))
	    (declare (fixnum i))
	    (setf cri (coerce (aref cr i) 'double-float)) ; should never be 0.
	    (setf eri(aref er i))
	      (dotimes (j (length es))
		(declare (fixnum j))
		(setf esj (+ eri(aref es j)))
		(incf (the double-float (aref ans esj))
		      (* (the double-float (aref sa j)) cri))))))))


#+ignore
(defun mul-naivedfx(r s) 
  ;;exponents are fixnums, coeffs are fixnums
  ;; produce answer into DENSE fixnum array
  ;; then copy into sparse standard form.
  (if (< (length (car r))(length (car s))); reverse args if r is shorter
      (mul-naivedfx s r)
    (with-pairs2 r s (er . cr)(es . cs)
		 (let* ((cri 0)
			(eri 0)
			(ler (length er))
			(les (length es))
			(rdeg (aref er(1- ler)));; highest degree exponent of input r
			(sdeg (aref es(1- les)));; highest degree exponent of input s
			(tdeg (+ rdeg sdeg));; highest degree exponent of t=r*s 
			(ans (make-array (1+ tdeg) :element-type 
					 'fixnum :initial-element 0))	   )
		   (declare (type (simple-array fixnum (*)) ans)
			    (type (simple-array fixnum (*)) cr cs er es)
			    (fixnum eri ler les esj cri))
		   ;; do the NXM multiplies into the answer array
		   (dotimes (i ler (DA2PA ans))
		   (declare (fixnum i))
		   (setf cri (aref cr i) eri(aref er i))
		   (dotimes (j les)
		     (declare (fixnum j))
		     (incf (the fixnum (aref ans (+ eri (aref es j))))
			   (* (the fixnum (aref cs j)) (the fixnum cri)))))))))

(defun dX (a)		
  (let ((anse nil)(ansc nil)
	 (nzcount 0)
	 (j 0))
    (declare (type (simple-array fixnum (*)) a)
	     (fixnum nzcount lim j))
    

    (dotimes (i (length a))
      (unless (= 0.0d0 (aref a i))(incf nzcount)))

    (setf anse (make-array nzcount :element-type 'fixnum))
    (setf ansc (make-array nzcount :element-type 'fixnum))

    (dotimes (i (length a) (cons anse ansc))
      (unless (= 0 (aref a i))
	(setf (aref anse j) i)
	(setf (aref ansc j) (aref a i))
	(incf j)))))



(defun dfaX2pa (a)			;double float array to pair of arrays
  (let ((anse nil)(ansc nil)
	 (nzcount 0)
	 (j 0))
    (declare (type (simple-array double-float (*)) a)
	     (fixnum nzcount lim j)(fixnum q)(double-float k tf))

    (dotimes (i (length a))
      (unless (= 0.0d0 (aref a i))(incf nzcount)))

    (setf anse (make-array nzcount :element-type 'fixnum))
    (setf ansc (make-array nzcount :element-type 'integer))

    (dotimes (i (length a) (cons anse ansc))
      (unless (= 0.0d0 (aref a i))
	(setf (aref anse j) i)
	(setf (aref ansc j)(round (aref a i)))
	(incf j)))))

;;; if we just use integers...


#+ignore
(defun mul-naivex(r s)	 ;;; about 4X slower than with double-floats.
  
  ;;exponents are fixnums, coeffs fit in double-float
  ;; copy one input into double-float-array dense!
  ;; produce answer into DENSE double-float-array.
  ;; then copy into sparse standard form.

    (let* ((er (car r))
	   (es (car s))
	   (cr (cdr r))
	   (cs (cdr s))
	   (cri 0.0d0)
	   (eri 0)
	   (rdeg (aref er(1- (length er)))) ;; highest degree exponent of input r
	   (sdeg (aref es(1- (length es)))) ;; highest degree exponent of input s
	   (tdeg (+ rdeg sdeg)) ;; highest degree exponent of t=r*s 
	   (sa (make-array (1+ sdeg) :initial-element 0)) ;; dense!
	   (ans (make-array (1+ tdeg) :initial-element 0))	   )
          (declare (type (simple-array t (*)) sa ans)
		   (type (simple-array t (*)) cr cs er es)
		   (fixnum eri)(integer cri))
	  (do ((i (1- (length er))(1- i)))
	      ((< i 0))
	    (declare (fixnum i))
	    (setf (aref sa (aref er i))(aref cs i)))
	  (dotimes (i (length er) (dX ans))
	    (declare (fixnum i))
	    (setf eri(aref er i))
	    (setf cri  (aref cr i))
	    (unless (= 0 cri)
	      (dotimes (j (length er))
		(declare (fixnum j))
		(incf (aref ans (+ eri j))(* (aref sa j) cri)))))))

#+ignore
(defun mul-naivexx(r s)
  ;; about 4X slower than with double-floats. fixnums are just
  ;; not as fast here!
    (let* ((er (car r))
	   (es (car s))
	   (cr (cdr r))
	   (cs (cdr s))
	   (cri 0)
	   (eri 0)
	   (rdeg (aref er(1- (length er)))) ;; highest degree exponent of input r
	   (sdeg (aref es(1- (length es)))) ;; highest degree exponent of input s
	   (tdeg (+ rdeg sdeg)) ;; highest degree exponent of t=r*s 
	   (sa (make-array (1+ sdeg) :initial-element 0 :element-type 'fixnum))
	   (ans (make-array (1+ tdeg) :initial-element 0 :element-type 'fixnum))	   )
          (declare (type (simple-array fixnum (*)) sa ans)
		   (type (simple-array t (*)) cr cs er es)
		   (fixnum eri)(fixnum cri))
	  (do ((i (1- (length er))(1- i)))
	      ((< i 0))
	    (declare (fixnum i))
	    (setf (aref sa (aref er i))(aref cs i)))
	  (dotimes (i (length er) (dX ans))
	    (declare (fixnum i))
	    (unless (= 0 (setf cri  (aref cr i)))
	      (setf eri(aref er i))
	      (dotimes (j (length er))
		(declare (fixnum j))
		(incf (the fixnum (aref ans (the fixnum (+ eri j))))(* (the fixnum(aref sa j))
							 (the fixnum cri))))))))


;; looking at doing modular arithmetic, CRA,Garner's Alg.
;; code from macsyma/maxima  rat3a.
;;; maybe we can do it all in floats.

(defun crecip (n modulus)
  (cond
    ;; Have to use bignum arithmetic if modulus is a bignum
    ((not (fixnump modulus) )
     (prog (a1 a2 y1 y2 q (big-n n))
	(if (minusp big-n) (incf big-n  modulus))
	(setq a1 modulus a2 big-n)
	(setq y1 0 y2 1)
	(go step3)
	step2 (setq q (truncate a1 a2))
	(psetq a1 a2 a2 (- a1 (* a2 q)))
	(psetq y1 y2 y2 (- y1 (* y2 q)))
	step3 (cond ((zerop a2) (error "Inverse of zero divisor?"))
		    ((not (= a2 1)) (go step2)))
	(return y2 )))
    ;; Here we can use fixnum arithmetic
    (t (prog ((a1 0) (a2 0) (y1 0) (y2 0) (q 0) (nn 0) (pp 0))
	  (declare (fixnum a1 a2 y1 y2 q nn pp))
	  (setq nn n pp modulus)
	  (cond ((minusp nn) (incf nn pp)))
	  (setq a1 pp a2 nn)
	  (setq y1 0 y2 1)
	  (go step3)
	  step2 (setq q (truncate a1 a2))
	  (psetq a1 a2 a2 (rem a1 a2))
	  (psetq y1 y2 y2 (- y1 (* y2 q)))
	  step3 (cond ((= a2 0) (error "Inverse of zero divisor?"))
		      ((not (= a2 1)) (go step2)))
	 (return  y2;(cmod y2)?
		  )))))

;; balanced modular representation.
;; let modulus be an odd prime. the numbers used are
;; - (m-1)/2 .... 0 .... (m-1)/2
;;e.g  for m=7, the numbers are the 7 representatives: -3 -2 -1 0 1 2 3

(defparameter *modulus* 7.0d0)
(defparameter *hmod* (* 0.5d0 (1- *modulus*)))
(defparameter *mhmod* (- *hmod*))
;;

(defun fmod(f &optional (modulus *modulus*)(hmod *hmod*)(mhmod  *mhmod*))
    (declare (double-float f modulus hmod mhmod))
  (cond ((>  (setf f (fpint (rem f modulus))) hmod) (- f modulus))
	((< f mhmod)(+ f modulus))
	(t f)))

(defun fmod+(f g &optional (modulus *modulus*)(hmod *hmod*)(mhmod  *mhmod*))
    (declare (double-float f modulus hmod mhmod))
   (cond ((>  (setf f (+ f g)) hmod) (- f modulus))
	((< f mhmod)(+ f modulus))
	(t f)))

(defun fmod*(f g)(fmod (* f g)))
  

(defun fpint(f)  ;; integer closest to the float f.
  
  (declare(double-float f))
  (if (> f 0.0d0) (fptrunc (+ 0.5d0 f) bitsarray)
     (fptrunc (+ f -0.5d0) bitsarray)))

;; 20 primes less than 2^48 = 281474976710656
(defparameter primes48 
    '(281474976710087 281474976710087 281474976710089 281474976710107 
 281474976710129 281474976710131 281474976710143 281474976710197 
 281474976710287 281474976710327 281474976710339 281474976710399 
 281474976710413 281474976710423 281474976710467 281474976710491 
 281474976710509 281474976710563 281474976710567 281474976710591))
;; their product is about 9.745314011165468d+288


(defmacro sf(a b c d)`(excl::shorts-to-double-float  ,a ,b ,c ,d))
(defmacro fs(f)`(excl::double-float-to-shorts  ,f))

(defparameter bitsarray (make-array 16 :element-type '(unsigned-byte 16)
			      :initial-contents
			      '(#b1000000000000000
				#b1100000000000000 
				#b1110000000000000
				#b1111000000000000
				#b1111100000000000
				#b1111110000000000
				#b1111111000000000
				#b1111111100000000
				#b1111111110000000
				#b1111111111000000
				#b1111111111100000
				#b1111111111110000
				#b1111111111111000
				#b1111111111111100
				#b1111111111111110
				#b1111111111111111)))


#+ignore ;; not bad, but fptrunc4 is a little faster.
(defun fptrunc3(f ba)  ;; double fp truncate to an integer closer to 0?
  (declare    (type (simple-array (unsigned-byte 16) (16)) ba)
	      (optimize (speed 3)(safety 0)))
					; (:explain :types :inlining)
  (multiple-value-bind (hwhi hwlo lwhi lwlo)
      (excl::double-float-to-shorts f)
    (declare (type (unsigned-byte 16) hwhi hwlo lwhi lwlo) )
    (let (( exp(- (ash (logand 65520 hwhi) -4)
		  (if (> hwhi 32775) 3071 1023)))) ;; pick out the exponent abs val.
      (declare (fixnum exp))
   ;;   (format t "~%exp=~s"exp)
    (cond ((>= exp 52)f)  ;; all digits are part of the integer
	  ((< exp 0) 0.0d0) ;; number is in [-1,1], truncate to 0.0d0
	  ((<= 0 exp 4) (sf (logand (aref ba (+ 11 exp)) hwhi) 0 0 0))
	  ((<= 5 exp 20)(sf hwhi (logand (aref ba (+ -5 exp)) hwlo)  0 0))
	  ((<= 21 exp 36)(sf hwhi hwlo (logand (aref ba (+ -21 exp)) lwhi) 0))
	  (t (sf hwhi hwlo lwhi (logand (aref ba (+ -37 exp)) lwlo)))))))

;; version fptrunc4
(defun fptrunc(f ba);; double fp truncate to an integer closer to 0?
  (declare    (type (simple-array (unsigned-byte 16) (16)) ba)
	      (optimize (speed 3)(safety 0)))
  (multiple-value-bind (hwhi hwlo lwhi lwlo)
      (excl::double-float-to-shorts f)
    (declare (type (unsigned-byte 16) hwhi hwlo lwhi lwlo) )
    (let (( exp(- (ash (logand 65520 hwhi) -4)
		  (if (> hwhi 32775) 3071 1023))));; pick out the exponent abs val.
      (declare (fixnum exp))
      ;;   (format t "~%exp=~s"exp)
      (cond ((>= exp 52) f);; all digits are part of the integer
	    ((< exp 0) 0.0d0);; number is in [-1,1], truncate to 0.0d0
	    (t (case exp
		 ((0 1 2 3 4)
		  (sf (logand (aref ba (+ 11 exp)) hwhi) 0 0 0))
		 ((5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20)
		  (sf hwhi (logand (aref ba (+ -5 exp)) hwlo)  0 0))
		 ((21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36)
		  (sf hwhi hwlo (logand (aref ba (+ -21 exp)) lwhi) 0))
		 (otherwise  
		  (sf hwhi hwlo lwhi (logand (aref ba (+ -37 exp)) lwlo)))))))))

  
(defun ktd(n) (- (1- (expt 2 n))(fptrunc2 (+ 0.5d0 (1-(expt 2.0d0 n))))) )
(defun ktd3(n) (- (1- (expt 2 n))(fptrunc3 (+ 0.5d0 (1-(expt 2.0d0 n))) bitsarray)) )

;;(defun test1 (n)  (dotimes (i n)(fptrunc  (random #.(expt 2.0d0 48)))))

(defun test2 (n)
  (dotimes (i n)(coerce (round  (random #.(expt 2.0d0 48))) 'double-float)))

(defun testran (n)
  (dotimes (i n) (random #.(expt 2.0d0 48))))

(defun test4 (n)
  (dotimes (i n)(fptrunc4  (random #.(expt 2.0d0 48)) bitsarray)))

(defun test3 (n)
  (dotimes (i n)(fptrunc3  (random #.(expt 2.0d0 48)) bitsarray)))


(defun foo1( x y)
  (> (logand x y) 3))

(defun foo2( x y)
  (declare (type (unsigned-byte 16) x y))
  (> (logand x y) 3))

(defun foo3( x y)
  (declare (fixnum x y))
  (> (logand x y) 3))


#+ignore
(defun abmodc(a b c)
  (let*((p (* a b))
	(q (* p (/ 1 c)))
	;;(q (/ p c))
	(r (mod (* a b) c)))
        ;;(format t "~% prod ~s quo ~s c ~s r ~s roundq ~s" p q c r (round q))
    (- p (* c (round q)))))
       
;;must load generic/qd.dll, generic/dd.fasl, maybe generic/mysys-int

(defun abmodc-xx(a b c &optional 
			 (hmod (truncate c 2)) 
			 (mhmod(-(truncate c 2))))
      (format t "~%a=~s b=~s c=~s" a b c)
					;modulus is global
 (let ((f(fpint(mod (* (fpint a)(fpint b)) (round c )))))
    (cond ((> f hmod) (- f c))
	  ((< f mhmod)(+ f c))
	  (t f))))
  
(defpackage dd)

(defmethod abmodc-dd((a double-float)(b double-float)(c real)
		      )	;;c could be fix, double, integer abs(c)<2^53
  (let* ((crecip (/ 1.0d0 c))
	 (adivc (* a crecip))
	 (prod (dd::* (dd::lisp2dd a)(dd::lisp2dd b))) ;could do faster maybe
	 (quo (fpint(* adivc b)))) ;multiply and truncate to integer
    ;; compute remainder by a*b-[quo]*c, return only the first 53 bits
    (format t "~%a=~s b=~s c=~s" a b c)
   (aref (dd::add-q
	  (dd::- prod (dd::* (dd::lisp2dd c)(dd::lisp2dd  quo))) 
	  ) 0)))





;; above could could benefit from  double X double --> dd routine.
;; instead of (dd_* (into_dd a)(into_dd b)),
;; just use   (dd_d_d_*  (the double-float a)(the double-float b)) etc

#+ignore

(defmethod abmodc-dd((a double-float)(b double-float)(c real)
		      )	;;c could be fix, double, integer abs(c)<2^53
  (let* ((crecip (/ 1.0d0 c))
	 (adivc (* a crecip))
	 (prod (dd::* (dd::lisp2dd a)(dd::lisp2dd b))) ;could do faster maybe
	 (quo (fpint(* adivc b)))) ;multiply and truncate to integer
    ;; compute remainder by a*b-[quo]*c, return only the first 53 bits
    (format t "~%a=~s b=~s c=~s" a b c)
   (aref (dd::add-q
	  (dd::- prod (dd::* (dd::lisp2dd c)
			     (dd::lisp2dd quo))) 
	  ) 0)))

;; following example is bad, 3.0d17 is not 3*10^17.
;; try  (abmodc-xx 3.0d17 4.0d17 71.0d0) is 0
;; compare to (mod (* 3 4 (expt 10 34)) 71)  is 58
;; compare to  (mod (* 3.0d0 4 (expt 10 34)) 71) is 0



(defun fabmodc
    (a b c &optional (hmod (truncate c 2)) (mhmod(-(truncate c 2))))
		      
  (let((f (abmodc-dd2 a b c)))
    (cond ((> f hmod) (- f c))
	  ((< f mhmod)(+ f c))
	  (t f))))


;;how about a simple-minded double X double -> dd multiplier?

;; hm, does dd::+ work??
(defun muldd(a b) ;;a, b double floats
  (multiple-value-bind
      (atop abot)
      (halfit a)
    
    ;;        (dd::+ (dd::lisp2dd(* atop b))(dd::lisp2dd (* abot b)))
    
    (+ (rational(* atop b))(rational (* abot b)))))
  


#|(defun tmd(a b) ;test
  (multiple-value-bind (aa bb)
      (muldd (* a 1.0d0)(* b 1.0d0))
    (list (dd::+ aa bb)
	  (dd::* (dd::lisp2dd a)(dd::lisp2dd b)))))

|#
  
(defun halfit(df)			;df is double float
  (multiple-value-bind (hwhi hwlo lwhi lwlo)
      (excl::double-float-to-shorts df)
    (declare (ignore lwlo))
    (let   (( r (excl::shorts-to-double-float hwhi hwlo (logand #b1111100000000000 lwhi) 0)))
      (;;list
     values
     ;; high part
					; (setf r (excl::shorts-to-double-float hwhi hwlo (logand #b1111110000000000 lwhi) 0))
     r  (- df r)))))

(defun halfitx(x k)			; x is an integer, split at n
  (let* ((n (- (integer-length x) k))
	 (r (ash (ash x (- n)) n)))
    (values
     r (- x r))))


(defun thalf(d) ;test
  (= d (reduce #'+ (multiple-value-list (halfit d)))))




(defun crecip (n &aux (a1 modulus) a2 (y1 0) (y2 1) q)
  (declare (special modulus))
  (setq a2 (if (< n 0) (+ n modulus) n))
  (do ()
      ((= a2 1) (cmod y2))
    (if (= a2 0)
	(error "Inverse of zero divisor -- ~D modulo ~D" n modulus))
    (setq q (truncate a1 a2))
    (psetq a1 a2 a2 (- a1 (* a2 q)))
    (psetq y1 y2 y2 (- y1 (* y2 q)))))


(defun crecipx (n &aux (a1 modulus) a2 (y1 0) (y2 1) q)
  (declare (fixnum n a1 a2 y1 y2 q modulus)(special modulus))
  (setq a2 (if (< n 0) (+ n modulus) n))
  (do ()
      ((= a2 1) (cmod y2))
    (if (= a2 0)
	(error "Inverse of zero divisor -- ~D modulo ~D" n modulus))
    (setq q (truncate a1 a2))
    (psetq a1 a2 a2 (- a1 (* a2 q)))
    (psetq y1 y2 y2 (- y1 (* y2 q)))))

(defun cmod(n)				; n is positive or zero
  (declare (special modulus))
  (cond ((<= (ash n 1) modulus) n)
	(t (- n modulus))))

(defun cmodx(n)				; n is positive or zero
  (declare(fixnum n *modulus*))
  (cond ((<= (ash n 1) *modulus*) n)
	(t (- n *modulus*))))


(defun polymod(p m)			; reduce polynomial p modulo m
  ;; um, should we toss out coefficients which are 0 mod m?
  ;; this doesn't
  (let ((modulus m))
    (declare (special modulus))
    (cons (car p)			; exponents
	  (map 'vector #'(lambda(r)(cmod (mod r m))) (cdr p)))))



(defun pnorm (p)
  (let* ((hasazero nil)
	 (e (car p))
	 (c (cdr p))
	 (nz (length e))
	 (anse  (make-array nz :adjustable t :fill-pointer 0 :element-type 'integer))
	 (ansc  (make-array nz :adjustable t :fill-pointer 0 :element-type 'integer)))
    (dotimes (i nz (if hasazero (cons 
				 (make-simple-array anse) ;copy to compress.
				 (make-simple-array ansc))
		     p))
      (cond ((= 0 (aref c i))
	     (setf hasazero t))		; if this remains nil, just return p as normal
	    
	    (t
	     (vector-push-extend (aref e i) anse) ;copy over non-zeros
	     (vector-push-extend (aref c i) ansc))))))
  

(defun make-fixnum-array (A) (if (typep A '(simple-array fixnum))
				 A
			       (make-array (length A)
					     :element-type 'fixnum
					     :initial-contents A)))

(defun make-double-array (A) (if (typep A '(simple-array double-float))
				 A
			       (let ((r(make-array (length A)
						   :element-type 'double-float)))
				 (dotimes (i(length A) r)
				   (setf (aref r i)(coerce (aref A i) 'double-float))))))


;; this next program is an attempt to beat the FFT on its home base,
;; but by using the most straight-forward algorithm taking n*m multiplies
;; using a storage scheme that is compact if the answer is dense.

(defun mul-dense-fix (x y)
  ;; fast and simple for small degree dense
  ;; exponents and coefficients are all fixnums, in the answer too.
  ;; if they are not, this answer will be wrong! 
  ;; we want y to be shorter, less copying..
  (if (< (length (car x))(length (car y))) ; reverse args if x is shorter
      (mul-dense-fix y x)

  (let* (
	 (xe (car x))			;array of exponents of x
	 (ye (car y))			;array of exponents of y
	 (xc (cdr x))			;array of coefficients of x
	  ;array of coefficients of y, put in typed array so we can iterate faster
	 (yc (make-fixnum-array (cdr y)))
	 (xei 0)(xci 0)
	 (yl (length ye))
	 (ans (make-array
	       (+ 1 (aref xe (1- (length xe)))
		  (aref ye (1- yl)))	; size of ans
	       :element-type 'fixnum
	       :initial-element 0)))
    (declare (fixnum yl xei xci)
	     (type (simple-array t (*)) xe ye xc)
	     (type (simple-array fixnum (*)) ans yc))
    (dotimes (i (length (car x)) (DA2PA ans))
      (declare (fixnum i))
      (setf xci (aref xc i))
      (unless (= xci 0)
	(setf xei (aref xe i))
	(dotimes(j yl)
	  ;; multiply terms 
	  (declare (fixnum j))
	  (incf (aref ans (the fixnum (+ xei(the fixnum (aref ye j)))))
		(the fixnum(* xci (the fixnum (aref yc j)))))))))))

(defun mul-dense (x y)
  ;;  simple for small degree dense ;; faster than mul-naive if dense
  ;; exponents and coefficients are integers. compare to mul-dense-fix
  ;; if they are not, mul-dense-fix's  answer will be wrong! 
  ;; the idea here is to copy one of the polys to a (dense) array.
  ;; we want y to be shorter, less copying..
  (if (< (length (car x))(length (car y))) ; reverse args if x is shorter
      (mul-dense y x)
  
  (let* (
	 (xe (car x))			;array of exponents of x
	 (ye (car y))			;array of exponents of y
	 (xc (cdr x))			;array of coefficients of x
	  ;array of coefficients of y, put in typed array so we can iterate faster
	 (yc (make-simple-array (cdr y)))
	 (xei 0)(xci 0)
	 (yl (length ye))
	 (ans (make-array
	       (+ 1 (aref xe (1- (length xe)))
		  (aref ye (1- yl)))	; size of ans
	       :element-type 'integer
	       :initial-element 0)))
    ;; we could do a reality check here: if the degree is computed above is huge
    ;; compared to the number of terms, maybe this is a bad idea.
    (declare (fixnum yl)
	     (type (simple-array integer (*)) xe ye xc)
	     (type (simple-array integer (*)) ans yc))
    (dotimes (i (length (car x)) (DA2PA ans))
      (declare (fixnum i))
      (setf xci (aref xc i))
	(setf xei (aref xe i))
	(dotimes(j yl)
	  ;; multiply terms 
	  (declare (fixnum j))
	  (incf (aref ans (+ xei (aref ye j)))
		(* xci (aref yc j))))))))

(defun mul-dense-float (x y)
  ;; fast and simple for small degree dense
  ;; exponents and coefficients are all fixnums, in the answer too.
  ;; if they are not, this answer will be wrong! 
  ;; we want y to be shorter, less copying..
  (if (< (length (car x))(length (car y))) ; reverse args if x is shorter
      (mul-dense-float y x)

  (let* (
	 (xe (car x))			;array of exponents of x
	 (ye (car y))			;array of exponents of y
	 (xc (cdr x))			;array of coefficients of x
	  ;array of coefficients of y, put in typed array so we can iterate faster
	 (yc (make-double-array (cdr y)))
	 (xei 0)(xci 0.0d0)
	 (yl (length ye))
	 (deg  (+ 1 (aref xe (1- (length xe)))  (aref ye (1- yl))))
	 (ans (make-array
	      deg	; size of ans
	       :element-type 'double-float   :initial-element 0.0d0)))
    (declare (fixnum yl xei deg)
	     (double-float xci)
	     (type (simple-array t (*)) xe ye xc)
	     (type (simple-array double-float (*)) ans yc))
    
    (dotimes (i (length (car x)) (dfa2pa ans deg))
      (declare (fixnum i))
      (setf xci (coerce(aref xc i) 'double-float))
	(setf xei (aref xe i))
	(dotimes(j yl)
	  ;; multiply terms 
	  (declare (fixnum j))
	  (incf (aref ans (the fixnum (+ xei(the fixnum (aref ye j)))))
		(the double-float(* xci (the double-float (aref yc j))))))))))

;; how fast is this?
;;mul-dense-float 281 ms
;;mul-dense-fix   234 ms
;;mul-dense       985 ms   multiplying u X u, 
;;mul-fft         110 ms  . answer has 19,892 terms, of degree 19,900.

;; mul-naive     1953 ms

;;consider  (setf v (make-racg 5000 10 5)) . answer has 54,225 terms, degree 54306
;; mul-naive 3100 ms. 
;; mul-dense  984 ms.
;; mul-fft    281 ms.
;; mul-dense-fix 250 ms.
;; mul-dense-float 266 ms

 

 

;;;;;;;;;;;;; yet more hacks.  suppose the exponents are bignums.

#|strategy: decompose the input.  

P= p_0 + ... + p_n*x_n  into P_0 + x^(m_1)*P_1+ x^(m_r)*P_r

where P_0, ... P_r  have exponents between 0 and k.  The m_i are determined
by looking at P, and seeing the next exponent that needs to be encoded.
k would usually be 1/2 the largest fixnum.

Encode  Q the same way.  If the segmented P and Q have s and t segments respectively,
s*t mults of sub-parts will be needed to finish the multiplication.
The final answer is reassembled into one polynomial, with arbitrary exponents.
|#

(defstruct segpoly pieces shiftvals)

(defun segbyexp(p &optional(k #.(ash most-positive-fixnum -1)))
  ;;; segment a polynomial by exponent range.
  ;;; find the next lowest exponent, say E
  ;;; encode all terms from E to E+k-1 as
  ;;; x^E* (poly-shifted by E)
  
  (let* ((n (aref (car p) (1- (length (car p))))); largest exponent
	 (pieces (make-array 2 :adjustable t :fill-pointer 0) )
	 (shiftvals (make-array 2 :adjustable t :fill-pointer 0) )); unknown number of pieces
    (cond ((< n k)			; no need to segment at all 
	   (vector-push-extend  p pieces)
	   (vector-push-extend  0 shiftvals))
	  (t 
	   (let* ((lp (length (car p)))
		  (lim 0)(low 0) (expon 0)(i 0)
		  (thepart nil))
	     ;; put all pieces with exponent less than i*n into ith cell.
	     ;; after reducing by k^i.
	     (declare (fixnum i lp))
	     (loop while (< i lp) do
		   (setf low (aref (car p) i))
		   (setf lim (+ low k)) ; next piece has stuff up to lim
		   (setf thepart nil)
		   (loop while (and (< i lp)
				    (< (setf expon (aref (car p) i)) lim))
			 do
			 (push (cons (- expon low) (aref (cdr p) i)) thepart)
			 (incf i))
		   ;; check if there really were any parts in this interval... since the next
		   ;; exponent  is too big
		   (cond (thepart	;if there are pieces in this interval, save them
			  (vector-push-extend (L2PA (nreverse thepart)) pieces)
			  (vector-push-extend low shiftvals)))))))
    ;; all done. return the structure.
    (make-segpoly :pieces pieces :shiftvals shiftvals)))


;;
#|To  multiply segmented polynomials, one could consider a karatsuba-like scheme, or just NXM.
This is just NXM.
|#


(defun mul-seg(p q  &key (mulfun #'mul-naive) )

  (let ((result (cons (vector)(vector))) ;the zero polynomial
	(pseg (segpoly-pieces p))
	(psv(segpoly-shiftvals p))
	(qseg (segpoly-pieces q))
	(qsv (segpoly-shiftvals q))
	(psvi 0))
    
    ;; if we had L  processors, we could do L of these NxM multiplies
    ;; at the same time and do an L-1 way merge. Here we just use linear merge, bad.
    (dotimes (i (length pseg) result)
      (declare (fixnum i))
      (setf psvi (aref psv i))
      (dotimes(j (length qseg))
	(declare (fixnum j))
	(setf result (add-poly result ; not necessarily fixnum exponents
			       (shiftexp
				(+ psvi (aref qsv j))
				(funcall mulfun  ;; this program can assume fixnum exponents
					 ;; should we try to do some adaptation here to pick
					 ;; the most appropriate for this segment??
					 (aref pseg i)(aref qseg j)))))))))

(defun shiftexp(n p) ;shift the exponents of polynomial p by multiplying by x^n, ie add n to exp
  (if (= n 1) p 
    (cons (map 'array #'(lambda(r)(+ n r))	(the (simple-array t (*))(car p)))
	(cdr p))))

(defun add-poly (p q)			;works.  we attempt to make efficient.
  (let* ((ep (car p))			;exponents
	 (eq (car q))
	 (cp (cdr p))			;coefs
	 (cq (cdr q))
	 (epi 0)(eqj 0)
	 (i (1-(length ep)))
	 (j (1-(length eq))) 
	 (count 0)
	 (ans nil))
    (declare (fixnum i j count)
	     (type (simple-array t (*)) ep eq cp cq)
	     (integer epi eqj))
    (loop
     (cond ((< i 0)
	    (do ((jj j (1- jj)))
		((< jj 0) (return-from add-poly (L2PA ans count)))
	      (declare (fixnum jj))
	      (push (cons (aref eq jj) (aref cq jj)) ans)
	      (incf count)))
	   ((< j 0)
	    (do ((jj i (1- jj)))
		((< jj 0) (return-from add-poly  (L2PA ans count)))
	      (declare (fixnum jj))
	      (push (cons (aref ep jj) (aref cp jj)) ans)
	      (incf count)))
	   ((> (setf epi (aref ep i))(setf eqj (aref eq j)))
	    (push (cons epi  (aref cp i)) ans)
	    (decf i))
	   ((< epi eqj)
	    (push (cons eqj  (aref cq j)) ans)
	    (decf j))
	   (t;; (= epi epj)
	    (push (cons epi  (+ (aref cp i)(aref cq j))) ans)
	    (decf i)(decf j)))
     (incf count))))
	   
	   
(defun test-seg(u v n)
  (mul-seg (segbyexp u n)(segbyexp v n) n))
 
;;;various tests for ACL compiling on intel 32-bit machine    
#|	
;; is fixnum faster than signed-byte 32?
(defun 
    f1(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 32) (*)) z))
  (incf (aref z 0)(aref z 1))  ;; 146 bytes!!
  
  z)

(defun 
    f12(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 32) (*)) z))
  (incf (the (signed-byte 32)(aref z 0))(aref z 1))  ;; only  15 bytes.
  z)

(defun 
    f2(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array fixnum (*)) z))
  (incf (aref z 0)(aref z 1))  ;; 15 bytes also
  z)

	

(defun 
    f3(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 24) (*)) z))
  (incf (aref z 0)(aref z 1))
  z)

(defun 
    f4(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 31) (*)) z)) ;; also 15 bytes
  (incf (aref z 0)(aref z 1)) ;; adding is same; multiplying of fixnum needs shift of 2  bits
  z)

(defun 
    f11(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 32) (*)) z))
  (setf (aref z 0)(the (signed-byte 32) (* (aref z 0)(aref z 1)))) ;; 35 bytes
  z)


(defun 
    g4(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 31) (*)) z)) ;; winner, same as fixnum, though.
  (setf (aref z 0)(*(aref z 0)(aref z 1)))
  z)

(defun 
    g5(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array fixnum (*)) z)) ;; winner, same as fixnum, though.
  (setf (aref z 0)(*(aref z 0)(aref z 1)))
  z)

(defun 
    g6(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array fixnum (*)) z)) ;; winner, same as fixnum, though.
  (setf (aref z 0)(the fixnum (*(aref z 0)(aref z 1))))
  z)

(defun 
    g7(z)
  (declare (optimize (speed 3)(safety 0))
	   (type (simple-array (signed-byte 31) (*)) z)) ;; winner, same as fixnum, though.
  (setf (aref z 0)(the (signed-byte 31) (*(aref z 0)(aref z 1))))
  z)

|#

;;; multivariate

;;; inspired by packexpons.lisp

;;; glitch here.  The order of exponents in these arrays is reversed, causing problems.
;;; so I inserted 4 reverses below...
;;; this should be changed with incode changed too.

(defun mul-multivar(r s &optional (mulprog #'mul-naive)) ;; multiply r and s; mulprog is the univariate mult prog
  (multiple-value-bind (in out)(make-coders r s) ; set up the encoder and decoder
    (let((prod 
	  (funcall mulprog 
		   (cons (map 'array in (car r))(cdr r))
		   (cons (map 'array in (car s))(cdr s)))))
      (cons ;; the re-formed exponents
       (map 'array  out (car prod)) (cdr prod)))))
      
(defun make-coders(p1 p2);; returns 2 programs for encoding and decoding
  (let* ((z (nreverse(bound-expons-product p1 p2)))
	 (fields (mapcar #'integer-length z))
	 (sumfields (cons 0(sumlist fields 0)))
	 (maskfields  (mapcar #'(lambda(r)(1- (ash 1 r))) fields)))
    ;; encoder  vector to integer
    (values
     #'(lambda(v)
	 (reduce #'+ (map 'list  #'ash (reverse v)  sumfields)))
     ;; decoder  integer to vector
     #'(lambda(i)(let ((q   (make-array 5 :adjustable t :fill-pointer 0)))
		   (do ((u  fields (cdr u))
			(masks maskfields (cdr masks)))
		       ((null u) (nreverse q))
		     (vector-push-extend (boole boole-and (car masks) i) q)
		 
		     (setf i (ash i (- (car u))))) )))))

(defun max-expons-multivar(p)
  ;; p is a polynomial in the form below. Return a list of the max exponents
  ;; (#((5 4 3) (9 7 0)) .   ;;exponents
  ;;  #(30 21)  ) ;;coefs
  
  ;;  30*x^5*y^4*z^3+  21*x^9*y^7*z^0
  (let ((h (map 'array #'(lambda(z)(declare (ignore z))0) (aref (car p) 0))))
    (map nil #'(lambda(r) (setf h (map 'array #'max h r)))(car p))
    h))

(defparameter pmult
       (cons (vector (vector 5 4 3) (vector 9 7 0))
	     (vector 30 21)  )) ;;coefs

(defun bound-expons-product(p1 p2)
  ;; returns a list of the maximum exponents that will occur in p1*p2, polynomials
  (map 'list #'+ (max-expons-multivar p1)(max-expons-multivar p2)))



(defun sumlist(h n)(cond((null h) nil) ;; (sumlist '(a b c) 0) returns list (a a+b a+b+c).
			(t (let((z(+ (car h) n)))
			     (cons  z (sumlist (cdr h) z))))))



(defun mul-multivar-direct (p q)
  ;; use hash tables
  (let* ((ans  (make-hash-table :test 'equal ))
	 (ep (car p))	 (cp (cdr p))
	 (eq (car q))	 (cq (cdr q))
	 (cpi 0)         (epi 0))
    
    (dotimes (i (length ep) 
	       ;; this answer is not in the right form.
	       ;; needs to be sorted and converted to pair of arrays.
	       (l2pam
		(sort
		(loop for x being the hash-key in ans 
		    and y being the hash-value in ans unless (= y 0) collect (cons x y))
		#'lexgp :key #'car)
		(hash-table-count ans)))
      (setf epi (aref ep i) cpi (aref cp i))
      (dotimes (j (length eq))
	(incf (gethash (map 'list #'+ epi(aref eq j)) ans 0) 
	      (* cpi(aref cq j)))))))


(defun l2pam  (z &optional (n (length z)))			; list to pair of arrays, multivar
  (let ((anse (make-array n))
	(ansc (make-array n))
	(exponsize (length (caar z)))
	(j (1- n)))
    
    (declare (fixnum j n exponsize))
    (loop (if (< j 0) (return (cons anse ansc)))
      (setf (aref anse j) (make-array exponsize :initial-contents (caar z)))
      (setf (aref ansc j) (cdar z))
      (decf j)
      (pop z))))

(defun lexless(a b)
  (cond((null a) nil)
       ((< (car a)(car b)) t)
       (t (and (= (car a)(car b))(lexless (cdr a)(cdr b))))))

(defun lexgp(a b)
  (cond((null b) nil)
       ((> (car a)(car b)) t)
       (t (and (= (car a)(car b))(lexgp (cdr a)(cdr b))))))
;;;
;;what about packing multiple coefs in a dense polynomial in single words?
;; for example coefficients mod 5 could be stored in an array of (integer -2 2)
(defparameter *s* (make-array 10 :element-type '(integer 0 4) :initial-element 0))

;; ACL 8.0 still allows storage of numbers from -128 to 127 in this array.

(defun pack-s(s)
  (declare (optimize (speed 3)(safety 0))(type (simple-array (integer 0 4) 10) s))
  (dotimes (i 10)(setf (aref s i) i)))


(defun add-pack-s(r s) ;; add packed coefs
  (declare (optimize (speed 3)(safety 0))(type (simple-array (integer 0 4) 10) r s))
  (dotimes (i 10 r)
    (setf (aref r i)(mod (+ (aref r i)(aref s i)) 5))))

(defun mul-pack-s(r c) ;; multiply packed coefs by a constant.
  (declare (optimize (speed 3)(safety 0))(type (simple-array (integer 0 4) 10) r s)
	   (type (integer 0 4) c))
  (dotimes (i 10 r)
    (setf (aref r i)(mod (* c (aref r i)) 5))))