;;; -*- Lisp -*-
;;; various ways of multiplying sparse polynomials.
;;; using lists, hashtables, trees, streams, heaps, and just hacking the naive method.
;;; SAME BUT EXPONENTS are FIXNUMS


(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))
;; set up a lexically available context for handling several
;; variables which would otherwise be global, and require more
;; effort to access from compiled code.
;; 
(let ((*YYEXPS* nil)
      (*YYCOEFS* nil)
      (*YYLEN* 0)
      (fill-pointer 0))
  (declare (fixnum *YYLEN* fill-pointer)
	   (type (simple-array t  (*))  *YYCOEFS*)
	   (type (simple-array fixnum  (*)) *YYEXPS*))

  
  ;; input format: a univariate polynomial is represented by a Lisp cons
  ;; of 2 arrays.  The first array is exponents, in ascending order. The
  ;; second array is corresponding coefficients. We expect that the
  ;; usual way of representing numbers in Lisp is used, whereby a small
  ;; number (in magnitude less than, say 2^29) is represented "in place"
  ;; shifted right a few bits so it is tagged as a small number; larger
  ;; numbers, of unlimited length, are pointed to by the entry in the
  ;; same field.  Thus "short" arithmetic in Lisp is slightly
  ;; handicapped by the need to do some tag checking and shifting.
  ;;This makes the transition to arbitrary precision, as well as to
  ;; float, double-float, and even complex numbers programmatically
  ;; smoother.

  ;; If the Lisp compiler is informed (via declarations) that it has an array of small
  ;; numbers, it may dispense with the tags and generate code to use ordinary 32-bit (or
  ;; 64-bit) arithmetic.


  ;; example of the input format:  45x^10+25x^2+ 9  [note: 9 = 9x^0]
  ;; ( #(0 2 10) . (9 25 45))


  ;; Now to multiplicatio of polynomials.
  ;; First, the really obvious way, using a double-nested loop,
  ;; inserting each of the new exponent-coefficient pairs in
  ;; the right place in a linked list.

  (defun mul-naive (x y) 
    (let* ((ans nil)(e 0)(c 0)
	   (yl (length (car y)))
	   (xe (car x))			;array of exponents of x
	   (ye (car y))			;array of exponents of y
	   (xc (cdr x))			;array of coefficients of x
	   (yc (cdr y)))		;array of coefficients of y
      (dotimes (i (length (car x)) ans)
	(dotimes(j yl)
	  ;; multiply terms 
	  (setf e (+ (aref xe i)(aref ye j));add exponents
		c (* (aref xc i)(aref yc j));multiply coefficients
		ans (add-poly-term e c ans))))))

  (defun add-poly-term(e c L)		;make a list of ( ... (exp . coef)...)
    (let ((ahead (cdr L)))
      (cond ((null ahead) 
	     (if L  (setf (cdr L)(list (cons e c)))
	       (setf L (list (cons e c)))))
	    ((= e (caar ahead))
	     (incf (cdar ahead) c)	;add the new term into old coefficient
	     (cdr L))
	    ((< e (caar ahead))
	     (setf (cdr L)(cons (cons e c) ahead))
	     (cdr L))
	    (t (add-poly-term e c (cdr L)))))
    L)

;;; This is almost all we need for a working program, but we should do this:
;;; Convert list back to the standard form for polynomials used by the input.
;;; Here is a program that does it:
  (defun L2PA (L) 
    ;;List to Pair of Arrays
    ;; L is a LIST of (exp . coef) of length n.
    ;; create a pair: one array of exponents and one of coefficients.
    (let* ((n (length L))
	   (exps (make-array n))
	   (coefs (make-array n))
	   (count 0))
      (declare (fixnum count n) (type (simple-array t (*)) exps coefs))
      (dolist (i L (cons exps coefs))
	(setf (aref exps count) (car i))
	(setf (aref coefs count)(cdr i))
	(incf count))))


  ;; A nuance we will ignore for now is that this result may have some zero coefficients,
  ;; e.g. if we multiply (x+1)*(x-1). We can usually fix this cheaply.

  ;; The  new program looks like this:

  (defun mul-naive2 (x y) 
    (let* ((ans nil) (e 0)(c 0)
	   (yl (length (car y)))
	   (xe (car x))
	   (ye (car y))
	   (xc (cdr x))
	   (yc (cdr y)))
      (dotimes (i (length (car x)) (L2PA ans));; ** changed**
	(dotimes(j yl)
	  ;; multiply terms 
	  (setf e (+ (aref xe i)(aref ye j))
		c (* (aref xc i)(aref yc j))
		ans (add-poly-term e c ans))))))


  ;; This program is not nearly as fast as it could be, if we just make
  ;; a small change.  Instead of looking to insert each new term into
  ;; the answer starting from the beginning, let us keep track of where
  ;; the just-previous insertion was made, and place the new insertion
  ;; to its right, on each sweep through the second argument.  When we
  ;; start with the next term of the first argument, we start insertion
  ;; from the beginning of the answer.  We also simplify the programming
  ;; of an insertion into the answer by initializing ans to a non-empty
  ;; list.

  ;; This change results in the following program:


  (defun mul-naive-fin (x y);; naive but with a finger into the answer
    (let* ((ans (list (cons 0 0)))	; makes iteration neater
	   (finger ans)
	   (yl (length (the simple-array (car y))))
	   (xl (length (the simple-array (car x))))
	   (xe (car x))
	   (ye (car y))
	   (xc (cdr x))
	   (yc (cdr y)))
      (declare(fixnum yl xl) (type (simple-array t (*)) xe ye xc yc))
	     
      (dotimes (i xl  (L2PA (cdr ans)))
	(dotimes(j yl)
	  ;; multiply terms 
	  (setf finger (addterm-fin(+ (aref xe i)(aref ye j))
				   (* (aref xc i)(aref yc j))
				   finger)))
	(setf finger ans))))


  ;; We slightly improve the addterm program that returns an
  ;; appropriate finger into the list for insertion.
  ;; Instead of the recursive version we use an iterative format.

  (defun addterm-fin(e c L)		;iterative version with do ; 258 bytes
  
    (do ((ahead (caadr L)(caadr L)))
	((null ahead) (setf (cdr L)(list (cons e c))))
      (cond
       ((> e ahead)  (setf L (cdr L)))
       ((= e ahead)  (incf (cdadr L) c) (return (cdr L)))
       (t (setf (cdr L)(cons (cons e c) (cdr L))); e < ahead. tack on end
	  (return (cdr L))))))

  ;; OK, mul-naive-fin works.  How much faster is it than mul-naive?
  ;; On one example of a 500 X 1000 term multiplication, mul-naive takes 10.500 seconds.
  ;; The improved one takes 0.047 seconds, and is therefore 223 times faster.

  ;; Next step is to make a testing program.
  ;; First, we specify a program to create some random data.

  (defun make-racg(n d) 
    ;; make Random Array of increasing exponents of length n with
    ;; spacing k, with 0<k<=d between elements and with Coefficients that
    ;; are random integers between 1 and d. d may be, in General, arbitrary precision.
    (let ((exps (make-array n))
	  (coefs(make-array n))
	  (v (random d)))
      (dotimes (i n (cons exps coefs))
	(setf (aref coefs i)
	      (+ 1 (random d)))
	(setf (aref exps i)v)
	(setf v (+ 1 v (random d))))))

  (defvar *x* nil);; these will be global polynomials for testing
  (defvar *y* nil)

  (defun test0(n m &optional (d 5))	;try these parameters for now.
    (setf *x* (make-racg n d)) 
    (setf *y* (make-racg m d))
    (format t "~% mul-naive-fin time - just use insertion into single list")
    (time (mul-naive-fin *x* *y*))
    ;; We will add more lines here later
    )

;;; If we want to make this naive method faster, we can try
;;; restricting the exponents and coefficients to be known types
;;; rather than general numbers.  In Lisp, such general numbers could
;;; be bignums, double-floats, complex, etc. 

;;;These versions below require that the inputs be produced by
;;; make-racf or equivalent, so that the arrays will be of known
;;; type, namely fixnum, and not tagged numbers.  The payoff is about 60% speedup:
;;; the fixnum-only version, mul-naive-finf, takes 29 ms for 500 X 1000 multiply,
;;; whereas the general version, mul-naive-fin, takes 47 ms.

;;; An intermediate position which we thought would be advantageous,
;;; is to require the exponents to be fixnums but allow the
;;; coefficients to be arbitrary.  Our thinking was this: If a
;;; polynomial has huge exponents, it can be broken up into pieces
;;; like this p = poly1(x)*x^0 +poly2(x)*x^[LargeNumber]+ ... where
;;; poly1 etc are polynomials with modest exponents (say, half the
;;; size of the largest fixnum).  In our 500X1000 example, the number
;;; of exponent operations looks like this: 1.3 million ">", 0.495501
;;; million "=" 0.5 million "+".  The number of coefficient operations
;;; is 0.5 million "*" and and 495,501 "+".  Note that the number of
;;; terms in the answer is 500000-495501=4499.  There are about 2.3
;;; million exponent operations vs. 1 million coefficient operations.

;;; Converting the exponent arithmetic ONLY to fixnum and leaving the
;;; coefficients general, does not appear to improve the timing, at
;;; least in Allegro CL 8.0, so this was not pursued further.  Other
;;; prospects are appealing however. The coefficient arithmetic 
;;; could be done with (say) GMP arithmetic, an open-source fast bignum
;;; package, or MPFR, an open-source arbitrary precision float package.
;;; These type/coding issues are largely independent of the algorithm used
;;; for sparse polynomial multiplication.

;;; For the record, we show how to write the program for fixnums.

  (defun make-racf(n d);;fixnum only
    ;; make random array of increasing exponents of length n with
    ;; spacing k, with 0<k<=d between elements and with coefficients that
    ;; are random integers between 1 and d. d is a fixnum
    (declare (fixnum n d))
    (let ((exps (make-array n :element-type 'fixnum))
	  (coefs(make-array n :element-type 'fixnum))
	  (v (random d)))
      (dotimes (i n (cons exps coefs))
	(setf (aref coefs i)
	      (+ 1 (random d)))
	(setf (aref exps i)v)
	(setf v (+ 1 v (random d))))))
	  
  (defun addterm-finf(e c L)
    (declare (fixnum e c))
    (do ((ahead (caadr L)(caadr L)))
	((null ahead) (setf (cdr L)(list (cons e c))))
      (declare (fixnum ahead))
      (cond
       ((> e ahead)  (setf L (cdr L)))
       ((= e ahead)  (incf (the fixnum (cdadr L)) c) (return (cdr L)))
       (t (setf (cdr L)(cons (cons e c) (cdr L))); e < ahead. tack on end
	  (return (cdr L))))))

  (defun mul-naive-finf (x y);; naive but with a finger into the answer. fixnum only
    (let* ((ans (list (cons 0 0)))
	   (finger ans)
	   (yl (length  (car y)))
	   (xl (length  (car x)))
	   (xe (car x))
	   (ye (car y))
	   (xc (cdr x))
	   (yc (cdr y)))
      (declare(fixnum yl xl) (type (simple-array fixnum (*)) xe ye xc yc))
      (dotimes (i xl (L2PA (cdr ans)))
	(dotimes(j yl)
	  ;; multiply terms 
	  (setf finger (addterm-finf 
			(the fixnum (+ (the fixnum (aref xe i))(the fixnum(aref ye j))))
			(the fixnum (* (the fixnum (aref xc i))(the fixnum (aref yc j))))
			finger)))
	(setf finger ans))))

;;;;;;;;;;;; enough for naive methods.  Can we do better?

;;; A first hashing method for multiplication.

  (defun mul-hash0(x y)
    (let* ((ans (make-hash-table))	;set up an empty hash table
	   (yl (length (car y)))
	   (xe (car x))
	   (ye (car y))
	   (xc (cdr x))
	   (yc (cdr y)))
      (dotimes (i (length (car x)) ans)
	(dotimes(j yl)
	  ;; multiply terms and increment any existing entry
	  ;; in the hash table by the new coefficient. If there
	  ;; is not coefficient there, give it a default 0 value.
	  (incf (gethash (+ (aref xe i)(aref ye j));key= exponent
			 ans 
			 0)		;default for missing value: 0
		(* (aref xc i)(aref yc j));increment by prod of coefs.
		)))))

  ;; There are 2 problems here. One: the result is a hash table, not our
  ;; standard form.  We could live with this and have every polynomial
  ;; be stored in a hash table. Probably better is to translate from a
  ;; hash table to the standard form. We do this in mulhash0 below.

  ;; The second problem is that this program, while short, is not as
  ;; efficient in time as we can make it. not take into account various
  ;; ways of reducing hashing time.

  ;; Here is the program needed for the first fix:

  (defun sorthash(h);; utility for mulhash0, empty a hash table into an a-list
    (let ((ans nil))
      (maphash #'(lambda (i v)(push (cons i v) ans)) h)
      (sort ans #'< :key #'car)))

  ;; Here is a much more sophisticated version.  We even toss in the
  ;; nuance of eliminating zero coefficients.

  ;; Coefficients are stored in an array separate from the hash table,
  ;; so they can be allocated in an array of known type, like
  ;; double-float, or fixnum.  For now we don't specify a particular
  ;; numeric type.  This array may make memory locality better in some
  ;; situations, but it also has the beneficial side effect of
  ;; eliminating about half the hash code computation.  We use the
  ;; address of data in this array to update, rather than re-hashing on
  ;; the key. The program is longer than strictly necessary: Some of the
  ;; code is almost never used, but allows us to do some storage
  ;; optimization "by hand".  Common Lisp provides for expandable
  ;; arrays, but we can be faster by managing it by hand.

  (defparameter *zcoefs-lim* 50)
  (defparameter *zcoefs* (make-array *zcoefs-lim*))

  #+ignore 

  ;; This works, but allocates 3N list elements then copies into
  ;; 2N array elements.  another definition (much longer, using a
  ;; hand-coded quicksort variant) is provided later.
  (defun sorthash1(h ar);; utility for mul-hash1
    (let ((ans nil))
      (maphash #'(lambda (i v)
		   (setf v (aref ar v))	; get the coefficient, by indirection
		   (unless (= v 0);; normalize! remove 0 coefs
		     (push (cons i v) ans)))
	       h)
      (L2PA  (sort ans #'< :key #'car))))

  (defun mul-hash1 (x y);;   exponents may be arbitrary integers. coefficients numbers.
    (let ((ans (make-hash-table :test 'eql));; eql is good enough for bignums too
	  (xexps (car x))
	  (xcoefs (cdr x))
	  (yexps (car y))
	  (ycoefs (cdr y))
	  (xei 0)
	  (xco 0)
	  (zcoefs *zcoefs*)
	  (zlim *zcoefs-lim*)
	  (zcounter 0)
	  (loc nil)
	  (xy 0))
      (declare (type (simple-array t (*)) xexps yexps); any exponents
	       (type (simple-array t (*)) xcoefs ycoefs zcoefs); any coefs
	       (fixnum zcounter zlim)
	       (integer xei xy)
	       (optimize (speed 3)(safety 0)))
      (dotimes (i (length xexps)(sorthash1 ans zcoefs))
	(declare (fixnum i))
	(setf xei (aref xexps i))	;exponent for x
	(setf xco (aref xcoefs i))	;coefficient corresponding to that exponent
	(dotimes (j (length yexps))
	  (declare (fixnum j))
	  ;; look in the hash table at location for this exponent
	  (setf loc (gethash (setf xy (+ xei (aref yexps j))) ans))
	  (cond (loc 
		 ;; There is an entry in hash table for this
		 ;; exponent. Update corresponding array location. Note:
		 ;; hashtable is NOT updated.  this happens about n*m
		 ;; times, where n=size(x), m=size(y).
		 (incf (aref zcoefs loc);; no number consing here either??
		       (*  xco (aref ycoefs j))))
		(t
		 ;; if loc is nil, allocate a space in the answer array.
		 ;; This happens relatively infrequently, about k times, where k=size(x*y).
		 ;; Hashtable is updated.
		 (setf (gethash xy ans) zcounter)
		 ;; store the result in zcoefs array.
		 (setf (aref zcoefs zcounter) (* xco (aref ycoefs j))); no number consing??
		 (incf zcounter)
		 (cond ((= zcounter zlim) 
		     
			;;if we reach this, we need more working space.
			;; This is rarely used code. If global array is
			;; big enough, it is never used.  If global array
			;; is too small, it is made bigger.
			(let* ((newsize (ash zlim 1)); double the size
			       ;; (newarray (make-array newsize))
			       (newarray (make-array newsize :initial-element 0)))
			  (declare (fixnum newsize)
				   (type (simple-array t(*)) newarray))
			  ;;(format t "~% sizes: newarray ~s zcoefs ~s newsize ~s"
			  ;;	 (length newarray)(length zcoefs) newsize)
			  (dotimes	;(m newsize) ;wrong
			      (m zlim)
			    (declare (fixnum m))
			    (setf (aref newarray m)(aref zcoefs m)))
			  (setf zcoefs newarray  zlim newsize
				*zcoefs-lim* zlim 
				*zcoefs* zcoefs))))))))))


  ;; on the same data, 500 X 1000 -> 4450 terms
  ;;*             ACL 8.0        GCL       SBCL
  ;;*  mul-naive-finf 29 ms 
  ;;*  mul-naive-fin  47 ms       29 ms    47 ms
  ;;*  mul-hash1      94 ms    1,120 ms   110 ms
  ;;*  mul-hash0     203 ms   
  ;;*  mul9        1,562 ms    2,160 ms   687 ms
  ;;*  mulstr        282 ms       ?       203 ms

  ;; We slipped in mul-str and mul9, to be discussed later.

;;; On different data, with random spacing of up to 500 between terms,
;;; we tried 500 X 1000 -> 255,364 terms [that is, VERY sparse
;;; input.], something else is going on.  mul-str now looks good ...

  ;;*                 ACL         GCL              SBCL     

  ;;*  mul-naive-fin  2,400 ms     6,600ms        1,546 ms
  ;;*  mul-hash1      1,173 ms      ?             1,782 ms
  ;;*  mul-str          391 ms      ?               297 ms


;;; On yet different data, with random spacing of up to 5000 between
;;; terms, we tried 500 X 1000 -> 459,416 terms [that is, VERY VERY
;;; sparse input.] Also rather large output. If there are 500k terms,
;;; there are 500k exponents and 500k coefficients.  If each one
;;; occupies 4 bytes, we have 500k X 2 X 4 bytes or 2 megabytes.
;;; Given the additional space for storing numbers by indirection, and
;;; the extra space allocated for possible growth, the hash-table
;;; allocations are substantially larger. The 2 L2 caches on the Intel
;;; Pentium D CPU on which we are testing are each 1 megabyte.  The
;;; heap multiplier (mul-str6) has the significant advantage of
;;; generating each coefficient/exponent pair in the answer once, and
;;; not re-examining it.  (Actually it will revisit locations because
;;; the program uses an array that must be dynamically grown -- it
;;; does not know how many terms will ultimately appear in the answer.)

;;; As for how large this polynomial is, it would probably take 1,500
;;; pages to print out, double-sided, estimating 150 terms per
;;; side. Evaluating the polynomial at a floating-point point much
;;; outside of [-1,1] would probably fail from overflow. Finding zeros
;;; would likely be problematical, and it is unlikely that such a
;;; large polynomial would be a useful approximation to any data from
;;; measurements.  Nevertheless, there apparently is interest from the
;;; direction of pure mathematics, in such creations.

;;; we show data for 3 implementations of Common Lisp, 
;;; Allegro 8.0, GCL 2.6.8 and SBCL 1.0.13

  ;;*                   ACL       GCL          SBCL
  ;;*  mul-naive-fin  3,891 ms     9,820 ms    4,422 ms
  ;;*  mul-hash1      2,187 ms     2,970 ms    3,359 ms
  ;;*  mul-str          484 ms     *             375 ms
;;; 

;;; note 1: * GCL error message. 
;;; note 2: ? unknown. Too long to wait.
;;; note 3: Some program alterations that we
;;; found could speed up ACL, slowed down other programs especially
;;; GCL.  Use of arrays and structures in GCL seem to be slow compared
;;; to lists. 


;;; It seems that mul-str is a BIG winner for ACL and SBCL, but GCL
;;; has problems with it. 

;;; What IS this mul-str??  It uses a heap, and we discuss it below.

  ;; It is one of a bunch of more advanced alternative storage ideas
  ;; (rather than a linked list) including binary radix trees, AVL
  ;; trees, skiplists, and other methods that we programmed.

  ;; Unfortunately, in our tests, these are all slower than
  ;; mul-naive-fin for smallish problems. We know that this
  ;; mul-naive-fin algorithm was used at least as early as 1966 by
  ;; William A. Martin, in programming the polynomial arithmetic system
  ;; for Macsyma at MIT. In this system a slightly different arrangement
  ;; was used. Rather than ((e . c)(e . c) ...) for exponents and
  ;; coefficients, they were arranged in one list (e c e c ...), and the
  ;; arrangement started with the highest degree with a non-zero
  ;; coefficient.  The arrangement also allowed (recursively) for the
  ;; coefficients to themselves be polynomials. The timings show that on
  ;; large and very sparse problems the more elaborate alternatives
  ;; look better.

  ;; Since Monagan and Pearce advocate heap organizations, we describe
  ;; that next, and proceed on to mulstr.

  ;; For our demonstration here, a heap is just an array.  Since we are
  ;; using only one heap, we have one fill-pointer we manage as a
  ;; lexically bound shared variable among all the programs using the
  ;; heap. A relatively straightforward implementation puts STREAMS into
  ;; a heap. What do we mean by a stream?  We experimented with various
  ;; versions, which in Lisp can be made quite elegant by using
  ;; "continuations" -- in effect, designating as streams an object
  ;; which includes a program which when called produces further another
  ;; stream object -- including a program which can be called yet again,
  ;; ad infinitum.  Using this kind of construction we can build streams
  ;; that interleave or merge results from two streams, and cascade them
  ;; recursively into the one final stream of results.  Cute, and not
  ;; too shabby in terms of performance, but not the best. For low
  ;; storage usage and fast speed we found it adequate to use the
  ;; concept of streams, in this case, using just ordinary records of 5
  ;; integers: the simple stream continuation function just increments a
  ;; pointer and otherwise is just multiplying and adding elements in a
  ;; stream.  The simple version is below.

  (defstruct (st) 
    ;; A kind of stream.  each structure contains 5 items
    ;; which by reference to arrays of XX and YY , can burp out
    ;; successive exponents and coefficients of the product.
  
    (exp 0 :type fixnum)
    (coef 0 :type integer) 
    (i 0 :type fixnum)  
    (v 0 :type integer) (e 0 :type integer)); v, e are read-only 

  ;;**** let was here****
  
  ;; mularZ4 produces an array of "streams" of structures.
  ;; Each stream consists of 5 items:
  ;; current exponent, current coefficient, index of the next array
  ;; element of YY to use, multiplier for this stream, exponent
  ;; increment from XX for this stream.  YY is the second polynomial.
  ;; Example:
  
  ;; Let X = a[0]*x^e[0]+ a[1]*x^e[1]+ ....
  ;; Let Y = b[0]*x^f[0]+ b[1]*x^f[1]+ ....
  
  ;; the first stream will represent   a[0]* Y
  ;; the second stream will represent  a[1]* Y  ...
  
  ;; the first stream will start out as 
  ;;  { e[0]+f[0] , a[0]*b[0], a[0], 1, a[0] e[0]} = {exp,coef,i,v, e} 
  ;; the second { e[1]+f[1], a[1]*b[0], 2, a[1], a[1], e[1]}
  
  ;; updating the first stream will do two things. Assuming Y has at least 2 terms,
  ;; 1. return exp and coef
  ;; 2. update the stream to contain   { e[0]+f[1] , a[0]*b[1], 2, a[0], e[0]}. 

  ;; If a stream runs off the end of Y, exp is returned as NIL rather than a number.
    
  ;; x and y are sparse polys in arrays as produced by (make-racg n
  ;; m). return array hack for heap management with a starter entry in
  ;; ans.
  
  ;; mularZ4 sets up an array of initialized streams, one for each term in x.
  
  ;; this setup takes time linear in the length of x, and is negligible.
  ;; It also allocates essentially all the memory for the heap maintenance.
  ;; The rest of the memory needed is allocated to store the answer.
  
  (defun make-simple-array (A &optional (the-type nil)) 
    (if (typep A (list 'simple-array the-type)) A 
				 (make-array (length A)
					     :element-type (or the-type (array-element-type A))
					     :initial-contents A)))
  
  (defun mularZ4(x y) 
    (declare (optimize (safety 3)(speed 0)))
    (let* ((xexps (car x))
	   (xcoefs (cdr x))
	   (yexps (make-simple-array (car y) 'fixnum))
	   (ycoefs(make-simple-array (cdr y)))
	   (yexp0 (aref yexps 0))
	   (ycoef0 (aref ycoefs 0))
	   (ans (make-array (1+(length xexps))))
	   (ylen (length (car y)))
	   (xe 0)
	   (v 0))			;
      (declare (fixnum i) (type (simple-array t (*))
				ans xexps xcoefs yexps ycoefs))
      (setf *YYEXPS* yexps *YYCOEFS* ycoefs *YYLEN* ylen)
      (setf (aref ans 0) (make-st :exp 0 :coef 0 :i ylen :v 0 :e 0))
      (dotimes (j (length xexps) ans)
	(setf v (aref xcoefs j))
	(setf xe (aref xexps j))
	(setf (aref ans (1+ j)) 
	      (make-st :exp (+ xe yexp0) :coef (* v ycoef0) :i 1 :v v :e xe) ))))
  
  (defun update-strheap (h)
    ;;return top of heap h, which is necessarily not empty.
    ;; take successor in stream of top of heap, and insert it in heap.
    (declare(type (simple-array t(*)) h)(optimize (speed 3)(safety 0)))
    (let* ((top (aref h 1))
	   (ta (st-exp top))(tb (st-coef top))
	   (newe (updatestrfun top)))

      ;; in case newe <= lefti  and newe <= righti.  put it at the top.
      ;; avoiding all heap programs
      ;; top of heap is location 1  (ignore 0)
      ;; left branch is location 2
      ;; right branch is location 3
      (cond
					;if end of stream from this subproduct, remove top and make heap smaller!
       ((null newe) (heap-remove h))	
       ;; we've modified the top node! leave it there!
       ((and(<= newe (st-exp (aref h 2))) (<= newe  (st-exp (aref h 3)))))
       ;; it's not the top node. remove it and put the (revised) node back in.
       (t (heap-insert h (heap-remove h) fill-pointer))); the new item doesn't fit there
      (values ta tb)))
 
  (defun mulstrfix(x y);; hacked to insert at top of heap if possible.  fixnum
    (declare (optimize(speed 3)(safety 0)))
    (let* ((hh (if (< (length (car x))(length (car y)))(mularZ4 x y)(mularZ4 y x)))
	   (alen  10)			; heuristic(* (length (car x))(length (car y))))
	   ;; initial heap
	   (anse (make-array alen :adjustable t :element-type 'integer :fill-pointer 0))
	   (ansc (make-array alen :adjustable t :element-type 'integer :fill-pointer 0))
	   (preve 0)
	   (prevc 0))
      (declare (integer preve prevc) (type (array t(*)) anse ansc))
      (setf fill-pointer  (length hh))
      (loop while (not (heap-empty-p)) do
	    (multiple-value-bind
		(newe newc)
		(update-strheap hh)
	      (declare (integer newe newc))
					; take next term off heap and insert its successor
	      (cond ((= preve newe)	; is this same exponent as previous?
		     (incf prevc newc))	; if so, add coefficient.
		    (t (unless (= prevc 0); normalize: remove zero coeffs
			 (vector-push-extend prevc ansc)
			 (vector-push-extend preve anse))
		       (setf prevc newc)
		       (setf preve newe))); initialize new coefficient
	      ))
      (vector-push-extend prevc ansc)	;put in last results before exit
      (vector-push-extend preve anse)
      (cons anse ansc)))

  (defun updatestrfun(rec)		; record is a stream 5-tuple 
    (declare(type (simple-array t(*)) *YYCOEFS*)
	    (type (simple-array fixnum(*)) *YYEXPS*)
	    (optimize (speed 3)(safety 0)) 
	    (type st rec))
    (let ((i (st-i rec)))
      (cond ((>= i *YYLEN*) nil);; no more items on stream of products
	    (t (incf (st-i rec))
	       (setf (st-coef rec)
		     (* (the integer (st-v rec))(the integer (aref *YYCOEFS* i))))
	       (setf (st-exp rec) 
		     (+ (the fixnum (st-e rec))(the fixnum (aref *YYEXPS* i))); returns this val
		     )))))
  
  (defmacro heap-empty-p ()
    "Returns non-NIL if & only if the heap contains no items."
    `(<= fill-pointer 1))

  (defmacro hlessfun (a b) `(< (the fixnum(st-exp ,a))(the fixnum (st-exp ,b))))

  (defmacro lesser-child (a parent fp)
    "Return the index of the lesser child.  If there's one child,
 return its index.  If there are no children, return 
 (FILL-POINTER (HEAP-A HEAP))."
    `(let* ((left;;(+ (the fixnum ,parent)(the fixnum  ,parent))
	     (ash ,parent 1))
	    (right (1+ (the fixnum left))))
       (declare (fixnum left right ,fp ,parent )
		(optimize(speed 3)(safety 0))
		(type (simple-array t (*)) ,a))
       (cond ((>= left ,fp) ,fp)
	     ((= right ,fp) left)
	     ((hlessfun (aref ,a left) (aref ,a right)) left)
	     (t right))))

  ;; We want to get fast access to a sort-of global variable "fill-pointer"
  ;; without incurring the overhead for a full global access.  We can have
  ;; the best of both worlds by setting up a lexical environment in which
  ;; fill-pointer is bound, and defining programs in that environment.
  ;; The names of the programs (like percolate-down) are available in 
  ;; any context including the global context.

  ;; (heap management parts of this were inspired by code that is
  ;; Copyright (c) 2002, 2003 Gene Michael Stover., GPL.
  ;; The code was found ;; at http://cybertiggyr.com/lisp-heap/


  (defun percolate-down (aa hole xx fp)
    "Private. Move the HOLE down until it's in a location suitable for X.
Return the new index of the hole."

    (let ((xxval (st-exp xx)))

      (declare (fixnum hole fp)(fixnum xxval)
	       (type (simple-array t (*)) aa)(optimize (speed 3)(safety 0)))
      (do* ((child (lesser-child aa hole fp) (lesser-child aa hole fp))
	    (aaval (aref aa child)(aref aa child)))
	  ((or (>= (the fixnum child) fp) (< xxval (the fixnum(st-exp aaval))))
	   hole)
	(declare (fixnum child))
	(setf (aref aa hole)  aaval  hole child))))

  (defun percolate-up (a hole xx)
    "Private.  Moves the HOLE until it's in a location suitable for holding
X.  Does not actually bind X to the HOLE.  Returns the new
index of the HOLE.  The hole itself percolates down; it's the X
that percolates up."
    (declare (type (simple-array t (*)) a)(fixnum hole)
	     (optimize (speed 3)(safety 0)))
    (setf (aref a 0) xx)
    (do ((i hole parent)
	 (parent (ash hole -1);;(floor (/ hole 2))
		 (ash parent -1);;(floor (/ parent 2))
		 ))
	;; potential to speed up line below by declaration if a, x are fixnum,
	((not (hlessfun xx (aref a parent))) i)
      (declare (fixnum i parent))
      (setf (aref a i) (aref a parent))))

  ;; important: initial-contents must have the first element duplicated.
  ;; e.g.  (3 5 8)  should be (3 3 5 8)

  (defun heap-insert (a xx fp)
    "Insert a new element into the heap.  Returns the heap.";; rjf
    (declare (type (simple-array t (*)) a)(fixnum fill-pointer)
	     (optimize (speed 3)(safety 0)))
    ;; (format t ".")
    (setf (aref a fp) nil)
    (incf fill-pointer)			; the global one
    ;; Move the hole from the end towards the front of the
    ;; queue until it is in the right position for the new
    ;; element.
    (setf (aref a (percolate-up a fp xx)) xx))

;;; Later, a trick to try, if we believe Roman Pearce's comment:
;;; Instead of percolating down, see what is going to be inserted next,
;;; and try to insert it at the top. This saves a lot of time if it works.

  (defun heap-remove(a) 
    ;; assumes non-empty!! if empty next answer is bogus.
    ;; answer after that is an error (fill pointer can't pop)
    (declare(type (simple-array t(*)) a)(optimize (speed 3)(safety 0)))
    (let*  ((xx (aref a 1))
	    (fp (decf fill-pointer))	;faster, fp is local now
	    (last-object (aref a fp)))
      (declare (fixnum fp))
      (setf (aref a (the fixnum (percolate-down a 1 last-object fp))) last-object)
      xx))

;;; end of the heap stuff.  Note that this has been hacked to provide
;;; less generality and some consequent advantages in time or
;;; space. The original package on which this is based is far more flexible.
;;; 


  )
;; end of lex environment for *YY..*

;; (heap management parts of this were inspired by code that is
;; Copyright (c) 2002, 2003 Gene Michael Stover., GPL.
;; The code was found ;; at http://cybertiggyr.com/lisp-heap/

#+SBCL (defmacro while (test &rest body) ;SBCL doesn't have "while".
  `(do ()   ((not ,test))     ,@body))


;; Here is a multiplier similar to mulstr but more straightforward in
;; that it doesn't use a heap.  Just uses lisp linked list, as in
;; naive multiplication, for heap.  Less straightforwardly, it uses
;; for the records on the heap, lists of the form (exponent
;; coefficient . program).  The program generates the next exp/coef pair.

;;Manage streams of exponent/coeffs by keeping the linked list
;; sorted. Keep on taking the top off the list! It doesn't win
;; anywhere.

(defun mulstr12(x y);; manage "heap" as linked list!
    (declare (optimize(speed 3)(safety 0)))
    (let* ((hha (if (< (length (car x))(length (car y)))(mularZ2 x y)(mularZ2 y x)))
	   ;; initial sorted linked list
	   (anse (make-array 10 :adjustable t :fill-pointer 0))
	   (ansc (make-array 10 :adjustable t :fill-pointer 0))
	   (preve 0) (prevc 0)
	   (hhl (coerce hha  'list)))
      (declare (integer preve prevc))
      (loop while  (cdr hhl) do
	    (multiple-value-bind
		(newe newc)
		(update-ll hhl)
	      (declare (integer newe newc))
	  ;;    (format t "~% inserting e=~s c=~s with preve=~s" newe newc preve )
					; take next term off heap and insert its successor
	      (cond ((= preve newe)	; is this same exponent as previous?
		     (incf prevc newc))	; if so, add coefficient.
		    (t (unless (= prevc 0); normalize: remove zero coeffs
			 (vector-push-extend prevc ansc)
			 (vector-push-extend preve anse))
		       (setf prevc newc)
		       (setf preve newe))); initialize new coefficient
	      ))
      (vector-push-extend prevc ansc)	;put in last results before exit
      (vector-push-extend preve anse)
      (cons anse ansc)))

(defun update-ll (h)
    ;;return top of linked list, which is necessarily not empty.
    ;; take successor in stream of top of list, and insert it in the list.
    (declare(type (simple-array t(*)) h)(optimize (speed 3)(safety 0)))
    (let* ((rest (cdr h))
	   (top (car rest))		; (car h) is a dummy link to make insertion easier.
	   (ta (car top))(tb (cadr  top) ))
      (multiple-value-bind (newe newc)
	  (funcall (the compiled-function (cddr top)))
	(cond((null newe) 
	      ;; remove top; that's all
	      (setf (cdr h)(cdr rest)))
	     (t	
	      (setf (cdr h)(cdr rest))	; remove top from list
	      (setf (car top) newe (cadr top) newc) ; revise top
	      (setf h (LL-insertx rest (car top) h))
	      ))			;re-insert
	   (values ta tb))))

(defun LL-insertx(r exp L)	;make a list of ( ... (exp coef . rprog)...)
  (let ((ahead (cdr L))) ; compare key is exponent to compare in list
    (cond ((null ahead) 
	   (setf (cdr r) nil)
	   (if L  (setf (cdr L) r)
	     (setf L r))	   )
	   ((<= exp (caar ahead))
	    (setf (cdr r)ahead)
	    (setf (cdr L) r)	   	   )
	   (t (LL-insertx r exp (cdr L))))
    L))
	   
;;; another piece, to make mul-hash1 faster, especially useful when the answer is huge.
;;; we don't want to keep generating terms in answer in a list and sorting it.
;;; we can generate terms in 2 arrays and sort, but we can't use lisp's sort.
;;; at least not easily.  This version slows down GCL.

(defun sorthash1(h ar) ;; utility for mul-hash1
  (let* ((htc (hash-table-count h))
	 (anse (make-array htc  :fill-pointer 0))
	 (ansc (make-array htc  :fill-pointer 0)))
    (maphash #'(lambda (i v)
		 (setf v (aref ar v)); get the coefficient, by indirection
		       (unless (= v 0) ;; normalize! remove 0 coefs
			 (vector-push i anse)
			 (vector-push v ansc)))
	     h)
    (qs anse ansc 0 (1- (length anse))) ;; qs is a quick sort we wrote, with a twist.
    (cons anse ansc) ))

(defun qs(a b lo hi)	 
  ;; a re-implementation of quicksort, where entries in a are used as keys
  ; ;for sorting b too.
  (let ((pivotloc 0))
  (declare(type (array t(*)) a b) (fixnum lo hi pivotloc))
  (cond ((< lo hi)
	 (setf pivotloc (partition a b lo hi))
	 (qs a b lo (1- pivotloc))
	 (qs a b (1+ pivotloc) hi)
	 nil))))

(defmacro swapab(i j)
  `(let((temp& (aref a ,i)))
     (setf (aref a ,i) (aref a ,j))
     (setf (aref a ,j) temp&)
     (setf temp& (aref b ,i)) ;also update b. this is why we wrote qs
     (setf (aref b ,i) (aref b ,j))
     (setf (aref b ,j) temp&)   ))

(defun partition(a b lo hi) ;; the partition step in quicksort
    (declare(type (array t(*)) a b ) (fixnum lo hi))
  (let ((left  lo)(right hi)(pivot (aref a lo))) ;pick a[lo] for pivot
    (declare(fixnum left right))
    (loop while (< left right) do
	  (while (< pivot (aref a right))
	    (decf right))
	  (while (and (<= (aref a left) pivot)(< left hi))
	    (incf left))
	  ;; at this point  a[left]>pivot and  a[right]<=pivot.
	  ;; or left=hi
	  (if (< left right) (swapab left right)) )
    (setf (aref a lo) (aref a right))
    (setf (aref b lo) (aref b right))
    (setf (aref a right) pivot)
    (setf (aref b right) (aref b lo)) ;also update b array. This is why we wrote qs
    right))

(defun test(n m &optional (d 5))	;try these parameters for now.
  (setf *x* (make-racg n d)) 
  (setf *y* (make-racg m d))
  (format t "~% mul-hash1 time - best hash table version")
  (time (mul-hash1 *x* *y*))
  (format t "~% mulstr time - use simple streams, in a heap array, insert from top sometimes")
  (time (mulstr *x* *y*))  
   (format t "~% mul-naive-fin time - just use insertion into single list")
   (time (mul-naive-fin *x* *y*)))


;; TRIP data
#|
Fast Sparse Multivariate Polynomial
Multiplication Using Burst Tries
Micka�el Gastineau and Jacques Laskar
Paris Observatory - IMCCE - CNRS UMR8028
77, avenue Denfert Rochereau
75014 Paris
gastineau@imcce.fr, laskar@imcce.fr

V.N. Alexandrov et al. (Eds.): ICCS 2006, Part II, LNCS 3992, pp. 446�453, 2006.

Table 1. Computation time and memory consumption of s1  (s1 + 1) with s1 =
(1+x+y+z+t)25 using different memory managers. Polynomial is stored as a recursive
list. Measured time (sum of user CPU time and system CPU time) is expressed in
seconds and space used (size of the trip process in memory) in kilobytes.
Machine time (s) memory (Kb)
system rlalloc system rlalloc
Intel Xeon 3.06Ghz - Linux 32-bits - glibc 2.3.4 108 83 22360 19688
Intel Xeon 3.6Ghz - Linux 64-bits - glibc 2.3.4 87 63 44656 33920
Apple G5 2.0Ghz - Mac os X 10.4 64-bits 225 186 28748 22504

better stuff from table 2 in same article,  s2 = (1+x+y+z+t)^20

Computer algebra systems s1  (s1 +1) s2  (s2 + 1)
                         time (s) memory (Kb) time (s) memory (Kb)
PARI/GP 2.1.7               82.1 N/A            225.6 N/A
PARI/GP 2.2.11 (GMP kernel) 67.8 N/A            177.6 N/A
Singular 3.0.1              101.5 8383.5         42.8 982.9
TRIP 0.98 (recursive list) 54.6 7138.5           15.5 1370.0
TRIP 0.98 (recursive vector) 42.9 5953.7         16.5 1831.4
TRIP 0.98 (flat vector)      24.1 3609.1         13.3 721.2

|#
#| our time, 

 (time (mul-hash1 s2 s2p1))
; cpu time (non-gc) 98,661 msec (00:01:38.661) user, 62 msec system
; cpu time (gc)     3,167 msec user, 0 msec system
; cpu time (total)  101,828 msec (00:01:41.828) user, 62 msec system
; real time  102,250 msec (00:01:42.250)
; space allocation:
;  2,112 cons cells, -3,090,540,784 other bytes, 0 static bytes
(#(0 1 2 3 4 5 6 7 8 9 ...)
 . #(1 40 780 780 40 658008 658008 18643560 3838380 273438880 ...))
[4] cl-user(974): 

[4] cl-user(978): (time (mul-naive-fin s2 s2p1))
; cpu time (non-gc) 86,852 msec (00:01:26.852) user, 32 msec system
; cpu time (gc)     3,133 msec user, 0 msec system
; cpu time (total)  89,985 msec (00:01:29.985) user, 32 msec system
; real time  90,641 msec (00:01:30.641)
; space allocation:
;  271,406 cons cells, 1,196,507,336 other bytes, 16 static bytes

[4] cl-user(980): (time (mulstr s2 s2p1))
; cpu time (non-gc) 203,147 msec (00:03:23.147) user, 46 msec system
; cpu time (gc)     1,588 msec user, 0 msec system
; cpu time (total)  204,735 msec (00:03:24.735) user, 46 msec system
; real time  206,203 msec (00:03:26.203)
; space allocation:
;  1,327 cons cells, 1,197,434,912 other bytes, 0 static bytes

;; 1+x+y+z+t  -->  (setf r (cons #(0 1 100 10000 1000000)#(1 1 1 1 1)))

[8] cl-user(88):  (time (mulstr *s2* *s2p1*))
; cpu time (non-gc) 237,750 msec (00:03:57.750) user, 125 msec system
; cpu time (gc)     12,719 msec user, 31 msec system
; cpu time (total)  250,469 msec (00:04:10.469) user, 156 msec system
; real time  252,375 msec (00:04:12.375)
; space allocation:
;  1,356 cons cells, 1,197,434,952 other bytes, 16 static bytes
(#(0 1 2 3 4 5 6 7 8 9 ...)
 . #(1 40 780 9880 91390 658008 3838380 18643560 76904685 273438880

s2^20 has 135751 terms
s2 has 10626 terms.


|#


(defvar *r* nil)
(defvar *s2* nil)
(defvar *s2p1* nil)

(defun setup()
  (let ((k1 nil)(k2 nil))
    (setf *r* (cons #(0 1 100 10000 1000000)#(1 1 1 1 1))); 1+x+y+z+t
    (setf *s2* (power *r* 20))
    (setf k1 (make-array 10626 :initial-contents (car *s2*)))
    (setf k2 (make-array 10626 :initial-contents (cdr *s2*)))
    (setf (aref k2 0) 1)
    (setf *s2p1* (cons k1 k2))))
(defun power(p n)(if (eql n 1) p(mul-naive-fin p (power p (1- n)))))
(defun power2(p n)(if (eql n 1) p(mulstr p (power2 p (1- n)))))