;;; -*- Lisp -*-

#|
Multiplication of sparse polynomials P, Q, using the following strategy:
Using a bit-vector as working storage, determine the location of all
possible non-zero coefficients in the answer. Namely, insert a 1 in
the vector for the sum of each exponent pair, r+s, where r is an exponent
in P, and s is an exponent in Q. This, in effect, computes a "skeleton" for
the answer, in an attempt to reduce the time to access a sparse data structure
as we compute the product.

If this bit-vector contains T ones, then allocate two arrays of length T,
one for exponents E and one for coefficients C.  Repeat the doubly-nested loop.
Now for the sum of each exponent pair, q=r+s, compute the related coefficient
product and add it in to the accumulated result.  Finding the place i where q
appears in E. Then add to C[i] the product of the associated coefficients in P and Q.

Finding the place i is done by a binary search, but where upper/lower bounds for
the search are provided by some simple calculations in the loop.
|#

(eval-when (compile load) (declaim (optimize (speed 3)(safety 0))))

;; This method should not be used if the highest degree is
;; extravagant: We have to allocate a bit array of size equal to the
;; degree of the answer.  This technique may win on a kind of
;; middling-sparse domain.  If the degree is low and there are many
;; non-zero terms, a dense array makes a better structure.

;; If the degree is very high, a heap version should be better.

(defun mul-naive-b (x y);; arrange the data structure by computing a skeleton of answer exponents
  ;; if hideg (computed below) is huge, we may not want to do this at all.
  (let* ((xe (car x))
	 (ye (car y))
	 (yl (length (the simple-array ye)))
	 (xl (length (the simple-array xe)))
	 (yemax  (aref ye (1- yl)))
	 (hideg (+ (aref xe (1- xl))yemax ));degree of answer
	 )
    (declare(fixnum yl xl hideg yemax)
	    (type (simple-array t (*)) xe ye))
    (cond 
     ((> hideg 1000000)
      (format t  "~%mul-naive-b can't be used: too high degree: ~s" hideg)
      (mul-heap5 x y)
      ;; probably use heap
      )
     ;;  ((<  hideg (* 3000 xl yl))
     ;;   (format t  "~%mul-naive-b shouldn't be used: too low degree: ~s" hideg)
     ;; probably use dense
     ;;(mul-dense x y)     )

     (t(let((ansbit (make-array (1+ hideg) :element-type 'bit :initial-element 0))
	    (xc (cdr x))  (yc (cdr y)) (low 0)(high 0)
	    (j 0)  (xei 0)  (xci 0)  (ee 0) (ae #())  (ac #())
	    (termcount 0))
	 (declare(fixnum xei j ee termcount low high yemax)
		 (type (simple-array t (*)) xc yc ansy ans)
		 (type (simple-array bit (*)) ansbit))
	 ;; exponent "skeleton" keeping track of maximum number of
	 ;; non-zero coefs possible in termcount.

	 (dotimes (i xl)
	   (declare (fixnum i))
	   (setf xei (aref xe i))
	   (dotimes(j yl) ;; all could be done in parallel
	     (declare (fixnum j))
	     (setf ee (the fixnum(+ xei(the fixnum (aref ye j)))))
	     (unless (eq 1 (sbit ansbit ee))
	       (setf (sbit ansbit ee) 1)
	       (incf termcount))))
	   
	 ;; now we know exactly how many terms in answer, allocate storage.
	 ;; If we determine that the number of terms is nearly the same as hideg,
	 ;; we could divert this again and just use mul-dense.

	 (cond ((>(* 4 termcount) hideg) ;25% of terms are non-zero
		(format t "~% using mul-dense; ~s terms, ~s degree" termcount hideg)
		(return-from mul-naive-b (mul-dense x y))))
	 
	 (setf ae (make-array termcount))
	 (setf ac (make-array termcount :initial-element 0))
	 ;; initialize exponents in answer. j=0 to start
	 (dotimes (i (1+ hideg) ae) ;;all could be done in parallel
	   (declare(fixnum i))
	   (unless (zerop (aref ansbit i))
	     (setf (aref ae j) i)
	     (incf j)))
	 ;; finally, do the multiplications in place
	 (dotimes (i xl (cons ae ac))
	   (declare (fixnum i))
	   (setf xei (aref xe i) xci (aref xc i))
	   ;; use low and high to delimit the binary search to possible matches
	   (setf low -1)
	   (setf high  (1+ (binsearch-updateLH ae ac (+ xei yemax) 0 0  termcount)))
	   (dotimes (j yl) ;; all could be done in parallel
	     (declare (fixnum j))
	     (setf low
		   (binsearch-updateLH ae ac 
				       (the fixnum(+ xei(the fixnum (aref ye j))))
				       (* (aref yc j) xci) (1+ low) high)))))))))


(defun binsearch-updateLH (indexA valA  ind val lo hi)
  ;; Binary search and update, with low and high search parameters.
  ;; All exponents are in array indexA, all values in array valA
  ;; find j such that  ind=indexA[j]. Then add val to element valA[j]   
  ;; by construction there is such a j between lo and hi.  Also indexA is sorted.
  (declare (fixnum ind lo hi)
	   (type (simple-array t (*)) indexA valA))
  (let ((p 0)
	(midpt 0))
    (declare (fixnum p midpt))
    (loop
     (setf p (ash (- hi lo) -1))
     (cond((eq p 0);; either lo=hi or lo=hi-1
	   ;; all values are in the array, so we found it.
	   (incf (aref valA lo) val)
	   (return lo)))
     (setf p (+ lo p))
     (setf midpt (the fixnum (aref indexA p))); midpoint, about.
     (cond ((eq midpt ind)
	    (incf (aref valA p) val)
	    (return p));; the only other way out
	   ((< ind midpt)
	    (setf hi p))
	   (t (setf lo p))))))
	  
#|

 sample polynomial:  (#(2 5 6 7 9) . #(2 1 3 3 1))
 has exponents 2,5,6,7,9 and corresponding coeffs 2,1,3,3,1.
 make-racg is defined in newmul.fasl.
 
(setf q (make-racg 5000 100 5)r (make-racg 1000 100 5))
cl-user(254): (time (mul-naive-b q r))
; cpu time (non-gc) 1,171 msec user, 0 msec system
cl-user(253): (time (mul-dense q r))
; cpu time (non-gc) 235 msec user, 0 msec system
 cl-user(286): (time (mul-heap5 q r))
; cpu time (non-gc) 3,375 msec user, 0 msec system

(setf q (make-racg 5000 10000 5)r (make-racg 1000 10000 5))
cl-user(291): (time (mul-heap5 q r))
; cpu time (non-gc) 5,359 msec user, 0 msec system
cl-user(290): (time (mul-dense q r))
; cpu time (non-gc) 2,375 msec user, 0 msec system

cl-user(293):  (time (mul-rtree4 q r))
; cpu time (non-gc) 22,064 msec user, 140 msec system

cl-user(295): (time (mul-naive q r))
; cpu time (non-gc) 458,844 msec (00:07:38.844) user, 157 msec system


|#