;; Using PAPI on a windows/intel machine from Allegro Common Lisp
;; from Franz Inc.  (see www.franz.com)

;; load winpapi.dll from its standard place.

(load "c:/winnt/system32/winpapi.dll")

(defparameter papi_values (make-array 80 :element-type '(unsigned-byte 8) :initial-element 0))
;; oddly enough the only "interface" problem was that Lisp didn't have
;; a LONGLONG.  So I wrote this program to convert an array of
;; 8 bytes-long objects into a corresponding array of
;; "arbitrary precision integers" or bignums.  

(defun bytes2ll(ba) ;; bytes to long long.  actually a bignum
  ;; 8 bytes represent a long-long. convert to bignum
  ;; ba is a byte array of length 8*n
  ;; return an array of n bignums.
  ;;
  (let* ((len (truncate (length ba) 8))
	 (a (make-array len :initial-element 0))
	 (i8 0)
	 (ans 0))
    (dotimes (i len a) ;return array a
      (setf i8 (1- (* (1+ i) 8)) ans 0)
      (dotimes (j 8 (setf (aref a i) ans))
	(setf ans (+(aref ba (- i8 j))(* 256 ans)))))))
      
;; There are lots of ways of using PAPI, but I only needed these
;; three calls. More parts of the connection could be added.

(ff:def-foreign-call
    (papi_start_counters "PAPI_start_counters")
    ((flags (* :int)) (len :int)) 
  :returning :int)

(ff:def-foreign-call
    (papi_read_counters "PAPI_read_counters")
    ((counters (* :int)) (len :int)) 
  :returning :int)

(ff:def-foreign-call
    (papi_stop_counters "PAPI_stop_counters")
    ((counters (* :int)) (len :int)) 
  :returning :int)

;; all negative numbers are errors.  see papi.h for decoding:
#|
#define PAPI_OK        0  /*No error*/
#define PAPI_EINVAL   -1  /*Invalid argument*/
#define PAPI_ENOMEM   -2  /*Insufficient memory*/
#define PAPI_ESYS     -3  /*A System/C library call failed, please check errno*/
#define PAPI_ESBSTR   -4  /*Substrate returned an error, 
			    usually the result of an unimplemented feature*/
#define PAPI_ECLOST   -5  /*Access to the counters was lost or interrupted*/
#define PAPI_EBUG     -6  /*Internal error, please send mail to the developers*/
#define PAPI_ENOEVNT  -7  /*Hardware Event does not exist*/
#define PAPI_ECNFLCT  -8  /*Hardware Event exists, but cannot be counted 
                            due to counter resource limitations*/ 
#define PAPI_ENOTRUN  -9  /*No Events or EventSets are currently counting*/
#define PAPI_EISRUN  -10  /*Events or EventSets are currently counting */
#define PAPI_ENOEVST -11  /* No EventSet Available */
#define PAPI_ENOTPRESET -12 /* Not a Preset Event in argument */
#define PAPI_ENOCNTR -13 /* Hardware does not support counters */
#define PAPI_EMISC   -14 /* No clue as to what this error code means */

|#


#| There are many possible events we can look for. Here are a few we
think are important for our testing, but (depending on your machine)
there are probably more. These numbers are from papiStdEventDefs.h
In our experience the fact that these items are defined does not
mean that they are actually implemented on your particular chip.
  
#define PAPI_L1_TCM  0x80000006 /*Level 1 total cache misses*/
#define PAPI_L2_TCM  0x80000007 /*Level 2 total cache misses*/
|#
(defconstant papi_l1_dcm  #x80000000)	;data cache misses, level 1
(defconstant papi_l1_icm  #x80000001)	;instruction cache misses
(defconstant papi_l1_tcm  #x80000006)   ;total cache misses
(defconstant papi_l2_tcm  #x80000007)
(defconstant papi_offset  #x80000000)

;;Live events on my pentium 3 are supposedly these on my pentium 3.
; I don't believe them though :
#| (for actual hex numbers, add papi_offset..)
  0	l1 cache
  1 	l1 inst
  6 	l1 total miss
  7 	l2 total miss
  a 	shr request for shared cache line
  b 	cln
  c 	inv
  d 	itv
 15 	instr trans lookaside buffer miss
 17 	l1 load miss
 18 	l1 store miss
 1b 	btac miss
 29 	hardware interrupts
 2b 	conditional branches executed
 2c 	conditional branches taken
 2d 	not taken
 2e 	mispredicted
 2f 	corrected predicted
 31 	total instr executed
 32 	integer inst executed
 34 	fp executed
 37 	total branch inst executed
 38 	vector / simd inst executed
 39 	fp inst per sec
 3a 	cycles process is stalled
 3c 	total cycles
 3d 	instr/sec
 40 	l1 d cache hit
 41 	l2 d cache hit
 42 	l1 d cache access
 43 	l2 d cache access
 46 	l2 d cache read
 49 	l2 d cache write
 4b 	l1 i cache hits
 4e 	l1 i cache accesses
 4f 	l2 i cache accesses
 51 	l1 i cache reads
 52 	l2 i cache reads
 54 	l1 i cache writes
 5a 	l1 total cache accesses
 5b 	l2 total cache accesses
 5e 	l2 total cache reads
 61 	l2 total cache writes
 |#


(defparameter *last-time* 0)

(defun start-ccm(); count L1 L2 total cache misses
  (let((ar2 
	(make-array 2 :element-type '(unsigned-byte 32) 
		    :initial-contents  (vector papi_l1_tcm papi_l2_tcm)))
       )
    (setf *last-time* (get-internal-run-time))
  (papi_start_counters ar2 2)
  ))

(defun read-ccm(printp)			;
  (papi_read_counters papi_values 2)
  (let* ((ans(bytes2ll papi_values))
	 (newtime (get-internal-run-time))
	 (diff (- newtime *last-time*)) )
    (setf *last-time* newtime)
    (if printp  (format t "~% L1 cache misses = ~e, L2 cache misses=~e, runtime=~s"
			(float (aref ans 0))
			(float (aref ans 1))
			diff)
      (list 	(aref ans 0)
		(aref ans 1)
		diff))))


;;; HERE ARE SOME OTHER EXAMPLES...

(defun start-ccmX(); count L1 data  and instruction cache misses
  (let(( ar2 
	 (make-array 2 :element-type '(unsigned-byte 32) 
		     :initial-contents  (vector papi_l1_dcm papi_l1_icm))))
  (papi_start_counters ar2 2)
  ))

(defun start-ccmY(); count inst and L1 total cache access
  (let(( ar2 
	 (make-array 2 :element-type '(unsigned-byte 32) 
		     :initial-contents  (vector (+ papi_offset #x31 )(+ papi_offset #x5a )))))
    (papi_start_counters ar2 2)))



(defun read-ccmX(printp)		;
  (papi_read_counters papi_values 2)
  (let ((ans(bytes2ll papi_values)))
    (if printp  (format t "~% L1 data misses= ~e, L1 inst misses=~e, total misses ~e" (float (aref ans 0))
			(float (aref ans 1))
			(float (+(aref ans 0)(aref ans 1)))))))

(defun read-ccmY(printp)		;
  (papi_read_counters papi_values 2)
  (let ((ans(bytes2ll papi_values)))
    (if printp  (format t "~% total instr = ~e, l1 cache accesses=~e" (float (aref ans 0))
			(float (aref ans 1))))))

(defun sc()(start-ccm)) ;; shorthand
(defun rc(p)(read-ccm p))		;p is t if you want to read and PRINT results

;; typical usage
;; (sc) ;start counting. e.g. L1 and L2 cache misses

;; (progn
;;    (rc nil)				;read and reset counters
;;    (compute something)
;;    (rc t) ;; print the results
;;   )


;;;;;;;;;;;;;;;;;;;;;;that's it;;;;;;;;;;;;;;;;;;;

(defun mrl(n) ;;make random list
  (let ((ans nil))
    (declare (fixnum n))
    (dotimes (j n ans)
      (declare (fixnum j))
      (push (random #.(truncate most-positive-fixnum 4))
	    ans))))

(defun lengthm(l m)			;compute length of list l m times 
  (declare(optimize(speed 3)(safety 0)(debug 0)) (fixnum m)(list l))
  (dotimes (j m)
      (declare (fixnum j))
      (length l)))

(defun mra(n) ;;make random array
  (let ((ans (make-array n :element-type 'fixnum)))
    (declare (fixnum n))
    (dotimes (j n ans)
      (declare (fixnum j))
      (setf (aref ans j) (random #.(truncate most-positive-fixnum 4))))))

(defun lengtha(a m)
  ;;compute touch each element in array a m times
  (declare (type (simple-array t (*)) a)
	   (fixnum m)
	   (optimize(speed 3)(safety 0)(debug 0)))
	   (let ((r 0))
	     (dotimes (j m)
	       (declare (fixnum j))
	       (dotimes (k (length a))
		 (declare (fixnum k))
		 (setf r (aref a k))))))



(defmacro timecache(m)`(time (progn (rc nil),m(rc t))))
(defmacro timecacheX(m)`(time (progn (read-ccmX nil),m(read-ccmX t))))
(defmacro timecacheY(m)`(time (progn (read-ccmY nil),m(read-ccmY t))))
;; to see gc messages on allegro console..
(setf (sys:gsgc-switch :print) t)
(setf (sys:gsgc-switch :verbose) t)
(setf (sys:gsgc-switch :stats) t)
#|  Here's what we did:  make a list R of 16000 random numbers.
R is allocated off the lisp free list and is routinely
composed of sequential locations in memory.

note about cache miss data and times:  the L1/L2/total and I/D cache
miss data was taken on different runs and does not necessarily add up.

The first timed test is to  call (length R) 10000 times. 

This takes 0.701 seconds with  L1 misses= 4.01e7, L2 misses= 3.36e4
 of the L1 misses                 data    4.01e7 
.                                 inst    0.00168e7   i.e. 1.68e4
 
 The second timed test:
Take R and destructively sort it.  This randomizes the sequence in locations
in memory, since R was initialized to random values.
Repeat the timed test of calling (length R) 10000 times. 

This takes 1.151 seconds with L1 misses= 14.43e7  L2 misses= 4.17e4.
 of the L1 misses                data    14.48e7
.                                inst     0.0023e7    i.e.  2.3e4
 
 That is, after randomization the elapsed time  increased by 0.45 sec, 64%
 L1 cache misses, (almost all due to data) increased by 10.47e7, 261%
 L2 cache misses increased by 2.53e7 or 154%.

In a pentium III we have
32-byte cache lines. There will be 4 cons cells. If they
are sequentially accessed, we will get 3 cache hits then a miss. If they
are randomly accessed, each reference will give us a cache miss.
So there should be about 3 times more misses. 2.61 is measured.

Next part of the test:
Do a garbage collection; call (length R) 10000 times. 
This takes 0.711 seconds with L1 misses= 4.01e7   L2 misses= 3.85e4
of the L1 misses                 data    4.01e7
.                                inst    0.00076e7  i.e. 7.597e3

Conclusion: The lisp copying/generation scavenging garbage
collector had reorganized the memory to regenerate locality in this
test. 

Next test, compare lists to arrays.
Allocate 16000 single-word numbers in an array. This array is
actually half the size of a 16000 item list in memory, since
the array does not need space for pointers.

Touch them all in sequence one after another. (10000 times). For
this test there will be 8 sequential elements rather than 4 in a
cache line, and so the number of cache misses is predicted to be
half of the earlier number, 4.01e7.  This is very close
to the number measured at 2.01e7. 

This takes 0.891 seconds with L1 data misses= 2.007e7  
.                                inst misses= 0.0022  ie ..2.2e4

The cache behavior of the array is not affected by sorting.

Moral of the story:

Here are two ways to get less memory thrashing by two mechanisms:

(a) Find a more compact representation that is accessed sequentially
and doesn't "waste space" for pointers etc. (like a packed array).
(b) Leaving data in lists and perform a copying garbage collection. In
an ideal case the layout of the data in the copied list will mirror
the data access pattern of your program, and there will be substantial
improvement in locality.

The second of these possibilities is not so well known, but in this
simple benchmark it speeds the computation by 63%.  As they say,
your mileage may vary.




other notes.  
(lengtha a16000 10000) compiled with declarations
takes about .872 seconds
(lengtha a16000 10000) compiled without declarations
takes about 16.7 seconds, or 19.1 times slower
(lengtha a16000 10000) uncompiled
should take about 4,706 seconds.   (I only timed 10 iterations) or 5,396 times
slower.

|#


(defun t1(n);; n is length of list
  (let ((z (mrl n))
	(count (truncate 1000000000 n))) ;; that's 1.0d+9
    (start-ccm)
    (format t "~%statistics for length of list")
    (statlength z) 
    (format t "~%just allocated time")
    (timecache (lengthm z count))
					;(setf z (sort z #'>))
    (setf z (mergesort z))
    (format t "~%statistics for length of list, scrambled")
    (statlength z) 
    (format t "~%after scrambling time")
    (timecache (lengthm z count))
    (gc)
    (format t "~%statistics for length of list, after GC")
    (statlength z) 
    (format t "~%after GC time")
    (timecache (lengthm z count))))

(defun t2(n);; n is length of list
  (let ((z (mrl n))
	(count (truncate 1000000000 n))) ;; that's 1.0d+9
    (start-ccmY)
    (format t "~%just allocated time")
    (timecacheY (lengthm z count))
    (setf z (sort z #'>))
    (format t "~%after scrambling time")
    (timecacheY (lengthm z count))
    (gc)
    (format t "~%after GC time")
    (timecacheY (lengthm z count))))

(defun t3(n);; n is length of list
  (let ((z (mrl n))
	(count (truncate 1000000000 n))) ;; that's 1.0d+9
    (start-ccmX)
    (format t "~%just allocated time")
    (timecacheX (lengthm z count))
    (setf z (sort z #'>))
    (format t "~%after scrambling time")
    (timecacheX (lengthm z count))
    (gc)
    (format t "~%after GC time")
    (timecacheX (lengthm z count))))


#| data summary
  n       fresh                 scrambled          after GC
        time    L1    l2       time  L1   l2      time   L1  l2
10     5.9    1.1e6 1.5e5      5.9   1.1  1.6	  5.9	 1.1 1.5 
             L1 cache misses about 1.5:1 = data:instr. instr misses = 2.6e5
20     4.7    8.6e5 1.7e5      4.7   9.8  1.9	  4.7	 9.7 1.8 
50     3.8    7.9e5 1.5e5      3.8   7.9  1.6	  3.8	 9.8 2.4
50     3.8    8.0e5 1.6e5      3.8   7.1  1.4	  3.9	 8.1 1.6 ;; GC no help
100    3.6    6.5e5 1.3e5      3.6   9.2  2.0	  3.6	 9.3 2.0 ;; GC no help
200    3.5    6.3e5 9.6e4      3.5   7.3  11	  3.4	 6.4 9.9 ;; GC so/so
400    3.4    7.1e5 1.5e5      3.4   7.3  1.5	  3.4	 7.8 1.5 ;; GC no help
1000   4.1    8.6e5 1.1e5      3.6   7.0  0.9	  5.8	 41. 1.4 ;; GC worse?
             L1 cache misses about 2:1  = data:instr.  instr misses = 1.4e5
1000   4.0    8.9e5 1.3e5      4.6   9.5  1.4	  4.5	 11. 1.6 ;; do over
2000   6.5    9.4e6 1.8e5      6.3   11	  1.6	  3.4	 8.2 1.1 ;; vg 4 gc
             L1 cache misses about 20:1 to 33:1  = data:instr
             at n=2000, the data ceases to fit in L1 cache
2000   6.0    6.9e6 2.0e5      6.3   14   8.4	  3.5	 5.7 0.95 ;;vg 4 gc
4000   6.4    2.5e8 1.8e5      7.2   5.8  1.8	  6.5	 2.5 1.6  ;;g  4 gc
8000   4.5    2.5e8 1.5e5      6.8   8.0  1.9	  4.5	 2.5 1.3  ;;vg 4 gc
          L1 cache misses about 1000:1 to 3000:1  = data:instr instr misses 2.5e5
16000  4.5    2.5e8 2.2e5      7.3   9.1  3.1	  4.5	 2.5 2.3  ;;vvg 4 gc
20000  4.5    2.5e8 3.2e5      7.3   9.2  4.8	  4.5	 2.5 3.4 ;;vg 4 gc
30000  4.9    2.5e8 3.8e6      8.7   9.5  8.6	  9.2	 2.5 40. ;; bad??L2
30000  4.9    2.5e8 3.6e6      8.7   9.5  8.5	  13.5	 2.5 77. ;; bad??L2
32000  9.8    2.5e8 4.6e7      12.1  9.5  3.4	  9.8	 2.5 4.5 ;; g ??L2
     L1 cache misses about 700:1 to 1900:1  = data:instr instr misses 3.6e5 to 4.9e5
            at n=32768, the data ceases to fit in L2 cache
35000 23.2    2.5e8 1.5e8      22.5  9.6  1.1	  18.4	 2.5 1.2 ;;g 4 gc
40000 33.8    2.6e8 2.5e8      27.4  3.1  1.9	  31.5	 2.6 2.3 ;; bad??L2
50000 33.9    2.6e8 2.5e8      33.1  5.1  2.1	  34.0	 2.6 2.6 ;;?? l2

Program t1 will run always run the length function over 10^9 items.
For example, 10^6 times on a list of size 10^3= 10^9 accesses.  
If each access took the cache system by surprise there
would be 10^9 L1 and L2 cache misses. For 25,000 runs of 40,000 elements, we
got to about 1/4 of that rate. Note that at 32,768 8-byte cells the L2
cache capacity is reached and the number of cache misses increases by
a factor of 70, and the execution time by a factor of 7.
Program t3 counts only L1 cache misses, but separates the
data into instruction and data accesses.

The correlation between numbers from program t1 and t3 are somewhat disappointing.

|#

(defun mylength2(a n)
  (declare (list a)(fixnum n)(optimize (speed 3)(safety 0)(debug 0)))
  (cond ((null a) n)(t (mylength2 (cdr a)(1+ n)))))


(defun t4(n);; n is length of list
  (let ((z (mrl n))
	(count (truncate 1000000000 n))) ;; that's 1.0d+9
    (start-ccmX)
    (format t "~%just allocated time")
    (timecacheX (lengthm2 z count))
    (setf z (sort z #'>))
    (format t "~%after scrambling time")
    (timecacheX (lengthm2 z count))
    (gc)
    (format t "~%after GC time")
    (timecacheX (lengthm2 z count))))

(defun lengthm2(l m)			;compute length of list l m times 
  (declare(optimize(speed 3)(safety 0)(debug 0)) (fixnum m)(list l))
  (dotimes (j m)
      (declare (fixnum j))
    (mylength2 l 0)))


(defmacro timecacheW(m)`(progn (read-ccm nil),m(read-ccm t)))

(defparameter *timedata* nil)

(defun t5(n);; n is length of list
  (let ((z (mrl n))
	(count (truncate 1000000000 n))) ;; that's 10^9/n
    (setf z (sort z #'>))
    (format t "~%after scrambling time")
    (push `(,n s ,(timecacheW (lengthm2 z count))) *timedata*)
    (gc)
    (format t "~%after GC time")
    (push `(,n g ,(timecacheW (lengthm2 z count))) *timedata*)
	 ))

;;;time data from t5
'((200000 g (302214143 277477395 40849))
  (200000 s (301298288 277335463 40729))
  (200000 j (254375212 253184531 34089))
  (100000 g (254070731 252939805 34129))
  (100000 s (355716615 276144704 40468))
  (100000 j (255050908 253102448 34129))
;  (100000 g (253984042 252904546 34159)) 
;  (100000 s (453657710 320011316 47128))
					;  (100000 j (254137435 252903523 34109))
  (80000 g (254862100 253049921 34209))
  (80000 s (311975311 253843240 35811))
  (80000 j (254101522 252764418 34089))
					; ( 80000 g (254073693 252938896 34139)) 
					;  (80000 s (457181585 291849093 44063))
					;  (80000 j (254162233 252782112 34139))
  (50000 g (253339823 252228422 33919))
  (50000 s (517831098 200812961 31996))
  (50000 j (253415851 251947113 33949))
  (40000 g (254704833 252248962 34139))
  (40000 s (256857603 251438083 35551))
  (40000 j (254656325 252416353 34129))
					;  (40000 g (253826963 251950180 33889))
					;  (40000 s (348816828 164564752 24034))
					;  (40000 j (253380822 251502547 33919))
  (35000 g (252426063 124851307 19098))
  (35000 s (963043955 109639398 22432))
  (35000 j (252906818 169818240 24375)) 
  (30000 g (251560818 38514280 8903))
  (30000 s (951928394 32471367 11717))
  (30000 j (251701135 45467535 9754))
  (20000 g (251098282 331958 4396)) 
  (20000 s (922729575 603923 7401)) 
  (20000 j (251164191 311352 4426)) 
  (10000 g (251744501 203497 4397));
  (10000 s (843541842 254946 7000));
  (10000 j (251574116 101990 4396));
  (5000 g (252586666 85126 4396))
  (5000 s (681926449 112443 6280))
  (5000 j (252574524 80024 4386))
  (2000 g (23417658 68930 3375))
  (2000 s (13975002 62048 3334))
  (2000 j (23421759 64154 3395))
  (1000 g (407684 66331 3325))
  (1000 s (398824 62925 3335))
  (1000 j (383136 55981 3315))
  (500 g (383813 80679 3385))
  (500 s (326318 59157 3384))
  (500 j (306504 49966 3375))
  (200 g (335370 66803 3535))
  (200 s (330773 66684 3535))
  (200 j (328054 64491 3535))
  
					;(200 g (1051307 164993 3646))
					;(200 s (365089 86998 3525))
					;(200 j (302947 57540 3525))
  (100 g (318497 60956 3775))
  (100 s (315211 61424 3786))
  (100 j (317666 63384 3775))
  (50 g (353506 63919 4296))
  (50 s (348598 64630 4316))
  (50 j (337807 61081 4297)) 
  (20 g (489692 97322 5828))
  (20 s (447945 83095 5748))
  (20 j (506673 103447 5749))
  (10 g (631608 113339 8181))
  (10 s (654784 119475 8312))
  (10 j (728567 141596 8292)))

;;;;;;;;;;more thoughts.
#| If the lisp GC puts things in memory in the order in which I think it does,
namely, copy a cons cell X into relative location 0, then  (CDR X)into
relative location 1  (actually 1*(8 bytes)), (recursively)  and then (CAR X),
that means traversing a tree in breadth-first order would be faster than
depth first, if the tree gets big enough. |#

(eval-when (compile)(proclaim '(optimize (speed 3)(safety 0)(debug 0))))

(defun visit1(x)            ; visit car-cdr (slower?)
  (cond(x(visit1 (car x))(visit1(cdr x)))))

(defun visit2(x)            ; visit cdr-car (faster?)
  (cond(x(visit2 (cdr x))(visit2(car x)))))

(defun mt(n)                ; make-tree
  (cond((= n 0) nil)
       (t (let ((r (mt(1- n))))
	    (cons r (copy-tree r))))))

(defun mt4(n)                ; make-tree alternative
  (cond((= n 0) nil)
       (t (let ((r (mt(1- n))))
	    (list r (copy-tree r) (copy-tree r)(copy-tree r))))))

(defun mtl(n)                ; make-tree
  (cond((= n 0) nil)
       (t (let ((r (mt(1- n))))
	    (list r (copy-tree r) (copy-tree r)(copy-tree r))))))


(defun tx1(n tree)
  (time (dotimes (i (expt 2 n))(declare (fixnum i)) (visit1 tree))))

(defun tx2(n tree)
  (time (dotimes (i (expt 2 n)) (declare (fixnum i))(visit2 tree))))

;(setf t15 (mt4 16))(tx1 10 t16)(tx2 10 t16)  ;; 10+16=constant

(defun visit3(x)            ; visit cdr-car (faster?)
  (cond(x(visit2 (cdr x))(visit2(car x)))))

(defun visit4(x)(bfs (list x) x))

;; breadth first search..

(defun bfs (queue x)
     (if (null queue) nil
        (let ((path (car queue)))
          (let ((node (car path)))
            (if (null x)nil
                (bfs  
                     (append (cdr queue)
                             (new-paths path node x))
                     x))))))

(defun whereis(x &optional (rel 0))
  ;; where is x in memory (relative to rel, if present)
  (- (excl::pointer-to-address x)
     (excl::pointer-to-address rel)))

(defun avlength(z)
  (let ((sum 0)
	(len (length z))
	(r z))
    (declare (fixnum len r))
    (mapl #'(lambda(x)
	      (incf sum(abs(whereis x r)))
	      (print (abs (whereis x r)))
		 (setf r x))
	     z	 )
    (float (/ sum (1- len)))))

(defun statlength(z)
  ;; look at the lengths of links in a list z
  ;; keep track in bins.
  (let ((bins '(8 16 24 32 64 #.(expt 2 10) #.(expt 2 20)))
	(counts (list 0 0 0 0 0 0 0 0))
	(r 0)
	(len 0))
    (declare (fixnum len r))
    (mapl #'(lambda(x)
	      (statcount(abs(whereis x r)) counts bins)
	      (setf r x))
	  z )
    
    (format t "~%<=~1,8@t~{~s~1,8@t~}~%  ~%count~1,8@t~{~s~1,8@t~}~% " bins counts)
    (values)))

(defun statcount(num clist bins)
  ;; num is pos integer
  ;; clist is at least one longer than bins 
  (cond((null bins)(incf (car clist)))
       ((<= num (car bins))(incf (car clist)))
       (t (statcount num (cdr clist)(cdr bins))))
  clist)

;; like statlength but look at car and cdr links both


(defparameter bins nil)
(defparameter counts nil)
(defun sttl(z) ;stattreelength
  ;; look at the lengths of links in a list z
  ;; keep track in bins.
  (let ((bins '(8 16 24 32 64 #.(expt 2 10) #.(expt 2 20)))
	(counts (list 0 0 0 0 0 0 0 0)))
    (declare (special bins counts))
    (cond((atom z)nil)
	 (t(sttlx z)))
    (format t "~%<=~1,8@t~{~s~1,8@t~}~%  ~%count~1,8@t~{~s~1,8@t~}~% " bins counts)
    (values)))
(defun sttlx (z)
  (cond ((not (atom z))  (sttlx1 z (cdr z))  (sttlx1 z (car z)))))

(defun sttlx1(here next)
  (cond ((not (atom next))  ;; don't count pointers to nil or atm
	 (statcount(abs(whereis next here)) counts bins)
	 (sttlx next))))

    



	
;;; an in-place mergesort
;;; split at least 2 
(defun split (x)  ;; break the list x into two parts, odd and even
  (cond ((null x)(values nil nil))
	((null (cdr x))(values x nil))
	(t
	 (let((a x)
	      (b (cdr x))
	      (c (cddr x)))
	   (setf (cdr a) nil)
	   (setf (cdr b) nil)
	   (split1 c a b)))))

(defun split1(x y z)
  (cond((null x))
      ((null (cdr x))(setf (cdr x)(cdr y)) (setf (cdr y) x))
       (t
	(let((a x)
	     (b (cdr x))
	     (c (cddr x)))
	  (setf (cdr a) (cdr y))
	  (setf (cdr y)a)
	  (setf (cdr b) (cdr z))
	  (setf (cdr z)b)
	  (split1 c a b))))
       (values y z))
	
(defun mymerge(x y);; faster version of merging, destructively (cf. CL version)
  (cond ((null x) y)
	((null y) x)
	((>= (car x)(car y))
	 (setf (cdr x)(mymerge (cdr x) y))
	 x)
	(t (mymerge y x))))


(defun mergesort(r)
  (cond ((null r) nil)
	((null (cdr r)) r)
	(t(multiple-value-bind
	     (x y)
	     (split r)
	    ;;(merge 'list (mergesort x)(mergesort y) #'<)
	    (mymerge (mergesort x)(mergesort y))
	    ))))

	       
;;;;;;;;;;
;;; slow program
#|
(;; take a list of numbers and put them in order by manipulating 
;; the list cells.  Important not to provoke a GC.

(defun sortone(k)
  (declare(optimize(speed 3)(safety 0)(debug 0)))
  ;; k is a list.  take (cadr k) and put it in order in (cddr k)
  ;; destructively. Don't mess with (car k); use it for a handle.

  (cond ((null (cddr k)));; e.g. '(? 10 ) can't be sorted here.
	;; now we have '(? 10 30)  or '(? 30 10)
	((< (the fixnum (cadr k))(the fixnum (caddr k)))); '(? 10 30) ; sorted, do nothing
	(t				; '(? 30 10  z  )
	 (let ((a (cdr k)))		; a is  (30 10 z )
	   (setf (cdr k)(cddr k))	; k is (? 10 z)
	   (setf (cdr a)(cddr k))	;   a is (30 z)
	   (setf (cddr k) a))		;  k is (? 10 30 z)
	 (sortone (cdr k))))
  k)

(defun mysort(x)
  (cdr (sortall (cons 'header x))))

;; slow.  doing this wrong?..

(defun sortall (x)
  (cond ((null (cdr x)) x)
	(t(sortall (cdr x))
	  (sortone x)))) 
) |#

(defun mrb(n) ;;make random list
  (let ((ans nil))
    (dotimes (j n ans)
      (setf ans (cons ans  'x)))))