;; Using PAPI on a windows/intel machine from Allegro Common Lisp ;; from Franz Inc. (see www.franz.com) ;; load winpapi.dll from its standard place. (load "c:/winnt/system32/winpapi.dll") (defparameter papi_values (make-array 80 :element-type '(unsigned-byte 8) :initial-element 0)) ;; oddly enough the only "interface" problem was that Lisp didn't have ;; a LONGLONG. So I wrote this program to convert an array of ;; 8 bytes-long objects into a corresponding array of ;; "arbitrary precision integers" or bignums. (defun bytes2ll(ba) ;; bytes to long long. actually a bignum ;; 8 bytes represent a long-long. convert to bignum ;; ba is a byte array of length 8*n ;; return an array of n bignums. ;; (let* ((len (truncate (length ba) 8)) (a (make-array len :initial-element 0)) (i8 0) (ans 0)) (dotimes (i len a) ;return array a (setf i8 (1- (* (1+ i) 8)) ans 0) (dotimes (j 8 (setf (aref a i) ans)) (setf ans (+(aref ba (- i8 j))(* 256 ans))))))) ;; There are lots of ways of using PAPI, but I only needed these ;; three calls. More parts of the connection could be added. (ff:def-foreign-call (papi_start_counters "PAPI_start_counters") ((flags (* :int)) (len :int)) :returning :int) (ff:def-foreign-call (papi_read_counters "PAPI_read_counters") ((counters (* :int)) (len :int)) :returning :int) (ff:def-foreign-call (papi_stop_counters "PAPI_stop_counters") ((counters (* :int)) (len :int)) :returning :int) ;; all negative numbers are errors. see papi.h for decoding: #| #define PAPI_OK 0 /*No error*/ #define PAPI_EINVAL -1 /*Invalid argument*/ #define PAPI_ENOMEM -2 /*Insufficient memory*/ #define PAPI_ESYS -3 /*A System/C library call failed, please check errno*/ #define PAPI_ESBSTR -4 /*Substrate returned an error, usually the result of an unimplemented feature*/ #define PAPI_ECLOST -5 /*Access to the counters was lost or interrupted*/ #define PAPI_EBUG -6 /*Internal error, please send mail to the developers*/ #define PAPI_ENOEVNT -7 /*Hardware Event does not exist*/ #define PAPI_ECNFLCT -8 /*Hardware Event exists, but cannot be counted due to counter resource limitations*/ #define PAPI_ENOTRUN -9 /*No Events or EventSets are currently counting*/ #define PAPI_EISRUN -10 /*Events or EventSets are currently counting */ #define PAPI_ENOEVST -11 /* No EventSet Available */ #define PAPI_ENOTPRESET -12 /* Not a Preset Event in argument */ #define PAPI_ENOCNTR -13 /* Hardware does not support counters */ #define PAPI_EMISC -14 /* No clue as to what this error code means */ |# #| There are many possible events we can look for. Here are a few we think are important for our testing, but (depending on your machine) there are probably more. These numbers are from papiStdEventDefs.h In our experience the fact that these items are defined does not mean that they are actually implemented on your particular chip. #define PAPI_L1_TCM 0x80000006 /*Level 1 total cache misses*/ #define PAPI_L2_TCM 0x80000007 /*Level 2 total cache misses*/ |# (defconstant papi_l1_dcm #x80000000) ;data cache misses, level 1 (defconstant papi_l1_icm #x80000001) ;instruction cache misses (defconstant papi_l1_tcm #x80000006) ;total cache misses (defconstant papi_l2_tcm #x80000007) (defconstant papi_offset #x80000000) ;;Live events on my pentium 3 are supposedly these on my pentium 3. ; I don't believe them though : #| (for actual hex numbers, add papi_offset..) 0 l1 cache 1 l1 inst 6 l1 total miss 7 l2 total miss a shr request for shared cache line b cln c inv d itv 15 instr trans lookaside buffer miss 17 l1 load miss 18 l1 store miss 1b btac miss 29 hardware interrupts 2b conditional branches executed 2c conditional branches taken 2d not taken 2e mispredicted 2f corrected predicted 31 total instr executed 32 integer inst executed 34 fp executed 37 total branch inst executed 38 vector / simd inst executed 39 fp inst per sec 3a cycles process is stalled 3c total cycles 3d instr/sec 40 l1 d cache hit 41 l2 d cache hit 42 l1 d cache access 43 l2 d cache access 46 l2 d cache read 49 l2 d cache write 4b l1 i cache hits 4e l1 i cache accesses 4f l2 i cache accesses 51 l1 i cache reads 52 l2 i cache reads 54 l1 i cache writes 5a l1 total cache accesses 5b l2 total cache accesses 5e l2 total cache reads 61 l2 total cache writes |# (defparameter *last-time* 0) (defun start-ccm(); count L1 L2 total cache misses (let((ar2 (make-array 2 :element-type '(unsigned-byte 32) :initial-contents (vector papi_l1_tcm papi_l2_tcm))) ) (setf *last-time* (get-internal-run-time)) (papi_start_counters ar2 2) )) (defun read-ccm(printp) ; (papi_read_counters papi_values 2) (let* ((ans(bytes2ll papi_values)) (newtime (get-internal-run-time)) (diff (- newtime *last-time*)) ) (setf *last-time* newtime) (if printp (format t "~% L1 cache misses = ~e, L2 cache misses=~e, runtime=~s" (float (aref ans 0)) (float (aref ans 1)) diff) (list (aref ans 0) (aref ans 1) diff)))) ;;; HERE ARE SOME OTHER EXAMPLES... (defun start-ccmX(); count L1 data and instruction cache misses (let(( ar2 (make-array 2 :element-type '(unsigned-byte 32) :initial-contents (vector papi_l1_dcm papi_l1_icm)))) (papi_start_counters ar2 2) )) (defun start-ccmY(); count inst and L1 total cache access (let(( ar2 (make-array 2 :element-type '(unsigned-byte 32) :initial-contents (vector (+ papi_offset #x31 )(+ papi_offset #x5a ))))) (papi_start_counters ar2 2))) (defun read-ccmX(printp) ; (papi_read_counters papi_values 2) (let ((ans(bytes2ll papi_values))) (if printp (format t "~% L1 data misses= ~e, L1 inst misses=~e, total misses ~e" (float (aref ans 0)) (float (aref ans 1)) (float (+(aref ans 0)(aref ans 1))))))) (defun read-ccmY(printp) ; (papi_read_counters papi_values 2) (let ((ans(bytes2ll papi_values))) (if printp (format t "~% total instr = ~e, l1 cache accesses=~e" (float (aref ans 0)) (float (aref ans 1)))))) (defun sc()(start-ccm)) ;; shorthand (defun rc(p)(read-ccm p)) ;p is t if you want to read and PRINT results ;; typical usage ;; (sc) ;start counting. e.g. L1 and L2 cache misses ;; (progn ;; (rc nil) ;read and reset counters ;; (compute something) ;; (rc t) ;; print the results ;; ) ;;;;;;;;;;;;;;;;;;;;;;that's it;;;;;;;;;;;;;;;;;;; (defun mrl(n) ;;make random list (let ((ans nil)) (declare (fixnum n)) (dotimes (j n ans) (declare (fixnum j)) (push (random #.(truncate most-positive-fixnum 4)) ans)))) (defun lengthm(l m) ;compute length of list l m times (declare(optimize(speed 3)(safety 0)(debug 0)) (fixnum m)(list l)) (dotimes (j m) (declare (fixnum j)) (length l))) (defun mra(n) ;;make random array (let ((ans (make-array n :element-type 'fixnum))) (declare (fixnum n)) (dotimes (j n ans) (declare (fixnum j)) (setf (aref ans j) (random #.(truncate most-positive-fixnum 4)))))) (defun lengtha(a m) ;;compute touch each element in array a m times (declare (type (simple-array t (*)) a) (fixnum m) (optimize(speed 3)(safety 0)(debug 0))) (let ((r 0)) (dotimes (j m) (declare (fixnum j)) (dotimes (k (length a)) (declare (fixnum k)) (setf r (aref a k)))))) (defmacro timecache(m)`(time (progn (rc nil),m(rc t)))) (defmacro timecacheX(m)`(time (progn (read-ccmX nil),m(read-ccmX t)))) (defmacro timecacheY(m)`(time (progn (read-ccmY nil),m(read-ccmY t)))) ;; to see gc messages on allegro console.. (setf (sys:gsgc-switch :print) t) (setf (sys:gsgc-switch :verbose) t) (setf (sys:gsgc-switch :stats) t) #| Here's what we did: make a list R of 16000 random numbers. R is allocated off the lisp free list and is routinely composed of sequential locations in memory. note about cache miss data and times: the L1/L2/total and I/D cache miss data was taken on different runs and does not necessarily add up. The first timed test is to call (length R) 10000 times. This takes 0.701 seconds with L1 misses= 4.01e7, L2 misses= 3.36e4 of the L1 misses data 4.01e7 . inst 0.00168e7 i.e. 1.68e4 The second timed test: Take R and destructively sort it. This randomizes the sequence in locations in memory, since R was initialized to random values. Repeat the timed test of calling (length R) 10000 times. This takes 1.151 seconds with L1 misses= 14.43e7 L2 misses= 4.17e4. of the L1 misses data 14.48e7 . inst 0.0023e7 i.e. 2.3e4 That is, after randomization the elapsed time increased by 0.45 sec, 64% L1 cache misses, (almost all due to data) increased by 10.47e7, 261% L2 cache misses increased by 2.53e7 or 154%. In a pentium III we have 32-byte cache lines. There will be 4 cons cells. If they are sequentially accessed, we will get 3 cache hits then a miss. If they are randomly accessed, each reference will give us a cache miss. So there should be about 3 times more misses. 2.61 is measured. Next part of the test: Do a garbage collection; call (length R) 10000 times. This takes 0.711 seconds with L1 misses= 4.01e7 L2 misses= 3.85e4 of the L1 misses data 4.01e7 . inst 0.00076e7 i.e. 7.597e3 Conclusion: The lisp copying/generation scavenging garbage collector had reorganized the memory to regenerate locality in this test. Next test, compare lists to arrays. Allocate 16000 single-word numbers in an array. This array is actually half the size of a 16000 item list in memory, since the array does not need space for pointers. Touch them all in sequence one after another. (10000 times). For this test there will be 8 sequential elements rather than 4 in a cache line, and so the number of cache misses is predicted to be half of the earlier number, 4.01e7. This is very close to the number measured at 2.01e7. This takes 0.891 seconds with L1 data misses= 2.007e7 . inst misses= 0.0022 ie ..2.2e4 The cache behavior of the array is not affected by sorting. Moral of the story: Here are two ways to get less memory thrashing by two mechanisms: (a) Find a more compact representation that is accessed sequentially and doesn't "waste space" for pointers etc. (like a packed array). (b) Leaving data in lists and perform a copying garbage collection. In an ideal case the layout of the data in the copied list will mirror the data access pattern of your program, and there will be substantial improvement in locality. The second of these possibilities is not so well known, but in this simple benchmark it speeds the computation by 63%. As they say, your mileage may vary. other notes. (lengtha a16000 10000) compiled with declarations takes about .872 seconds (lengtha a16000 10000) compiled without declarations takes about 16.7 seconds, or 19.1 times slower (lengtha a16000 10000) uncompiled should take about 4,706 seconds. (I only timed 10 iterations) or 5,396 times slower. |# (defun t1(n);; n is length of list (let ((z (mrl n)) (count (truncate 1000000000 n))) ;; that's 1.0d+9 (start-ccm) (format t "~%statistics for length of list") (statlength z) (format t "~%just allocated time") (timecache (lengthm z count)) ;(setf z (sort z #'>)) (setf z (mergesort z)) (format t "~%statistics for length of list, scrambled") (statlength z) (format t "~%after scrambling time") (timecache (lengthm z count)) (gc) (format t "~%statistics for length of list, after GC") (statlength z) (format t "~%after GC time") (timecache (lengthm z count)))) (defun t2(n);; n is length of list (let ((z (mrl n)) (count (truncate 1000000000 n))) ;; that's 1.0d+9 (start-ccmY) (format t "~%just allocated time") (timecacheY (lengthm z count)) (setf z (sort z #'>)) (format t "~%after scrambling time") (timecacheY (lengthm z count)) (gc) (format t "~%after GC time") (timecacheY (lengthm z count)))) (defun t3(n);; n is length of list (let ((z (mrl n)) (count (truncate 1000000000 n))) ;; that's 1.0d+9 (start-ccmX) (format t "~%just allocated time") (timecacheX (lengthm z count)) (setf z (sort z #'>)) (format t "~%after scrambling time") (timecacheX (lengthm z count)) (gc) (format t "~%after GC time") (timecacheX (lengthm z count)))) #| data summary n fresh scrambled after GC time L1 l2 time L1 l2 time L1 l2 10 5.9 1.1e6 1.5e5 5.9 1.1 1.6 5.9 1.1 1.5 L1 cache misses about 1.5:1 = data:instr. instr misses = 2.6e5 20 4.7 8.6e5 1.7e5 4.7 9.8 1.9 4.7 9.7 1.8 50 3.8 7.9e5 1.5e5 3.8 7.9 1.6 3.8 9.8 2.4 50 3.8 8.0e5 1.6e5 3.8 7.1 1.4 3.9 8.1 1.6 ;; GC no help 100 3.6 6.5e5 1.3e5 3.6 9.2 2.0 3.6 9.3 2.0 ;; GC no help 200 3.5 6.3e5 9.6e4 3.5 7.3 11 3.4 6.4 9.9 ;; GC so/so 400 3.4 7.1e5 1.5e5 3.4 7.3 1.5 3.4 7.8 1.5 ;; GC no help 1000 4.1 8.6e5 1.1e5 3.6 7.0 0.9 5.8 41. 1.4 ;; GC worse? L1 cache misses about 2:1 = data:instr. instr misses = 1.4e5 1000 4.0 8.9e5 1.3e5 4.6 9.5 1.4 4.5 11. 1.6 ;; do over 2000 6.5 9.4e6 1.8e5 6.3 11 1.6 3.4 8.2 1.1 ;; vg 4 gc L1 cache misses about 20:1 to 33:1 = data:instr at n=2000, the data ceases to fit in L1 cache 2000 6.0 6.9e6 2.0e5 6.3 14 8.4 3.5 5.7 0.95 ;;vg 4 gc 4000 6.4 2.5e8 1.8e5 7.2 5.8 1.8 6.5 2.5 1.6 ;;g 4 gc 8000 4.5 2.5e8 1.5e5 6.8 8.0 1.9 4.5 2.5 1.3 ;;vg 4 gc L1 cache misses about 1000:1 to 3000:1 = data:instr instr misses 2.5e5 16000 4.5 2.5e8 2.2e5 7.3 9.1 3.1 4.5 2.5 2.3 ;;vvg 4 gc 20000 4.5 2.5e8 3.2e5 7.3 9.2 4.8 4.5 2.5 3.4 ;;vg 4 gc 30000 4.9 2.5e8 3.8e6 8.7 9.5 8.6 9.2 2.5 40. ;; bad??L2 30000 4.9 2.5e8 3.6e6 8.7 9.5 8.5 13.5 2.5 77. ;; bad??L2 32000 9.8 2.5e8 4.6e7 12.1 9.5 3.4 9.8 2.5 4.5 ;; g ??L2 L1 cache misses about 700:1 to 1900:1 = data:instr instr misses 3.6e5 to 4.9e5 at n=32768, the data ceases to fit in L2 cache 35000 23.2 2.5e8 1.5e8 22.5 9.6 1.1 18.4 2.5 1.2 ;;g 4 gc 40000 33.8 2.6e8 2.5e8 27.4 3.1 1.9 31.5 2.6 2.3 ;; bad??L2 50000 33.9 2.6e8 2.5e8 33.1 5.1 2.1 34.0 2.6 2.6 ;;?? l2 Program t1 will run always run the length function over 10^9 items. For example, 10^6 times on a list of size 10^3= 10^9 accesses. If each access took the cache system by surprise there would be 10^9 L1 and L2 cache misses. For 25,000 runs of 40,000 elements, we got to about 1/4 of that rate. Note that at 32,768 8-byte cells the L2 cache capacity is reached and the number of cache misses increases by a factor of 70, and the execution time by a factor of 7. Program t3 counts only L1 cache misses, but separates the data into instruction and data accesses. The correlation between numbers from program t1 and t3 are somewhat disappointing. |# (defun mylength2(a n) (declare (list a)(fixnum n)(optimize (speed 3)(safety 0)(debug 0))) (cond ((null a) n)(t (mylength2 (cdr a)(1+ n))))) (defun t4(n);; n is length of list (let ((z (mrl n)) (count (truncate 1000000000 n))) ;; that's 1.0d+9 (start-ccmX) (format t "~%just allocated time") (timecacheX (lengthm2 z count)) (setf z (sort z #'>)) (format t "~%after scrambling time") (timecacheX (lengthm2 z count)) (gc) (format t "~%after GC time") (timecacheX (lengthm2 z count)))) (defun lengthm2(l m) ;compute length of list l m times (declare(optimize(speed 3)(safety 0)(debug 0)) (fixnum m)(list l)) (dotimes (j m) (declare (fixnum j)) (mylength2 l 0))) (defmacro timecacheW(m)`(progn (read-ccm nil),m(read-ccm t))) (defparameter *timedata* nil) (defun t5(n);; n is length of list (let ((z (mrl n)) (count (truncate 1000000000 n))) ;; that's 10^9/n (setf z (sort z #'>)) (format t "~%after scrambling time") (push `(,n s ,(timecacheW (lengthm2 z count))) *timedata*) (gc) (format t "~%after GC time") (push `(,n g ,(timecacheW (lengthm2 z count))) *timedata*) )) ;;;time data from t5 '((200000 g (302214143 277477395 40849)) (200000 s (301298288 277335463 40729)) (200000 j (254375212 253184531 34089)) (100000 g (254070731 252939805 34129)) (100000 s (355716615 276144704 40468)) (100000 j (255050908 253102448 34129)) ; (100000 g (253984042 252904546 34159)) ; (100000 s (453657710 320011316 47128)) ; (100000 j (254137435 252903523 34109)) (80000 g (254862100 253049921 34209)) (80000 s (311975311 253843240 35811)) (80000 j (254101522 252764418 34089)) ; ( 80000 g (254073693 252938896 34139)) ; (80000 s (457181585 291849093 44063)) ; (80000 j (254162233 252782112 34139)) (50000 g (253339823 252228422 33919)) (50000 s (517831098 200812961 31996)) (50000 j (253415851 251947113 33949)) (40000 g (254704833 252248962 34139)) (40000 s (256857603 251438083 35551)) (40000 j (254656325 252416353 34129)) ; (40000 g (253826963 251950180 33889)) ; (40000 s (348816828 164564752 24034)) ; (40000 j (253380822 251502547 33919)) (35000 g (252426063 124851307 19098)) (35000 s (963043955 109639398 22432)) (35000 j (252906818 169818240 24375)) (30000 g (251560818 38514280 8903)) (30000 s (951928394 32471367 11717)) (30000 j (251701135 45467535 9754)) (20000 g (251098282 331958 4396)) (20000 s (922729575 603923 7401)) (20000 j (251164191 311352 4426)) (10000 g (251744501 203497 4397)); (10000 s (843541842 254946 7000)); (10000 j (251574116 101990 4396)); (5000 g (252586666 85126 4396)) (5000 s (681926449 112443 6280)) (5000 j (252574524 80024 4386)) (2000 g (23417658 68930 3375)) (2000 s (13975002 62048 3334)) (2000 j (23421759 64154 3395)) (1000 g (407684 66331 3325)) (1000 s (398824 62925 3335)) (1000 j (383136 55981 3315)) (500 g (383813 80679 3385)) (500 s (326318 59157 3384)) (500 j (306504 49966 3375)) (200 g (335370 66803 3535)) (200 s (330773 66684 3535)) (200 j (328054 64491 3535)) ;(200 g (1051307 164993 3646)) ;(200 s (365089 86998 3525)) ;(200 j (302947 57540 3525)) (100 g (318497 60956 3775)) (100 s (315211 61424 3786)) (100 j (317666 63384 3775)) (50 g (353506 63919 4296)) (50 s (348598 64630 4316)) (50 j (337807 61081 4297)) (20 g (489692 97322 5828)) (20 s (447945 83095 5748)) (20 j (506673 103447 5749)) (10 g (631608 113339 8181)) (10 s (654784 119475 8312)) (10 j (728567 141596 8292))) ;;;;;;;;;;more thoughts. #| If the lisp GC puts things in memory in the order in which I think it does, namely, copy a cons cell X into relative location 0, then (CDR X)into relative location 1 (actually 1*(8 bytes)), (recursively) and then (CAR X), that means traversing a tree in breadth-first order would be faster than depth first, if the tree gets big enough. |# (eval-when (compile)(proclaim '(optimize (speed 3)(safety 0)(debug 0)))) (defun visit1(x) ; visit car-cdr (slower?) (cond(x(visit1 (car x))(visit1(cdr x))))) (defun visit2(x) ; visit cdr-car (faster?) (cond(x(visit2 (cdr x))(visit2(car x))))) (defun mt(n) ; make-tree (cond((= n 0) nil) (t (let ((r (mt(1- n)))) (cons r (copy-tree r)))))) (defun mt4(n) ; make-tree alternative (cond((= n 0) nil) (t (let ((r (mt(1- n)))) (list r (copy-tree r) (copy-tree r)(copy-tree r)))))) (defun mtl(n) ; make-tree (cond((= n 0) nil) (t (let ((r (mt(1- n)))) (list r (copy-tree r) (copy-tree r)(copy-tree r)))))) (defun tx1(n tree) (time (dotimes (i (expt 2 n))(declare (fixnum i)) (visit1 tree)))) (defun tx2(n tree) (time (dotimes (i (expt 2 n)) (declare (fixnum i))(visit2 tree)))) ;(setf t15 (mt4 16))(tx1 10 t16)(tx2 10 t16) ;; 10+16=constant (defun visit3(x) ; visit cdr-car (faster?) (cond(x(visit2 (cdr x))(visit2(car x))))) (defun visit4(x)(bfs (list x) x)) ;; breadth first search.. (defun bfs (queue x) (if (null queue) nil (let ((path (car queue))) (let ((node (car path))) (if (null x)nil (bfs (append (cdr queue) (new-paths path node x)) x)))))) (defun whereis(x &optional (rel 0)) ;; where is x in memory (relative to rel, if present) (- (excl::pointer-to-address x) (excl::pointer-to-address rel))) (defun avlength(z) (let ((sum 0) (len (length z)) (r z)) (declare (fixnum len r)) (mapl #'(lambda(x) (incf sum(abs(whereis x r))) (print (abs (whereis x r))) (setf r x)) z ) (float (/ sum (1- len))))) (defun statlength(z) ;; look at the lengths of links in a list z ;; keep track in bins. (let ((bins '(8 16 24 32 64 #.(expt 2 10) #.(expt 2 20))) (counts (list 0 0 0 0 0 0 0 0)) (r 0) (len 0)) (declare (fixnum len r)) (mapl #'(lambda(x) (statcount(abs(whereis x r)) counts bins) (setf r x)) z ) (format t "~%<=~1,8@t~{~s~1,8@t~}~% ~%count~1,8@t~{~s~1,8@t~}~% " bins counts) (values))) (defun statcount(num clist bins) ;; num is pos integer ;; clist is at least one longer than bins (cond((null bins)(incf (car clist))) ((<= num (car bins))(incf (car clist))) (t (statcount num (cdr clist)(cdr bins)))) clist) ;; like statlength but look at car and cdr links both (defparameter bins nil) (defparameter counts nil) (defun sttl(z) ;stattreelength ;; look at the lengths of links in a list z ;; keep track in bins. (let ((bins '(8 16 24 32 64 #.(expt 2 10) #.(expt 2 20))) (counts (list 0 0 0 0 0 0 0 0))) (declare (special bins counts)) (cond((atom z)nil) (t(sttlx z))) (format t "~%<=~1,8@t~{~s~1,8@t~}~% ~%count~1,8@t~{~s~1,8@t~}~% " bins counts) (values))) (defun sttlx (z) (cond ((not (atom z)) (sttlx1 z (cdr z)) (sttlx1 z (car z))))) (defun sttlx1(here next) (cond ((not (atom next)) ;; don't count pointers to nil or atm (statcount(abs(whereis next here)) counts bins) (sttlx next)))) ;;; an in-place mergesort ;;; split at least 2 (defun split (x) ;; break the list x into two parts, odd and even (cond ((null x)(values nil nil)) ((null (cdr x))(values x nil)) (t (let((a x) (b (cdr x)) (c (cddr x))) (setf (cdr a) nil) (setf (cdr b) nil) (split1 c a b))))) (defun split1(x y z) (cond((null x)) ((null (cdr x))(setf (cdr x)(cdr y)) (setf (cdr y) x)) (t (let((a x) (b (cdr x)) (c (cddr x))) (setf (cdr a) (cdr y)) (setf (cdr y)a) (setf (cdr b) (cdr z)) (setf (cdr z)b) (split1 c a b)))) (values y z)) (defun mymerge(x y);; faster version of merging, destructively (cf. CL version) (cond ((null x) y) ((null y) x) ((>= (car x)(car y)) (setf (cdr x)(mymerge (cdr x) y)) x) (t (mymerge y x)))) (defun mergesort(r) (cond ((null r) nil) ((null (cdr r)) r) (t(multiple-value-bind (x y) (split r) ;;(merge 'list (mergesort x)(mergesort y) #'<) (mymerge (mergesort x)(mergesort y)) )))) ;;;;;;;;;; ;;; slow program #| (;; take a list of numbers and put them in order by manipulating ;; the list cells. Important not to provoke a GC. (defun sortone(k) (declare(optimize(speed 3)(safety 0)(debug 0))) ;; k is a list. take (cadr k) and put it in order in (cddr k) ;; destructively. Don't mess with (car k); use it for a handle. (cond ((null (cddr k)));; e.g. '(? 10 ) can't be sorted here. ;; now we have '(? 10 30) or '(? 30 10) ((< (the fixnum (cadr k))(the fixnum (caddr k)))); '(? 10 30) ; sorted, do nothing (t ; '(? 30 10 z ) (let ((a (cdr k))) ; a is (30 10 z ) (setf (cdr k)(cddr k)) ; k is (? 10 z) (setf (cdr a)(cddr k)) ; a is (30 z) (setf (cddr k) a)) ; k is (? 10 30 z) (sortone (cdr k)))) k) (defun mysort(x) (cdr (sortall (cons 'header x)))) ;; slow. doing this wrong?.. (defun sortall (x) (cond ((null (cdr x)) x) (t(sortall (cdr x)) (sortone x)))) ) |# (defun mrb(n) ;;make random list (let ((ans nil)) (dotimes (j n ans) (setf ans (cons ans 'x)))))