|
|
1.1 root 1: ;Last Modified: 16-APR-1992 09:06:30.46
2: .title fprims Fast Multiple Precision Primitives
3: .ident /V1.7B/
4: ;+
5: ; **-FPRIMS-Fast Multiple Precision Primitives
6: ;
7: ; Facility: PGP
8: ;
9: ; Language: Macro-32
10: ;
11: ; Functional Description:
12: ;
13: ; This module contains fast multiple precision routines for operating on arrays
14: ; of long words. Error checking is minimised at the expense of speed.
15: ;
16: ; Restrictions:
17: ;
18: ; This code is shareable but NOT reentrant as written because of static data.
19: ; A reentrant version of this module could be written but it would be slower!
20: ;
21: ; Version: 1
22: ;
23: ; Original: 00A Date: 17-Sep-1991 Author: Hugh A.J. Kennedy
24: ;
25: ; Based on FPRIMS.ASM written by Zhahai Stewart for the Intel 8086
26: ; architecture.
27: ;
28: ; Modification: 02A Date: 27-Sep-1991 Author: Hugh A.J. Kennedy.
29: ;
30: ; Add fast multiply routine, P_SMUL.
31: ; Re-organise code slightly.
32: ; Ammend/clarify copyright and license statement.
33: ; Add checking for maximum precision exceeded, display a warning message
34: ; and bomb!
35: ;
36: ; Modification: 03A Date: 16-Mar-1992 Author: Hugh A.J. Kennedy.
37: ;
38: ; Sniff for MSB in P_SMUL. In this way, avoid multiplies by leading zeroes
39: ; (not efficient).
40: ;
41: ; Modification: 05A Date: 17-Mar-1992 Author: Hugh A.J. Kennedy.
42: ;
43: ; Encode entire double precision multiply in VAX assembler.
44: ; Correct some minor problems with handling embedded zeroes.
45: ;
46: ; Modification: 06A Date: 17-Mar-1992 Author: Hugh A.J. Kennedy
47: ;
48: ; Align everything for speed. VAXen like stuff on 64-bit, or at least 32-bit
49: ; boundaries. Therefore, we align the add, subtract and rotate tables and then
50: ; we align the multiply loops. The extra NOPs used to pad these loops are of
51: ; negligable cost because they already exist in the memory buffer. When the
52: ; following instruction is aligned, it executes MUCH faster.
53: ;
54: ; Modification: 07A Date: 24-Mar-1991 Author: Hugh A.J. Kennedy.
55: ;
56: ; Implement fast compare.
57: ;-
58:
59: .sbttl Copyright Notice And License To Use
60: ;
61: ; Copyright (c) 1991-1992, All Rights Reserved by
62: ; Hugh A.J. Kennedy.
63: ;
64: ; A license to use and adapt this software without payment is hereby
65: ; granted subject to the following conditions:
66: ;
67: ; 1) It may only be copied with the inclusion of this copyright
68: ; notice in the program source with these associated conditions.
69: ;
70: ; 2) No title to or ownership of this software is hereby
71: ; transferred.
72: ;
73: ; 3) The information in this software is subject to change
74: ; without notice and should not be construed as a commitment by
75: ; Hugh Kennedy.
76: ;
77: ; 4) The author assumes no liability for any damages arising from the
78: ; use of this software, even if said damages arises from defects in
79: ; this software.
80: ;
81: ; 5) No warranty as to merchantability or fitness of purpose is
82: ; expressed or implied.
83: ;
84: ; 6) Any modifications to this source must be clearly identified as
85: ; such and added to the modification history.
86: ;
87: ; 7) These routines may not be incorporated in a commercial cryptographic
88: ; product.
89: ;
90: ; If you can not comply with these conditions, you *must* contact the author
91: ; and obtain permission other wise you are in violation of copyright.
92:
93: .sbttl Misc Macros & Definitions
94: ;
95: ; Assembly Parameters
96: ;
97: max_unit_prec = 72 ; Maximum unit precision
98: supersniffer = 1 ; Enable bit msb locator.
99: ;
100: ; The following parameter is dependent on the kind of VAX you are running on
101: ; and should be defined if the execution time of the SOBGTR loop control
102: ; instruction and the appropriate operation (ADWC or SBWC) from cache is much
103: ; less than the execution time in main memory. If you have a slow VAX you
104: ; should comment the following line out to use a vector of instructions.
105: ;
106: novector = 1 ; Use loops rather than vectors.
107:
108: .macro ascid .string
109: ;+
110: ; *-ASCID-Build An ASCII String Referenced By Descriptor
111: ;
112: ; Functional Description:
113: ;
114: ; This macro is a little like the system supplied .ASCID directive
115: ; but it uses a separate program section to store the ASCII data.
116: ;
117: ; Arguments:
118: ;
119: ; STRING String to create
120: ;-
121: .nocross
122:
123: .save_psect
124:
125: .psect puret
126:
127: $$$t0 = .
128: .ascii @.string@
129: $$$t1 = .-$$$t0
130:
131: .restore_psect
132:
133: .word $$$t1
134: .byte dsc$k_dtype_t
135: .byte dsc$k_class_s
136: .address -
137: $$$t0
138: .cross
139:
140: .endm ascid
141:
142: .sbttl Misc Data Areas
143: ;
144: ; Misc. Data Areas
145: ;
146: .psect impurd,con,lcl,noshr,exe,rd,wrt,long
147:
148: ;
149: ; This data is static and is used to hold the current precision established
150: ; by P_SETP for other calls to this library.
151: ;
152: .if not_defined novector
153:
154: addoff: ; Offset into add table.
155: .blkl 1 ; also for sub and rot.
156: .endc
157:
158: precis: ; Precision in longwords.
159: .blkl 1
160:
161: .psect pure,con,rel,shr,exe,rd,nowrt,quad
162:
163: .align quad
164:
165: .ifndef novector
166:
167: prectoobig:
168: ascid <PGP (FPRIMS) - Requested precision (!ZL) exceeds capacity (!ZL)>
169:
170: .endc
171:
172: .sbttl Start of Code
173:
174: .sbttl P_CMP Compare two very long integers
175: ;+
176: ; **-P_COMP-Compare two very long integers
177: ;
178: ; Functional Description:
179: ;
180: ; This procedure is invoked to compare two extended precision unsigned
181: ; integers.
182: ;
183: ; Calling Sequence
184: ;
185: ; short P_CMP ( r1, r2)
186: ;
187: ; Parameters:
188: ;
189: ; R1 -> Extended Precision Integer 1
190: ; R2 -> Extended Precision Integer 2
191: ;
192: ; Implicit Inputs:
193: ;
194: ; PRECIS lr*r Precision expresses in longs.
195: ;
196: ; Returns:
197: ;
198: ; -1 if r1 < r2
199: ; 0 if r1 = r2
200: ; +1 if r1 > r2
201: ;
202:
203: ;-
204:
205: .align long
206:
207: .entry p_cmp,^m<r2>
208:
209: movl 4(ap),r1 ; R1 -> Sum.
210: movl 8(ap),r2 ; R2 -> Addend.
211: movl precis,r0 ; R0 = Precision.
212: moval (r1)[r0],r1 ; Get MS longwords.
213: moval (r2)[r0],r2 ; Get MS longwords.
214: .align long 1 ; Align loop with NOPS.
215: 10$: cmpl -(r1),-(r2) ; Compare.
216: bnequ 20$ ; If ne, then exit loop.
217: sobgtr r0,10$ ; Loop until done.
218: ret ; R0 = zero so R1 = R2.
219: 20$:
220: bgtru 30$ ; If R1 > R2 then branch.
221: movw #-1,r0 ; Flag <.
222: ret
223: 30$:
224: movw #1,r0 ; Flag >.
225: ret
226:
227: .sbttl P_ADDC Add two very long integers with carry
228: ;+
229: ; **-P_ADDC-Add very long integers
230: ;
231: ; Functional Description:
232: ;
233: ; This procedure is invoked to add two very long integers with carry. Each
234: ; integer is represented as an array of longwords, least significant first.
235: ;
236: ; Calling Sequence:
237: ;
238: ; P_ADDC sum,addend,carry
239: ;
240: ; Parameters:
241: ;
242: ; sum lm*r Sum.
243: ; addend lr*r Addend.
244: ; carry lr*v Carry bit.
245: ;
246: ; Implicit Inputs:
247: ;
248: ; Addoff This is used as an offset into the various tables
249: ; of adds, subtracts and rotates to implement the
250: ; operation to the requested precsion.
251: ;
252: ; Status Returns:
253: ;
254: ; R0 Resulting carry bit.
255: ;-
256:
257: .align long
258:
259: .entry p_addc,^m<r2,r3>
260:
261: movl 4(ap),r1 ; R1 -> Sum.
262: movl 8(ap),r2 ; R2 -> Addend.
263:
264: .if defined novector
265:
266: movl precis,r3 ; R3 = Precision.
267: subl3 12(ap),#0,r0 ; Set carry bit.
268: .align quad,1 ; Align loop with NOPs
269: 10$: adwc (r2)+,(r1)+ ; Add with carry one longword.
270: .align quad,1 ; Align next instruction.
271: sobgtr r3,10$ ; Loop until done.
272:
273: .iff ; novector
274:
275: moval 10$,r3
276: addl2 addoff,r3 ; Jump into table.
277: subl3 12(ap),#0,r0 ; Set carry bit.
278: jmp (r3)
279:
280: .align quad
281:
282: 10$:
283: .rept max_unit_prec
284: $$$ = .
285: adwc (r2)+,(r1)+ ; Add with carry one longword.
286: nop
287: addsiz = .-$$$
288: .endr
289:
290: .endc ; novector
291:
292: clrl r0 ; Assume carry clear.
293: bcc 20$ ; Carry set?
294: incl r0 ; Flag carry was set.
295: 20$: ret
296:
297: .sbttl P_SUBB Subtract very long integers with borrow
298: ;+
299: ; **-P_SUBB-Subtract very long integers
300: ;
301: ; Functional Description:
302: ;
303: ; This procedure is invoked to add subtract very long integers with carry. Each
304: ; integer is represented as an array of longwords, least significant first.
305: ;
306: ; Calling Sequence:
307: ;
308: ; P_SUBB diff,sub,borrow
309: ;
310: ; Parameters:
311: ;
312: ; diff lm*r Difference
313: ; sub lr*r Subtrahend.
314: ; borrow lr*v Borrow bit.
315: ;
316: ; Implicit Inputs:
317: ;
318: ; Addoff This is used as an offset into the various tables
319: ; of adds, subtracts and rotates to implement the
320: ; operation to the requested precsion.
321: ;
322: ; Status Returns:
323: ;
324: ; R0 Resulting carry bit.
325: ;-
326:
327: .align long
328:
329: .entry p_subb,^m<r2,r3>
330:
331: movl 4(ap),r1 ; R1 -> Difference.
332: movl 8(ap),r2 ; R2 -> Minuend.
333:
334: .if defined novector
335:
336: movl precis,r3 ; R3 = No. of longs.
337: subl3 12(ap),#0,r0 ; Set borrow bit.
338: .align quad,1 ; Align loop with NOPs.
339: 10$: sbwc (r2)+,(r1)+ ; Subtract with borrow one long.
340: .align quad,1 ; Align with NOPs.
341: sobgtr r3,10$ ; Loop through.
342:
343: .iff ; novector
344:
345: moval 10$,r3
346: addl2 addoff,r3 ; Jump into table.
347: subl3 12(ap),#0,r0 ; Set borrow bit.
348: jmp (r3)
349:
350: .align quad
351: 10$:
352: .rept max_unit_prec
353: sbwc (r2)+,(r1)+ ; Subtract w/carry one longword.
354: nop
355: .endr
356:
357: .endc ; novector
358:
359: clrl r0 ; Assume carry clear.
360: bcc 20$ ; Carry set?
361: incl r0 ; Flag carry was set.
362: 20$: ret
363:
364: .sbttl P_ROTL Rotate left a very long integer with carry.
365: ;+
366: ; **-P_ROTL-Rotate left one bit very long integers
367: ;
368: ; Functional Description:
369: ;
370: ; This procedure is invoked to rotate left one bit (e.g. divide by 2) very
371: ; long integers with carry. Each integer is represented as an array of
372: ; longwords, least significant first. Note that we use the add with carry
373: ; instruction here because the VAX (unlike the dear old PDP-11) lacks a
374: ; rotate instruction that includes the carry bit.
375: ;
376: ; Calling Sequence:
377: ;
378: ; P_ROTL num,carry
379: ;
380: ; Parameters:
381: ;
382: ; num lm*r Number to be shifted
383: ; carry lr*v Carry bit.
384: ;
385: ; Implicit Inputs:
386: ;
387: ; Addoff This is used as an offset into the various tables
388: ; of adds, subtracts and rotates to implement the
389: ; operation to the requested precsion.
390: ;
391: ; Status Returns:
392: ;
393: ; R0 Resulting carry bit.
394: ;-
395:
396: .align long
397:
398: .entry p_rotl,^m<r3>
399:
400: movl 4(ap),r1 ; R1 -> Sum.
401:
402: .if defined novector
403:
404: movl precis,r3 ; R3 = No. of longwords.
405: subl3 8(ap),#0,r0 ; Set carry bit.
406: .align quad,1 ; Align loop with NOPs
407: 10$: adwc (r1),(r1)+ ; Add to itself with carry.
408: .align quad,1 ; Align with NOPs.
409: sobgtr r3,10$ ; Loop until done.
410:
411: .iff ; novector
412:
413: moval 10$,r3
414: addl2 addoff,r3 ; Jump into table.
415: subl3 8(ap),#0,r0 ; Set carry bit.
416: jmp (r3)
417:
418: .align quad
419: 10$:
420: .rept max_unit_prec
421: adwc (r1),(r1)+ ; *2+carry.
422: nop
423: .endr
424:
425: .endc ; novector
426: clrl r0 ; Assume carry clear.
427: bcc 20$ ; Carry set?
428: incl r0 ; Flag carry was set.
429: 20$: ret
430:
431: .sbttl P_DMUL Extended Multiple Precision Multiply
432: ;*
433: ; **-P_DMUL-Extended Multiple Precision Multiply
434: ;
435: ; Functional Description:
436: ;
437: ; This procedure multiplies an unsigned single precision multiplier by a
438: ; single precision multiplicand. The product register is double precision.
439: ; It is expected that the length of the single precision multiplier and
440: ; multiplicand has been previously set by a call to P_SETP. Note that the
441: ; entire length of the product register is zeroed - so it must be a full
442: ; double precision size.
443: ;
444: ; Calling Sequence:
445: ;
446: ; P_DMUL prod, multiplicand, multiplier
447: ;
448: ; Parameters:
449: ;
450: ; prod lw*r Product.
451: ; multuplicand lr*r Multiplicand
452: ; multiplier lr*r Multiplier
453: ;
454: ; Implicit Inputs:
455: ;
456: ; PRECIS lr*r Precision expresses in longs.
457: ;
458: ; Status Returns:
459: ;
460: ; None.
461: ;-
462:
463: .align long
464:
465: .entry p_dmul,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
466:
467: movl 4(ap),r8 ; R8 -> Product.
468: beql 49$ ; If eq, not specified.
469: movl precis,r10 ; R10 = Precision (longs)
470: ashl #3,r10,r2 ; R0 = No. of bytes to zero.
471: movc5 #0,#0,#0,r2,(r8) ; Zero product buffer.
472: movl 8(ap),r3 ; R3 -> Multiplicand.
473: beql 49$ ; If eq, not specified.
474: pushl r3 ; Save for posterity.
475: movl 12(ap),r11 ; R11 -> Multiplier.
476: beql 49$ ; If eq, not specified.
477: movl r10,r12 ; R12 = Multiplicand prec.
478:
479: .if defined SUPERSNIFFER
480:
481: ;
482: ; Here we calculate the effective maximum precision for the multiply by
483: ; locating the long containing the most significant bit of the multiplier
484: ; and the multiplicand.
485: ;
486: moval (r11)[r10],r0 ; Supersniffer...
487: .align quad,1 ; Align with nops
488: 45$: tstl -(r0) ; Examine next long.
489: bneq 50$ ; If ne, then we found msb.
490: sobgtr r10,45$ ; Loop until done.
491: 49$: ret ; Multiplier = 0!
492: 50$:
493: moval (r3)[r12],r0 ; Supersniffer...
494: .align quad,1 ; Align with nops
495: 55$: tstl -(r0) ; Examine next long.
496: bneq 200$ ; If ne, then we found msb.
497: sobgtr r12,55$ ; Loop until done.
498: ret ; Multiplicand = 0!
499: .iff
500:
501: brb 200$
502: 49$:
503: ret
504:
505: .endc ; SUPERSNIFFER
506:
507: ;
508: ; Multiplier Loop
509: ;
510: ; R12 = Count of multiplicand longs to process.
511: ; R11 -> Next long of multiplier.
512: ; R10 = Count of multiplier longs to process.
513: ; R8 -> Next long of product.
514: ;
515: .align quad,1 ; Align with nops
516: 200$: movl r12,r5 ; Multiplicand precision.
517: moval (r8)+,r4 ; R4 -> Next long of product.
518: movl (sp),r3 ; R3 -> 1st multiplier long.
519: movl (r4),r0 ; R0,R1 = Partial Sum.
520: movl 4(r4),r1
521: clrl r7 ; Zero look-ahead carry.
522: ;
523: ; Perform an extended multiply of two unsigned numbers. This means that
524: ; we have to compensate the hi-order product because either the multiplier
525: ; or the multiplicand may be apparently a negative number. EMUL is a signed
526: ; multiply - so we must be careful. Also, the EMUL longword addend is sign
527: ; extended before adding into the product so we have to add the hard way.
528: ;
529: ; R6 = Current Multiplicand
530: ; R2 = Multiplier
531: ; R4 -> Current quadword of partial product.
532: ; R0,R1 = Partial sum to which product is added
533: ; R7 = Lookahead carry. This gets set if we try to carry after adding
534: ; the partial product to the partial sum. This gets a little more
535: ; complicated because here we are setting the high-order long of
536: ; the next quadword to be operated on.
537: ;
538: ; Essentially the algorithm is as follows:
539: ;
540: ; 0) R0,R1 = (R4) ; Save current partial sum.
541: ; 1) R6 = Next longword of multiplicand.
542: ; 2) (R4) = R6 * R2 ; quad result compensating for negative numbers)
543: ; 3) (R4) = (R4) + R0,R1 ; add back partial sum.
544: ; 4) R7 = Carry bit.
545: ; 5) R4 = R4 + 4 ; Point to next long.
546: ; 6) R1 = 4(R4) + R7 ; Propagate carry to high order of next partial
547: ; ; sum.
548: ; 7) Loop back to step 1 until multiplicand completely processed.
549: ;
550: movl (r11)+,r2 ; R2 = Multiplier.
551: beql 999$ ; If eq, not specified.
552: blss 1500$ ; This unfolds the compensation
553: ; test out of the loop.
554: ;
555: ; This version of the multiply loop is entered when the multiplier is positive
556: ; saving three instructions per unit of precision.
557: ;
558: .align quad,1 ; Align with NOPs.
559: 500$: movl (r3)+,r6 ; R6 = Current multplicand.
560: emul r2,r6,#0,(r4) ; Multiply (64-bit result).
561: ;
562: ; Because we have removed leading zeroes, multiplication by zero is very
563: ; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
564: ; the EMUL (looking at the zero product) that the multiplicand was zero so we
565: ; don't need any special case logic later to adjust the product pointer.
566: ;
567: beql 550$ ; If result eq, skip.
568: tstl r6 ; Was multiplicand negative?
569: bgeq 550$ ; No, skip.
570: addl2 r2,4(r4) ; Yes, compensate.
571: 550$: addl2 r0,(r4)+ ; Accumulate.
572: adwc r1,(r4)+
573: movl (r4),r1 ; R1 = Next hi-end partial sum.
574: adwc r7,r1 ; Add carry if needed.
575: clrl r7 ; Reset lookahead register.
576: adwc #0,r7 ; Save lookahead carry.
577: movl -(r4),r0 ; R0 = Next lo-end partial.
578: sobgtr r5,500$ ; More units?
579: 999$: sobgtr r10,200$ ; Nope, go get next multiplier
580: ret
581: ;
582: ; This version of the above multiply loop is entered when the multiplier is
583: ; negative - and we must compensate by adding the multiplicand to the hi-order
584: ; product. This saves a test and a conditional branch per unit of precision.
585: ;
586: .align quad,1 ; Align with NOPs.
587: 1500$:
588: movl (r3)+,r6 ; R6 = Current multplicand.
589: emul r2,r6,#0,(r4) ; Multiply (64-bit result).
590: ;
591: ; Because we have removed leading zeroes, multiplication by zero is very
592: ; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
593: ; the EMUL (looking at the zero product) that the multiplicand was zero so we
594: ; don't need any special case logic later to adjust the product pointer.
595: ;
596: beql 1560$ ; If result eq, skip.
597: tstl r6 ; Was multiplicand negative?
598: bgeq 1550$ ; No, skip.
599: addl2 r2,4(r4) ; Yes, compensate.
600: 1550$:
601: ; As documented above, we unfolded the following to save instructions
602: ; tstl r2 ; Multiplier negative?
603: ; bgeq 1560$ ; No, skip.
604: addl2 r6,4(r4) ; Yes, compensate.
605: 1560$: addl2 r0,(r4)+ ; Accumulate.
606: adwc r1,(r4)+ ; R1 = High-end partial sum.
607: movl (r4),r1 ; R1 = Next hi-end partial sum.
608: adwc r7,r1 ; Add carry if needed.
609: clrl r7 ; Reset lookahead register.
610: adwc #0,r7 ; Save lookahead carry.
611: movl -(r4),r0 ; R0 = Next lo-end partial.
612: sobgtr r5,1500$ ; More units?
613: sobgtr r10,200$ ; Nope, go get next multiplier
614: ret
615:
616: .sbttl P_SETP Set Precison.
617: ;+
618: ; **-P_SETP-Set Precision
619: ;
620: ; Functional Description:
621: ;
622: ; This procedure is invoked to set the operating precision of the package.
623: ;
624: ; Calling Sequence:
625: ;
626: ; P_SETP nbits
627: ;
628: ; Parameters:
629: ;
630: ; nbits rw*v Number of bits in number.
631: ;
632: ; Implicit Outputs:
633: ;
634: ; Precis Set to the number of longwords required to implement
635: ; the requested precision.
636: ; Addoff This is used as an offset into the various tables
637: ; of adds, subtracts and rotates to implement the
638: ; operation to the requested precsion.
639: ;
640: ; Status Returns:
641: ;
642: ; None.
643: ;
644: ; Side Effects:
645: ;
646: ; If the maximum precision set in 32-bit units by the assembly
647: ; parameter "max_unit_prec" is exceeded, a message to that effect will
648: ; be displayed and the program will terminate with a fatal error.
649: ;-
650:
651: .entry p_setp,^m<>
652:
653: movzwl 4(ap),r1 ; R1 = No. of bits.
654: addl2 #31,r1 ; Round up to next long word.
655: ashl #-5,r1,r1 ; R1 = No. of 32 bit words.
656: movl r1,precis ; Save precision.
657:
658: .if not_defined novector
659:
660: subl3 r1,#max_unit_prec,r0 ; R0 = Number of steps reqd.
661: blss 10$ ; If > 0 then exit.
662: mull3 #addsiz,r0,addoff ; Get add table offset.
663:
664: .iftf ; novector
665:
666: ret
667:
668: .ift ; novector
669:
670: 10$: ; Table size exceeded!
671: movab -80(sp),sp ; Output buffer.
672: pushab (sp) ; Build descriptor
673: movzwl #80,-(sp)
674: clrl -(sp) ; Receive return length.
675: pushl #max_unit_prec ; Compiled max table size.
676: pushl r1 ; Requested table size.
677: pushaq 8+4(sp) ; -> Output buffer descriptor.
678: pushaw 12(sp) ; -> Returned length.
679: pushaq prectoobig ; -> FAO control string.
680: calls #5,g^sys$fao ; Format output string.
681: movl (sp)+,(sp) ; Set actual buffer size.
682: pushaq (sp) ; -> Output buffer descr.
683: calls #1,g^lib$put_output ; Output message.
684: $exit_s - ; Exit with severe error.
685: code=#4
686:
687: .endc ; novector
688:
689: .end
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.