|
|
1.1 root 1: /*
2: * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3: *
4: * @APPLE_LICENSE_HEADER_START@
5: *
6: * The contents of this file constitute Original Code as defined in and
7: * are subject to the Apple Public Source License Version 1.1 (the
8: * "License"). You may not use this file except in compliance with the
9: * License. Please obtain a copy of the License at
10: * http://www.apple.com/publicsource and read it before using this file.
11: *
12: * This Original Code and all software distributed under the License are
13: * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14: * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15: * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17: * License for the specific language governing rights and limitations
18: * under the License.
19: *
20: * @APPLE_LICENSE_HEADER_END@
21: */
22: #define STANDALONE 0
23:
24: #if STANDALONE
25: #include "asm.h"
26: #include "assym.h"
27: #include "proc_reg.h" /* For CACHE_LINE_SIZE */
28:
29: #else
30:
31: #include <mach/ppc/asm.h>
32: #if 0
33: /* #include <assym.h> */
34: #include <ppc/proc_reg.h> /* For CACHE_LINE_SIZE */
35: #endif 0
36: #endif
37:
38: /*
39: * Reg 3 - Pointer to data
40: * Reg 4 - Length of data
41: * Reg 5 - Accumulated sum value
42: * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data)
43: */
44:
45: ENTRY(xsum_assym, TAG_NO_FRAME_USED)
46:
47: mr r11, r6 ; Swapped flag
48: addi r8, 0, 0
49: addi r10, 0, 0x1f
50: addi r7, 0, 1
51: addic r7, r7, 0 ; This clears the carry bit!
52: mr r12, r5 ; Save the passed-in checksum value
53:
54: /*
55: * Sum bytes before cache line boundary
56: */
57:
58: cmpi cr0,0,r4,0 ; Check for length of 0
59: beq Lleftovers
60:
61: and. r9, r3, r10
62: beq Laligned32 ; 32 byte aligned
63:
64: andi. r9, r3, 0x3
65: beq Laligned4
66:
67: andi. r9, r3, 0x1
68: beq Laligned2 ; 2 byte aligned
69:
70: addi r11, 0, 1 ; swap bytes at end
71: lbz r8, 0(r3)
72: add r3, r3, r7
73: subf. r4, r7, r4
74: beq Ldone
75:
76: Laligned2:
77: cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up
78: blt Lleftovers
79: andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code
80: beq Laligned4
81: lhz r5, 0(r3) ; Load and add a halfword to the checksum
82: adde r8, r8, r5
83: slwi r7, r7, 1
84: add r3, r3, r7
85: subf. r4, r7, r4
86: beq Ldone
87:
88:
89: /*
90: Add longwords up to the 32 byte boundary
91: */
92:
93: Laligned4:
94: addi r7, 0, 4
95: Lloop4:
96: cmpi cr0,0,r4,4
97: blt Lleftovers
98: and. r9, r3, r10
99: beq Laligned32
100: lwz r5, 0(r3)
101: adde r8, r8, r5
102: add r3, r3, r7
103: subf. r4, r7, r4
104: bne Lloop4
105: b Ldone
106:
107:
108: /*
109: We're aligned on a 32 byte boundary now - add 8 longwords to checksum
110: until the remaining length is less than 32
111: */
112: Laligned32:
113: andis. r6, r4, 0xffff
114: bne Lmainloop
115: andi. r6, r4, 0xffe0
116: beq Lleftovers
117:
118: Lmainloop:
119: addi r9, 0, 64
120: addi r10, 0, 32
121: cmpi cr0,0,r4,64
122: blt Lnopretouch
123: dcbt r3, r10 ; Touch one cache-line ahead
124: Lnopretouch:
125: lwz r5, 0(r3)
126:
127: /*
128: * This is the main meat of the checksum. I attempted to arrange this code
129: * such that the processor would execute as many instructions as possible
130: * in parallel.
131: */
132:
133: Lloop:
134: cmpi cr0,0,r4,96
135: blt Lnotouch
136: dcbt r3, r9 ; Touch two cache lines ahead
137: Lnotouch:
138: adde r8, r8, r5
139: lwz r5, 4(r3)
140: lwz r6, 8(r3)
141: lwz r7, 12(r3)
142: adde r8, r8, r5
143: lwz r5, 16(r3)
144: adde r8, r8, r6
145: lwz r6, 20(r3)
146: adde r8, r8, r7
147: lwz r7, 24(r3)
148: adde r8, r8, r5
149: lwz r5, 28(r3)
150: add r3, r3, r10
151: adde r8, r8, r6
152: adde r8, r8, r7
153: adde r8, r8, r5
154: subf r4, r10, r4
155: andi. r6, r4, 0xffe0
156: beq Lleftovers
157: lwz r5, 0(r3)
158: b Lloop
159:
160: /*
161: * Handle whatever bytes are left
162: */
163:
164: Lleftovers:
165: /*
166: * Handle leftover bytes
167: */
168: cmpi cr0,0,r4,0
169: beq Ldone
170:
171: addi r7, 0, 1
172: addi r10, 0, 0x7ffc
173:
174: and. r9, r4, r10
175: bne Lfourormore
176: srw r10, r10, r7
177: and. r9, r4, r10
178: bne Ltwoormore
179: b Loneleft
180:
181: Lfourormore:
182: addi r10, 0, 4
183:
184: Lfourloop:
185: lwz r5, 0(r3)
186: adde r8, r8, r5
187: add r3, r3, r10
188: subf r4, r10, r4
189: andi. r6, r4, 0xfffc
190: bne Lfourloop
191:
192: Ltwoormore:
193: andi. r6, r4, 0xfffe
194: beq Loneleft
195: lhz r5, 0(r3)
196: adde r8, r8, r5
197: addi r3, r3, 2
198: subi r4, r4, 2
199:
200: Loneleft:
201: cmpi cr0,0,r4,0
202: beq Ldone
203: lbz r5, 0(r3)
204: slwi r5, r5, 8
205: adde r8, r8, r5
206:
207: /*
208: * Wrap the longword around, adding the two 16-bit portions
209: * to each other along with any previous and subsequent carries.
210: */
211: Ldone:
212: addze r8, r8 ; Add the carry
213: addze r8, r8 ; Add the carry again (the last add may have carried)
214: andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word
215: srwi r6, r6, 16 ; Shift it to the low order word
216: andi. r8, r8, 0xffff ; Zero out the high order word
217: add r8, r8, r6 ; Add the two halves
218:
219: andis. r6, r8, 0xffff ; Do the above again in case we carried into the
220: srwi r6, r6, 16 ; high order word with the last add.
221: andi. r8, r8, 0xffff
222: add r3, r8, r6
223:
224: cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes
225: beq Ldontswap
226:
227: /*
228: * Our buffer began on an odd boundary, so we need to swap
229: * the checksum bytes.
230: */
231: slwi r8, r3, 8 ; shift byte 0 to byte 1
232: clrlwi r8, r8, 16 ; Clear top 16 bits
233: srwi r3, r3, 8 ; shift byte 1 to byte 0
234: or r3, r8, r3 ; or them
235:
236: Ldontswap:
237: add r3, r3, r12 ; Add in the passed-in checksum
238: andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits
239: srwi r6, r6, 16
240: andi. r3, r3, 0xffff
241: add r3, r3, r6
242:
243: andis. r6, r3, 0xffff ; Do the above again in case we carried into the
244: srwi r6, r6, 16 ; high order word with the last add.
245: andi. r3, r3, 0xffff
246: add r3, r3, r6
247: blr
248:
249:
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.