|
|
1.1 root 1: /*
2: * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3: *
4: * @APPLE_LICENSE_HEADER_START@
5: *
6: * The contents of this file constitute Original Code as defined in and
7: * are subject to the Apple Public Source License Version 1.1 (the
8: * "License"). You may not use this file except in compliance with the
9: * License. Please obtain a copy of the License at
10: * http://www.apple.com/publicsource and read it before using this file.
11: *
12: * This Original Code and all software distributed under the License are
13: * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14: * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15: * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17: * License for the specific language governing rights and limitations
18: * under the License.
19: *
20: * @APPLE_LICENSE_HEADER_END@
21: */
22: ;
23: ; Copy bytes of data around. handles overlapped data.
24: ;
25: ; Change this to use Altivec later on, and maybe floating point.
26: ;
27: ; NOTE: This file compiles and executes on both MacOX 8.x (Codewarrior)
28: ; and MacOX X. The "#if 0"s are treated as comments by CW so the
29: ; stuff between them is included by CW and excluded on MacOX X.
30: ; Same with the "#include"s.
31: ;
32: #include <ppc/asm.h>
33: #include <ppc/proc_reg.h>
34:
35: ; Use CR5_lt to indicate non-cached
36: #define noncache 20
37: #if 0
38: noncache: equ 20
39: #endif
40: #if 0
41: br0: equ 0
42: #endif
43:
44: ;
45: ; bcopy_nc(from, to, nbytes)
46: ;
47: ; bcopy_nc operates on non-cached memory so we can not use any kind
48: ; of cache instructions.
49: ;
50:
51:
52:
53: #if 0
54: IF 0
55: #endif
56: ENTRY(bcopy_nc, TAG_NO_FRAME_USED)
57: #if 0
58: ENDIF
59: export xbcopy_nc[DS]
60: tc xbcopy_nc[TC],xbcopy_nc[DS]
61: csect xbcopy_nc[DS]
62: dc.l .xbcopy_nc
63: dc.l TOC[tc0]
64: export .xbcopy_nc
65: csect xbcopy_nc[PR]
66: .xbcopy_nc:
67: #endif
68:
69: crset noncache ; Set non-cached
70: b bcpswap
71:
72: ;
73: ; void bcopy(from, to, nbytes)
74: ;
75:
76: #if 0
77: IF 0
78: #endif
79: ENTRY(bcopy, TAG_NO_FRAME_USED)
80: #if 0
81: ENDIF
82: export xbcopy[DS]
83: tc xbcopyc[TC],xbcopy[DS]
84: csect xbcopy[DS]
85: dc.l .xbcopy
86: dc.l TOC[tc0]
87: export .xbcopy
88: csect xbcopy[PR]
89: .xbcopy:
90: #endif
91:
92: crclr noncache ; Set cached
93:
94: bcpswap: cmplw cr1,r4,r3 ; Compare "to" and "from"
95: mr. r5,r5 ; Check if we have a 0 length
96: mr r6,r3 ; Set source
97: beqlr- cr1 ; Bail if "to" and "from" are the same
98: beqlr- ; Bail if length is 0
99: b copyit ; Go copy it...
100:
101: ;
102: ; When we move the memory, forward overlays must be handled. We
103: ; also can not use the cache instructions if we are from bcopy_nc.
104: ; We need to preserve R3 because it needs to be returned for memcpy.
105: ; We can be interrupted and lose control here.
106: ;
107: ; There is no stack, so in order to used floating point, we would
108: ; need to take the FP exception. Any potential gains by using FP
109: ; would be more than eaten up by this.
110: ;
111: ; Later, we should used Altivec for large moves.
112: ;
113:
114: #if 0
115: IF 0
116: #endif
117: ENTRY(memcpy, TAG_NO_FRAME_USED)
118: #if 0
119: ENDIF
120: export xmemcpy[DS]
121: tc xmemcpy[TC],xmemcpy[DS]
122: csect xmemcpy[DS]
123: dc.l .xmemcpy
124: dc.l TOC[tc0]
125: export .xmemcpy
126: csect xmemcpy[PR]
127: .xmemcpy:
128: #endif
129: cmplw cr1,r3,r4 ; "to" and "from" the same?
130: mr r6,r4 ; Set the "from"
131: mr. r5,r5 ; Length zero?
132: crclr noncache ; Set cached
133: mr r4,r3 ; Set the "to"
134: beqlr- cr1 ; "to" and "from" are the same
135: beqlr- ; Length is 0
136:
137: copyit: sub r12,r4,r6 ; Get potential overlap (negative if backward move)
138: lis r8,0x7FFF ; Start up a mask
139: srawi r11,r12,31 ; Propagate the sign bit
140: dcbt br0,r6 ; Touch in the first source line
141: cntlzw r7,r5 ; Get the highest power of 2 factor of the length
142: ori r8,r8,0xFFFF ; Make limit 0x7FFFFFFF
143: xor r9,r12,r11 ; If sink - source was negative, invert bits
144: srw r8,r8,r7 ; Get move length limitation
145: sub r9,r9,r11 ; If sink - source was negative, add 1 and get absolute value
146: cmplw r12,r5 ; See if we actually forward overlap
147: cmplwi cr7,r9,32 ; See if at least a line between source and sink
148: dcbtst br0,r4 ; Touch in the first sink line
149: cmplwi cr1,r5,32 ; Are we moving more than a line?
150: cror noncache,noncache,28 ; Set to not DCBZ output line if not enough space
151: blt- fwdovrlap ; This is a forward overlapping area, handle it...
152:
153: ;
154: ; R4 = sink
155: ; R5 = length
156: ; R6 = source
157: ;
158:
159: ;
160: ; Here we figure out how much we have to move to get the sink onto a
161: ; cache boundary. If we can, and there are still more that 32 bytes
162: ; left to move, we can really speed things up by DCBZing the sink line.
163: ; We can not do this if noncache is set because we will take an
164: ; alignment exception.
165:
166: neg r0,r4 ; Get the number of bytes to move to align to a line boundary
167: rlwinm. r0,r0,0,27,31 ; Clean it up and test it
168: and r0,r0,r8 ; limit to the maximum front end move
169: mtcrf 3,r0 ; Make branch mask for partial moves
170: sub r5,r5,r0 ; Set the length left to move
171: beq alline ; Already on a line...
172:
173: bf 31,alhalf ; No single byte to do...
174: lbz r7,0(r6) ; Get the byte
175: addi r6,r6,1 ; Point to the next
176: stb r7,0(r4) ; Save the single
177: addi r4,r4,1 ; Bump sink
178:
179: ; Sink is halfword aligned here
180:
181: alhalf: bf 30,alword ; No halfword to do...
182: lhz r7,0(r6) ; Get the halfword
183: addi r6,r6,2 ; Point to the next
184: sth r7,0(r4) ; Save the halfword
185: addi r4,r4,2 ; Bump sink
186:
187: ; Sink is word aligned here
188:
189: alword: bf 29,aldouble ; No word to do...
190: lwz r7,0(r6) ; Get the word
191: addi r6,r6,4 ; Point to the next
192: stw r7,0(r4) ; Save the word
193: addi r4,r4,4 ; Bump sink
194:
195: ; Sink is double aligned here
196:
197: aldouble: bf 28,alquad ; No double to do...
198: lwz r7,0(r6) ; Get the first word
199: lwz r8,4(r6) ; Get the second word
200: addi r6,r6,8 ; Point to the next
201: stw r7,0(r4) ; Save the first word
202: stw r8,4(r4) ; Save the second word
203: addi r4,r4,8 ; Bump sink
204:
205: ; Sink is quadword aligned here
206:
207: alquad: bf 27,alline ; No quad to do...
208: lwz r7,0(r6) ; Get the first word
209: lwz r8,4(r6) ; Get the second word
210: lwz r9,8(r6) ; Get the third word
211: stw r7,0(r4) ; Save the first word
212: lwz r11,12(r6) ; Get the fourth word
213: addi r6,r6,16 ; Point to the next
214: stw r8,4(r4) ; Save the second word
215: stw r9,8(r4) ; Save the third word
216: stw r11,12(r4) ; Save the fourth word
217: addi r4,r4,16 ; Bump sink
218:
219: ; Sink is line aligned here
220:
221: alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
222: mtcrf 3,r5 ; Make branch mask for backend partial moves
223: rlwinm r11,r5,0,0,26 ; Get number of bytes we are going to move
224: beq- backend ; No full lines to move
225:
226: sub r5,r5,r11 ; Calculate the residual
227: li r10,96 ; Stride for touch ahead
228:
229: nxtline: subic. r0,r0,1 ; Account for the line now
230:
231: bt- noncache,skipz ; Skip if we are not cached...
232: dcbz br0,r4 ; Blow away the whole line because we are replacing it
233: dcbt r6,r10 ; Touch ahead a bit
234:
235: skipz: lwz r7,0(r6) ; Get the first word
236: lwz r8,4(r6) ; Get the second word
237: lwz r9,8(r6) ; Get the third word
238: stw r7,0(r4) ; Save the first word
239: lwz r11,12(r6) ; Get the fourth word
240: stw r8,4(r4) ; Save the second word
241: lwz r7,16(r6) ; Get the fifth word
242: stw r9,8(r4) ; Save the third word
243: lwz r8,20(r6) ; Get the sixth word
244: stw r11,12(r4) ; Save the fourth word
245: lwz r9,24(r6) ; Get the seventh word
246: stw r7,16(r4) ; Save the fifth word
247: lwz r11,28(r6) ; Get the eighth word
248: addi r6,r6,32 ; Point to the next
249: stw r8,20(r4) ; Save the sixth word
250: stw r9,24(r4) ; Save the seventh word
251: stw r11,28(r4) ; Save the eighth word
252: addi r4,r4,32 ; Bump sink
253: bgt+ nxtline ; Do the next line, if any...
254:
255:
256: ; Move backend quadword
257:
258: backend: bf 27,noquad ; No quad to do...
259: lwz r7,0(r6) ; Get the first word
260: lwz r8,4(r6) ; Get the second word
261: lwz r9,8(r6) ; Get the third word
262: lwz r11,12(r6) ; Get the fourth word
263: stw r7,0(r4) ; Save the first word
264: addi r6,r6,16 ; Point to the next
265: stw r8,4(r4) ; Save the second word
266: stw r9,8(r4) ; Save the third word
267: stw r11,12(r4) ; Save the fourth word
268: addi r4,r4,16 ; Bump sink
269:
270: ; Move backend double
271:
272: noquad: bf 28,nodouble ; No double to do...
273: lwz r7,0(r6) ; Get the first word
274: lwz r8,4(r6) ; Get the second word
275: addi r6,r6,8 ; Point to the next
276: stw r7,0(r4) ; Save the first word
277: stw r8,4(r4) ; Save the second word
278: addi r4,r4,8 ; Bump sink
279:
280: ; Move backend word
281:
282: nodouble: bf 29,noword ; No word to do...
283: lwz r7,0(r6) ; Get the word
284: addi r6,r6,4 ; Point to the next
285: stw r7,0(r4) ; Save the word
286: addi r4,r4,4 ; Bump sink
287:
288: ; Move backend halfword
289:
290: noword: bf 30,nohalf ; No halfword to do...
291: lhz r7,0(r6) ; Get the halfword
292: addi r6,r6,2 ; Point to the next
293: sth r7,0(r4) ; Save the halfword
294: addi r4,r4,2 ; Bump sink
295:
296: ; Move backend byte
297:
298: nohalf: bflr 31 ; Leave cuz we are all done...
299: lbz r7,0(r6) ; Get the byte
300: stb r7,0(r4) ; Save the single
301:
302: blr ; Leave cuz we are all done...
303:
304: ;
305: ; 0123456789ABCDEF0123456789ABCDEF
306: ; 0123456789ABCDEF0123456789ABCDEF
307: ; F
308: ; DE
309: ; 9ABC
310: ; 12345678
311: ; 123456789ABCDEF0
312: ; 0
313:
314: ;
315: ; Here is where we handle a forward overlapping move. These will be slow
316: ; because we can not kill the cache of the destination until after we have
317: ; loaded/saved the source area. Also, because reading memory backwards is
318: ; slower when the cache line needs to be loaded because the critical
319: ; doubleword is loaded first, i.e., the last, then it goes back to the first,
320: ; and on in order. That means that when we are at the second to last DW we
321: ; have to wait until the whole line is in cache before we can proceed.
322: ;
323:
324: fwdovrlap: add r4,r5,r4 ; Point past the last sink byte
325: add r6,r5,r6 ; Point past the last source byte
326: and r0,r4,r8 ; Apply movement limit
327: li r12,-1 ; Make sure we touch in the actual line
328: mtcrf 3,r0 ; Figure out the best way to move backwards
329: dcbt r12,r6 ; Touch in the last line of source
330: rlwinm. r0,r0,0,27,31 ; Calculate the length to adjust to cache boundary
331: dcbtst r12,r4 ; Touch in the last line of the sink
332: beq- balline ; Aready on cache line boundary
333:
334: sub r5,r5,r0 ; Precaculate move length left after alignment
335:
336: bf 31,balhalf ; No single byte to do...
337: lbz r7,-1(r6) ; Get the byte
338: subi r6,r6,1 ; Point to the next
339: stb r7,-1(r4) ; Save the single
340: subi r4,r4,1 ; Bump sink
341:
342: ; Sink is halfword aligned here
343:
344: balhalf: bf 30,balword ; No halfword to do...
345: lhz r7,-2(r6) ; Get the halfword
346: subi r6,r6,2 ; Point to the next
347: sth r7,-2(r4) ; Save the halfword
348: subi r4,r4,2 ; Bump sink
349:
350: ; Sink is word aligned here
351:
352: balword: bf 29,baldouble ; No word to do...
353: lwz r7,-4(r6) ; Get the word
354: subi r6,r6,4 ; Point to the next
355: stw r7,-4(r4) ; Save the word
356: subi r4,r4,4 ; Bump sink
357:
358: ; Sink is double aligned here
359:
360: baldouble: bf 28,balquad ; No double to do...
361: lwz r7,-8(r6) ; Get the first word
362: lwz r8,-4(r6) ; Get the second word
363: subi r6,r6,8 ; Point to the next
364: stw r7,-8(r4) ; Save the first word
365: stw r8,-4(r4) ; Save the second word
366: subi r4,r4,8 ; Bump sink
367:
368: ; Sink is quadword aligned here
369:
370: balquad: bf 27,balline ; No quad to do...
371: lwz r7,-16(r6) ; Get the first word
372: lwz r8,-12(r6) ; Get the second word
373: lwz r9,-8(r6) ; Get the third word
374: lwz r11,-4(r6) ; Get the fourth word
375: stw r7,-16(r4) ; Save the first word
376: subi r6,r6,16 ; Point to the next
377: stw r8,-12(r4) ; Save the second word
378: stw r9,-8(r4) ; Save the third word
379: stw r11,-4(r4) ; Save the fourth word
380: subi r4,r4,16 ; Bump sink
381:
382: ; Sink is line aligned here
383:
384: balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
385: mtcrf 3,r5 ; Make branch mask for backend partial moves
386: beq- bbackend ; No full lines to move
387: #if 0
388: stwu r1,-8(r1) ; Dummy stack for MacOS
389: stw r2,4(r1) ; Save RTOC
390: #endif
391:
392:
393: ; Registers in use: R0, R1, R3, R4, R5, R6
394: ; Registers not in use: R2, R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
395:
396: bnxtline: subic. r0,r0,1 ; Account for the line now
397:
398: lwz r7,-32(r6) ; Get the first word
399: lwz r5,-28(r6) ; Get the second word
400: lwz r2,-24(r6) ; Get the third word
401: lwz r12,-20(r6) ; Get the third word
402: lwz r11,-16(r6) ; Get the fifth word
403: lwz r10,-12(r6) ; Get the sixth word
404: lwz r9,-8(r6) ; Get the seventh word
405: lwz r8,-4(r6) ; Get the eighth word
406: subi r6,r6,32 ; Point to the next
407:
408: stw r7,-32(r4) ; Get the first word
409: ble- bnotouch ; Last time, skip touch of source...
410: dcbt br0,r6 ; Touch in next source line
411:
412: bnotouch: stw r5,-28(r4) ; Get the second word
413: stw r2,-24(r4) ; Get the third word
414: stw r12,-20(r4) ; Get the third word
415: stw r11,-16(r4) ; Get the fifth word
416: stw r10,-12(r4) ; Get the sixth word
417: stw r9,-8(r4) ; Get the seventh word
418: stw r8,-4(r4) ; Get the eighth word
419: subi r4,r4,32 ; Bump sink
420:
421: bgt+ bnxtline ; Do the next line, if any...
422: #if 0
423: lwz r2,4(r1) ; Restore RTOC
424: lwz r1,0(r1) ; Pop dummy stack
425: #endif
426:
427: ;
428: ; Note: We touched these lines in at the beginning
429: ;
430:
431: ; Move backend quadword
432:
433: bbackend: bf 27,bnoquad ; No quad to do...
434: lwz r7,-16(r6) ; Get the first word
435: lwz r8,-12(r6) ; Get the second word
436: lwz r9,-8(r6) ; Get the third word
437: lwz r11,-4(r6) ; Get the fourth word
438: stw r7,-16(r4) ; Save the first word
439: subi r6,r6,16 ; Point to the next
440: stw r8,-12(r4) ; Save the second word
441: stw r9,-8(r4) ; Save the third word
442: stw r11,-4(r4) ; Save the fourth word
443: subi r4,r4,16 ; Bump sink
444:
445: ; Move backend double
446:
447: bnoquad: bf 28,bnodouble ; No double to do...
448: lwz r7,-8(r6) ; Get the first word
449: lwz r8,-4(r6) ; Get the second word
450: subi r6,r6,8 ; Point to the next
451: stw r7,-8(r4) ; Save the first word
452: stw r8,-4(r4) ; Save the second word
453: subi r4,r4,8 ; Bump sink
454:
455: ; Move backend word
456:
457: bnodouble: bf 29,bnoword ; No word to do...
458: lwz r7,-4(r6) ; Get the word
459: subi r6,r6,4 ; Point to the next
460: stw r7,-4(r4) ; Save the word
461: subi r4,r4,4 ; Bump sink
462:
463: ; Move backend halfword
464:
465: bnoword: bf 30,bnohalf ; No halfword to do...
466: lhz r7,-2(r6) ; Get the halfword
467: subi r6,r6,2 ; Point to the next
468: sth r7,-2(r4) ; Save the halfword
469: subi r4,r4,2 ; Bump sink
470:
471: ; Move backend byte
472:
473: bnohalf: bflr 31 ; Leave cuz we are all done...
474: lbz r7,-1(r6) ; Get the byte
475: stb r7,-1(r4) ; Save the single
476:
477: blr ; Leave cuz we are all done...
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.