File:  [WindowsNT SDKs] / ntddk / src / video / displays / vga / i386 / nalgnblt.asm
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Thu Aug 9 18:31:12 2018 UTC (7 years, 9 months ago) by root
Branches: msft, MAIN
CVS tags: ntddk-nov-1993, HEAD
Microsoft Windows NT Build 511 (DDK SDK) 11-01-1993

;******************************Module*Header*******************************\
; Module Name: nalgnblt.asm
;
; driver prototypes
;
; Copyright (c) 1992 Microsoft Corporation
;**************************************************************************/

;-----------------------------------------------------------------------;
; VOID vNonAlignedSrcCopy(PDEVSURF pdsurf, RECTL * prcldst, PPOINTL * pptlsrc,
;                      INT icopydir);
; Input:
;  pdsurf - surface on which to copy
;  prcldest - pointer to destination rectangle
;  pptlsrc - pointer to source upper left corner
;  icopydir - direction in which copy must proceed to avoid overlap problems
;             and synchronize with the clip enumeration visually, according to
;             constants CD_RIGHTDOWN, CD_LEFTDOWN, CD_RIGHTUP, and CD_LEFTUP in
;             WINDDI.H
;
; Performs accelarated non-aligned SRCCOPY VGA-to-VGA blts.
;
;-----------------------------------------------------------------------;
;
; Note: The source and dest *must* be non-aligned (not have the same
; left-edge intrabyte pixel alignment. Will not work properly if they are
; in fact aligned.
;
; Note: Assumes all rectangles have positive heights and widths. Will not
; work properly if this is not the case.
;
;-----------------------------------------------------------------------;

        comment $

The overall approach of this module for each rectangle to copy is:

1) Precalculate the masks and whole byte widths, and determine which of
partial left edge, partial right edge, and whole middle bytes are required
for this copy.

2) Set up the starting pointers for each of the areas (left, whole middle,
right), the start and stop scan lines, the copying direction (left-to-right
or right-to-left, and top-to-bottom or bottom-to-top), the threading
(sequence of calls required to do the left/whole/right components in the
proper sequence), based on the passed-in copy direction, which in turn is
dictated by the nature of the overlap between the source and destination.

3) Execute a loop, based on adapter type (2 R/W windows, 1R/1W window,
1 R/W window, unbanked), that sequences through the intersection of each
bank with the source and destination rectangles in the proper direction
(top-to-bottom or bottom-to-top, based on the passed-in copy direction),
and performs the copy in each such rectangle. The threading vector is used
to call the required routines (copy left/whole/right bytes). For 1 R/W and
1R/1W adapters, there is a second threading vector that is called when the
source and the destination are both adequately (for the copy purposes)
addressable simultaneously (because they're in the same bank), so there's
no need to copy through a temp buffer. We want to avoid the temp
buffer whenever we can, because it's slower.

Note: 1 R/W and 1R/1W edges are copied through a temporary buffer. However,
each plane's bytes are not stored in the corresponding plane's temp buffer, but
rather consecutively in the plane 0 temp buffer. This is to reduce page
faulting, and also so that 1R/1W adapters only need a temp buffer large enough
to hold 4*tallest bank words (4K will do). 1 R/W adapters still copy whole
bytes through the full temp buffer, using all four planes' temp buffers, so
they require a temp buffer big enough to hold a full bank (256K will do).

Note: The VGA's rotator is used to perform all rotation in this module. The
two source bytes relevant to this operation are masked to preserve the desired
bits, then combined and fed to the VGA's rotator, which performs the rotation.
This is better than letting the 386/486 do the rotation because even with the
barrel shifter, those processors take 3 cycles per rotate, where the masking
and combining take only 2 cycles (or no cycles, for edges with 1-wide
sources). We also get to avoid 16-bit instructions like ROL AX,CL; the 16-bit
size prefix costs a cycle on a 486.

        commend $

;-----------------------------------------------------------------------;
; Set LOOP_UNROLL_SHIFT to the log2 of the number of times you want loops in
; this module unrolled. For example, LOOP_UNROLL_SHIFT of 3 yields 2**3 = 8
; times unrolling. This is the only thing you need to change to control
; unrolling. Note: does not affect loops that process in chunks, like edge
; loops.

LOOP_UNROLL_SHIFT equ 2

;-----------------------------------------------------------------------;
; Maximum # of edge bytes to process before switching to next plane. Larger
; means faster, but there's more potential for flicker, since the raster scan
; has a better chance of catching bytes that have changed in some planes but
; not all planes.

EDGE_CHUNK_SIZE equ     16

;-----------------------------------------------------------------------;
; Macro to push the current threading sequence (string of routine calls) on the
; stack, then jump to the first threading entry. The threading pointer can be
; specified, or defaults to pCurrentThread. The return address can be
; immediately after the JMP, or can be specified.

THREAD_AND_START macro THREADING,RETURN_ADDR
        local   push_base, return_address

ifb <&RETURN_ADDR&>
        push    offset return_address   ;after all the threaded routines, we
                                        ; return here
else
        push    offset &RETURN_ADDR&    ;return here
endif

ifb <&THREADING&>
        mov     eax,pCurrentThread
else
        mov     eax,&THREADING&
endif

        mov     ecx,[eax]               ;# of routines to thread (at least 1)
        lea     ecx,[ecx*2+ecx]         ;pushes below are 3 bytes each
        mov     edx,offset push_base+3
        sub     edx,ecx
        jmp     edx                     ;branch to push or jmp below

; Push the threading addresses on to the stack, so routines perform the
; threading as they return.

        push    dword ptr [eax+12]       ;3 byte instruction
        push    dword ptr [eax+8]
push_base:
        jmp     dword ptr [eax+4]        ;jump to the first threaded routine

        align   4
return_address:
        endm

;-----------------------------------------------------------------------;

                .386

ifndef  DOS_PLATFORM
        .model  small,c
else
ifdef   STD_CALL
        .model  small,c
else
        .model  small,pascal
endif;  STD_CALL
endif;  DOS_PLATFORM

        assume cs:FLAT,ds:FLAT,es:FLAT,ss:FLAT
        assume fs:nothing,gs:nothing

        .xlist
        include stdcall.inc             ;calling convention cmacros
        include i386\egavga.inc
        include i386\strucs.inc
        include i386\unroll.inc
        include i386\ropdefs.inc

        .list

;-----------------------------------------------------------------------;

        .data

; Threads for stringing together left, whole byte, and right operations
; in various orders, both using a temp buffer and not. Data format is:
;
; DWORD +0 = # of calls in thread (1, 2, or 3)
;       +4 = first call (required)
;       +8 = second call (optional)
;      +12 = third call (optional)

        align   4

; Copies not involving the temp buffer.

Thread_L        dd      1
                dd      copy_left_edge

Thread_W        dd      1
                dd      copy_whole_bytes

Thread_R        dd      1
                dd      copy_right_edge

Thread_LR       dd      2
                dd      copy_left_edge
                dd      copy_right_edge

Thread_RL       dd      2
                dd      copy_right_edge
                dd      copy_left_edge

Thread_LW       dd      2
                dd      copy_left_edge
                dd      copy_whole_bytes

Thread_WL       dd      2
                dd      copy_whole_bytes
                dd      copy_left_edge

Thread_WR       dd      2
                dd      copy_whole_bytes
                dd      copy_right_edge

Thread_RW       dd      2
                dd      copy_right_edge
                dd      copy_whole_bytes

Thread_LWR      dd      3
                dd      copy_left_edge
                dd      copy_whole_bytes
                dd      copy_right_edge

Thread_RWL      dd      3
                dd      copy_right_edge
                dd      copy_whole_bytes
                dd      copy_left_edge

; Copies involving the temp buffer.

Thread_Lb       dd      1
                dd      copy_left_edge_via_buffer

Thread_Wb       dd      1
                dd      copy_whole_bytes_via_buffer

Thread_Rb       dd      1
                dd      copy_right_edge_via_buffer

Thread_LbRb     dd      2
                dd      copy_left_edge_via_buffer
                dd      copy_right_edge_via_buffer

Thread_RbLb     dd      2
                dd      copy_right_edge_via_buffer
                dd      copy_left_edge_via_buffer

Thread_LbW      dd      2
                dd      copy_left_edge_via_buffer
                dd      copy_whole_bytes

Thread_LbWb     dd      2
                dd      copy_left_edge_via_buffer
                dd      copy_whole_bytes_via_buffer

Thread_WLb      dd      2
                dd      copy_whole_bytes
                dd      copy_left_edge_via_buffer

Thread_WbLb     dd      2
                dd      copy_whole_bytes_via_buffer
                dd      copy_left_edge_via_buffer

Thread_WRb      dd      2
                dd      copy_whole_bytes
                dd      copy_right_edge_via_buffer

Thread_WbRb     dd      2
                dd      copy_whole_bytes_via_buffer
                dd      copy_right_edge_via_buffer

Thread_RbW      dd      2
                dd      copy_right_edge_via_buffer
                dd      copy_whole_bytes

Thread_RbWb     dd      2
                dd      copy_right_edge_via_buffer
                dd      copy_whole_bytes_via_buffer

Thread_LbWRb    dd      3
                dd      copy_left_edge_via_buffer
                dd      copy_whole_bytes
                dd      copy_right_edge_via_buffer

Thread_LbWbRb   dd      3
                dd      copy_left_edge_via_buffer
                dd      copy_whole_bytes_via_buffer
                dd      copy_right_edge_via_buffer

Thread_RbWLb    dd      3
                dd      copy_right_edge_via_buffer
                dd      copy_whole_bytes
                dd      copy_left_edge_via_buffer

Thread_RbWbLb   dd      3
                dd      copy_right_edge_via_buffer
                dd      copy_whole_bytes_via_buffer
                dd      copy_left_edge_via_buffer

;-----------------------------------------------------------------------;
; Table of thread selection for various horizontal copy directions, with
; the look-up index a 4-bit field as follows:
;
; Bit 3 = 1 if left-to-right copy, 0 if right-to-left
; Bit 2 = 1 if left edge must be copied
; Bit 1 = 1 if whole bytes must be copied
; Bit 0 = 1 if right edge must be copied
;
; This is used for all cases where both the source and destination are
; simultaneously addressable for our purposes, so there's no need to go
; through the temp buffer (unbanked, 2 R/W, and sometimes for 1 R/W and 1R/1W).

MasterThreadTable label dword
                                ;right-to-left
        dd      0               ;<not used>
        dd      Thread_R        ;R->L, R
        dd      Thread_W        ;R->L, W
        dd      Thread_RW       ;R->L, RW
        dd      Thread_L        ;R->L, L
        dd      Thread_RL       ;R->L, RL
        dd      Thread_WL       ;R->L, WL
        dd      Thread_RWL      ;R->L, RWL
                                ;left-to-right
        dd      0               ;<not used>
        dd      Thread_R        ;L->R, R
        dd      Thread_W        ;L->R, W
        dd      Thread_WR       ;L->R, WR
        dd      Thread_L        ;L->R, L
        dd      Thread_LR       ;L->R, LR
        dd      Thread_LW       ;L->R, LW
        dd      Thread_LWR      ;L->R, LWR


; Table of thread selection for various adapter types and horizontal
; copy directions, with the look-up index a 6-bit field as follows:
;
; Bit 5 = adapter type high bit
; Bit 4 = adapter type low bit
; Bit 3 = 1 if left-to-right copy, 0 if right-to-left
; Bit 2 = 1 if left edge must be copied
; Bit 1 = 1 if whole bytes must be copied
; Bit 0 = 1 if right edge must be copied
;
; This is used for all cases where the source and destination are not both
; simultaneously addressable for our purposes, so we need to go through the
; temp buffer (only for 1 R/W and 1R/1W, and only sometimes).

MasterThreadTableViaBuffer label dword
                                ;unbanked (no need for buffer)
                                ;right-to-left
        dd      0               ;<not used>
        dd      Thread_R        ;R->L, R
        dd      Thread_W        ;R->L, W
        dd      Thread_RW       ;R->L, RW
        dd      Thread_L        ;R->L, L
        dd      Thread_RL       ;R->L, RL
        dd      Thread_WL       ;R->L, WL
        dd      Thread_RWL      ;R->L, RWL
                                ;left-to-right
        dd      0               ;<not used>
        dd      Thread_R        ;L->R, R
        dd      Thread_W        ;L->R, W
        dd      Thread_WR       ;L->R, WR
        dd      Thread_L        ;L->R, L
        dd      Thread_LR       ;L->R, LR
        dd      Thread_LW       ;L->R, LW
        dd      Thread_LWR      ;L->R, LWR

                                ;1 R/W banking window (everything goes through
                                ;                       buffer)
                                ;right-to-left
        dd      0               ;<not used>
        dd      Thread_Rb       ;R->L, R
        dd      Thread_Wb       ;R->L, W
        dd      Thread_RbWb     ;R->L, RW
        dd      Thread_Lb       ;R->L, L
        dd      Thread_RbLb     ;R->L, RL
        dd      Thread_WbLb     ;R->L, WL
        dd      Thread_RbWbLb   ;R->L, RWL
                                ;left-to-right
        dd      0               ;<not used>
        dd      Thread_Rb       ;L->R, R
        dd      Thread_Wb       ;L->R, W
        dd      Thread_WbRb     ;L->R, WR
        dd      Thread_Lb       ;L->R, L
        dd      Thread_LbRb     ;L->R, LR
        dd      Thread_LbWb     ;L->R, LW
        dd      Thread_LbWbRb   ;L->R, LWR

                                ;1R/1W banking window (edge go through buffer)
                                ;right-to-left
        dd      0               ;<not used>
        dd      Thread_Rb       ;R->L, R
        dd      Thread_W        ;R->L, W
        dd      Thread_RbW      ;R->L, RW
        dd      Thread_Lb       ;R->L, L
        dd      Thread_RbLb     ;R->L, RL
        dd      Thread_WLb      ;R->L, WL
        dd      Thread_RbWLb    ;R->L, RWL
                                ;left-to-right
        dd      0               ;<not used>
        dd      Thread_Rb       ;L->R, R
        dd      Thread_W        ;L->R, W
        dd      Thread_WRb      ;L->R, WR
        dd      Thread_Lb       ;L->R, L
        dd      Thread_LbRb     ;L->R, LR
        dd      Thread_LbW      ;L->R, LW
        dd      Thread_LbWRb    ;L->R, LWR

                                ;2 R/W banking window (no need for buffer)
                                ;right-to-left
        dd      0               ;<not used>
        dd      Thread_R        ;R->L, R
        dd      Thread_W        ;R->L, W
        dd      Thread_RW       ;R->L, RW
        dd      Thread_L        ;R->L, L
        dd      Thread_RL       ;R->L, RL
        dd      Thread_WL       ;R->L, WL
        dd      Thread_RWL      ;R->L, RWL
                                ;left-to-right
        dd      0               ;<not used>
        dd      Thread_R        ;L->R, R
        dd      Thread_W        ;L->R, W
        dd      Thread_WR       ;L->R, WR
        dd      Thread_L        ;L->R, L
        dd      Thread_LR       ;L->R, LR
        dd      Thread_LW       ;L->R, LW
        dd      Thread_LWR      ;L->R, LWR


; Amount to shift adapter type field left for use in MasterThreadTableViaBuffer.

ADAPTER_FIELD_SHIFT     equ     4

; Mask for setting left-to-right bit to "left-to-right true" for use in both
; MasterThread tables.

LEFT_TO_RIGHT_FIELD_SET equ     1000b


; Table of top-to-bottom loops for adapter types.

        align   4
TopToBottomLoopTable label dword
        dd      top_to_bottom_2RW       ;unbanked is same as 2RW
        dd      top_to_bottom_1RW
        dd      top_to_bottom_1R1W
        dd      top_to_bottom_2RW


; Table of bottom-to-top loops for adapter types.

        align   4
BottomToTopLoopTable label dword
        dd      bottom_to_top_2RW       ;unbanked is same as 2RW
        dd      bottom_to_top_1RW
        dd      bottom_to_top_1R1W
        dd      bottom_to_top_2RW


; Table of routines for setting up to copy in various directions.

        align   4
SetUpForCopyDirection   label   dword
        dd      left_to_right_top_to_bottom     ;CD_RIGHTDOWN
        dd      right_to_left_top_to_bottom     ;CD_LEFTDOWN
        dd      left_to_right_bottom_to_top     ;CD_RIGHTUP
        dd      right_to_left_bottom_to_top     ;CD_LEFTUP

;-----------------------------------------------------------------------;
; Left edge clip masks for intrabyte start addresses 0 through 7.
; Whole byte cases are flagged as 0ffh.

jLeftMaskTable  label   byte
        db      0ffh,07fh,03fh,01fh,00fh,007h,003h,001h

;-----------------------------------------------------------------------;
; Right edge clip masks for intrabyte end addresses (non-inclusive)
; 0 through 7. Whole byte cases are flagged as 0ffh.

jRightMaskTable label   byte
        db      0ffh,080h,0c0h,0e0h,0f0h,0f8h,0fch,0feh

;-----------------------------------------------------------------------;
; Table of width-based source-edge-to-buffer copy routines.

        align   4
copy_edge_from_screen_to_buffer label   dword
        dd      copy_screen_to_buffered_edge_1ws
        dd      copy_screen_to_buffered_edge_2ws

;-----------------------------------------------------------------------;
; Table of width-based buffer-to-dest-edge copy routines.

        align   4
copy_edge_from_buffer_to_screen label   dword
        dd      copy_buffered_edge_to_screen_1ws
        dd      copy_buffered_edge_to_screen_2ws

;-----------------------------------------------------------------------;
; Table of width-based edge copy routines (no intermediate buffer).

        align   4
copy_edge_table label   dword
        dd      copy_edge_1ws
        dd      copy_edge_2ws

;-----------------------------------------------------------------------;

        .code

_TEXT$04   SEGMENT DWORD USE32 PUBLIC 'CODE'
           ASSUME  CS:FLAT, DS:FLAT, ES:FLAT, SS:NOTHING, FS:NOTHING, GS:NOTHING

;-----------------------------------------------------------------------;

cProc   vNonAlignedSrcCopy,16,<        \
        uses    esi edi ebx,    \
        pdsurf: ptr DEVSURF,    \
        prcldest : ptr RECTL,   \
        pptlsrc : ptr POINTL,   \
        icopydir : dword

        local   culWholeBytesWidth : dword ;# of bytes to copy across each scan
        local   ulBlockHeight : dword   ;# of scans to copy per bank block
        local   ulWholeScanDelta : dword;offset from end of one whole bytes
                                        ; scan to start of next
        local   ulWholeBytesSrc : dword ;offset in bitmap of first source whole
                                        ; byte to copy from
        local   ulWholeBytesDest : dword;offset in bitmap of first source whole
                                        ; byte to copy to
        local   ulLeftEdgeSrc : dword   ;offset in bitmap of first source left
                                        ; edge byte to copy from
        local   ulLeftEdgeDest : dword  ;offset in bitmap of first dest left
                                        ; edge byte to copy to
        local   ulRightEdgeSrc : dword  ;offset in bitmap of first source right
                                        ; edge byte to copy from
        local   ulRightEdgeDest : dword ;offset in bitmap of first dest right
                                        ; edge byte to copy to
        local   ulNextScan : dword      ;width of scan, in bytes
        local   jLeftMask : dword       ;left edge clip mask
        local   jRightMask : dword      ;right edge clip mask
        local   culTempCount : dword    ;handy temporary counter
        local   pTempEntry : dword      ;temporary storage for vector into
                                        ; unrolled loop
        local   pTempPlane : dword      ;pointer to storage in temp buffer for
                                        ; edge bytes (which are stored
                                        ; consecutively, not in each plane's
                                        ; temp buffer, to reduce possible page
                                        ; faulting
        local   ppTempPlane0 : dword    ;pointer to pointer to storage in temp
                                        ; buffer for plane 0, immediately
                                        ; preceded by storage for planes 1, 2,
                                        ; and 3
        local   ppTempPlane3 : dword    ;like above, but for plane 3
        local   ulOffsetInBank : dword  ;offset relative to bank start
        local   pSrcAddr : dword        ;working pointer to first source
                                        ; byte to copy from
        local   pDestAddr : dword       ;working pointer to first dest
                                        ; byte to copy to
        local   ulCurrentJustification:dword ;justification used to map in
                                             ; banks; top for top to bottom
                                             ; copies, bottom for bottom to top
        local   ulCurrentSrcScan :dword ;scan line used to map in current
                                        ; source bank
        local   ulCurrentDestScan:dword ;scan line used to map in current dest
                                        ; bank
        local   ulLastDestScan :dword   ;scan in target rect at which we stop
                                        ; advancing through banks
        local   pCurrentThread : dword  ;pointer to data describing the
                                        ; threaded calls to be performed to
                                        ; perform the current copy
        local   pCurrentThreadViaBuffer:dword
                                        ;pointer to data describing the
                                        ; threaded calls to be performed to
                                        ; perform the current copy in the case
                                        ; where the source and destination are
                                        ; not simultaneously adequately
                                        ; accessible, so the copy has to go
                                        ; through a temp buffer (used only for
                                        ; 1 R/W and 1R/1W banking)
        local   ulAdapterType : dword   ;adapter type code, per VIDEO_BANK_TYPE
        local   ulLWRType : dword       ;whether left edge, whole bytes, and
                                        ; right edge are involved in the
                                        ; current operation;
                                        ; bit 2 = 1 if left edge involved
                                        ; bit 1 = 1 if whole bytes involved
                                        ; bit 0 = 1 if right edge involved
        local   ulLeftEdgeAdjust :dword ;used to bump the whole bytes start
                                        ; address past the left edge when the
                                        ; left edge is partial
        local   ulCombineMask : dword   ;mask for combining desired portions
                                        ; of AL and AH before ORing to make a
                                        ; single byte; used to combine before
                                        ; letting VGA rotate byte as it's
                                        ; written. Used for all cases except
                                        ; whole bytes copied left-to-right
        local   ulCombineMaskWhole : dword
                                        ;mask for combining desired portions of
                                        ; AL and AH when copying whole bytes
                                        ; (different from ulCombineMask in the
                                        ; case of whole bytes left-to-right
                                        ; copies, because then AH is the lsb
                                        ; and AL is the MSB; then, this is
                                        ; ulCombineMask with the bytes swapped.
                                        ; For right-to-left whole byte copies,
                                        ; this is the same as ulCombineMask)
        local   ulTempScanCount : dword ;temp scan line countdown variable
        local   ulWholeScanSrcDelta : dword
                                        ;offset from end of one source whole
                                        ; bytes scan line to start of next.
                                        ; Differs from ulWholeScanDelta because
                                        ; of source rotation pipeline priming
        local   ulLeftSrcWidthMinus1 : dword ;# of bytes in left src edge minus
                                             ; one (0 or 1)
        local   ulRightSrcWidthMinus1 : dword ;# of bytes in right src edge
                                             ; minus one (0 or 1)

;-----------------------------------------------------------------------;

; Set pointers to temp buffer plane pointers (used only by 1 R/W and 1R/1W
; adapters), and other rectangle-independent variables.

        mov     esi,pdsurf
        mov     eax,[esi].dsurf_pvBankBufferPlane0
        mov     pTempPlane,eax
        lea     eax,[esi].dsurf_pvBankBufferPlane0
        mov     ppTempPlane0,eax
        lea     eax,[esi].dsurf_pvBankBufferPlane3
        mov     ppTempPlane3,eax

        mov     eax,[esi].dsurf_vbtBankingType
        mov     ulAdapterType,eax

; Copy the rectangle.

        call    copy_rect

;-----------------------------------------------------------------------;
; Set the VGA registers back to their default state.
;-----------------------------------------------------------------------;

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     eax,(0ffh shl 8) + GRAF_BIT_MASK
        out     dx,ax           ;enable bit mask for all bits

        mov     eax,(DR_SET shl 8) + GRAF_DATA_ROT
        out     dx,ax           ;restore default of no rotation

        mov     dl,SEQ_DATA
        mov     al,MM_ALL
        out     dx,al           ;enable writes to all planes

        cld                     ;restore default direction flag

        cRet    vNonAlignedSrcCopy ;done


;***********************************************************************;
;
; Copies the specified rectangle.
;
;***********************************************************************;

        align   4
copy_rect:

; Calculate the rotation, set up the VGA's rotator, and set the byte-combining
; masks.

        mov     edi,prcldest            ;left edge of destination
        mov     esi,pptlsrc
        mov     ah,byte ptr [edi].xLeft ;left edge of source
        sub     ah,byte ptr [esi].ptl_x
        and     ah,07h                  ;rotation = (dest - source) % 8
        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_DATA_ROT
        out     dx,ax                   ;set the VGA's rotator for the rotation

; Set up byte-combining mask, in preparation for ORing and letting the VGA's
; rotator rotate, assuming the left-hand source byte is in AL and the
; right-hand source byte is in AH (true for all cases except left-to-right
; whole bytes).

        mov     cl,ah
        mov     eax,0000ff00h
        rol     ax,cl
        mov     ulCombineMask,eax

; Calculate source edge widths (1 or 2 bytes).

        sub     edx,edx         ;assume right source width is 1
        mov     ebx,[edi].xLeft
        mov     ecx,[edi].xRight ;dest right edge (non-inclusive)
        dec     ecx             ;make it inclusive
        sub     ecx,ebx         ;dest width = dest right - dest left
        mov     eax,[esi].ptl_x
        add     ecx,eax         ;ECX = right edge of source
        xor     eax,ecx
        and     eax,not 07h     ;do the src start and end differ in byte
                                ; address bits? (as opposed to intrabyte)
        jz      short @F        ;no, force 1-wide source

        mov     al,byte ptr [edi].xLeft
        mov     ah,byte ptr [esi].ptl_x
        and     eax,00000707h
        cmp     ah,al
        jb      short @F
        inc     edx             ;left source width is 2
@@:
        mov     ulLeftSrcWidthMinus1,edx

        sub     edx,edx         ;assume right source width is 1
        mov     eax,[edi].xRight ;dest right edge (non-inclusive)
        dec     eax             ;make it inclusive
        and     cl,07h          ;intrabyte source address
        and     al,07h          ;intrabyte dest address
        cmp     cl,al
        ja      short @F
        inc     edx             ;right source width is 2
@@:
        mov     ulRightSrcWidthMinus1,edx

; Set up masks and whole bytes count, and build left/whole/right index
; indicating which of those parts are involved in the copy.

        mov     ebx,[edi].xRight        ;right edge of fill (non-inclusive)
        mov     ecx,ebx
        and     ecx,0111b               ;intrabyte address of right edge
        mov     ah,jRightMaskTable[ecx] ;right edge mask

        mov     esi,[edi].xLeft         ;left edge of fill (inclusive)
        mov     ecx,esi
        shr     ecx,3                   ;/8 for start offset from left edge
                                        ; of scan line
        sub     ebx,esi                 ;width in pixels of fill

        and     esi,0111b               ;intrabyte address of left edge
        mov     al,jLeftMaskTable[esi]  ;left edge mask

        dec     ebx                     ;make inclusive on right
        add     ebx,esi                 ;inclusive width, starting counting at
                                        ; the beginning of the left edge byte
        shr     ebx,3                   ;width of fill in bytes touched - 1
        jnz     short more_than_1_byte  ;more than 1 byte is involved

; Only one byte will be affected. Combine first/last masks.

        and     al,ah                   ;we'll use first byte mask only
        xor     ah,ah                   ;want last byte mask to be 0 to
                                        ; indicate right edge not involved
        inc     ebx                     ;so there's one count to subtract below
                                        ; if this isn't a whole edge byte
more_than_1_byte:

; If all pixels in the left edge are altered, combine the first byte into the
; whole byte count, because we can handle solid edge bytes faster as part of
; the whole bytes. Ditto for the right edge.

        sub     ecx,ecx                 ;edge whole-status accumulator
        cmp     al,-1                   ;is left edge a whole byte or partial?
        adc     ecx,ecx                 ;ECX=1 if left edge partial, 0 if whole
        sub     ebx,ecx                 ;if left edge partial, deduct it from
                                        ; the whole bytes count
        mov     ulLeftEdgeAdjust,ecx    ;for skipping over the left edge if
                                        ; it's partial when pointing to the
                                        ; whole bytes
        and     ah,ah                   ;is right edge mask 0, meaning this
                                        ; fill is only 1 byte wide?
        jz      short save_masks        ;yes, no need to do anything
        or      ecx,40h                 ;assume there's a partial right edge
        cmp     ah,-1                   ;is right edge a whole byte or partial?
        jnz     short save_masks        ;partial
                                        ;bit 1=0 if left edge partial, 1 whole
        inc     ebx                     ;if right edge whole, include it in the
                                        ; whole bytes count
        and     ecx,not 40h             ;there's no partial right edge
save_masks:
        cmp     ebx,1                   ;do we have any whole bytes?
        cmc                             ;CF set if whole byte count > 0
        adc     ecx,ecx                 ;if any whole bytes, set whole bytes
                                        ; bit in left/whole/right accumulator
        rol     cl,1                    ;align the left/whole/right bits
        mov     ulLWRType,ecx           ;save left/whole/right status

        mov     byte ptr jLeftMask,al   ;save left and right clip masks
        mov     byte ptr jRightMask,ah
        mov     culWholeBytesWidth,ebx  ;save # of whole bytes

; Copy the rectangle in the specified direction.

        mov     eax,icopydir
        jmp     SetUpForCopyDirection[eax*4]


;***********************************************************************;
;
; The following routines set up to handle the four possible copy
; directions.
;
;***********************************************************************;


;-----------------------------------------------------------------------;
; Set-up code for left-to-right, top-to-bottom copies.
;-----------------------------------------------------------------------;

        align   4
left_to_right_top_to_bottom:

        cld                             ;we'll copy left to right

; Byte-combining mask, in preparation for ORing and letting the VGA's rotator
; rotate, assuming the left-hand source byte is in AH and the right-hand source
; byte is in AL (true only for left-to-right whole bytes).

        mov     eax,ulCombineMask
        not     eax
        mov     ulCombineMaskWhole,eax

        mov     esi,pdsurf
        mov     eax,[esi].dsurf_lNextScan
        mov     ulNextScan,eax          ;copy top to bottom
        sub     eax,culWholeBytesWidth  ;offset from end of one dest whole byte
        mov     ulWholeScanDelta,eax    ; scan to start of next
        dec     eax                     ;offset from end of one src whole byte
        mov     ulWholeScanSrcDelta,eax ; scan to start of next, accounting for
                                        ; leading byte used to prime the
                                        ; rotation pipeline

        mov     esi,ulLWRType           ;3-bit flag field for left, whole, and
                                        ; right involvement in operation
        or      esi,LEFT_TO_RIGHT_FIELD_SET   ;add left-to-right into the index
        mov     eax,MasterThreadTable[esi*4]
        mov     pCurrentThread,eax      ;threading when no buffering is needed
        mov     edx,ulAdapterType
        shl     edx,ADAPTER_FIELD_SHIFT
        or      esi,edx                 ;factor adapter type into the index
        mov     eax,MasterThreadTableViaBuffer[esi*4]
        mov     pCurrentThreadViaBuffer,eax ;threading when buffering is needed

        mov     ulCurrentJustification,JustifyTop ;copy top to bottom

        mov     esi,prcldest
        mov     eax,[esi].yBottom
        mov     ulLastDestScan,eax      ;end at bottom of dest copy rect
        mov     eax,[esi].yTop
        mov     ulCurrentDestScan,eax   ;start at top of dest copy rect
        mul     ulNextScan              ;offset in bitmap of top dest rect scan
        mov     edx,[esi].xLeft
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first dest byte
        mov     ulLeftEdgeDest,eax      ;that's where the left dest edge is
        add     eax,ulLeftEdgeAdjust    ;the whole bytes start at the next
                                        ; byte, unless the left edge is a whole
                                        ; byte and is thus part of the whole
                                        ; bytes already
        mov     ulWholeBytesDest,eax    ;where the whole dest bytes start
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeDest,eax     ;where the right dest edge starts

        mov     esi,pptlsrc
        mov     eax,[esi].ptl_y
        mov     ulCurrentSrcScan,eax    ;start at top of source copy rect
        mul     ulNextScan              ;offset in bitmap of top dest rect scan
        mov     edx,[esi].ptl_x
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first source byte
        mov     ulLeftEdgeSrc,eax       ;that's where the left src edge is
        add     eax,ulLeftSrcWidthMinus1 ;the first whole byte includes the
        dec     eax                      ; last (leftmost) left edge byte, so
        add     eax,ulLeftEdgeAdjust     ; add a byte if the left edge is 2
                                         ; wide, except when the left dest byte
                                         ; is solid so the left edge is part of
                                         ; the whole bytes
        mov     ulWholeBytesSrc,eax     ;where the src whole bytes start
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeSrc,eax      ;where the right src edge starts,
                                        ; because the whole bytes and the right
                                        ; source edge share a byte, and we
                                        ; always point to the leftmost byte in
                                        ; the right source edge

; Branch to the appropriate top-to-bottom bank enumeration loop.

        mov     eax,ulAdapterType
        jmp     TopToBottomLoopTable[eax*4]


;-----------------------------------------------------------------------;
; Set-up code for right-to-left, top-to-bottom copies.
;-----------------------------------------------------------------------;

        align   4
right_to_left_top_to_bottom:

        std                             ;we'll copy right to left

; Byte-combining mask, in preparation for ORing and letting the VGA's rotator
; rotate, assuming the left-hand source byte is in AL and the right-hand source
; byte is in AH (always true except for left-to-right whole bytes).

        mov     eax,ulCombineMask
        mov     ulCombineMaskWhole,eax

        mov     esi,pdsurf
        mov     eax,[esi].dsurf_lNextScan
        mov     ulNextScan,eax          ;copy top to bottom
        add     eax,culWholeBytesWidth  ;offset from end of one whole byte scan
        mov     ulWholeScanDelta,eax    ; to start of next, given that we're
                                        ; copying one way and going scan-to-
                                        ; scan the other way
        inc     eax                     ;offset from end of one src whole byte
        mov     ulWholeScanSrcDelta,eax ; scan to start of next, accounting for
                                        ; leading byte used to prime the
                                        ; rotation pipeline

        mov     esi,ulLWRType           ;3-bit flag field for left, whole, and
                                        ; right involvement in operation
                                        ;leave left-to-right field cleared, so
                                        ; we look up right-to-left entries
        mov     eax,MasterThreadTable[esi*4]
        mov     pCurrentThread,eax      ;threading when no buffering is needed
        mov     edx,ulAdapterType
        shl     edx,ADAPTER_FIELD_SHIFT
        or      esi,edx                 ;factor adapter type into the index
        mov     eax,MasterThreadTableViaBuffer[esi*4]
        mov     pCurrentThreadViaBuffer,eax ;threading when buffering is needed

        mov     ulCurrentJustification,JustifyTop ;copy top to bottom

        mov     esi,prcldest
        mov     eax,[esi].yBottom
        mov     ulLastDestScan,eax      ;end at bottom of dest copy rect
        mov     eax,[esi].yTop
        mov     ulCurrentDestScan,eax   ;start at top of dest copy rect
        mul     ulNextScan              ;offset in bitmap of top dest rect scan
        mov     edx,[esi].xLeft
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first dest byte
        mov     ulLeftEdgeDest,eax      ;that's where the left dest edge is
        add     eax,ulLeftEdgeAdjust    ;the whole bytes start at the next
                                        ; byte, unless the left edge is a whole
                                        ; byte and is thus part of the whole
                                        ; bytes already
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeDest,eax     ;where the right dest edge starts
        dec     eax                     ;back up to the last whole byte
        mov     ulWholeBytesDest,eax    ;where the whole dest bytes start

        mov     esi,pptlsrc
        mov     eax,[esi].ptl_y
        mov     ulCurrentSrcScan,eax    ;start at top of source copy rect
        mul     ulNextScan              ;offset in bitmap of top dest rect scan
        mov     edx,[esi].ptl_x
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first source byte
        mov     ulLeftEdgeSrc,eax       ;that's where the left src edge is
        add     eax,ulLeftSrcWidthMinus1 ;the first whole byte includes the
        dec     eax                      ; last (leftmost) left edge byte, so
        add     eax,ulLeftEdgeAdjust     ; add a byte if the left edge is 2
                                         ; wide, except when the left dest byte
                                         ; is solid so the left edge is part of
                                         ; the whole bytes
        add     eax,culWholeBytesWidth  ;point to the right edge of the whole
                                        ; src bytes, accounting for the extra
                                        ; source byte needed to prime the
                                        ; rotation pipeline
        mov     ulWholeBytesSrc,eax     ;where the src whole bytes start
        mov     ulRightEdgeSrc,eax      ;that's also where the right src edge
                                        ; starts, because the whole bytes and
                                        ; the right source edge share a byte,
                                        ; and we always point to the leftmost
                                        ; byte in the right source edge

; Branch to the appropriate top-to-bottom bank enumeration loop.

        mov     eax,ulAdapterType
        jmp     TopToBottomLoopTable[eax*4]


;-----------------------------------------------------------------------;
; Set-up code for left-to-right, bottom-to-top copies.
;-----------------------------------------------------------------------;

        align   4
left_to_right_bottom_to_top:

        cld                             ;we'll copy left to right

; Byte-combining mask, in preparation for ORing and letting the VGA's rotator
; rotate, assuming the left-hand source byte is in AH and the right-hand source
; byte is in AL (true only for left-to-right whole bytes).

        mov     eax,ulCombineMask
        not     eax
        mov     ulCombineMaskWhole,eax

        mov     edi,pdsurf
        mov     eax,[edi].dsurf_lNextScan
        neg     eax
        mov     ulNextScan,eax          ;copy bottom to top
        sub     eax,culWholeBytesWidth  ;offset from end of one whole byte scan
        mov     ulWholeScanDelta,eax    ; to start of next, given that we're
                                        ; copying one way and going scan-to-
                                        ; scan the other way
        dec     eax                     ;offset from end of one src whole byte
        mov     ulWholeScanSrcDelta,eax ; scan to start of next, accounting for
                                        ; leading byte used to prime the
                                        ; rotation pipeline

        mov     esi,ulLWRType           ;3-bit flag field for left, whole, and
                                        ; right involvement in operation
        or      esi,LEFT_TO_RIGHT_FIELD_SET   ;add left-to-right into the index
        mov     eax,MasterThreadTable[esi*4]
        mov     pCurrentThread,eax      ;threading when no buffering is needed
        mov     edx,ulAdapterType
        shl     edx,ADAPTER_FIELD_SHIFT
        or      esi,edx                 ;factor adapter type into the index
        mov     eax,MasterThreadTableViaBuffer[esi*4]
        mov     pCurrentThreadViaBuffer,eax ;threading when buffering is needed

        mov     ulCurrentJustification,JustifyBottom ;copy bottom to top

        mov     esi,prcldest
        mov     edx,[esi].yTop
        mov     ulLastDestScan,edx      ;end at top of dest copy rect
        mov     eax,[esi].yBottom
        dec     eax                     ;rectangle definition is non-inclusive,
                                        ; so advance to first scan we'll copy
        sub     edx,eax                 ;-(offset from rect top to bottom)
        push    edx                     ;remember for use with source
        mov     ulCurrentDestScan,eax   ;start at bottom of dest copy rect
        mul     [edi].dsurf_lNextScan   ;offset in bitmap of bottom dest rect
                                        ; scan (first scan to which to copy)
        mov     edx,[esi].xLeft
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first dest byte
        mov     ulLeftEdgeDest,eax      ;that's where the left dest edge is
        add     eax,ulLeftEdgeAdjust    ;the whole bytes start at the next
                                        ; byte, unless the left edge is a whole
                                        ; byte and is thus part of the whole
                                        ; bytes already
        mov     ulWholeBytesDest,eax    ;where the whole dest bytes start
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeDest,eax     ;where the right dest edge starts

        mov     esi,pptlsrc
        mov     eax,[esi].ptl_y
        pop     edx                     ;retrieve -(offset from top to bottom)
        sub     eax,edx                 ;advance to bottom of source rect
                                        ; (inclusive; this is first scan from
                                        ; which to copy)
        mov     ulCurrentSrcScan,eax    ;start at bottom of source copy rect
        mul     [edi].dsurf_lNextScan   ;offset in bitmap of bottom dest rect
                                        ; scan
        mov     edx,[esi].ptl_x
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first source byte
        mov     ulLeftEdgeSrc,eax       ;that's where the left src edge is
        add     eax,ulLeftSrcWidthMinus1 ;the first whole byte includes the
        dec     eax                      ; last (leftmost) left edge byte, so
        add     eax,ulLeftEdgeAdjust     ; add a byte if the left edge is 2
                                         ; wide, except when the left dest byte
                                         ; is solid so the left edge is part of
                                         ; the whole bytes
        mov     ulWholeBytesSrc,eax     ;where the src whole bytes start
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeSrc,eax      ;where the right src edge starts,
                                        ; because the whole bytes and the right
                                        ; source edge share a byte, and we
                                        ; always point to the leftmost byte in
                                        ; the right source edge

; Branch to the appropriate bottom-to-top bank enumeration loop.

        mov     eax,ulAdapterType
        jmp     BottomToTopLoopTable[eax*4]


;-----------------------------------------------------------------------;
; Set-up code for right-to-left, bottom-to-top copies.
;-----------------------------------------------------------------------;

        align   4
right_to_left_bottom_to_top:

        std                             ;we'll copy right to left

; Byte-combining mask, in preparation for ORing and letting the VGA's rotator
; rotate, assuming the left-hand source byte is in AL and the right-hand source
; byte is in AH (always true except for left-to-right whole bytes).

        mov     eax,ulCombineMask
        mov     ulCombineMaskWhole,eax

        mov     edi,pdsurf
        mov     eax,[edi].dsurf_lNextScan
        neg     eax
        mov     ulNextScan,eax          ;copy bottom to top
        add     eax,culWholeBytesWidth  ;offset from end of one whole byte scan
        mov     ulWholeScanDelta,eax    ; to start of next
        inc     eax                     ;offset from end of one src whole byte
        mov     ulWholeScanSrcDelta,eax ; scan to start of next, accounting for
                                        ; leading byte used to prime the
                                        ; rotation pipeline

        mov     esi,ulLWRType           ;3-bit flag field for left, whole, and
                                        ; right involvement in operation
                                        ;leave left-to-right field cleared, so
                                        ; we look up right-to-left entries
        mov     eax,MasterThreadTable[esi*4]
        mov     pCurrentThread,eax      ;threading when no buffering is needed
        mov     edx,ulAdapterType
        shl     edx,ADAPTER_FIELD_SHIFT
        or      esi,edx                 ;factor adapter type into the index
        mov     eax,MasterThreadTableViaBuffer[esi*4]
        mov     pCurrentThreadViaBuffer,eax ;threading when buffering is needed

        mov     ulCurrentJustification,JustifyBottom ;copy bottom to top

        mov     esi,prcldest
        mov     edx,[esi].yTop
        mov     ulLastDestScan,edx      ;end at top of dest copy rect
        mov     eax,[esi].yBottom
        dec     eax                     ;rectangle definition is non-inclusive,
                                        ; so advance to first scan we'll copy
        sub     edx,eax                 ;-(offset from rect top to bottom)
        push    edx                     ;remember for use with source
        mov     ulCurrentDestScan,eax   ;start at bottom of dest copy rect
        mul     [edi].dsurf_lNextScan   ;offset in bitmap of bottom dest rect
                                        ; scan (first scan to which to copy)
        mov     edx,[esi].xLeft
        shr     edx,3                   ;byte X address
        add     eax,edx
        mov     ulLeftEdgeDest,eax      ;that's where the left dest edge is
        add     eax,ulLeftEdgeAdjust    ;the whole bytes start at the next
                                        ; byte, unless the left edge is a whole
                                        ; byte and is thus part of the whole
                                        ; bytes already
        add     eax,culWholeBytesWidth  ;point to the right edge
        mov     ulRightEdgeDest,eax     ;where the right dest edge starts
        dec     eax                     ;back up to the last whole byte
        mov     ulWholeBytesDest,eax    ;where the whole dest bytes start

        mov     esi,pptlsrc
        mov     eax,[esi].ptl_y
        pop     edx                     ;retrieve -(offset from top to bottom)
        sub     eax,edx                 ;advance to bottom of source rect
                                        ; (inclusive; this is first scan from
                                        ; which to copy)
        mov     ulCurrentSrcScan,eax    ;start at bottom of source copy rect
        mul     [edi].dsurf_lNextScan   ;offset in bitmap of bottom dest rect
                                        ; scan
        mov     edx,[esi].ptl_x
        shr     edx,3                   ;byte X address
        add     eax,edx                 ;offset in bitmap of first source byte
        mov     ulLeftEdgeSrc,eax       ;that's where the left src edge is
        add     eax,ulLeftSrcWidthMinus1 ;the first whole byte includes the
        dec     eax                      ; last (leftmost) left edge byte, so
        add     eax,ulLeftEdgeAdjust     ; add a byte if the left edge is 2
                                         ; wide, except when the left dest byte
                                         ; is solid so the left edge is part of
                                         ; the whole bytes
        add     eax,culWholeBytesWidth  ;point to the right edge of the whole
                                        ; src bytes, accounting for the extra
                                        ; source byte needed to prime the
                                        ; rotation pipeline
        mov     ulWholeBytesSrc,eax     ;where the src whole bytes start
        mov     ulRightEdgeSrc,eax      ;that's also where the right src edge
                                        ; starts, because the whole bytes and
                                        ; the right source edge share a byte,
                                        ; and we always point to the leftmost
                                        ; byte in the right source edge

; Branch to the appropriate bottom-to-top bank enumeration loop.

        mov     eax,ulAdapterType
        jmp     BottomToTopLoopTable[eax*4]


;***********************************************************************;
;
; The following routines are the banking loops.
;
;***********************************************************************;


;-----------------------------------------------------------------------;
; Banking for 2 R/W and unbanked adapters, top to bottom.
;-----------------------------------------------------------------------;
        align   4
top_to_bottom_2RW:

; We're going top to bottom. Map in the source and dest, top-justified.

        mov     ebx,pdsurf
        mov     edx,ulCurrentSrcScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yTop ;is source top less than
                                                     ; current source bank?
        jl      short top_2RW_map_init_src_bank      ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yBottom ;source top greater than
                                                        ; current source bank?
        jl      short top_2RW_init_src_bank_mapped
                                                ;no, proper bank already mapped
top_2RW_map_init_src_bank:

; Map bank containing the top source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyTop,MapSourceBank>

top_2RW_init_src_bank_mapped:

        mov     edx,ulCurrentDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;is dest top less than
                                                     ; current dest bank?
        jl      short top_2RW_map_init_dest_bank     ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest top greater than
                                                        ; current dest bank?
        jl      short top_2RW_init_dest_bank_mapped
                                                ;no, proper bank already mapped
top_2RW_map_init_dest_bank:

; Map bank containing the top dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyTop,MapDestBank>

top_2RW_init_dest_bank_mapped:

; Bank-by-bank top-to-bottom copy loop.

top_2RW_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edx,ulLastDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom
        jl      short @F        ;copy rectangle bottom is in this bank
        mov     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest extends to end
                                                        ; of bank, at least
@@:
        sub     edx,ulCurrentDestScan   ;# of scans we can and want to do in
                                        ; the dest bank
        mov     eax,[ebx].dsurf_rcl2WindowClipS.yBottom
        sub     eax,ulCurrentSrcScan    ;# of scans we can do in the src bank

        cmp     edx,eax
        jb      short @F        ;source bank isn't limiting
        mov     edx,eax         ;source bank is limiting
@@:
        mov     ulBlockHeight,edx ;# of scans we'll do in this bank

; We're ready to copy this block.

        THREAD_AND_START

; Any more scans to copy?

        mov     eax,ulCurrentDestScan
        mov     esi,ulBlockHeight
        add     eax,esi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,eax      ;are we at the dest rect bottom?
        jz      short top_2RW_done      ;yes, we're done
        mov     ulCurrentDestScan,eax

; Now advance either or both banks, as needed.

        mov     ebx,pdsurf
        cmp     eax,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest scan greater than
                                                        ; current dest bank?
        jl      short top_2RW_dest_bank_mapped    ;no, proper bank still mapped

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyTop,MapDestBank>

top_2RW_dest_bank_mapped:

        add     esi,ulCurrentSrcScan    ;we've copied from source up to here
        mov     ulCurrentSrcScan,esi

        cmp     esi,[ebx].dsurf_rcl2WindowClipS.yBottom ;src scan greater than
                                                        ; current src bank?
        jl      short top_2RW_src_bank_mapped     ;no, proper bank still mapped

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,esi,JustifyTop,MapSourceBank>

top_2RW_src_bank_mapped:

        jmp     top_2RW_bank_loop

top_2RW_done:
        PLAIN_RET


;-----------------------------------------------------------------------;
; Banking for 2 R/W and unbanked adapters, bottom to top.
;-----------------------------------------------------------------------;
        align   4
bottom_to_top_2RW:

; We're going bottom to top. Map in the source and dest, bottom-justified.

        mov     ebx,pdsurf
        mov     edx,ulCurrentSrcScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yTop ;is source bottom less than
                                                     ; current source bank?
        jl      short bot_2RW_map_init_src_bank      ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yBottom ;source bottom greater
                                                        ; than current src bank?
        jl      short bot_2RW_init_src_bank_mapped
                                                ;no, proper bank already mapped
bot_2RW_map_init_src_bank:

; Map bank containing the bottom source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyBottom,MapSourceBank>

bot_2RW_init_src_bank_mapped:

        mov     edx,ulCurrentDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;is dest bottom less than
                                                     ; current dest bank?
        jl      short bot_2RW_map_init_dest_bank     ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest bottom greater
                                                        ; than current dst bank?
        jl      short bot_2RW_init_dest_bank_mapped
                                                ;no, proper bank already mapped
bot_2RW_map_init_dest_bank:

; Map bank containing the bottom dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyBottom,MapDestBank>

bot_2RW_init_dest_bank_mapped:

; Bank-by-bank bottom-to-top copy loop.

bot_2RW_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edx,ulLastDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop
        jg      short @F        ;copy rectangle top is in this bank
        mov     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;dest extends to end
                                                     ; of bank, at least
@@:
        neg     edx
        add     edx,ulCurrentDestScan   ;# of scans we can and want to do in
        inc     edx                     ; the dest bank

        mov     eax,ulCurrentSrcScan
        sub     eax,[ebx].dsurf_rcl2WindowClipS.yTop
        inc     eax                     ;# of scans we can do in the src bank

        cmp     edx,eax
        jb      short @F        ;source bank isn't limiting
        mov     edx,eax         ;source bank is limiting
@@:
        mov     ulBlockHeight,edx ;# of scans we'll do in this bank

; We're ready to copy this block.

        THREAD_AND_START

; Any more scans to copy?

        mov     eax,ulCurrentDestScan
        mov     esi,ulBlockHeight
        sub     eax,esi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,eax      ;are we past the dest rect top?
        jg      short bot_2RW_done      ;yes, we're done
        mov     ulCurrentDestScan,eax

; Now advance either or both banks, as needed.

        mov     ebx,pdsurf
        cmp     eax,[ebx].dsurf_rcl2WindowClipD.yTop ;dest scan less than
                                                     ; current dest bank?
        jge     short bot_2RW_dest_bank_mapped    ;no, proper bank still mapped

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyBottom,MapDestBank>

bot_2RW_dest_bank_mapped:

        mov     eax,ulCurrentSrcScan
        sub     eax,esi         ;we've copied from source up to here
        mov     ulCurrentSrcScan,eax

        cmp     eax,[ebx].dsurf_rcl2WindowClipS.yTop ;src scan less than
                                                     ; current src bank?
        jge     short bot_2RW_src_bank_mapped     ;no, proper bank still mapped

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyBottom,MapSourceBank>

bot_2RW_src_bank_mapped:

        jmp     bot_2RW_bank_loop

bot_2RW_done:
        PLAIN_RET


;-----------------------------------------------------------------------;
; Banking for 1R/1W adapters, top to bottom.
;-----------------------------------------------------------------------;
        align   4
top_to_bottom_1R1W:

; We're going top to bottom. Map in the source and dest, top-justified.

        mov     ebx,pdsurf
        mov     edx,ulCurrentSrcScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yTop ;is source top less than
                                                     ; current source bank?
        jl      short top_1R1W_map_init_src_bank      ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yBottom ;source top greater than
                                                        ; current source bank?
        jl      short top_1R1W_init_src_bank_mapped
                                                ;no, proper bank already mapped
top_1R1W_map_init_src_bank:

; Map bank containing the top source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyTop,MapSourceBank>

top_1R1W_init_src_bank_mapped:

        mov     edx,ulCurrentDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;is dest top less than
                                                     ; current dest bank?
        jl      short top_1R1W_map_init_dest_bank     ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest top greater than
                                                        ; current dest bank?
        jl      short top_1R1W_init_dest_bank_mapped
                                                ;no, proper bank already mapped
top_1R1W_map_init_dest_bank:

; Map bank containing the top dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyTop,MapDestBank>

top_1R1W_init_dest_bank_mapped:

; Bank-by-bank top-to-bottom copy loop.

top_1R1W_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edx,ulLastDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom
        jl      short @F        ;copy rectangle bottom is in this bank
        mov     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest extends to end
                                                        ; of bank, at least
@@:
        sub     edx,ulCurrentDestScan   ;# of scans we can and want to do in
                                        ; the dest bank
        mov     eax,[ebx].dsurf_rcl2WindowClipS.yBottom
        sub     eax,ulCurrentSrcScan    ;# of scans we can do in the src bank

        cmp     edx,eax
        jb      short @F        ;source bank isn't limiting
        mov     edx,eax         ;source bank is limiting
@@:
        mov     ulBlockHeight,edx ;# of scans we'll do in this bank

; We're ready to copy this block.
; Select different threading, depending on whether the source and destination
; are currently in the same bank; we can do edges faster if they are.

        mov     eax,[ebx].dsurf_ulWindowBank
        cmp     eax,[ebx].dsurf_ulWindowBank[4]
        jz      short top_1R1W_copy_same_bank

; Source and dest are currently in different banks, must go through temp buffer.

        THREAD_AND_START pCurrentThreadViaBuffer,top_1R1W_check_more_scans

; Source and dest are currently in the same bank.

        align   4
top_1R1W_copy_same_bank:
        THREAD_AND_START

; Any more scans to copy?

top_1R1W_check_more_scans:

        mov     eax,ulCurrentDestScan
        mov     esi,ulBlockHeight
        add     eax,esi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,eax      ;are we at the dest rect bottom?
        jz      short top_1R1W_done     ;yes, we're done
        mov     ulCurrentDestScan,eax

; Now advance either or both banks, as needed.

        mov     ebx,pdsurf
        cmp     eax,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest scan greater than
                                                        ; current dest bank?
        jl      short top_1R1W_dest_bank_mapped   ;no, proper bank still mapped

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyTop,MapDestBank>

top_1R1W_dest_bank_mapped:

        add     esi,ulCurrentSrcScan    ;we've copied from source up to here
        mov     ulCurrentSrcScan,esi

        cmp     esi,[ebx].dsurf_rcl2WindowClipS.yBottom ;src scan greater than
                                                        ; current src bank?
        jl      short top_1R1W_src_bank_mapped     ;no, proper bank still mapped

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,esi,JustifyTop,MapSourceBank>

top_1R1W_src_bank_mapped:

        jmp     top_1R1W_bank_loop

top_1R1W_done:
        PLAIN_RET


;-----------------------------------------------------------------------;
; Banking for 1R/1W adapters, bottom to top.
;-----------------------------------------------------------------------;
        align   4
bottom_to_top_1R1W:

; We're going bottom to top. Map in the source and dest, bottom-justified.

        mov     ebx,pdsurf
        mov     edx,ulCurrentSrcScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yTop ;is source bottom less than
                                                     ; current source bank?
        jl      short bot_1R1W_map_init_src_bank      ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipS.yBottom ;source bottom greater
                                                        ; than current src bank?
        jl      short bot_1R1W_init_src_bank_mapped
                                                ;no, proper bank already mapped
bot_1R1W_map_init_src_bank:

; Map bank containing the bottom source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyBottom,MapSourceBank>

bot_1R1W_init_src_bank_mapped:

        mov     edx,ulCurrentDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;is dest bottom less than
                                                     ; current dest bank?
        jl      short bot_1R1W_map_init_dest_bank     ;yes, map in proper bank
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yBottom ;dest bottom greater
                                                        ; than current dst bank?
        jl      short bot_1R1W_init_dest_bank_mapped
                                                ;no, proper bank already mapped
bot_1R1W_map_init_dest_bank:

; Map bank containing the bottom dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,edx,JustifyBottom,MapDestBank>

bot_1R1W_init_dest_bank_mapped:

; Bank-by-bank bottom-to-top copy loop.

bot_1R1W_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edx,ulLastDestScan
        cmp     edx,[ebx].dsurf_rcl2WindowClipD.yTop
        jg      short @F        ;copy rectangle top is in this bank
        mov     edx,[ebx].dsurf_rcl2WindowClipD.yTop ;dest extends to end
                                                     ; of bank, at least
@@:
        neg     edx
        add     edx,ulCurrentDestScan   ;# of scans we can and want to do in
        inc     edx                     ; the dest bank

        mov     eax,ulCurrentSrcScan
        sub     eax,[ebx].dsurf_rcl2WindowClipS.yTop
        inc     eax                     ;# of scans we can do in the src bank

        cmp     edx,eax
        jb      short @F        ;source bank isn't limiting
        mov     edx,eax         ;source bank is limiting
@@:
        mov     ulBlockHeight,edx ;# of scans we'll do in this bank

; We're ready to copy this block.
; Select different threading, depending on whether the source and destination
; are currently in the same bank; we can do edges faster if they are.

        mov     al,byte ptr [ebx].dsurf_ulWindowBank
        cmp     al,byte ptr [ebx].dsurf_ulWindowBank[4]
        jz      short bot_1R1W_copy_same_bank

; Source and dest are currently in different banks, must go through temp buffer.

        THREAD_AND_START pCurrentThreadViaBuffer,bot_1R1W_check_more_scans

; Source and dest are currently in the same bank.

        align   4
bot_1R1W_copy_same_bank:
        THREAD_AND_START

; Any more scans to copy?

        align   4
bot_1R1W_check_more_scans:

        mov     eax,ulCurrentDestScan
        mov     esi,ulBlockHeight
        sub     eax,esi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,eax      ;are we past the dest rect top?
        jg      short bot_1R1W_done     ;yes, we're done
        mov     ulCurrentDestScan,eax

; Now advance either or both banks, as needed.

        mov     ebx,pdsurf
        cmp     eax,[ebx].dsurf_rcl2WindowClipD.yTop ;dest scan less than
                                                     ; current dest bank?
        jge     short bot_1R1W_dest_bank_mapped   ;no, proper bank still mapped

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyBottom,MapDestBank>

bot_1R1W_dest_bank_mapped:

        mov     eax,ulCurrentSrcScan
        sub     eax,esi         ;we've copied from source up to here
        mov     ulCurrentSrcScan,eax

        cmp     eax,[ebx].dsurf_rcl2WindowClipS.yTop ;src scan less than
                                                     ; current src bank?
        jge     short bot_1R1W_src_bank_mapped    ;no, proper bank still mapped

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,eax,JustifyBottom,MapSourceBank>

bot_1R1W_src_bank_mapped:

        jmp     bot_1R1W_bank_loop

bot_1R1W_done:
        PLAIN_RET


;-----------------------------------------------------------------------;
; Banking for 1 R/W adapters, top to bottom.
;-----------------------------------------------------------------------;
        align   4
top_to_bottom_1RW:

; We're going top to bottom. Map in the dest, top-justified.

        mov     ebx,pdsurf
        mov     esi,ulCurrentDestScan
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop  ;is dest top less than
                                                     ; current bank?
        jl      short top_1RW_map_init_dest_bank     ;yes, map in proper bank
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;dest top greater than
                                                        ; current bank?
        jl      short top_1RW_init_dest_bank_mapped
                                                ;no, proper bank already mapped
top_1RW_map_init_dest_bank:

; Map bank containing the top dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyTop>

top_1RW_init_dest_bank_mapped:

; Bank-by-bank top-to-bottom copy loop.

top_1RW_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edi,ulLastDestScan
        cmp     edi,[ebx].dsurf_rcl1WindowClip.yBottom
        jl      short @F        ;copy rectangle bottom is in this bank
        mov     edi,[ebx].dsurf_rcl1WindowClip.yBottom ;dest extends to end
                                                       ; of bank, at least
@@:
        sub     edi,esi   ;# of scans we can and want to do in the dest bank

; Now make sure source is mapped in. This is the condition the copying routines
; expect, and we need to figure out how far we can go in the source.

        sub     edx,edx                 ;assume source and dest are in the same
                                        ; bank
        mov     esi,ulCurrentSrcScan
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop ;src scan less than
                                                    ; current bank?
        jl      short top_1RW_map_src_Bank          ;yes, must map in
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;src scan greater than
                                                       ; current bank?
        jl      short top_1RW_src_bank_mapped     ;no, proper bank still mapped

top_1RW_map_src_Bank:

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyTop>

        mov     edx,1                   ;mark that source and dest are not in
                                        ; the same bank
top_1RW_src_bank_mapped:

        mov     eax,[ebx].dsurf_rcl1WindowClip.yBottom
        sub     eax,esi         ;# of scans we can do in the src bank

        cmp     edi,eax
        jb      short @F        ;source bank isn't limiting
        mov     edi,eax         ;source bank is limiting
@@:
        mov     ulBlockHeight,edi ;# of scans we'll do in this bank

; We're ready to copy this block.
; Select different threading, depending on whether the source and destination
; are currently in the same bank; we can do edges faster if they are.

        and     edx,edx
        jz      short top_1RW_copy_same_bank

; Source and dest are currently in different banks, must go through temp buffer.

        THREAD_AND_START pCurrentThreadViaBuffer,top_1RW_check_more_scans

; Source and dest are currently in the same bank.

        align   4
top_1RW_copy_same_bank:
        THREAD_AND_START

; Any more scans to copy?

top_1RW_check_more_scans:

        mov     esi,ulCurrentDestScan
        mov     edi,ulBlockHeight
        add     esi,edi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,esi      ;are we at the dest rect bottom?
        jz      short top_1RW_done      ;yes, we're done
        mov     ulCurrentDestScan,esi

; Now make sure the dest bank is mapped in.

        mov     ebx,pdsurf
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop ;dest scan less than
                                                    ; current bank?
        jl      short top_1RW_map_dest_bank         ;yes, map in dest bank
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;dest scan greater than
                                                        ; current bank?
        jl      short top_1RW_dest_bank_mapped   ;no, proper bank mapped

top_1RW_map_dest_bank:

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyTop>

top_1RW_dest_bank_mapped:

        add     ulCurrentSrcScan,edi    ;we've copied from source up to here

        jmp     top_1RW_bank_loop

top_1RW_done:
        PLAIN_RET


;-----------------------------------------------------------------------;
; Banking for 1 R/W adapters, bottom to top.
;-----------------------------------------------------------------------;
        align   4
bottom_to_top_1RW:

; We're going bottom to top. Map in the dest, bottom-justified.

        mov     ebx,pdsurf
        mov     esi,ulCurrentDestScan
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop  ;is dest bottom less than
                                                     ; current dest bank?
        jl      short bot_1RW_map_init_dest_bank     ;yes, map in proper bank
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;dest bottom greater
                                                       ; than current dst bank?
        jl      short bot_1RW_init_dest_bank_mapped
                                                ;no, proper bank already mapped
bot_1RW_map_init_dest_bank:

; Map bank containing the bottom dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyBottom>

bot_1RW_init_dest_bank_mapped:

; Bank-by-bank bottom-to-top copy loop.

bot_1RW_bank_loop:

; Decide how far we can go before we run out of bank or rectangle to copy.

        mov     edi,ulLastDestScan
        cmp     edi,[ebx].dsurf_rcl1WindowClip.yTop
        jg      short @F        ;copy rectangle top is in this bank
        mov     edi,[ebx].dsurf_rcl1WindowClip.yTop ;dest extends to end
                                                    ; of bank, at least
@@:
        neg     edi
        add     edi,esi                 ;# of scans we can and want to do in
        inc     edi                     ; the dest bank

; Now make sure source is mapped in. This is the condition the copying routines
; expect, and we need to figure out how far we can go in the source.

        sub     edx,edx                 ;assume source and dest are in the same
                                        ; bank
        mov     esi,ulCurrentSrcScan
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop ;src scan less than
                                                    ; current bank?
        jl      short bot_1RW_map_src_Bank          ;yes, must map in
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;src scan greater than
                                                       ; current bank?
        jl      short bot_1RW_src_bank_mapped     ;no, proper bank still mapped

bot_1RW_map_src_Bank:

; Map bank containing the current source scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyBottom>

        mov     edx,1                   ;mark that source and dest are not in
                                        ; the same bank
bot_1RW_src_bank_mapped:

        sub     esi,[ebx].dsurf_rcl1WindowClip.yTop
        inc     esi                     ;# of scans we can do in the src bank

        cmp     edi,esi
        jb      short @F        ;source bank isn't limiting
        mov     edi,esi         ;source bank is limiting
@@:
        mov     ulBlockHeight,edi ;# of scans we'll do in this bank

; We're ready to copy this block.
; Select different threading, depending on whether the source and destination
; are currently in the same bank; we can copy much faster if they are.

        and     edx,edx
        jz      short bot_1RW_copy_same_bank

; Source and dest are currently in different banks, must go through temp buffer.

        THREAD_AND_START pCurrentThreadViaBuffer,bot_1RW_check_more_scans

; Source and dest are currently in the same bank.

        align   4
bot_1RW_copy_same_bank:
        THREAD_AND_START

; Any more scans to copy?

        align   4
bot_1RW_check_more_scans:

        mov     esi,ulCurrentDestScan
        mov     edi,ulBlockHeight
        sub     esi,edi                 ;we've copied to dest up to here
        cmp     ulLastDestScan,esi      ;are we past the dest rect top?
        jg      short bot_1RW_done      ;yes, we're done
        mov     ulCurrentDestScan,esi

; Now make sure the dest bank is mapped in.

        mov     ebx,pdsurf
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yTop ;dest scan less than
                                                    ; current bank?
        jl      short bot_1RW_map_dest_bank         ;yes, map in dest bank
        cmp     esi,[ebx].dsurf_rcl1WindowClip.yBottom ;dest scan greater than
                                                        ; current bank?
        jl      short bot_1RW_dest_bank_mapped   ;no, proper bank mapped

bot_1RW_map_dest_bank:

; Map bank containing the current dest scan line into source window.
; Note: EBX, ESI, and EDI preserved, according to C calling conventions.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>,<ebx,esi,JustifyBottom>

bot_1RW_dest_bank_mapped:

        sub     ulCurrentSrcScan,edi    ;we've copied from source up to here

        jmp     bot_1RW_bank_loop

bot_1RW_done:
        PLAIN_RET


;***********************************************************************;
;
; The following routines are the low-level copying routines. They know
; almost nothing about banks (the routines that copy through a temp
; buffer know how to switch banks after filling the temp buffer, but
; that's it). Banking should be taken care of at a higher level.
;
;***********************************************************************;

;-----------------------------------------------------------------------;
; Copies a block of solid bytes directly from the source to the
; destination, without using a temp buffer. We can't use the latches,
; though, because this is a rotated copy. Can only be used by 2 R/W or
; 1R/1W window banking, or by unbanked modes, or by 1 R/W adapters when
; the source and dest are in the same bank. 1 R/W adapters must go
; through an intermediate local buffer when the source and the destination
; aren't in the same bank.
;
; Input:
;       Direction Flag set for desired direction of copy
;       culWholeBytesWidth = # of bytes to copy across each scan line
;       ulWholeScanDelta = distance to start of next dest scan from end of
;               current
;       ulWholeScanSrcDelta = distance to start of next source scan from end of
;               current
;       ulBlockHeight = # of scans to copy
;       ulWholeBytesSrc = start source offset in bitmap
;       ulWholeBytesDest = start dest offset in bitmap
;       ulCombineMaskWhole = masking to be applied before ORing the two source
;               bytes together, to keep only the data needed in preparation
;               for the VGA rotator doing its stuff
;
; Output:
;       Advances ulWholeBytesSrc and ulWholeBytesDest to scan after last
;               scan processed
;-----------------------------------------------------------------------;

        align   4
copy_whole_bytes:

; Calculate start source and dest addresses from bitmap start addresses and
; offsets within bitmap.

        mov     ecx,pdsurf
        mov     eax,ulWholeBytesSrc
        add     eax,[ecx].dsurf_pvBitmapStart2WindowS
        mov     pSrcAddr,eax
        mov     eax,ulWholeBytesDest
        add     eax,[ecx].dsurf_pvBitmapStart2WindowD
        mov     pDestAddr,eax

; Set the bit mask to enable all bits.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     eax,(0ffh shl 8) + GRAF_BIT_MASK
        out     dx,ax

; Leave GC Index pointing to the Read Map register.

        mov     al,GRAF_READ_MAP
        out     dx,al

; Set up to copy the whole bytes from the buffer.

        mov     eax,ulBlockHeight
        mov     ulTempScanCount,eax

        mov     ebx,culWholeBytesWidth
        SET_UP_UNROLL_VARS ebx,ecx,ebx,pfnCopyWholeRWEntry, \
                                LOOP_UNROLL_SHIFT
        mov     culTempCount,ebx ;remember # of unrolled loop iterations
        mov     pTempEntry,ecx   ;ditto for entry point

copy_whole_scan_loop:

        mov     cl,MM_C3        ;start by copying plane 3 (for Map Mask)

copy_whole_plane_loop:

; Set Map Mask to enable writes to the plane we're copying.

        mov     edx,VGA_BASE + SEQ_DATA
        mov     al,cl
        out     dx,al

; Set Read Map to enable reads from the plane we're copying.

        mov     dl,GRAF_DATA
        shr     al,1                    ;map plane into ReadMask
        cmp     al,100b                 ;set Carry if not C3 (plane 3)
        adc     al,-1                   ;sub 1 only if C3
        out     dx,al

; Select the corresponding plane from the temp buffer.

        mov     esi,pSrcAddr       ;source offset in screen
        mov     edi,pDestAddr      ;point to destination start

        lodsb                   ;prime the rotation pipeline
        mov     ah,al           ;for combining with the next byte

        mov     ebx,culTempCount
        mov     edx,ulCombineMaskWhole
        jmp     pTempEntry


;-----------------------------------------------------------------------;
; Table of unrolled copy whole bytes from buffer loop entry points.
;-----------------------------------------------------------------------;

        UNROLL_LOOP_ENTRY_TABLE pfnCopyWholeRWEntry, \
                                WHOLE_RW, LOOP_UNROLL_COUNT

;-----------------------------------------------------------------------;
; Unrolled loop for copying whole bytes from the buffer.
;-----------------------------------------------------------------------;

COPY_WHOLE_RW macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        lodsb                   ;get byte to copy
        mov     ch,al           ;set aside for next time
        and     eax,edx         ;mask the bytes in preparation for combining
                                ; and rotating them
        or      al,ah           ;combine them
        stosb                   ;write the composite byte
                                ; VGA rotates during write
        mov     ah,ch           ;prepare byte for combining next time
        endm    ;-----------------------------------;

;  AH = rotation pipeline-priming byte
;  EDX = mask to preserve desired portions of AH and AL before combining
;  ESI = source address to copy from
;  EDI = target address to copy to
;  Map Mask set to enable the desired plane for write
;  Bit Mask set to enable all bits

        align   4
copy_whole_loop:
        UNROLL_LOOP COPY_WHOLE_RW,WHOLE_RW,LOOP_UNROLL_COUNT

        dec     ebx
        jnz     copy_whole_loop

; Do next plane, if any.

        shr     cl,1                    ;advance to next plane
        jnz     copy_whole_plane_loop

; Remember where we left off, for next scan.

        add     edi,ulWholeScanDelta    ;point to next dest scan
        mov     pDestAddr,edi
        add     esi,ulWholeScanSrcDelta ;point to next source scan
        mov     pSrcAddr,esi

; Count down scan lines.

        dec     ulTempScanCount
        jnz     copy_whole_scan_loop

; Remember where we left off, for next time.

        mov     ecx,pdsurf
        sub     esi,[ecx].dsurf_pvBitmapStart2WindowS
        mov     ulWholeBytesSrc,esi
        sub     edi,[ecx].dsurf_pvBitmapStart2WindowD
        mov     ulWholeBytesDest,edi

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies a block of solid bytes from the source to the destination via
; the temp buffer. This should only be used by 1 R/W adapters, and then
; only when the source and dest are in different banks.
;
; All relevant bytes are first copied from the source to a temp buffer that's
; an image of the source. Then, we copy each of the four planes for one scan
; line from the temp buffer to the screen before going on to the next scan
; line. See ALIGNBLT.ASM for comments about why this is done.
;
; Input:
;       Direction Flag set for desired direction of copy
;       culWholeBytesWidth = # of bytes to copy across each scan line
;       ulWholeScanDelta = distance to start of next scan from end of current
;       ulNextScan = width of a scan line
;       ulBlockHeight = # of scans to copy
;       ulWholeBytesSrc = start source offset in bitmap
;       ulWholeBytesDest = start dest offset in bitmap
;       ppTempPlane0 = pointer to pointer to plane 0 storage in temp buffer
;       ppTempPlane3 = pointer to pointer to plane 3 storage in temp buffer
;       ulCombineMaskWhole = masking to be applied before ORing the two source
;               bytes together, to keep only the data needed in preparation
;               for the VGA rotator doing its stuff
;       Expects the source bank to be mapped in; source bank is mapped in on
;               exit
;
; Output:
;       Advances ulWholeBytesSrc and ulWholeBytesDest to scan after last
;               scan processed
;-----------------------------------------------------------------------;

        align   4
copy_whole_bytes_via_buffer:

; Calculate start source address from bitmap start address and offset within
; bitmap.

        mov     ecx,pdsurf
        mov     eax,ulWholeBytesSrc
        add     eax,[ecx].dsurf_pvBitmapStart
        mov     pSrcAddr,eax
        sub     eax,[ecx].dsurf_pvStart
        mov     ulOffsetInBank,eax ;will come in handy because we treat the
                                   ; temp buffer as an image of the current
                                   ; bank

; First, copy all the bytes into the temporary buffer.

; Leave the GC Index pointing to the Read Map.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_READ_MAP
        out     dx,al

        mov     eax,3           ;start by copying plane 3
copy_whole_to_buffer_plane_loop:
        mov     ebx,ulBlockHeight  ;# of scans to copy
        mov     esi,pSrcAddr       ;source offset in screen
        mov     edi,ppTempPlane0
        mov     edi,[edi+eax*4]    ;pointer to current plane in temp buffer
        add     edi,ulOffsetInBank ;dest for plane in temp buffer

        mov     edx,VGA_BASE + GRAF_DATA
        out     dx,al            ;set Read Map to plane we're copying from.

        push    eax             ;remember plane index
        mov     eax,ulWholeScanSrcDelta ;offset to next scan
        mov     edx,culWholeBytesWidth ;# of bytes per scan
        inc     edx             ;always one more source byte than dest byte
copy_whole_to_buffer_scan_loop:
        mov     ecx,edx         ;# of bytes per scan
        rep     movsb           ;copy the scan line to the temp buffer
        add     esi,eax         ;point to next source scan
        add     edi,eax         ;point to next dest scan

        dec     ebx              ;count down scan lines
        jnz     copy_whole_to_buffer_scan_loop

        pop     eax             ;get back plane index
        dec     eax             ;count down planes
        jns     copy_whole_to_buffer_plane_loop

; Remember where we left off, for next time.

        mov     ebx,pdsurf
        sub     esi,[ebx].dsurf_pvBitmapStart
        mov     ulWholeBytesSrc,esi


; Now copy the temp buffer to the screen.

; Map in the destination bank, so we can read/write to it and let the Bit Mask
; work.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>, \
                <ebx,ulCurrentDestScan,ulCurrentJustification>

; Calculate dest start address (if this is a 1 R/W adapter, we had to wait
; until now to calculate this, because the dest bank wasn't mapped earlier).

        mov     eax,ulWholeBytesDest
        add     eax,[ebx].dsurf_pvBitmapStart
        mov     pDestAddr,eax

; Set the bit mask to enable all bits.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     eax,(0ffh shl 8) + GRAF_BIT_MASK
        out     dx,ax

; Set up to copy the whole bytes from the buffer.

        mov     eax,ulBlockHeight
        mov     ulTempScanCount,eax

        mov     ebx,culWholeBytesWidth
        SET_UP_UNROLL_VARS ebx,ecx,ebx,pfnCopyWholeFromBufferEntry, \
                                LOOP_UNROLL_SHIFT
        mov     culTempCount,ebx ;remember # of unrolled loop iterations
        mov     pTempEntry,ecx   ;ditto for entry point

copy_whole_from_buffer_scan_loop:

        mov     ebx,ppTempPlane3  ;point to plane 3's temp buffer offset
        mov     cl,MM_C3        ;start by copying plane 3

copy_whole_from_buffer_plane_loop:

; Set Map Mask to enable writes to the plane we're copying.

        mov     edx,VGA_BASE + SEQ_DATA
        mov     al,cl
        out     dx,al

; Select the corresponding plane from the temp buffer.

        mov     esi,[ebx]       ;point to plane start in temp buffer
        sub     ebx,4           ;point to next temp buffer plane ptr
        push    ebx             ;preserve pointer to plane pointer

        add     esi,ulOffsetInBank ;point to current scan start in temp buffer
        mov     edi,pDestAddr      ;point to destination start

        lodsb                   ;prime the rotation pipeline
        mov     ah,al           ;for combining with the next byte

        mov     ebx,culTempCount
        mov     edx,ulCombineMaskWhole
        jmp     pTempEntry


;-----------------------------------------------------------------------;
; Table of unrolled copy whole bytes from buffer loop entry points.
;-----------------------------------------------------------------------;

        UNROLL_LOOP_ENTRY_TABLE pfnCopyWholeFromBufferEntry, \
                                WHOLE_FROM_BUFFER, LOOP_UNROLL_COUNT

;-----------------------------------------------------------------------;
; Unrolled loop for copying whole bytes from the buffer.
;-----------------------------------------------------------------------;

COPY_WHOLE_FROM_BUFFER macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        lodsb                   ;get byte to copy
        mov     ch,al           ;set aside for next time
        and     eax,edx         ;mask the bytes in preparation for combining
                                ; and rotating them
        or      al,ah           ;combine them
        stosb                   ;write the composite byte
                                ; VGA rotates during write
        mov     ah,ch           ;prepare byte for combining next time
        endm    ;-----------------------------------;

;  AH = rotation pipeline-priming byte
;  EDX = mask to preserve desired portions of AH and AL before combining
;  ESI = source address to copy from
;  EDI = target address to copy to
;  Map Mask set to enable the desired plane for write
;  Bit Mask set to enable all bits

        align   4
copy_whole_from_buffer_loop:
        UNROLL_LOOP COPY_WHOLE_FROM_BUFFER,WHOLE_FROM_BUFFER,LOOP_UNROLL_COUNT

        dec     ebx
        jnz     copy_whole_from_buffer_loop

; Do next plane, if any.

        pop     ebx             ;retrieve pointer to plane pointer
        shr     cl,1            ;advance to next plane
        jnz     copy_whole_from_buffer_plane_loop

; Remember where we left off, for next scan.

        add     edi,ulWholeScanDelta    ;point to next dest scan
        mov     pDestAddr,edi
        mov     eax,ulNextScan
        add     ulOffsetInBank,eax      ;next scan's start in temp buffer,
                                        ; relative to start of plane's storage

; Count down scan lines.

        dec     ulTempScanCount
        jnz     copy_whole_from_buffer_scan_loop

; Remember where we left off, for next time.

        mov     ebx,pdsurf
        sub     edi,[ebx].dsurf_pvBitmapStart
        mov     ulWholeBytesDest,edi

; Put back the original source bank.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl>, \
                <ebx,ulCurrentSrcScan,ulCurrentJustification>

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies a strip of left edge bytes from the source to the destination,
; assuming both the source and the destination are both readable and
; writable. Can only be used by 2 R/W window banking, or by unbanked
; modes. 1 R/W and 1R/1W adapters must go through an intermediate local
; buffer when the source and dest are in different banks. Processes up to
; EDGE_CHUNK_SIZE bytes in each plane at a pop; more bytes might cause
; flicker.
;
; Input:
;       ulNextScan = width of scan, in bytes
;       ulBlockHeight = # of scans to copy
;       ulLeftEdgeSrc = start source offset in bitmap
;       ulLeftEdgeDest = start dest offset in bitmap
;       ulLeftSrcWidthMinus1 = width of left source edge minus 1 (0 or 1)
;       jLeftMask = left edge clip mask
;
; Output:
;       Advances ulLeftEdgeSrc and ulLeftEdgeDest to scan after last
;               scan processed
;-----------------------------------------------------------------------;

        align   4
copy_left_edge:

; Calculate start source and dest addresses from bitmap start addresses and
; offsets within bitmap.

        mov     ecx,pdsurf
        mov     esi,ulLeftEdgeSrc
        add     esi,[ecx].dsurf_pvBitmapStart2WindowS
        mov     edi,ulLeftEdgeDest
        add     edi,[ecx].dsurf_pvBitmapStart2WindowD

; Copy the edge.

        mov     ah,byte ptr jLeftMask   ;clip mask for this edge
        mov     ebx,ulLeftSrcWidthMinus1
        call    copy_edge_table[ebx*4]

; Remember where we left off, for next time.

        mov     ecx,pdsurf
        sub     esi,[ecx].dsurf_pvBitmapStart2WindowS
        mov     ulLeftEdgeSrc,esi
        sub     edi,[ecx].dsurf_pvBitmapStart2WindowD
        mov     ulLeftEdgeDest,edi

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies a strip of right edge bytes from the source to the destination,
; assuming both the source and the destination are both readable and
; writable. Can only be used by 2 R/W window banking, or by unbanked
; modes. 1 R/W and 1R/1W adapters must go through an intermediate local
; buffer when the source and dest are in different banks. Processes up to
; EDGE_CHUNK_SIZE bytes in each plane at a pop; more bytes might cause
; flicker.
;
; Input:
;       ulNextScan = width of scan, in bytes
;       ulBlockHeight = # of scans to copy
;       ulRightEdgeSrc = start source offset in bitmap
;       ulRightEdgeDest = start dest offset in bitmap
;       ulRightSrcWidthMinus1 = width of right source edge minus 1 (0 or 1)
;       jRightMask = right edge clip mask
;
; Output:
;       Advances ulRightEdgeSrc and ulRightEdgeDest to scan after last
;               scan processed
;-----------------------------------------------------------------------;

        align   4
copy_right_edge:

; Calculate start source and dest addresses from bitmap start addresses and
; offsets within bitmap.

        mov     ecx,pdsurf
        mov     esi,ulRightEdgeSrc
        add     esi,[ecx].dsurf_pvBitmapStart2WindowS
        mov     edi,ulRightEdgeDest
        add     edi,[ecx].dsurf_pvBitmapStart2WindowD

; Copy the edge.

        mov     ah,byte ptr jRightMask  ;clip mask for this edge
        mov     ebx,ulRightSrcWidthMinus1
        call    copy_edge_table[ebx*4]

; Remember where we left off, for next time

        mov     ecx,pdsurf
        sub     esi,[ecx].dsurf_pvBitmapStart2WindowS
        mov     ulRightEdgeSrc,esi
        sub     edi,[ecx].dsurf_pvBitmapStart2WindowD
        mov     ulRightEdgeDest,edi

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from a 1-wide source to the destination on the screen.
; Entry:
;       AH = bit mask setting for edge
;       ESI = source address
;       EDI = destination address
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       Source readable, and destination readable and writable
; Exit:
;       ESI = next source address
;       EDI = next destination address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_edge_1ws:
        mov     pSrcAddr,esi
        mov     pDestAddr,edi

; Set the clip mask for this edge.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_BIT_MASK
        out     dx,ax

; Leave the GC Index pointing to the Read Map.

        mov     al,GRAF_READ_MAP
        out     dx,al

        mov     ecx,offset copy_edge_rw_1ws_full_chunk
                                ;entry point into unrolled loop to copy first
                                ; chunk, assuming it's a full chunk
        mov     ebx,ulBlockHeight

; Copy the edge in a series of chunks.

copy_edge_chunk_loop_1ws:

        sub     ebx,EDGE_CHUNK_SIZE ;scans remaining after this chunk, assuming
                                    ; a full chunk
        jge     short @F            ;do a full chunk
        add     ebx,EDGE_CHUNK_SIZE ;not a full chunk; process all remaining
                                    ; scans
        mov     ecx,pfnCopyEdgeRWEntry_1ws[-4][ebx*4]
                                ;entry point into unrolled loop to copy desired
                                ; chunk size
        sub     ebx,ebx         ;no scans after this
@@:
        push    ebx             ;remember remaining scan count

        mov     ah,MM_C3        ;start by copying plane 3
        mov     ebx,ulNextScan

copy_edge_plane_loop_1ws:

; Set Map Mask to enable writes to plane we're copying.

        mov     al,ah
        mov     dl,SEQ_DATA
        out     dx,al

; Set Read Map to same plane.

        shr     al,1                    ;map plane into ReadMask
        cmp     al,100b                 ;set Carry if not C3 (plane 3)
        adc     al,-1                   ;sub 1 only if C3
        mov     dl,GRAF_DATA
        out     dx,al

        mov     esi,pSrcAddr
        mov     edi,pDestAddr

        jmp     ecx                     ;copy the left edge


;-----------------------------------------------------------------------;
; Table of unrolled edge loop entry points. First entry point is to copy
; 1 byte, last entry point is to copy EDGE_CHUNK_SIZE bytes.
;-----------------------------------------------------------------------;

pfnCopyEdgeRWEntry_1ws label dword
INDEX = 1
        rept    EDGE_CHUNK_SIZE
        DEFINE_DD       EDGE_RW_1WS,%INDEX
INDEX = INDEX+1
        endm


;-----------------------------------------------------------------------;
; Unrolled loop for copying a strip of edge bytes, with 1-wide source and
; destination both readable and writable.
;-----------------------------------------------------------------------;

COPY_EDGE_RW_1WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     al,[esi]        ;get byte to copy
        add     esi,ebx         ;point to next source scan
        xchg    [edi],al        ;read before write so Bit Mask can operate
                                ; VGA rotates during write
        add     edi,ebx         ;point to next dest scan
        endm    ;-----------------------------------;

;  EBX = scan line width
;  ESI = source address to copy from
;  EDI = target address to copy to
;  Bit Mask set to desired clipping
;  Read Map and Map Mask set to enable the desired plane for read and write

        align   4
copy_edge_rw_1ws_full_chunk:
        UNROLL_LOOP COPY_EDGE_RW_1WS,EDGE_RW_1WS,EDGE_CHUNK_SIZE

; Do next plane within this chunk, if any.

        shr     ah,1                    ;advance to next plane
        jnz     copy_edge_plane_loop_1ws

; Remember where we left off, for the next chunk.

        mov     pSrcAddr,esi
        mov     pDestAddr,edi

; Do next chunk within this bank block, if any.

        pop     ebx                     ;retrieve remaining scan count
        and     ebx,ebx                 ;any scans left?
        jnz     copy_edge_chunk_loop_1ws ;more scans to do

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies a strip of left edge bytes from the source to the destination
; through an intermediate RAM buffer. This is the approach required by
; 1 R/W and 1R/1W adapters when the source and dest are in different banks.
; Writes up to EDGE_CHUNK_SIZE bytes in each plane at a pop; more bytes might
; cause flicker.
;
; Input:
;       ulNextScan = width of scan, in bytes
;       ulBlockHeight = # of scans to copy
;       ulLeftEdgeSrc = start source offset in bitmap
;       ulLeftEdgeDest = start dest offset in bitmap
;       jLeftMask = left edge clip mask
;       pTempPlane = pointer to temp storage buffer
;       ulCurrentSrcScan = scan used to map in source bank
;       ulCurrentDestScan = scan used to map in dest bank
;       ulCurrentJustification = justification used to map in current bank
;       ulLeftSrcWidthMinus1 = width of left source edge minus 1 (0 or 1)
;       For 1 R/W adapters, expects the source bank to be mapped in; banking
;               is the same at exit as it was at entry
;
; Output:
;       Advances ulLeftEdgeSrc and ulLeftEdgeDest to scan after last
;               scan processed
;
; Note that this should never be called for an unbanked or 2 R/W adapter,
; because the source and dest are always both addressable simultaneously then.
;-----------------------------------------------------------------------;

        align   4
copy_left_edge_via_buffer:

; First, copy all the bytes into the temporary buffer.

; Calculate start source and dest addresses from bitmap start addresses and
; offsets within bitmap.

        mov     ecx,pdsurf
        mov     esi,ulLeftEdgeSrc
        add     esi,[ecx].dsurf_pvBitmapStart2WindowS

; Copy the edge from the source to the temp buffer.

        mov     eax,ulLeftSrcWidthMinus1
        call    copy_edge_from_screen_to_buffer[eax*4]

; Remember where we left off, for next time

        mov     ebx,pdsurf
        sub     esi,[ebx].dsurf_pvBitmapStart2WindowS
        mov     ulLeftEdgeSrc,esi

; Now copy the temp buffer to the screen.

; Map in the source bank to match the destination, so we can read/write to it
; and let the Bit Mask work. Note that on a 1 R/W adapter, both banks will be
; mapped by this call, which is fine.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,ulCurrentDestScan,ulCurrentJustification,MapSourceBank>

; Calculate dest start address (if this is a 1 R/W adapter, we had to wait
; until now to calculate this, because the dest bank wasn't mapped earlier).

        mov     edi,ulLeftEdgeDest
        add     edi,[ebx].dsurf_pvBitmapStart2WindowD

; Do the copy.

        mov     ah,byte ptr jLeftMask           ;clip mask for this edge
        mov     ebx,ulLeftSrcWidthMinus1
        call    copy_edge_from_buffer_to_screen[ebx*4]

; Remember where we left off, for next time.

        mov     ebx,pdsurf
        sub     edi,[ebx].dsurf_pvBitmapStart2WindowD
        mov     ulLeftEdgeDest,edi

; Put back the original source bank.  Note that on a 1 R/W adapter, both banks
; will be mapped by this call, which is fine.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,ulCurrentSrcScan,ulCurrentJustification,MapSourceBank>

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies a strip of right edge bytes from the source to the destination
; through an intermediate RAM buffer. This is the approach required by
; 1 R/W and 1R/1W adapters when the source and dest are in different banks.
; Writes up to EDGE_CHUNK_SIZE bytes in each plane at a pop; more bytes might
; cause flicker.
;
; Input:
;       ulNextScan = width of scan, in bytes
;       ulBlockHeight = # of scans to copy
;       ulRightEdgeSrc = start source offset in bitmap
;       ulRightEdgeDest = start dest offset in bitmap
;       jRightMask = right edge clip mask
;       pTempPlane = pointer to temp storage buffer
;       ulCurrentSrcScan = scan used to map in source bank
;       ulCurrentDestScan = scan used to map in dest bank
;       ulCurrentJustification = justification used to map in current bank
;       ulRightSrcWidthMinus1 = width of right source edge minus 1 (0 or 1)
;       For 1 R/W adapters, expects the source bank to be mapped in; banking
;               is the same at exit as it was at entry
;
; Output:
;       Advances ulRightEdgeSrc and ulRightEdgeDest to scan after last
;               scan processed
;
; Note that this should never be called for an unbanked or 2 R/W adapter,
; because the source and dest are always both addressable simultaneously then.
;-----------------------------------------------------------------------;

        align   4
copy_right_edge_via_buffer:

; First, copy all the bytes into the temporary buffer.

; Calculate start source address from bitmap start addresses and
; offsets within bitmap.

        mov     ecx,pdsurf
        mov     esi,ulRightEdgeSrc
        add     esi,[ecx].dsurf_pvBitmapStart2WindowS

; Copy the edge from the source to the temp buffer.

        mov     eax,ulRightSrcWidthMinus1
        call    copy_edge_from_screen_to_buffer[eax*4]

; Remember where we left off, for next time

        mov     ebx,pdsurf
        sub     esi,[ebx].dsurf_pvBitmapStart2WindowS
        mov     ulRightEdgeSrc,esi

; Now copy the temp buffer to the screen.

; Map in the source bank to match the destination, so we can read/write to it
; and let the Bit Mask work. Note that on a 1 R/W adapter, both banks will be
; mapped by this call, which is correct.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,ulCurrentDestScan,ulCurrentJustification,MapSourceBank>

; Calculate dest start address (if this is a 1 R/W adapter, we had to wait
; until now to calculate this, because the dest bank wasn't mapped earlier).

        mov     edi,ulRightEdgeDest
        add     edi,[ebx].dsurf_pvBitmapStart2WindowD

; Do the copy.

        mov     ah,byte ptr jRightMask          ;clip mask for this edge
        mov     ebx,ulRightSrcWidthMinus1
        call    copy_edge_from_buffer_to_screen[ebx*4]

; Remember where we left off, for next time.

        mov     ebx,pdsurf
        sub     edi,[ebx].dsurf_pvBitmapStart2WindowD
        mov     ulRightEdgeDest,edi

; Put back the original source bank.  Note that on a 1 R/W adapter, both banks
; will be mapped by this call, which is fine.

        ptrCall   <dword ptr [ebx].dsurf_pfnBankControl2Window>, \
                <ebx,ulCurrentSrcScan,ulCurrentJustification,MapSourceBank>

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from the temp buffer (1 wide) to the screen.
; Entry:
;       AH = bit mask setting for edge
;       DH = VGA_BASE SHR 8
;       EDI = destination address
;       pTempPlane = temp buffer from which to copy
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       Source and dest banks both pointing to destination
; Exit:
;       EDI = next destination address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_buffered_edge_to_screen_1ws:

        mov     pDestAddr,edi

        mov     dl,GRAF_ADDR
        mov     al,GRAF_BIT_MASK
        out     dx,ax

        mov     pTempEntry,offset copy_edge_from_buf_full_chunk_1ws
                                ;entry point into unrolled loop to copy first
                                ; chunk, assuming it's a full chunk
        mov     ecx,pTempPlane  ;temp buffer start (copy from here)
        mov     ebx,ulBlockHeight ;total # of scans to copy

; Copy the edge in a series of chunks, to avoid flicker.

copy_from_buffer_chunk_loop_1ws:

        sub     ebx,EDGE_CHUNK_SIZE ;scans remaining after this chunk, assuming
                                    ; a full chunk
        jge     short @F            ;do a full chunk
        add     ebx,EDGE_CHUNK_SIZE ;not a full chunk; process all remaining
                                    ; scans
        mov     ebx,pfnCopyEdgesFromBufferEntry_1ws[-4][ebx*4]
        mov     pTempEntry,ebx  ;entry point into unrolled loop to copy desired
                                ; chunk size
        sub     ebx,ebx         ;no scans after this
@@:
        push    ebx             ;remember remaining scan count

        mov     al,MM_C3        ;start by copying plane 3
        mov     ebx,ulNextScan

        push    ecx             ;remember current temp buffer start

        mov     dl,SEQ_DATA     ;leave DX pointing to the Sequencer Data reg

copy_from_buffer_plane_loop_1ws:

; Set Map Mask to enable writes to plane we're copying.

        out     dx,al

        mov     esi,ecx                 ;point to current plane's source byte
        add     ecx,ulBlockHeight       ;point to next plane's source byte

        mov     edi,pDestAddr

        jmp     pTempEntry              ;copy the left edge


;-----------------------------------------------------------------------;
; Table of unrolled edge copy-from-buffer loop entry points. First entry
; point is to copy 1 byte, last entry point is to copy EDGE_CHUNK_SIZE
; bytes.
;-----------------------------------------------------------------------;

pfnCopyEdgesFromBufferEntry_1ws label dword
INDEX = 1
        rept    EDGE_CHUNK_SIZE
        DEFINE_DD       EDGE_FROM_BUFFER_1WS,%INDEX
INDEX = INDEX+1
        endm


;-----------------------------------------------------------------------;
; Unrolled loop for copying a strip of edge bytes (1 wide) from the temp
; buffer.
;-----------------------------------------------------------------------;

COPY_EDGE_FROM_BUFFER_1WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     ah,[esi]        ;get byte to copy
        inc     esi             ;point to next source (temp buffer) byte
        xchg    [edi],ah        ;read before write so Bit Mask can operate
                                ; VGA rotates during write
        add     edi,ebx         ;point to next dest (screen) scan
        endm    ;-----------------------------------;

;  EBX = scan line width
;  ESI = source address to copy from (temp buffer)
;  EDI = target address to copy to (screen)
;  Bit Mask set to desired clipping
;  Map Mask set to enable the desired plane for write

        align   4
copy_edge_from_buf_full_chunk_1ws:
        UNROLL_LOOP     COPY_EDGE_FROM_BUFFER_1WS, \
                        EDGE_FROM_BUFFER_1WS,EDGE_CHUNK_SIZE

; Do next plane within this chunk, if any.

        shr     al,1                    ;advance to next plane
        jnz     copy_from_buffer_plane_loop_1ws

; Remember where we left off, for next chunk.

        mov     pDestAddr,edi
        pop     ecx                     ;get back current temp buffer start
        add     ecx,EDGE_CHUNK_SIZE     ;point to next chunk's start

; Do next chunk within this bank block, if any.

        pop     ebx                     ;retrieve remaining scan count
        and     ebx,ebx                 ;any scans left?
        jnz     copy_from_buffer_chunk_loop_1ws ;more scans to do

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from the screen (1 wide) to the temp buffer.
; Entry:
;       ESI = source address
;       pTempPlane = temp buffer from which to copy
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       Source bank pointing to source
; Exit:
;       DH = VGA_BASE SHR 8
;       ESI = next source address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_screen_to_buffered_edge_1ws:

        mov     pSrcAddr,esi

; Leave the GC Index pointing to the Read Map.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_READ_MAP
        out     dx,al

        mov     ebx,ulBlockHeight
        SET_UP_UNROLL_VARS ebx,ecx,ebx,pfnCopyEdgeToTempEntry_1ws, \
                                LOOP_UNROLL_SHIFT
        mov     culTempCount,ebx ;remember # of unrolled loop iterations
        mov     pTempEntry,ecx   ;ditto for entry point

        mov     ecx,ulNextScan
        mov     edi,pTempPlane  ;dest offset in temp buffer for plane 3 bytes.
                                ;The rest of the planes are stored
                                ; consecutively
        mov     al,3            ;start by copying plane 3
        mov     dl,GRAF_DATA    ;leave DX pointing to the GC Data reg
copy_edge_to_buffer_plane_loop_1ws:
        mov     esi,pSrcAddr ;source pointer

        out     dx,al            ;set Read Map to plane we're copying from.

        mov     ebx,culTempCount ;# of unrolled loop iterations
        jmp     pTempEntry       ;copy the edge bytes for this plane to the
                                 ; temp buffer

;-----------------------------------------------------------------------;
; Table of unrolled edge copy to temp buffer loop entry points.
;-----------------------------------------------------------------------;

        UNROLL_LOOP_ENTRY_TABLE pfnCopyEdgeToTempEntry_1WS, \
                                EDGE_TO_TEMP_1WS, LOOP_UNROLL_COUNT

;-----------------------------------------------------------------------;
; Unrolled loop for copying edge bytes to the temp buffer.
;-----------------------------------------------------------------------;

COPY_EDGE_TO_TEMP_1WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     ah,[esi]        ;get byte to copy
        add     esi,ecx         ;point to next source scan
        mov     [edi],ah        ;copy byte to temp buffer
        inc     edi             ;point to next temp buffer byte
        endm    ;-----------------------------------;

;  EBX = count of unrolled loop iterations
;  ECX = offset from end of one scan's fill to start of next
;  ESI = source address to copy from (screen)
;  EDI = target address to copy to (temp buffer)
;  Read Map set to enable the desired plane for read

        align   4
edge_to_buffer_loop_1ws:
        UNROLL_LOOP     COPY_EDGE_TO_TEMP_1WS,EDGE_TO_TEMP_1WS, \
                        LOOP_UNROLL_COUNT
        dec     ebx
        jnz     edge_to_buffer_loop_1ws

        dec     al              ;count down planes
        jns     copy_edge_to_buffer_plane_loop_1ws

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from a 2-wide source to the destination on the screen.
; Entry:
;       AH = bit mask setting for edge
;       ESI = source address
;       EDI = destination address
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       ulCombineMask = masking to be applied before ORing the two source
;               bytes together, to keep only the data needed in preparation
;               for the VGA rotator doing its stuff
;       Source readable, and destination readable and writable
; Exit:
;       ESI = next source address
;       EDI = next destination address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_edge_2ws:
        mov     pSrcAddr,esi
        mov     pDestAddr,edi

; Set the clip mask for this edge.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_BIT_MASK
        out     dx,ax

; Leave the GC Index pointing to the Read Map.

        mov     al,GRAF_READ_MAP
        out     dx,al

        mov     ebx,ulBlockHeight

        mov     ecx,offset copy_edge_rw_2ws_full_chunk
                                ;entry point into unrolled loop assuming we do
                                ; a full chunk the first time

; Copy the edge in a series of chunks.

copy_edge_chunk_loop_2ws:

        sub     ebx,EDGE_CHUNK_SIZE ;scans remaining after this chunk, assuming
                                    ; a full chunk
        jge     short @F            ;do a full chunk
        add     ebx,EDGE_CHUNK_SIZE ;not a full chunk; process all remaining
                                    ; scans
        mov     ecx,pfnCopyEdgeRWEntry_2ws[-4][ebx*4]
                                ;entry point into unrolled loop to copy desired
                                ; chunk size
        sub     ebx,ebx         ;no scans after this
@@:
        push    ebx             ;remember remaining scan count

        mov     eax,(MM_C3 SHL 8) + 3 ;start by copying plane 3
        mov     ebx,ulNextScan

copy_edge_plane_loop_2ws:

        push    eax                     ;preserve plane info

; Set Read Map to enable reads from plane we're copying from.

        mov     edx,VGA_BASE + GRAF_DATA
        out     dx,al

; Set Map Mask to enable writes to plane we're copying.

        mov     dl,SEQ_DATA
        mov     al,ah
        out     dx,al

        mov     esi,pSrcAddr
        mov     edi,pDestAddr
        mov     edx,ulCombineMask

        jmp     ecx                     ;copy the left edge


;-----------------------------------------------------------------------;
; Table of unrolled edge loop entry points. First entry point is to copy
; 1 byte, last entry point is to copy EDGE_CHUNK_SIZE bytes.
;-----------------------------------------------------------------------;

pfnCopyEdgeRWEntry_2ws label dword
INDEX = 1
        rept    EDGE_CHUNK_SIZE
        DEFINE_DD       EDGE_RW_2WS,%INDEX
INDEX = INDEX+1
        endm


;-----------------------------------------------------------------------;
; Unrolled loop for copying a strip of edge bytes, with 2-wide source and
; destination both readable and writable.
;-----------------------------------------------------------------------;

COPY_EDGE_RW_2WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     ax,[esi]        ;get word to copy
        add     esi,ebx         ;point to next source scan
        and     eax,edx         ;mask in preparation for combining bytes
        or      al,ah           ;combine the desired parts of the bytes
        xchg    [edi],al        ;read before write so Bit Mask can operate
                                ; VGA rotates during write
        add     edi,ebx         ;point to next dest scan
        endm    ;-----------------------------------;

;  EBX = scan line width
;  EDX = mask to preserve desired portions of AH and AL before combining
;  ESI = source address to copy from
;  EDI = target address to copy to
;  Bit Mask set to desired clipping
;  Read Map and Map Mask set to enable the desired plane for read and write

        align   4
copy_edge_rw_2ws_full_chunk:
        UNROLL_LOOP COPY_EDGE_RW_2WS,EDGE_RW_2WS,EDGE_CHUNK_SIZE

; Do next plane within this chunk, if any.

        pop     eax                     ;retrieve plane info

        shr     ah,1                    ;advance to next plane
        dec     eax                     ;count down planes
        jns     copy_edge_plane_loop_2ws

; Remember where we left off, for the next chunk.

        mov     pSrcAddr,esi
        mov     pDestAddr,edi

; Do next chunk within this bank block, if any.

        pop     ebx                     ;retrieve remaining scan count
        and     ebx,ebx                 ;any scans left?
        jnz     copy_edge_chunk_loop_2ws ;more scans to do

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from the temp buffer (2 wide) to the screen.
; Entry:
;       AH = bit mask setting for edge
;       EDI = destination address
;       pTempPlane = temp buffer from which to copy
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       Source and dest banks both pointing to destination
;       ulCombineMask = masking to be applied before ORing the two source
;               bytes together, to keep only the data needed in preparation
;               for the VGA rotator doing its stuff
; Exit:
;       EDI = next destination address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_buffered_edge_to_screen_2ws:

        mov     pDestAddr,edi

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_BIT_MASK
        out     dx,ax

        mov     pTempEntry,offset copy_edge_from_buf_full_chunk_2ws
                                ;entry point into unrolled loop, assuming the
                                ; first chunk is full size
        mov     ecx,pTempPlane  ;temp buffer start (copy from here)
        mov     ebx,ulBlockHeight

; Copy the edge in a series of chunks, to avoid flicker.

copy_from_buffer_chunk_loop_2ws:

        sub     ebx,EDGE_CHUNK_SIZE ;scans remaining after this chunk, assuming
                                    ; a full chunk
        jge     short @F            ;do a full chunk
        add     ebx,EDGE_CHUNK_SIZE ;not a full chunk; process all remaining
                                    ; scans
        mov     ebx,pfnCopyEdgesFromBufferEntry_2ws[-4][ebx*4]
        mov     pTempEntry,ebx  ;entry point into unrolled loop to copy final
                                ; chunk size
        sub     ebx,ebx         ;no scans after this
@@:
        push    ebx             ;remember remaining scan count

        mov     al,MM_C3        ;start by copying plane 3
        mov     ebx,ulNextScan

        push    ecx             ;remember current temp buffer start

copy_from_buffer_plane_loop_2ws:

; Set Map Mask to enable writes to plane we're copying.

        mov     edx,VGA_BASE + SEQ_DATA
        out     dx,al

        push    eax                     ;preserve plane info

        mov     esi,ecx                 ;point to current plane's source word
        mov     eax,ulBlockHeight
        lea     ecx,[ecx+eax*2]         ;point to next plane's source word

        mov     edi,pDestAddr
        mov     edx,ulCombineMask

        jmp     pTempEntry              ;copy the left edge


;-----------------------------------------------------------------------;
; Table of unrolled edge copy-from-buffer loop entry points. First entry
; point is to copy 1 byte, last entry point is to copy EDGE_CHUNK_SIZE
; bytes.
;-----------------------------------------------------------------------;

pfnCopyEdgesFromBufferEntry_2WS label dword
INDEX = 1
        rept    EDGE_CHUNK_SIZE
        DEFINE_DD       EDGE_FROM_BUFFER_2WS,%INDEX
INDEX = INDEX+1
        endm


;-----------------------------------------------------------------------;
; Unrolled loop for copying a strip of edge bytes (1 wide) from the temp
; buffer.
;-----------------------------------------------------------------------;

COPY_EDGE_FROM_BUFFER_2WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     ax,[esi]        ;get word to copy
        add     esi,2           ;point to next source (temp buffer) word
        and     eax,edx         ;mask in preparation for combining bytes
        or      al,ah           ;combine the desired parts of the bytes
        xchg    [edi],al        ;read before write so Bit Mask can operate
                                ; VGA rotates during write
        add     edi,ebx         ;point to next dest (screen) scan
        endm    ;-----------------------------------;

;  EBX = scan line width
;  EDX = mask to preserve desired portions of AH and AL before combining
;  ESI = source address to copy from (temp buffer)
;  EDI = target address to copy to (screen)
;  Bit Mask set to desired clipping
;  Map Mask set to enable the desired plane for write

        align   4
copy_edge_from_buf_full_chunk_2ws:
        UNROLL_LOOP     COPY_EDGE_FROM_BUFFER_2WS, \
                        EDGE_FROM_BUFFER_2WS,EDGE_CHUNK_SIZE

; Do next plane within this chunk, if any.

        pop     eax                     ;retrieve plane info
        shr     al,1                    ;advance to next plane
        jnz     copy_from_buffer_plane_loop_2ws

; Remember where we left off, for next chunk.

        mov     pDestAddr,edi
        pop     ecx                     ;get back current temp buffer start
        add     ecx,EDGE_CHUNK_SIZE*2   ;point to next chunk's start word

; Do next chunk within this bank block, if any.

        pop     ebx                     ;retrieve remaining scan count
        and     ebx,ebx                 ;any scans left?
        jnz     copy_from_buffer_chunk_loop_2ws ;more scans to do

        PLAIN_RET


;-----------------------------------------------------------------------;
; Copies an edge from the screen (2 wide) to the temp buffer.
; Entry:
;       ESI = source address
;       pTempPlane = temp buffer from which to copy
;       ulBlockHeight = # of bytes to copy per plane
;       ulNextScan = scan width
;       Source bank pointing to source
; Exit:
;       ESI = next source address
;
; Preserved: EBP
;-----------------------------------------------------------------------;

        align   4
copy_screen_to_buffered_edge_2ws:

        mov     pSrcAddr,esi

; Leave the GC Index pointing to the Read Map.

        mov     edx,VGA_BASE + GRAF_ADDR
        mov     al,GRAF_READ_MAP
        out     dx,al

        mov     ebx,ulBlockHeight
        SET_UP_UNROLL_VARS ebx,ecx,ebx,pfnCopyEdgeToTempEntry_2ws, \
                                LOOP_UNROLL_SHIFT
        mov     culTempCount,ebx ;remember # of unrolled loop iterations
        mov     pTempEntry,ecx   ;ditto for entry point

        mov     ecx,ulNextScan
        mov     edi,pTempPlane  ;dest offset in temp buffer for plane 3 bytes.
                                ;The rest of the planes are stored
                                ; consecutively
        mov     eax,3           ;start by copying plane 3
copy_edge_to_buf_pl_loop_2ws:
        mov     esi,pSrcAddr    ;source pointer

        mov     edx,VGA_BASE + GRAF_DATA
        out     dx,al           ;set Read Map to plane from which we're copying

        mov     ebx,culTempCount ;# of unrolled loop iterations
        jmp     pTempEntry       ;copy the edge bytes for this plane to the
                                 ; temp buffer

;-----------------------------------------------------------------------;
; Table of unrolled edge copy to temp buffer loop entry points.
;-----------------------------------------------------------------------;

        UNROLL_LOOP_ENTRY_TABLE pfnCopyEdgeToTempEntry_2WS, \
                                EDGE_TO_TEMP_2WS, LOOP_UNROLL_COUNT

;-----------------------------------------------------------------------;
; Unrolled loop for copying edge bytes to the temp buffer.
;-----------------------------------------------------------------------;

COPY_EDGE_TO_TEMP_2WS macro ENTRY_LABEL,ENTRY_INDEX
&ENTRY_LABEL&ENTRY_INDEX&:
        mov     dx,[esi]        ;get byte to copy
        add     esi,ecx         ;point to next source scan
        mov     [edi],dx        ;copy byte to temp buffer
        add     edi,2           ;point to next temp buffer byte
        endm    ;-----------------------------------;

;  EBX = count of unrolled loop iterations
;  ECX = offset from end of one scan's fill to start of next
;  ESI = source address to copy from (screen)
;  EDI = target address to copy to (temp buffer)
;  Read Map set to enable the desired plane for read

        align   4
edge_to_buffer_loop_2ws:
        UNROLL_LOOP     COPY_EDGE_TO_TEMP_2WS,EDGE_TO_TEMP_2WS, \
                        LOOP_UNROLL_COUNT
        dec     ebx
        jnz     edge_to_buffer_loop_2ws

        dec     eax              ;count down planes
        jns     copy_edge_to_buf_pl_loop_2ws

        PLAIN_RET


;-----------------------------------------------------------------------;

endProc vNonAlignedSrcCopy

_TEXT$04   ends

        end



unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.