Listing 1: Fast floating-point register copy


 _asm 
 {
mov esi, pData
mov ecx, dwByteCount
 mov edi, pDest
 sub ecx, 256
 jl DonePreWarm

ALIGN 16
PreWarm:
 // Pre-warm the read buffer
    ;clocks
 mov al, [esi]  ;1
 mov bl, [esi+32] ;0
 mov al, [esi+64] ;1
 mov bl, [esi+96] ;0
 mov al, [esi+128] ;1
 mov bl, [esi+160] ;0
 mov al, [esi+192] ;1
 mov bl, [esi+224] ;0
 
 // The nop will force the code
 // to pair better.
 add esi, 256  ;1 
  nop   ;0 
 
 sub ecx, 256  ;1
  jg PreWarm  ;0

DonePreWarm:
 mov ecx, dwByteCount;
  mov esi, pData;         
  sub ecx, 32  ;ecx counts bytes
  jl DoneCopy;  ; Nothing to do!
ALIGN 16
LoopCopy:
 fild qword ptr[esi+ecx]  ;1+2 a 
 fild qword ptr[esi+ecx+8] ;1+2 b a
 fild qword ptr[esi+ecx+16] ;1+2 c b a 
   fxch st(2)   ;0 a b c
 fild qword ptr[esi+ecx+24] ;1+2 d a b c
   fxch st(2)   ;0 b a d c
 fistp qword ptr[edi+ecx+8] ;6 a d c
 fistp qword ptr[edi+ecx] ;6 d c
 fistp qword ptr[edi+ecx+24] ;6 c
 fistp qword ptr[edi+ecx+16] ;6

 sub ecx, 32   ;1
  jge LoopCopy   ;0 (paired)
    
DoneCopy:
}
: End of File