Listing 1: Fast floating-point register copy
_asm
{
mov esi, pData
mov ecx, dwByteCount
mov edi, pDest
sub ecx, 256
jl DonePreWarm
ALIGN 16
PreWarm:
// Pre-warm the read buffer
;clocks
mov al, [esi] ;1
mov bl, [esi+32] ;0
mov al, [esi+64] ;1
mov bl, [esi+96] ;0
mov al, [esi+128] ;1
mov bl, [esi+160] ;0
mov al, [esi+192] ;1
mov bl, [esi+224] ;0
// The nop will force the code
// to pair better.
add esi, 256 ;1
nop ;0
sub ecx, 256 ;1
jg PreWarm ;0
DonePreWarm:
mov ecx, dwByteCount;
mov esi, pData;
sub ecx, 32 ;ecx counts bytes
jl DoneCopy; ; Nothing to do!
ALIGN 16
LoopCopy:
fild qword ptr[esi+ecx] ;1+2 a
fild qword ptr[esi+ecx+8] ;1+2 b a
fild qword ptr[esi+ecx+16] ;1+2 c b a
fxch st(2) ;0 a b c
fild qword ptr[esi+ecx+24] ;1+2 d a b c
fxch st(2) ;0 b a d c
fistp qword ptr[edi+ecx+8] ;6 a d c
fistp qword ptr[edi+ecx] ;6 d c
fistp qword ptr[edi+ecx+24] ;6 c
fistp qword ptr[edi+ecx+16] ;6
sub ecx, 32 ;1
jge LoopCopy ;0 (paired)
DoneCopy:
}
: End of File