What's the fastest way to copy/set memory for X86+ with XMM+? It should be designed to process large blocks/files as well as relatively small ones (256-4K).
I'm currently using rep stosd/movsd to draw/copy horizontal scan/lines. Here's an old memory.set I wrote a long time ago on Pentium+MMX 75MHZ (not sure how well it works):
memory.set: ; p, v, n, x
push edi
mov edi, [esp+8]
mov eax, [esp+12]
mov ecx, [esp+16]
or ecx, ecx ; size=0?
jnz @f
error E.SIZE
jmp .r
@@:
or edi, edi ; address=0?
jnz @f
error E.ADDRESS
jmp .r
@@:
or eax, eax ; 0 value?
jz .x4
cmp eax, 0FFh ; all 1s?
jne @f
mov eax, 0FFFFFFFFh ; expand
jmp .x4
@@:
mov edx, [esp+20] ; size
cmp edx, 32 ; expand v to 32?
je .x4
cmp edx, 8
je .x1
cmp edx, 16
je .x2
jmp .r
.x1: ; 00.00.00.AA=00.00.AA.AA
movzx edx, al
shl edx, 8
or eax, edx
.x2: ; 00.00.AA.BB=AA.BB.AA.BB
mov edx, eax
shl edx, 16
or eax, edx
.x4:
cmp ecx, 4 ; if #<4 (0-3), copy individual bytes
jb .1
cmp ecx, 64 ; if #<64 (4-63), use stosd table
jb .4
.8: ; else, use mmx. expand 32 to 64
movd mm0, eax ; AA.BB.CC.DD=AA.BB.CC.DD.AA.BB.CC.DD
movq mm1, mm0 ; 00.00.00.00.VV.VV.VV.VV
psllq mm0, 32 ; VV.VV.VV.VV.00.00.00.00
por mm0, mm1 ; VV.VV.VV.VV.VV.VV.VV.VV
movq mm1, mm0
@@:
mov edx, 64
jmp @f
align 16
@@:
movntq [edi], mm0 ; set 64 bytes
movntq [edi+8], mm1
movntq [edi+16], mm0
movntq [edi+24], mm1
movntq [edi+32], mm0
movntq [edi+40], mm1
movntq [edi+48], mm0
movntq [edi+56], mm1
add edi, edx
sub ecx, edx
cmp ecx, edx ; while not end
jae @b
emms
or ecx, ecx ; remainder?
jz .r
cmp ecx, 4 ; <4?
jb .1
.4: ; set 4-63 bytes
push ecx
shr ecx, 2
mov edx, @f
sub edx, ecx
jmp edx ; jmp into stosd array at @f-(ecx/4)
dd 0ABABABABh,\ ; 16 stosd opcodes
0ABABABABh,\
0ABABABABh,\
0ABABABABh
@@:
pop ecx
and ecx, 3 ; remainder?
jz .r
.1: ; set 1-3 bytes
jmp dword [@f+ecx*4]
@@: dd \
.r, .s1, .s2, .s3
.s1:
mov [edi], al
jmp .r
.s2:
mov [edi], ax
jmp .r
.s3:
mov [edi], ax
cmp dword [esp+16], 16
jne @f
mov [edi+2], al
jmp .r
@@:
shr eax, 16
mov [edi+2], al
.r:
pop edi
ret 16