flat assembler
Message board for the users of flat assembler.

Index > Main > Fastest Memory Copy/Set?

Author
Thread Post new topic Reply to topic
uart777



Joined: 17 Jan 2012
Posts: 369
uart777
What's the fastest way to copy/set memory for X86+ with XMM+? It should be designed to process large blocks/files as well as relatively small ones (256-4K).

I'm currently using rep stosd/movsd to draw/copy horizontal scan/lines. Here's an old memory.set I wrote a long time ago on Pentium+MMX 75MHZ (not sure how well it works):

Code:
memory.set:           ; p, v, n, x
push edi
mov edi, [esp+8]
mov eax, [esp+12]
mov ecx, [esp+16]
or ecx, ecx           ; size=0?
jnz @f
error E.SIZE
jmp .r
@@:
or edi, edi           ; address=0?
jnz @f
error E.ADDRESS
jmp .r
@@:
or eax, eax           ; 0 value?
jz .x4
cmp eax, 0FFh         ; all 1s?
jne @f
mov eax, 0FFFFFFFFh   ; expand
jmp .x4
@@:
mov edx, [esp+20]     ; size
cmp edx, 32           ; expand v to 32?
je .x4
cmp edx, 8
je .x1
cmp edx, 16
je .x2
jmp .r
.x1:                  ; 00.00.00.AA=00.00.AA.AA
movzx edx, al
shl edx, 8
or eax, edx
.x2:                  ; 00.00.AA.BB=AA.BB.AA.BB
mov edx, eax
shl edx, 16
or eax, edx
.x4:
cmp ecx, 4            ; if #<4 (0-3), copy individual bytes
jb .1
cmp ecx, 64           ; if #<64 (4-63), use stosd table
jb .4
.8:                   ; else, use mmx. expand 32 to 64
movd mm0, eax         ; AA.BB.CC.DD=AA.BB.CC.DD.AA.BB.CC.DD
movq mm1, mm0         ; 00.00.00.00.VV.VV.VV.VV
psllq mm0, 32         ; VV.VV.VV.VV.00.00.00.00
por mm0, mm1          ; VV.VV.VV.VV.VV.VV.VV.VV
movq mm1, mm0
@@:
mov edx, 64
jmp @f
align 16
@@:
movntq [edi], mm0     ; set 64 bytes
movntq [edi+8], mm1
movntq [edi+16], mm0
movntq [edi+24], mm1
movntq [edi+32], mm0
movntq [edi+40], mm1
movntq [edi+48], mm0
movntq [edi+56], mm1
add edi, edx
sub ecx, edx
cmp ecx, edx          ; while not end
jae @b
emms
or ecx, ecx           ; remainder?
jz .r
cmp ecx, 4            ; <4?
jb .1

.4:                   ; set 4-63 bytes
push ecx
shr ecx, 2
mov edx, @f
sub edx, ecx
jmp edx               ; jmp into stosd array at @f-(ecx/4)
dd 0ABABABABh,\       ; 16 stosd opcodes
0ABABABABh,\
0ABABABABh,\
0ABABABABh
@@:
pop ecx
and ecx, 3            ; remainder?
jz .r

.1:                   ; set 1-3 bytes
jmp dword [@f+ecx*4]
@@: dd \
.r, .s1, .s2, .s3
.s1:
mov [edi], al
jmp .r
.s2:
mov [edi], ax
jmp .r
.s3:
mov [edi], ax
cmp dword [esp+16], 16
jne @f
mov [edi+2], al
jmp .r
@@:
shr eax, 16
mov [edi+2], al
.r:
pop edi
ret 16    
Post 12 Feb 2013, 02:33
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 17287
Location: In your JS exploiting you and your system
revolution
See here: http://board.flatassembler.net/topic.php?t=2571

And also the AMD manual which gives a good tutorial on optimising copy routines for various situations.
Post 12 Feb 2013, 03:07
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777
Thanks, saved HTML. Agner Fog's optimization manuals have good information on this too (for anyone reading): http://www.agner.org/optimize/
Post 12 Feb 2013, 04:56
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2020, Tomasz Grysztar. Also on YouTube, Twitter.

Website powered by rwasa.