flat assembler
Message board for the users of flat assembler.

Index > Main > Fastest Memory Copying Algorithms

Goto page 1, 2  Next
Author
Thread Post new topic Reply to topic
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
Hy,
i wrote some code, whould you comment them, and/or post some too?
i guess moving between memory is faster with FPU, MMX, SSE, than rep MOVSD
Code:
move32:shr cx,1  ; ds:si = source, es:di = destination, cx = size
       jnc .nb   ; les di,dest lds si,source
       movsb
.nb:   shr cx,1
       jnc    .nw
       movsw
.nw:   rep movsd
ret

move64F:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
        jnc    .nb   ; les di,dest lds si,source
        movsb
.nb:    shr    cx,1
        jnc    .nw
        movsw
.nw:    shr    cx,1
        jnc    .nd
        movsd
.nd:    jcxz   .done
.loop:  fild   qword [ds:si]
        fistp  qword [es:di]
        add    si,8
        add    di,8
        dec    ecx
        jnz    .loop:
.done:  ret

move128FC:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fxch
         fistp  qword [es:di]
         fistp  qword [es:di+8]
         add    si,16
         add    di,16
         dec    cx
         jnz    .loop:
.done:   ret

move128F:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,16
         add    di,16
         dec    cx
         jnz    .loop:
.done:   ret

move256F:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     shr    cx,1
         jnc    .ndq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,16
         add    di,16
.ndq:    jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fistp  qword [es:di+24]
         fistp  qword [es:di+16]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,32
         add    di,32
         dec    cx
         jnz    .loop:
.done:   ret

move512F:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     shr    cx,1
         jnc    .ndq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,16
         add    di,16
.ndq:    shr    cx,1
         jnc    .nqq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fistp  qword [es:di+24]
         fistp  qword [es:di+16]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,32
         add    di,32
.nqq:    jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fild   qword [ds:si+32]
         fild   qword [ds:si+40]
         fild   qword [ds:si+48]
         fild   qword [ds:si+56]
         fistp  qword [es:di+56]
         fistp  qword [es:di+48]
         fistp  qword [es:di+40]
         fistp  qword [es:di+32]
         fistp  qword [es:di+24]
         fistp  qword [es:di+16]
         fistp  qword [es:di+8]
         fistp  qword [es:di]
         add    si,64
         add    di,64
         dec    cx
         jnz    .loop:
.done:   ret

move512FD:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     shr    cx,1
         jnc    .ndq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fxch
         fistp  qword [es:di]
         fistp  qword [es:di+8]
         add    si,16
         add    di,16
.ndq:    shr    cx,1
         jnc    .nqq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fxch   st1 ; fxch   st3
         fxch   st2 ; fxch   st1
         fxch   st1 ; fxch   st2
         fxch   st3 ; fxch   st1
         fistp  qword [es:di]
         fistp  qword [es:di+8]
         fistp  qword [es:di+16]
         fistp  qword [es:di+24]
         add    si,32
         add    di,32
.nqq:    jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fild   qword [ds:si+32]
         fild   qword [ds:si+40]
         fild   qword [ds:si+48]
         fild   qword [ds:si+56]
         fxch   st7
         fxch   st1
         fxch   st6
         fxch   st2
         fxch   st5
         fxch   st3
         fxch   st4
         fxch   st3
         fxch   st2
         fxch   st1
         fistp  qword [es:di]
         fistp  qword [es:di+8]
         fistp  qword [es:di+16]
         fistp  qword [es:di+24]
         fistp  qword [es:di+32]
         fistp  qword [es:di+40]
         fistp  qword [es:di+48]
         fistp  qword [es:di+56]
         add    si,64
         add    di,64
         dec    cx
         jnz    .loop:
.done:   ret

    


here's the latest code:

Code:
move512FZ:shr    cx,1  ; ds:si = source, es:di = destination, cx = size
         jnc    .nb   ; les di,dest lds si,source
         movsb
.nb:     shr    cx,1
         jnc    .nw
         movsw
.nw:     shr    cx,1
         jnc    .nd
         movsd
.nd:     shr    cx,1
         jnc    .nq
         fild   qword [ds:si]
         fistp  qword [es:di]
         add    si,8
         add    di,8
.nq:     shr    cx,1
         jnc    .ndq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fxch
         fistp  qword [es:di]
         fistp  qword [es:di+8]
         add    si,16
         add    di,16
.ndq:    shr    cx,1
         jnc    .nqq
         fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fxch   st3
         fistp  qword [es:di]
         fxch   st1
         fistp  qword [es:di+8]
         fistp  qword [es:di+16]
         fistp  qword [es:di+24]
         add    si,32
         add    di,32
.nqq:    jcxz   .done
.loop:   fild   qword [ds:si]
         fild   qword [ds:si+8]
         fild   qword [ds:si+16]
         fild   qword [ds:si+24]
         fild   qword [ds:si+32]
         fild   qword [ds:si+40]
         fild   qword [ds:si+48]
         fild   qword [ds:si+56]
         fxch   st7
         fistp  qword [es:di]
         fxch   st5
         fistp  qword [es:di+8]
         fxch   st3
         fistp  qword [es:di+16]
         fxch   st1
         fistp  qword [es:di+24]
         fistp  qword [es:di+32]
         fistp  qword [es:di+40]
         fistp  qword [es:di+48]
         fistp  qword [es:di+56]
         add    si,64
         add    di,64
         dec    cx
         jnz    .loop
.done:   ret
    


Last edited by Matrix on 10 Nov 2004, 17:28; edited 2 times in total
Post 09 Nov 2004, 21:45
View user's profile Send private message Visit poster's website Reply with quote
Chewy509



Joined: 19 Jun 2003
Posts: 297
Location: Bris-vegas, Australia
Chewy509
SSE with Cache prefetch IIRC the quickest method at the moment. (AMD's optimisation manual has the code). Also unroll your loop so that entire cache lines are read and written in single passes.

All the normal alignment rules should be taken into consideration as well...
Post 10 Nov 2004, 00:11
View user's profile Send private message Visit poster's website Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
i was just about to construct some similar codes to step 7 and 8, but i was not planning 9 Smile

Code:
    
Post 10 Nov 2004, 01:15
View user's profile Send private message Visit poster's website Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2145
Location: Estonia
Madis731
ASM makes a difference Very Happy !!!
Post 10 Nov 2004, 07:06
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
wildtollwut



Joined: 10 Jul 2003
Posts: 4
Location: Germany
wildtollwut
very nice work! thanks! :thumbs:
Post 10 Nov 2004, 15:38
View user's profile Send private message Reply with quote
tom tobias



Joined: 09 Sep 2003
Posts: 1320
Location: usa
tom tobias
Yes, excellent post. This is an illustration of the strength of the FlatAssembler forum. Very well done.
Post 11 Nov 2004, 00:18
View user's profile Send private message Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
thank you, Smile i'm keeping the spirit alive, i don't like bitdog's theory about that Keep It Simple Stupid, here's a piece of my programming algorithm:
Code:
optimize equ
fix equ
code equ
verify equ
good equ
jump_if_ok equ jnz
let's equ call
is equ cmp
enough? equ ,eax

Org 'Code'
Org 256 ; wake up

mainloop: ; there's a code to be written

push [Limit] ; push the limit

let's Beat_The_Limit ; begin programming

pop eax ; get the Previous Limit

is [Limit] good enough? ; compare with new one

jbe mainloop ; if not good enough then again

ret ; (urn to sleep)

Limit rd 1

Beat_The_Limit:

optimize code
dec code [size]
inc code [speed]

verify code
jump_if_ok .done
fix code
.done:
dec [Limit]

ret ;(urn)

size rd 1
speed rd 1
    

you can run it Smile
Post 11 Nov 2004, 03:06
View user's profile Send private message Visit poster's website Reply with quote
S.T.A.S.



Joined: 09 Jan 2004
Posts: 173
Location: Ru#27
S.T.A.S.
Here's the code I wrote while studying Mike Wall's Using Block Prefetch for Optimized Memory Performance tutor. It's easy to add another copy metods and see which is better.


Description: memcopy speed test
Download
Filename: memcopy2.Asm
Filesize: 6.69 KB
Downloaded: 678 Time(s)

Post 11 Nov 2004, 03:18
View user's profile Send private message Reply with quote
S.T.A.S.



Joined: 09 Jan 2004
Posts: 173
Location: Ru#27
S.T.A.S.
Matrix, pls, download the source again - you've got wrong version Iposted by mistake.
BTW, you're able to use Ctrl+C in WinXP

PS
Please, post so long code as an attachment, my browser goes mad while opening your post :\
Post 11 Nov 2004, 20:06
View user's profile Send private message Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
there's somthing wrong with your benchmark

try the attached code and tell me what do you think about the results

you know if rep movsb is 343MB/s@16MB & 340MB/s@1MB & 307@64K,
and optimized mmx is 292MB/s@16MB & 273MB/s@1MB & N/A@64K then i whould use rep movsb instead Very Happy

ps.: i could not copy the results here.

issues summary:
S.T.A.S. whould you correct it pls?
if i swap the lines the codes speeds will change, and its kinda weird lookin results i think its not measuring well,
then could you add the feature the text could be copied out?
(i 'm not a windows programer )


Description:
Download
Filename: memcopy3.Asm
Filesize: 6.99 KB
Downloaded: 628 Time(s)

Post 12 Nov 2004, 03:31
View user's profile Send private message Visit poster's website Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2145
Location: Estonia
Madis731
Very Happy
memcopy works like a charm.
Hey, the previous showed me that I had a 16MHz CPU
Luckily I didn't go to kick the manager of the company
that sold it to me.
memcopy3 showed that I had a 1299MHz CPU Very Happy thats
more like it!
Post 12 Nov 2004, 13:55
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
S.T.A.S.



Joined: 09 Jan 2004
Posts: 173
Location: Ru#27
S.T.A.S.
Matrix wrote:
if i swap the lines the codes speeds will change, and its kinda weird lookin results i think its not measuring well

Well, if you run the test twice, you'll get different result as well. Why ? Because windos is multitasking OS. There's no easy way to achieve exact results.. Moreover, results will differ in different windoses 9x & NT.

Quote:
optimized mmx is 292MB/s@16MB & 273MB/s@1MB
This is rather strange - I tested the routine on P3, PIV & Athlon/XP, everithing was Ok. However I must say my code is sloppy - loops aren't aligned, etc.. I wrote this just to understand the tutorial better.

Here's my results (windos XP)
Code:
CPU AuthenticAMD @ 1661 MHz
Test results:

copy method     bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         723 Mb/sec      699 Mb/sec      577 Mb/sec
optimized mmx:        1927 Mb/sec     1632 Mb/sec     n/a but why???
mov w/GPRs:       737 Mb/sec      707 Mb/sec      564 Mb/sec
mmx (8*qwords):       780 Mb/sec      743 Mb/sec      574 Mb/sec
mmx/movntq:   1175 Mb/sec     1078 Mb/sec     730 Mb/sec
repmovsb:     621 Mb/sec      598 Mb/sec      498 Mb/sec

---------------------------

CPU AuthenticAMD @ 1661 MHz
Test results:

copy method        bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         726 Mb/sec      703 Mb/sec      584 Mb/sec
mov w/GPRs:   743 Mb/sec      713 Mb/sec      572 Mb/sec
mmx (8*qwords):       781 Mb/sec      746 Mb/sec      584 Mb/sec
mmx/movntq:   1177 Mb/sec     1020 Mb/sec     749 Mb/sec
optimized mmx:        1858 Mb/sec     1421 Mb/sec     n/a

---------------------------

CPU AuthenticAMD @ 1660 MHz
Test results:

copy method       bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         667 Mb/sec      639 Mb/sec      502 Mb/sec
mov w/GPRs:   673 Mb/sec      643 Mb/sec      500 Mb/sec
mmx (8*qwords):       722 Mb/sec      687 Mb/sec      530 Mb/sec
mmx/movntq:   1115 Mb/sec     947 Mb/sec      670 Mb/sec
optimized mmx:        1857 Mb/sec     1404 Mb/sec     n/a
    

Matrix wrote:
n/a but why???
'optimized mmx' method uses blocks of 128Kb to copy, not less.

In attachment there's console version, use memcopy2.Exe > filename.txt to save results to a file.


Description:
Download
Filename: memcopy2.Asm
Filesize: 7.06 KB
Downloaded: 586 Time(s)

Post 12 Nov 2004, 21:03
View user's profile Send private message Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
Thnx,
but for me it is returning erronous results, and i still dk why

Code:
memcopy benchmark by S.T.A.S.

CPU GenuineIntel @ 928 MHz
Test results:

copy method      bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         330 Mb/sec      327 Mb/sec      269 Mb/sec
mov w/GPRs:   326 Mb/sec      326 Mb/sec      268 Mb/sec
mmx (8*qwords):       335 Mb/sec      333 Mb/sec      272 Mb/sec
mmx/movntq:   348 Mb/sec      314 Mb/sec      207 Mb/sec
optimized mmx:        452 Mb/sec      541 Mb/sec      n/a



memcopy benchmark by S.T.A.S.

CPU GenuineIntel @ 928 MHz
Test results:

copy method      bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         330 Mb/sec      330 Mb/sec      267 Mb/sec
mov w/GPRs:   327 Mb/sec      325 Mb/sec      252 Mb/sec
mmx (8*qwords):       336 Mb/sec      331 Mb/sec      265 Mb/sec
mmx/movntq:   348 Mb/sec      314 Mb/sec      205 Mb/sec
optimized mmx:        524 Mb/sec      535 Mb/sec      n/a



memcopy benchmark by S.T.A.S.

CPU GenuineIntel @ 928 MHz
Test results:

copy method      bytes to copy / bandwidth
           16,0 Mb         1,0 Mb          64 Kb

movsd:         331 Mb/sec      325 Mb/sec      262 Mb/sec
mov w/GPRs:   330 Mb/sec      320 Mb/sec      245 Mb/sec
mmx (8*qwords):       338 Mb/sec      329 Mb/sec      255 Mb/sec
mmx/movntq:   347 Mb/sec      313 Mb/sec      205 Mb/sec
optimized mmx:        283 Mb/sec      264 Mb/sec      n/a

    


there shouldn't be that large amount of inaccuracy, it is present even if i put cli sti in the code!

i have a Intel Celeron II 600@933Mhz w 103Mhz bus(66Mhz) w Pc-133 sd rams test were made with windos 98 se
Post 12 Nov 2004, 21:52
View user's profile Send private message Visit poster's website Reply with quote
S.T.A.S.



Joined: 09 Jan 2004
Posts: 173
Location: Ru#27
S.T.A.S.
You'll get more inacuracy when run some mpeg player in the background Wink
BTW, cli & sti have no effect in windos.

Memcopy speed mostly depends on CPUs FSB frequency, so results are quite reasonable for yours 100MHz, IMHO. Just compare these numbers with mine (at 266MHz FSB).
In my tests the difference between 98SE & XP was up to 20% (I guess this is because of different scheduling logic)
Post 12 Nov 2004, 23:33
View user's profile Send private message Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
i weren't listening to mp3 while testing Smile
i dk but my system is somewhat high end, its full tuning by me Smile
of course you can't do it with any cheap hardware.
btw i just got a screenshot of my almost complete cpuID sw, i posted the code @ Main/Detecting Cpu Types

Code:
Your CPU is: Protected Mode Pentium Pro Class GenuineIntel Processor
cpuID Level: 2 - Type:Original OEM Processor
 Model:6 Stepping:8 Revision:6
Capabilities:
MMX Technology                          CMOV-Cond. Move/Cmp. Inst.
MCA-Machine Check Arch.                 PGE-PTE Global Bit
MTRR-Mem. Type Range Reg.               CXS-CMPXCHG8B Inst.
MCE-Machine Check Exception             PAE-Physical Address Extensions
MSR-RDMSR and WRMSR Support             TSC-Time Stamp Counter
PSE-Page Size Extensions                DE-Debugging Extensions
VME-Virtual-8086 Mode Enhancement       FPU-FPU on Chip

Not supported:
APIC-APIC on Chip
Processor internals: - Instruction TLB: 4K-Byte Pages, 4-way set associative, 32
 entries - Instruction TLB: 4M-Byte Pages, 4-way set associative, 4 entries - Da
ta TLB: 4K-Byte Pages, 4-way set associative, 64 entries - L1 Instruction cache:
 16K Bytes, 4-way set associative, 32 byte line size - Data TLB: 4M-Byte Pages,
4-way set associative, 8 entries - L1 Data cache: 16K Bytes, 2-way set associati
ve, 32 byte line size - Unified L2 cache: 128K Bytes, 4-way set associative, 32
byte line size<---- Matrix cpuID 007 is waiting for a key to exit ---->
    
Post 13 Nov 2004, 06:40
View user's profile Send private message Visit poster's website Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
there must be a way to get full priority on windows, i just ran a sciencemark.org program, it measured my cpu's frequency in a few secs with a 0.4 MHz accuracy, Sad but it played freezing on my system when i wanted to benchmark. maeby assumes i have 3d now?

well, S.T.A.S.,
here's AMD 's optimized memcopy, it seems very good to me, if you could get more process priority, it whould be nice to put it in your testprog

ps.: here'h how to calculate memory speeds:
its from here
Code:
The terms PC800 and PC266 are misleading as they only tell part of the story. PC800 is used by Intel and Rambus to indicate the 800 MHz peak data transfer rate for Direct Rambus memory technology. But a Rambus module is only 2 bytes wide while an SDRAM module is 8 bytes wide.

The formula for peak bandwidth is PBW = Peak_Data_Rate x Data_Bus_Width.


The peak bandwidth of PC-800 Rambus module = 800 MHz x 2 Bytes = 1.6GB/s.


The peak bandwidth for PC-100 SDRAM =100 MHz x 8 Bytes = 800MB/s, exactly half that of the PC-800 Rambus module. 
PC-266 DDR has a peak data transfer rate of 266 MHz and like SDRAM module, a DDR module has an 8 byte wide bus. Therefore, the peak bandwidth for PC-266 DDR is 266 MHz x 8 Bytes = 2.1GB/s, about 30% higher than PC-800 RDRAM. The actual system performance of a different memory technology is much more complicated than simply comparing peek bandwidth.
    


Description: its AMD optimized memcopy, you can use it for anything (not just AMD optimized, uses MMX)
Download
Filename: memcpy_amd.zip
Filesize: 7.86 KB
Downloaded: 513 Time(s)

Post 13 Nov 2004, 21:17
View user's profile Send private message Visit poster's website Reply with quote
S.T.A.S.



Joined: 09 Jan 2004
Posts: 173
Location: Ru#27
S.T.A.S.
Quote:
PC-266 DDR has a peak data transfer rate of 266 MHz and like SDRAM module, a DDR module has an 8 byte wide bus. Therefore, the peak bandwidth for PC-266 DDR is 266 MHz x 8 Bytes = 2.1GB/s
Well, 1858 Mb/sec on my CPU is ~88% of theoretical (read: unachievable) limit.
Post 13 Nov 2004, 23:21
View user's profile Send private message Reply with quote
Matrix



Joined: 04 Sep 2004
Posts: 1171
Location: Overflow
Matrix
well i'd like to see that limit on your computer achieved Smile
can't you boost the priority to realtime somehow?
that AMD code is almost 100%, but i think i could still make it a bit faster.

what do you think? what is the max your ram can do? 95% of theoretical (read: unachievable) limit?
Post 14 Nov 2004, 01:28
View user's profile Send private message Visit poster's website Reply with quote
MCD



Joined: 21 Aug 2004
Posts: 604
Location: Germany
MCD
The following piece of code is the fastest copying algorythm on a PentiumIII+ CPU Crying or Very sad (sorry older PCs).
Furthermore, it needs data alignments to 128bytes. (8 dqwords)
It works best if destination is not frequently accessed afterwards (because of movntps (non temporary), else replace movntps to movaps), so use it for stuff like final display/sound buffer copying (double/triple buffering) and when copying really large blocks of memory.

Code:
;esi:    ptr to source
;edi:    ptr to destination
;ecx:    bytes to copy, mod 128 = 0
and     cl,80h
jecxz   copy_0_bytes
add     esi,ecx
add     edi,ecx
neg     ecx

copy_loop:
movaps  xmm0,[esi+ecx]
movaps  xmm1,[esi+ecx+10h]
movaps  xmm2,[esi+ecx+20h]
movaps  xmm3,[esi+ecx+30h]
movaps  xmm4,[esi+ecx+40h]
movaps  xmm5,[esi+ecx+50h]
movaps  xmm6,[esi+ecx+60h]
movaps  xmm7,[esi+ecx+70h]
movntps  [edi+ecx],xmm0
movntps  [edi+ecx+10h],xmm1
movntps  [edi+ecx+20h],xmm2
movntps  [edi+ecx+30h],xmm3
movntps  [edi+ecx+40h],xmm4
movntps  [edi+ecx+50h],xmm5
movntps  [edi+ecx+60h],xmm6
movntps  [edi+ecx+70h],xmm7
add     ecx,80h
jnz     copy_loop

copy_0_bytes:
    


Just a thing: Moving data with FPU is usually MUCh slower than with integer instructions. Furthermore, on many CPUs tested by me doesn't like string- and loop-instructions in general for tight loops stuff. They are all from 10% to 50% slower than their integer counterparts. I don't know why exactly, but I guess this is because of additional CISC to RISC microcode decomposation. Question

Another thing, data shouldn't be accessed downwards (in large blocks), this is really bad for the cache, since most caches assumes increasing data accesses. => Idea try avoiding "dec ecx/loops" for the pointer-counter.

_________________
MCD - the inevitable return of the Mad Computer Doggy

-||__/
.|+-~
.|| ||


Last edited by MCD on 29 Nov 2004, 17:10; edited 1 time in total
Post 25 Nov 2004, 19:41
View user's profile Send private message Reply with quote
MCD



Joined: 21 Aug 2004
Posts: 604
Location: Germany
MCD
If you need to find out how many CPU cycles you stuff actually took, try this (for DOS, but should easily be converted to Win32 API)

Code:
;This is a little template-program that calculates the time the CPU actually
;takes for executing any specified code.
;It simply reads out the 64bit Time Stamp Counter-registers at the beginning
;and after end of the code and displays the difference as a hex value to the
;console.
;Since this prog uses the RDTSC-instruction, it requires a Pentium+ CPU.

org 100h

    mov     ax,cs
       mov     [cs:Calibrate_1-2],ax
   mov     [cs:Calibrate_2-2],ax
   mov     [cs:Benchmark_1-2],ax
   mov     [cs:Benchmark_2-2],ax
   finit
;Probably does nothing but preventing from running this code under Win-ows Wink
  wbinvd;Write back all cached memory
;Calibrate TSC
       xor     bl,bl

    TSCCalibrateLoop:
      rdtsc
       mov     esi,eax
                mov          edi,edx
     jmp     0:Calibrate_1;Far jump to prevent parallelism
      Calibrate_1:
 jmp     0:Calibrate_2
      Calibrate_2:
 rdtsc
       sub     eax,esi
     sbb     edx,edi
     dec     bl
  jnz     TSCCalibrateLoop
;This is the # of CPU-clocks nothing will take, used to adjust the actual
;CPU-clock mesuring later
  mov     [TSCAdj],eax
                mov          [TSCAdj+2],edx
;Actual TSC
  xor     bl,bl

    TSCLoop:
       rdtsc
       mov     esi,eax
                mov          edi,edx
     jmp     0:Benchmark_1
      Benchmark_1:
;----------------------------------------------------------------------------
; Put whatever stuff you want to benchmark between those to separators.
; All registers except bl,esi and edi may be modified (else change the code).
; This benchmark is very precise since:
;       - it uses a differential time-calculation method and
;       - far jumps to prevent most (any?) parallel executions and other
;         speed optimization before and after these lines and
;       - no memory is accessed (unless you add it) in the time stopping
;         process since this makes time calculation _REALLY_ unreliable and
; Because of this, it is perfect for calculating the time singles or clusters
; of instructions actually take. This can be a huge help when writing speed
; optimized code. Smile
;----------------------------------------------------------------------------
    jmp     0:Benchmark_2
      Benchmark_2:
 rdtsc
       sub     eax,esi
     sbb     edx,edi
     dec     bl
  jnz     TSCLoop

 sub     eax,[TSCAdj]
        sbb     edx,[TSCAdj+2]
      xchg    eax,edx;High DWord first
    call    OHexD;
      mov     eax,edx
     call    OHexD
       int     20h

TSCAdj       dd ?,?

;al: hexadecimal byte to output to console
OHexB:
  push    ax dx
       mov     bx,HeXlat
   mov     dh,al
       shr     al,4
        xlatb
       mov     dl,al
       mov     ah,2
        int     21h
 mov     al,dh
       and     al,0Fh
      xlatb
       mov     dl,al
       int     21h
 pop     dx ax
       ret
;a: hexadecimal dword to output to console
OHexD:
 bswap   eax
 call    OHexB
       mov     al,ah
       call    OHexB
       shr     eax,16
      call    OHexB
       mov     al,ah
       call    OHexB
       ret

HeXlat       db "0123456789ABCDEF"    

_________________
MCD - the inevitable return of the Mad Computer Doggy

-||__/
.|+-~
.|| ||


Last edited by MCD on 29 Nov 2004, 17:09; edited 1 time in total
Post 25 Nov 2004, 19:57
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  
Goto page 1, 2  Next

< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2019, Tomasz Grysztar.

Powered by rwasa.