flat assembler
Message board for the users of flat assembler.

Index > Windows > Speed test

Author
Thread Post new topic Reply to topic
A$M



Joined: 29 Feb 2012
Posts: 94
A$M
Hello!

I was in doubt about the speed of the FPU. So I did a little test. Here it is:
Code:
format PE CONSOLE 4.0

include 'win32a.inc'

 cinvoke system, _title
 cinvoke system, _pause

 cinvoke printf, _nl

 cinvoke printf, _test1
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 add ecx, edx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test2
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 sub ecx, edx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test3
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 imul ebx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test4
 mov edx, 0
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 mov eax, 0
@@:
 idiv ebx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test5
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov al, [_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test6
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov ax, word[_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test7
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov eax, dword[_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test8
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test9
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test10
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test11
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test12
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test13
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test14
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test15
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test16
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _nl

 cinvoke printf, _test17
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test18
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test19
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _nl
 cinvoke system, _pause
 invoke ExitProcess, 0

_title  db "TITLE Speed Test",0
_pause  db "PAUSE",0

_test1  db "LONG INT ADD... ",0
_test2  db "LONG INT SUBTRACTION... ",0
_test3  db "LONG INT MULTIPLICATION... ",0
_test4  db "LONG INT DIVISION... ",0
_test5  db "MOVE BYTE/CHAR IN MEMORY... ",0
_test6  db "MOVE WORD/INT IN MEMORY... ",0
_test7  db "MOVE DOUBLE WORD/LONG INT IN MEMORY... ",0
_test8  db "FLOAT ADD... ",0
_test9  db "DOUBLE ADD... ",0
_test10 db "LONG DOUBLE ADD... ",0
_test11 db "FLOAT SUBTRACTION... ",0
_test12 db "DOUBLE SUBTRACTION... ",0
_test13 db "LONG DOUBLE SUBTRACTION... ",0
_test14 db "FLOAT MULTIPLICATION... ",0
_test15 db "DOUBLE MULTIPLICATION... ",0
_test16 db "LONG DOUBLE MULTIPLICATION... ",0
_test17 db "FLOAT DIVISION... ",0
_test18 db "DOUBLE DIVISION... ",0
_test19 db "LONG DOUBLE DIVISION... ",0
_done   db "1 billion operations made in %i milliseconds.",10,13,0
_done2  db "10 million operations made in %i milliseconds.",10,13,0
_failed db "Failed!"
_nl     db 10,13,0

_float  dd 1.2345
_double dq 1.2345
_longd  dt 1.2345

data import

 library kernel32,'KERNEL32.DLL',\
         crtdll,'CRTDLL.DLL'

 import kernel32,\
        GetTickCount,'GetTickCount',\
        ExitProcess,'ExitProcess'

 import crtdll,\
        printf,'printf',\
        system,'system'

end data
    


Before running, set the battery level to High Performance and close music, videos and games that are running to avoid noise. Run the test and preferably do not use your computer while the test is done.

The resulting values ​​are for comparison with others.
On my computer (Intel(R) Core(TM) i3-2370M CPU @ 2.40GHz | 6 GB) the results were:
Quote:
Pressione qualquer tecla para continuar. . .

LONG INT ADD... 1 billion operations made in 2094 milliseconds.
LONG INT SUBTRACTION... 1 billion operations made in 2109 milliseconds.
LONG INT MULTIPLICATION... 1 billion operations made in 2094 milliseconds.
LONG INT DIVISION... 1 billion operations made in 7969 milliseconds.

MOVE BYTE/CHAR IN MEMORY... 1 billion operations made in 2094 milliseconds.
MOVE WORD/INT IN MEMORY... 1 billion operations made in 2093 milliseconds.
MOVE DOUBLE WORD/LONG INT IN MEMORY... 1 billion operations made in 2094 millise
conds.

FLOAT ADD... 1 billion operations made in 2531 milliseconds.
DOUBLE ADD... 1 billion operations made in 2094 milliseconds.
LONG DOUBLE ADD... 1 billion operations made in 2094 milliseconds.

FLOAT SUBTRACTION... 1 billion operations made in 2094 milliseconds.
DOUBLE SUBTRACTION... 1 billion operations made in 2093 milliseconds.
LONG DOUBLE SUBTRACTION... 1 billion operations made in 2094 milliseconds.

FLOAT MULTIPLICATION... 10 million operations made in 1422 milliseconds.
DOUBLE MULTIPLICATION... 10 million operations made in 1422 milliseconds.
LONG DOUBLE MULTIPLICATION... 10 million operations made in 1422 milliseconds.

FLOAT DIVISION... 10 million operations made in 2937 milliseconds.
DOUBLE DIVISION... 10 million operations made in 2938 milliseconds.
LONG DOUBLE DIVISION... 10 million operations made in 2953 milliseconds.

Pressione qualquer tecla para continuar. . .


Most of these values ​​has a little or a lot of noise, but they can be compared with each other. Draw your own conclusions. Wink
Post 07 Nov 2013, 14:26
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 17271
Location: In your JS exploiting you and your system
revolution
Your float timings for single/double/extended are all comparing the same operation. You will need to change the control word (FLDCW) to set the precision. And your memory read tests are only testing the cache timing for L1.
Post 07 Nov 2013, 16:41
View user's profile Send private message Visit poster's website Reply with quote
HaHaAnonymous



Joined: 02 Dec 2012
Posts: 1180
Location: Unknown
HaHaAnonymous
[ Post removed by author. ]


Last edited by HaHaAnonymous on 28 Feb 2015, 19:10; edited 1 time in total
Post 07 Nov 2013, 17:48
View user's profile Send private message Reply with quote
A$M



Joined: 29 Feb 2012
Posts: 94
A$M
revolution wrote:
Your float timings for single/double/extended are all comparing the same operation. You will need to change the control word (FLDCW) to set the precision. And your memory read tests are only testing the cache timing for L1.


The new code:
Code:
format PE CONSOLE 4.0

include 'win32a.inc'

 cinvoke system, _title
 cinvoke system, _pause

 cinvoke printf, _nl

 cinvoke printf, _test1
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 add ecx, edx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test2
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 sub ecx, edx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test3
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 imul ebx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test4
 mov edx, 0
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 mov eax, 0
@@:
 idiv ebx
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test5
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov al, [_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test6
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov ax, word[_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test7
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
@@:
 mov eax, dword[_test1]
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test8
 finit
 fstcw [_cword]
 xor [_cword], 1100000000b
 fldcw [_cword]
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test9
 finit
 fstcw [_cword]
 xor [_cword], 100000000b
 fldcw [_cword]
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test10
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fadd st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test11
 finit
 fstcw [_cword]
 xor [_cword], 1100000000b
 fldcw [_cword]
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test12
 finit
 fstcw [_cword]
 xor [_cword], 100000000b
 fldcw [_cword]
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _test13
 finit
 mov ecx, 1000000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fsub st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done, eax

 cinvoke printf, _nl

 cinvoke printf, _test14
 finit
 fstcw [_cword]
 xor [_cword], 1100000000b
 fldcw [_cword]
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test15
 finit
 fstcw [_cword]
 xor [_cword], 100000000b
 fldcw [_cword]
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test16
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fmul st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _nl

 cinvoke printf, _test17
 finit
 fstcw [_cword]
 xor [_cword], 1100000000b
 fldcw [_cword]
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_float]
 fld [_float]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test18
 finit
 fstcw [_cword]
 xor [_cword], 100000000b
 fldcw [_cword]
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_double]
 fld [_double]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _test19
 finit
 mov ecx, 10000000
 invoke GetTickCount
 mov ebx, eax
 fld [_longd]
 fld [_longd]
@@:
 fdiv st0, st1
 loop @b
 invoke GetTickCount
 sub eax, ebx
 cinvoke printf, _done2, eax

 cinvoke printf, _nl
 cinvoke system, _pause
 invoke ExitProcess, 0

_title  db "TITLE Speed Test",0
_pause  db "PAUSE",0

_test1  db "LONG INT ADD... ",0
_test2  db "LONG INT SUBTRACTION... ",0
_test3  db "LONG INT MULTIPLICATION... ",0
_test4  db "LONG INT DIVISION... ",0
_test5  db "MOVE BYTE/CHAR IN MEMORY... ",0
_test6  db "MOVE WORD/INT IN MEMORY... ",0
_test7  db "MOVE DOUBLE WORD/LONG INT IN MEMORY... ",0
_test8  db "FLOAT ADD... ",0
_test9  db "DOUBLE ADD... ",0
_test10 db "LONG DOUBLE ADD... ",0
_test11 db "FLOAT SUBTRACTION... ",0
_test12 db "DOUBLE SUBTRACTION... ",0
_test13 db "LONG DOUBLE SUBTRACTION... ",0
_test14 db "FLOAT MULTIPLICATION... ",0
_test15 db "DOUBLE MULTIPLICATION... ",0
_test16 db "LONG DOUBLE MULTIPLICATION... ",0
_test17 db "FLOAT DIVISION... ",0
_test18 db "DOUBLE DIVISION... ",0
_test19 db "LONG DOUBLE DIVISION... ",0
_done   db "1 billion operations made in %i milliseconds.",10,13,0
_done2  db "10 million operations made in %i milliseconds.",10,13,0
_failed db "Failed!"
_nl     db 10,13,0

_cword  dw ?
_float  dd 1.2345
_double dq 1.2345
_longd  dt 1.2345

data import

 library kernel32,'KERNEL32.DLL',\
         crtdll,'CRTDLL.DLL'

 import kernel32,\
        GetTickCount,'GetTickCount',\
        ExitProcess,'ExitProcess'

 import crtdll,\
        printf,'printf',\
        system,'system'

end data
    

New results, but only division really change:
Quote:
Pressione qualquer tecla para continuar. . .

LONG INT ADD... 1 billion operations made in 2094 milliseconds.
LONG INT SUBTRACTION... 1 billion operations made in 2093 milliseconds.
LONG INT MULTIPLICATION... 1 billion operations made in 2094 milliseconds.
LONG INT DIVISION... 1 billion operations made in 7953 milliseconds.

MOVE BYTE/CHAR IN MEMORY... 1 billion operations made in 2094 milliseconds.
MOVE WORD/INT IN MEMORY... 1 billion operations made in 2094 milliseconds.
MOVE DOUBLE WORD/LONG INT IN MEMORY... 1 billion operations made in 2094 millise
conds.

FLOAT ADD... 1 billion operations made in 2093 milliseconds.
DOUBLE ADD... 1 billion operations made in 2110 milliseconds.
LONG DOUBLE ADD... 1 billion operations made in 2093 milliseconds.

FLOAT SUBTRACTION... 1 billion operations made in 2094 milliseconds.
DOUBLE SUBTRACTION... 1 billion operations made in 2094 milliseconds.
LONG DOUBLE SUBTRACTION... 1 billion operations made in 2094 milliseconds.

FLOAT MULTIPLICATION... 10 million operations made in 1422 milliseconds.
DOUBLE MULTIPLICATION... 10 million operations made in 1406 milliseconds.
LONG DOUBLE MULTIPLICATION... 10 million operations made in 1422 milliseconds.

FLOAT DIVISION... 10 million operations made in 2906 milliseconds.
DOUBLE DIVISION... 10 million operations made in 2922 milliseconds.
LONG DOUBLE DIVISION... 10 million operations made in 3000 milliseconds.

Pressione qualquer tecla para continuar. . .
It's ok now?
MOVE (...) IN MEMORY tests are not important (for me).

PS.: Please, post your results. Wink
Post 07 Nov 2013, 23:52
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 17271
Location: In your JS exploiting you and your system
revolution
When you divide zero (or small numbers) by something there is special circuitry that will complete the instruction quickly. You might want to ensure that the numerator is always non-zero to get a better idea of the long execution nature of division.
Post 08 Nov 2013, 00:24
View user's profile Send private message Visit poster's website Reply with quote
HaHaAnonymous



Joined: 02 Dec 2012
Posts: 1180
Location: Unknown
HaHaAnonymous
[ Post removed by author. ]


Last edited by HaHaAnonymous on 28 Feb 2015, 19:09; edited 1 time in total
Post 08 Nov 2013, 12:46
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 17271
Location: In your JS exploiting you and your system
revolution
HaHaAnonymous what is your CPU?
Post 08 Nov 2013, 12:49
View user's profile Send private message Visit poster's website Reply with quote
HaHaAnonymous



Joined: 02 Dec 2012
Posts: 1180
Location: Unknown
HaHaAnonymous
[ Post removed by author. ]


Last edited by HaHaAnonymous on 28 Feb 2015, 19:09; edited 1 time in total
Post 08 Nov 2013, 13:36
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 17271
Location: In your JS exploiting you and your system
revolution
The bug is here:
Code:
mov ecx, 10000000
invoke GetTickCount    
The value in ECX is unknown after the invoke.
Post 08 Nov 2013, 13:48
View user's profile Send private message Visit poster's website Reply with quote
cod3b453



Joined: 25 Aug 2004
Posts: 619
cod3b453
The FPU only differentiates between single/double/extended for load/stores; arithmetic has the same internal precision so there should be no difference.

Also it is likely that the tight loop is the dominating factor and preventing maximum potential pipeline utilisation - I believe Intel CPUs should be able to perform two non-interlocked/dependent adds in the same cycle time as one mul for example Question (I may be wrong)
Post 08 Nov 2013, 18:25
View user's profile Send private message Reply with quote
Xorpd!



Joined: 21 Dec 2006
Posts: 161
Xorpd!
Your processor is capable of something like 38.4 GFLOPS at double precision. As revolution points out above you have to stop clobbering ecx (among other things) to get any information at all out of your test.
Post 10 Nov 2013, 09:05
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2020, Tomasz Grysztar.

Powered by rwasa.