flat assembler
Message board for the users of flat assembler.

Index > Main > AMD Optimization 32t/64 bit

Author
Thread Post new topic Reply to topic
r22



Joined: 27 Dec 2004
Posts: 805
r22 27 Nov 2005, 00:18
Here's a code padding macro for AMD code.
Using the macro to properly align branches is suposedly optimal performance wise.

Code:
macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}
    

If you know a shorter way of representing this macro it would be welcome, I just can't think of one.
Also the AMD opt manual only goes up to a 9byte pad so for 10-15 I'm just quessing about the 66h and 90h sequences.


Here's an example program that uses the macro (its a win64 program).
Code:
format PE64 console
entry start

macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}

section '.code' code readable executable

  start:
        mov      rax,qword[gs:dword 30h]
        mov      eax,[rax+40h]
        mov      rdx,rax ;;;;;process ID
        mov      rcx,_ProcessID
        call     [printf]
       ; call    [GetCurrentProcess] ;returns -1
        mov     rcx,-1
        mov     rdx,100h ;realtime
        call    [SetPriorityClass]
       ; call    [GetCurrentThread];;returns -2
        mov     rcx,-2
        mov     rdx,15 ;time critical
        call    [SetThreadPriority]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
           LOOP_CNT = 1fffffffh
   .again:
        mov     rcx,LOOP_CNT
        call    MainFunc
        mov     r9d,4
        mov     r8,_Again
        mov     rdx,_Again
        xor     ecx,ecx
        call    [MessageBox]
        cmp     rax,6;;MB_YES
        je      .again
   .End:
        xor     ecx,ecx
        call    [ExitProcess]

align 16
;;uses r13 r14 r15
;;IN: rcx = number of looped calls for each function
;;OUT: 0
  MainFunc:
        push    r13
        push    r14
        push    r15
        sub     rsp,8*10 ;stack space for your functions to use
        mov     r15,rcx
        ;;;;;;;;;;;;;;;;;;;
        mov     r14,r15
        call    qword[GetTickCount]
        mov     r13,rax
  AMDPad16
      .t1: ;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;PUT CALL TO FUNCTION 1 HERE
             ;example multiplying by 3
             ;method 1
        mov rax, 1234567890
        mov rcx, 1234567890
        lea rax,[rax+rax*2]
        lea rcx,[rcx+rcx*2]

;;;;;;;;;;
        sub     r14,1
        jnz     .t1
        call    qword[GetTickCount]
        sub     rax,r13
        mov     rdx,rax
        mov     rcx,_fmt1
        call    qword[printf]
        ;;;;;;;;;;;;;;;;;;;
        mov     r14,r15
        call    qword[GetTickCount]
        mov     r13,rax
   AMDPad16
      .t2: ;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;PUT CALL TO FUNCTION 2 HERE
             ;example multiplying by 3
             ;method 2
        mov rax,1234567890
        mov rcx,1234567890
        mov r8,rax
        mov r9,rcx
        shl rax,1
        shl rcx,1
        add rax,r8
        add rcx,r9

;;;;;;;;;;
        sub     r14,1
        jnz     .t2
        call    qword[GetTickCount]
        sub     rax,r13
        mov     rdx,rax
        mov     rcx,_fmt2
        call    qword[printf]
        add     rsp,8*10
        pop     r15
        pop     r14
        pop     r13
        ret     0


section '.data' data readable writeable
  _fmth db ' %x ',0
  _fmtd db ' %d ',0
  _fmtf db ' %10.10f ',0
  _fmt1 db 'Function1 Time(ms): %d',13,10,0
  _fmt2 db 'Function2 Time(ms): %d',13,10,0
  _fmtxx db '< %.8x %.8x >',0
  _Again db 'Run benchmark again?',0
  _ProcessID db 'Process ID: %d',13,10,'Boosting priority level.',13,10,0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section '.idata' import data readable writeable

  dd 0,0,0,RVA kernel_name,RVA kernel_table
  dd 0,0,0,RVA user_name,RVA user_table
  dd 0,0,0,RVA msvcrt_name,RVA msvcrt_table
  dd 0,0,0,0,0

  kernel_table:
    ExitProcess dq RVA _ExitProcess
    GetTickCount dq RVA _GetTickCount
    SetPriorityClass dq RVA _SetPriorityClass
    GetCurrentProcess dq RVA _GetCurrentProcess
    GetCurrentThread dq RVA _GetCurrentThread
    SetThreadPriority dq RVA _SetThreadPriority
    dq 0
  user_table:
    MessageBox dq RVA _MessageBoxA
    dq 0
  msvcrt_table:
    printf dq RVA _printf
    dq 0

  kernel_name db 'KERNEL32.DLL',0
  user_name db 'USER32.DLL',0
  msvcrt_name db 'MSVCRT.DLL',0

  _ExitProcess dw 0
    db 'ExitProcess',0
  _GetCurrentProcess dw 0
    db 'GetCurrentProcess',0
  _GetCurrentThread dw 0
    db 'GetCurrentThread',0
  _GetTickCount dw 0
    db 'GetTickCount',0
  _SetPriorityClass dw 0
    db 'SetPriorityClass',0
  _SetThreadPriority dw 0
    db 'SetThreadPriority',0
  _MessageBoxA dw 0
    db 'MessageBoxA',0
  _printf dw 0
    db 'printf',0
    

Run it as is and then run it again COMMENTING OUT the macro calls you'll notice a 25% speed up using the macro as opposed to unaligned branches. The speedup from using align 16 to using the macro is nonexistant.

So why did AMD make NOP sequences with combos of 66h and 90h when the straight 90h padding runs just as fast ?

Tested on AMD64 x2 3800+ Win64
Post 27 Nov 2005, 00:18
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20300
Location: In your JS exploiting you and your system
revolution 28 Nov 2005, 03:48
Quote:
So why did AMD make NOP sequences with combos of 66h and 90h when the straight 90h padding runs just as fast ?
The reason you see there is no speed improvement is because your padding code is not in the main loop. The speed difference is miniscule and can't be seen with a millisecond timer.
Post 28 Nov 2005, 03:48
View user's profile Send private message Visit poster's website Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 28 Nov 2005, 10:01
Its an interesting code you got there. There's definitely an algo there, but I couldn't find it. I see the pattern there though and here's a code to help notice it:
Code:
macro padnum [params]
{   forward
    if params=2
        db "f",90h
    end if
    if params=3
        db "ff",90h
    end if
    if params=4
        db "fff",90h
    end if
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       padnum 2
    end if
    if a=3
       padnum 3
    end if
    if a=4
       padnum 4
    end if
    if a=5
       padnum 3,2
    end if
    if a=6
       padnum 3,3
    end if
    if a=7
       padnum 4,3
    end if
    if a=8
       padnum 4,4
    end if
    if a=9
       padnum 3,3,3
    end if
    if a=10
       padnum 4,3,3
    end if
    if a=11
       padnum 4,4,3
    end if
    if a=12
       padnum 4,4,4
    end if
    if a=13
       padnum 4,3,3,3
    end if
    if a=14
       padnum 4,4,3,3
    end if
    if a=15
       padnum 4,4,4,3
    end if
}
    

I think you can tell FASM with macros that you want the padding to happen with the fewest groups and biggest. Hmm...interesting ^o)
Post 28 Nov 2005, 10:01
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 28 Nov 2005, 11:03
I'm optimizing it right now - I think you might want to see the middle stages:
Code:
macro padnum [params]
{   forward
    if params=1
        db 90h
    end if
    if params=2
        db "f",90h
    end if
    if params=3
        db "ff",90h
    end if
    if params=4
        db "fff",90h
    end if
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual

    if a<5
       padnum a;/1
    else if a<9
       padnum (a+1)/2,a/2
    else if a<13
       padnum (a+2)/3,(a+1)/3,a/3
    else if a<16
       padnum (a+3)/4,(a+2)/4,(a+1)/4,a/4
    end if
}
    
Post 28 Nov 2005, 11:03
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 28 Nov 2005, 11:58
I told you it contains some algo Very Happy
Code:
macro padnum [params]
{   times params-1 db "f" ;66h
    db 90h
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual
                  ;  a+3
    c=(a+3) shr 2 ;  ---
                  ;   4
    repeat c
        padnum (a-%+c)/c
    end repeat
} 
    

EDIT: some comments and crearer code
Post 28 Nov 2005, 11:58
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 28 Nov 2005, 14:25
I have a question about this, why you don't use sequences of instructions that do nothing?

Example:

Instead of
Code:
if a=4 
       db 66h,66h,66h,90h 
end if
    


Why not
Code:
if a=4
       mov     eax, eax
       mov     edx, edx
end if    

?

Dissasembling some executables written in high level languages I found they never uses NOPs sequences except in cases where there is no another instruction that fit in the reminder space. Usually they use LEAs in the sequence but I can't find the way to force FASM to assemble "lea eax, [eax+0]", FASM assembles "lea eax, [eax]" which takes much less space.

Regards,
LocoDelAssembly
Post 28 Nov 2005, 14:25
View user's profile Send private message Reply with quote
Tomasz Grysztar



Joined: 16 Jun 2003
Posts: 8351
Location: Kraków, Poland
Tomasz Grysztar 28 Nov 2005, 14:41
You can use "lea eax,[dword eax+0]" to force using the 32-bit displacement; but for the 8-bit one you have to define it as byte opcodes, as fasm always optimizes them.
Post 28 Nov 2005, 14:41
View user's profile Send private message Visit poster's website Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 28 Nov 2005, 14:49
Ups I didn't know that Embarassed

Well now my original idea:

Instead of:
Code:
    if a=6 
       db 66h,66h,90h,66h,66h,90h 
    end if 
    


Why not
Code:
    if a=6 
       lea     eax, [dword eax + 0]
    end if    
?

Thanks Tomasz for both replies Wink
Post 28 Nov 2005, 14:49
View user's profile Send private message Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 28 Nov 2005, 15:05
You can always just jump to the next location. If that is the point. Then you can have strings or whatever inbetween Wink
Code:
jmp alignment_label
;When you are off by 10 here
db "filling 10"
alignment_label:
    
Post 28 Nov 2005, 15:05
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 28 Nov 2005, 15:12
Yes, I see that too in dissasemblings, if padding with instructions will spend more cicles than just jumping then JMP is used, but in the other case sequence of instructions that doesn't modifies nothing are used.
Post 28 Nov 2005, 15:12
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20300
Location: In your JS exploiting you and your system
revolution 28 Nov 2005, 15:47
Using "lea" or "mov" etc. are not optimal ways to get the least activity inside the CPU. AMD specifically state that the 66,66,90 is the most optimal way to "do nothing", they made the processor so I guess they should know best.
Post 28 Nov 2005, 15:47
View user's profile Send private message Visit poster's website Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 28 Nov 2005, 16:37
Code:
include 'win32ax.inc'

.code

  start:
        invoke  GetCurrentProcess
        invoke  SetPriorityClass, eax, REALTIME_PRIORITY_CLASS

        invoke  GetTickCount
        push    eax

        xor     ecx, ecx

        align   16
  .loop:
        lea     eax, [dword eax + 0]
        db 66h,66h,90h,66h,66h,90h
        dec     ecx
        jnz     .loop

        invoke  GetTickCount
        pop     edx             ; Result of previous call to GetTickCount
        sub     eax, edx
        cinvoke wsprintf, output, fmt, eax
        invoke  MessageBox, 0, output, title, 0

        invoke  ExitProcess,0

.data
  fmt    db "Loop took %u ms", 0
  title  db "AMD speed test", 0

  output rb 256

.end start    


Well I can't beleave what I'm seeing, if I comment the suggested AMD sequence the loop took 4360 ms, if I comment the lea again it's take 4360 ms and if I comment both the loop takes 4360 ms. WHAT'S HAPPENING HERE?

I'm using an Athlon 3200+ (S939) with WinXP 32-bits.
[edit]Now I tried running at 1005 Mhz instead of 2010 Mhz and still happening the same but now it takes 8610 ms. Well I will try another way to take the time measurement.[/edit]
Post 28 Nov 2005, 16:37
View user's profile Send private message Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 28 Nov 2005, 17:35
Because modern computers don't count ticks but micro-operations. Intel can issue 3 of those in one clock (don't know about AMD).
When you write a jump, any half-tick will be finished and new tick started. Now with this kind of loop - it takes 1 clock no matter what. Thus commenting out doesn't help - you are NOT getting 100% out of your CPU though.
I wonder why my PIII laptop (700MHz) takes 43232ms and 12523ms respectively Wink
Code:
  .loop:
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        dec     ecx
        jnz     .loop
    

With this kind of code you will minimize the impact from jumps and you can notice some changes Wink
Post 28 Nov 2005, 17:35
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 28 Nov 2005, 18:23
Well I found better times with this:
Code:
  .loop:
        lea     eax, [dword eax + 0]
        lea     ebx, [dword ebx + 0]
        lea     ecx, [dword ecx + 0]
        lea     edx, [dword edx + 0]
        lea     eax, [dword eax + 0]

        dec     ecx 
        jnz     .loop    

Works faster than:
Code:
  .loop:
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]

        dec     ecx 
        jnz     .loop    
However if I put in the loop the bytes sequences both takes 12172 ms :S and without the bytes sequences take 8593 ms and 10734 ms respectively. Is this happening because the processor executes out of order the NOPs?

Quote:
I wonder why my PIII laptop (700MHz) takes 43232ms and 12523ms respectively
Which code takes 12523 ms?

[edit]Executing in the loop only the bytes sequences takes 10734 ms[/edit]
Post 28 Nov 2005, 18:23
View user's profile Send private message Reply with quote
Madis731



Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Madis731 29 Nov 2005, 09:01
locodelassembly wrote:
Which code takes 12523 ms?

I tryed with all enabled (bytesequences & leas) and none (only empty loop)

_________________
My updated idol Very Happy http://www.agner.org/optimize/
Post 29 Nov 2005, 09:01
View user's profile Send private message Visit poster's website Yahoo Messenger MSN Messenger Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.