AMD Optimization 32t/64 bit

Index > Main > AMD Optimization 32t/64 bit

Author

Thread

r22

Joined: 27 Dec 2004
Posts: 805

r22 27 Nov 2005, 00:18

Here's a code padding macro for AMD code.
Using the macro to properly align branches is suposedly optimal performance wise.

Code:

macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}

If you know a shorter way of representing this macro it would be welcome, I just can't think of one.
Also the AMD opt manual only goes up to a 9byte pad so for 10-15 I'm just quessing about the 66h and 90h sequences.

Here's an example program that uses the macro (its a win64 program).

Code:

format PE64 console
entry start

macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}

section '.code' code readable executable

  start:
        mov      rax,qword[gs:dword 30h]
        mov      eax,[rax+40h]
        mov      rdx,rax ;;;;;process ID
        mov      rcx,_ProcessID
        call     [printf]
       ; call    [GetCurrentProcess] ;returns -1
        mov     rcx,-1
        mov     rdx,100h ;realtime
        call    [SetPriorityClass]
       ; call    [GetCurrentThread];;returns -2
        mov     rcx,-2
        mov     rdx,15 ;time critical
        call    [SetThreadPriority]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
           LOOP_CNT = 1fffffffh
   .again:
        mov     rcx,LOOP_CNT
        call    MainFunc
        mov     r9d,4
        mov     r8,_Again
        mov     rdx,_Again
        xor     ecx,ecx
        call    [MessageBox]
        cmp     rax,6;;MB_YES
        je      .again
   .End:
        xor     ecx,ecx
        call    [ExitProcess]

align 16
;;uses r13 r14 r15
;;IN: rcx = number of looped calls for each function
;;OUT: 0
  MainFunc:
        push    r13
        push    r14
        push    r15
        sub     rsp,8*10 ;stack space for your functions to use
        mov     r15,rcx
        ;;;;;;;;;;;;;;;;;;;
        mov     r14,r15
        call    qword[GetTickCount]
        mov     r13,rax
  AMDPad16
      .t1: ;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;PUT CALL TO FUNCTION 1 HERE
             ;example multiplying by 3
             ;method 1
        mov rax, 1234567890
        mov rcx, 1234567890
        lea rax,[rax+rax*2]
        lea rcx,[rcx+rcx*2]

;;;;;;;;;;
        sub     r14,1
        jnz     .t1
        call    qword[GetTickCount]
        sub     rax,r13
        mov     rdx,rax
        mov     rcx,_fmt1
        call    qword[printf]
        ;;;;;;;;;;;;;;;;;;;
        mov     r14,r15
        call    qword[GetTickCount]
        mov     r13,rax
   AMDPad16
      .t2: ;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;PUT CALL TO FUNCTION 2 HERE
             ;example multiplying by 3
             ;method 2
        mov rax,1234567890
        mov rcx,1234567890
        mov r8,rax
        mov r9,rcx
        shl rax,1
        shl rcx,1
        add rax,r8
        add rcx,r9

;;;;;;;;;;
        sub     r14,1
        jnz     .t2
        call    qword[GetTickCount]
        sub     rax,r13
        mov     rdx,rax
        mov     rcx,_fmt2
        call    qword[printf]
        add     rsp,8*10
        pop     r15
        pop     r14
        pop     r13
        ret     0


section '.data' data readable writeable
  _fmth db ' %x ',0
  _fmtd db ' %d ',0
  _fmtf db ' %10.10f ',0
  _fmt1 db 'Function1 Time(ms): %d',13,10,0
  _fmt2 db 'Function2 Time(ms): %d',13,10,0
  _fmtxx db '< %.8x %.8x >',0
  _Again db 'Run benchmark again?',0
  _ProcessID db 'Process ID: %d',13,10,'Boosting priority level.',13,10,0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
section '.idata' import data readable writeable

  dd 0,0,0,RVA kernel_name,RVA kernel_table
  dd 0,0,0,RVA user_name,RVA user_table
  dd 0,0,0,RVA msvcrt_name,RVA msvcrt_table
  dd 0,0,0,0,0

  kernel_table:
    ExitProcess dq RVA _ExitProcess
    GetTickCount dq RVA _GetTickCount
    SetPriorityClass dq RVA _SetPriorityClass
    GetCurrentProcess dq RVA _GetCurrentProcess
    GetCurrentThread dq RVA _GetCurrentThread
    SetThreadPriority dq RVA _SetThreadPriority
    dq 0
  user_table:
    MessageBox dq RVA _MessageBoxA
    dq 0
  msvcrt_table:
    printf dq RVA _printf
    dq 0

  kernel_name db 'KERNEL32.DLL',0
  user_name db 'USER32.DLL',0
  msvcrt_name db 'MSVCRT.DLL',0

  _ExitProcess dw 0
    db 'ExitProcess',0
  _GetCurrentProcess dw 0
    db 'GetCurrentProcess',0
  _GetCurrentThread dw 0
    db 'GetCurrentThread',0
  _GetTickCount dw 0
    db 'GetTickCount',0
  _SetPriorityClass dw 0
    db 'SetPriorityClass',0
  _SetThreadPriority dw 0
    db 'SetThreadPriority',0
  _MessageBoxA dw 0
    db 'MessageBoxA',0
  _printf dw 0
    db 'printf',0

Run it as is and then run it again COMMENTING OUT the macro calls you'll notice a 25% speed up using the macro as opposed to unaligned branches. The speedup from using align 16 to using the macro is nonexistant.

So why did AMD make NOP sequences with combos of 66h and 90h when the straight 90h padding runs just as fast ?

Tested on AMD64 x2 3800+ Win64

27 Nov 2005, 00:18

revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 20700
Location: In your JS exploiting you and your system

revolution 28 Nov 2005, 03:48

Quote:

So why did AMD make NOP sequences with combos of 66h and 90h when the straight 90h padding runs just as fast ?

The reason you see there is no speed improvement is because your padding code is not in the main loop. The speed difference is miniscule and can't be seen with a millisecond timer.

28 Nov 2005, 03:48

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 28 Nov 2005, 10:01

Its an interesting code you got there. There's definitely an algo there, but I couldn't find it. I see the pattern there though and here's a code to help notice it:

Code:

macro padnum [params]
{   forward
    if params=2
        db "f",90h
    end if
    if params=3
        db "ff",90h
    end if
    if params=4
        db "fff",90h
    end if
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       padnum 2
    end if
    if a=3
       padnum 3
    end if
    if a=4
       padnum 4
    end if
    if a=5
       padnum 3,2
    end if
    if a=6
       padnum 3,3
    end if
    if a=7
       padnum 4,3
    end if
    if a=8
       padnum 4,4
    end if
    if a=9
       padnum 3,3,3
    end if
    if a=10
       padnum 4,3,3
    end if
    if a=11
       padnum 4,4,3
    end if
    if a=12
       padnum 4,4,4
    end if
    if a=13
       padnum 4,3,3,3
    end if
    if a=14
       padnum 4,4,3,3
    end if
    if a=15
       padnum 4,4,4,3
    end if
}

I think you can tell FASM with macros that you want the padding to happen with the fewest groups and biggest. Hmm...interesting ^o)

28 Nov 2005, 10:01

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 28 Nov 2005, 11:03

I'm optimizing it right now - I think you might want to see the middle stages:

Code:

macro padnum [params]
{   forward
    if params=1
        db 90h
    end if
    if params=2
        db "f",90h
    end if
    if params=3
        db "ff",90h
    end if
    if params=4
        db "fff",90h
    end if
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual

    if a<5
       padnum a;/1
    else if a<9
       padnum (a+1)/2,a/2
    else if a<13
       padnum (a+2)/3,(a+1)/3,a/3
    else if a<16
       padnum (a+3)/4,(a+2)/4,(a+1)/4,a/4
    end if
}

28 Nov 2005, 11:03

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 28 Nov 2005, 11:58

I told you it contains some algo Very Happy

Code:

macro padnum [params]
{   times params-1 db "f" ;66h
    db 90h
}
macro AMDPad16
{   virtual
        align 16
        a = $-$$
    end virtual
                  ;  a+3
    c=(a+3) shr 2 ;  ---
                  ;   4
    repeat c
        padnum (a-%+c)/c
    end repeat
}

EDIT: some comments and crearer code

28 Nov 2005, 11:58

LocoDelAssembly
Your code has a bug

Joined: 06 May 2005
Posts: 4623
Location: Argentina

LocoDelAssembly 28 Nov 2005, 14:25

I have a question about this, why you don't use sequences of instructions that do nothing?

Example:

Instead of

Code:

if a=4 
       db 66h,66h,66h,90h 
end if

Why not

Code:

if a=4
       mov     eax, eax
       mov     edx, edx
end if

?

Dissasembling some executables written in high level languages I found they never uses NOPs sequences except in cases where there is no another instruction that fit in the reminder space. Usually they use LEAs in the sequence but I can't find the way to force FASM to assemble "lea eax, [eax+0]", FASM assembles "lea eax, [eax]" which takes much less space.

Regards,
LocoDelAssembly

28 Nov 2005, 14:25

Tomasz Grysztar

Joined: 16 Jun 2003
Posts: 8434
Location: Kraków, Poland

Tomasz Grysztar 28 Nov 2005, 14:41

You can use "lea eax,[dword eax+0]" to force using the 32-bit displacement; but for the 8-bit one you have to define it as byte opcodes, as fasm always optimizes them.

28 Nov 2005, 14:41

LocoDelAssembly
Your code has a bug

Joined: 06 May 2005
Posts: 4623
Location: Argentina

LocoDelAssembly 28 Nov 2005, 14:49

Ups I didn't know that Embarassed

Well now my original idea:

Instead of:

Code:

    if a=6 
       db 66h,66h,90h,66h,66h,90h 
    end if

Why not

Code:

    if a=6 
       lea     eax, [dword eax + 0]
    end if

?

Thanks Tomasz for both replies Wink

28 Nov 2005, 14:49

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 28 Nov 2005, 15:05

You can always just jump to the next location. If that is the point. Then you can have strings or whatever inbetween Wink

Code:

jmp alignment_label
;When you are off by 10 here
db "filling 10"
alignment_label:

28 Nov 2005, 15:05

LocoDelAssembly
Your code has a bug

Joined: 06 May 2005
Posts: 4623
Location: Argentina

LocoDelAssembly 28 Nov 2005, 15:12

Yes, I see that too in dissasemblings, if padding with instructions will spend more cicles than just jumping then JMP is used, but in the other case sequence of instructions that doesn't modifies nothing are used.

28 Nov 2005, 15:12

revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 20700
Location: In your JS exploiting you and your system

revolution 28 Nov 2005, 15:47

Using "lea" or "mov" etc. are not optimal ways to get the least activity inside the CPU. AMD specifically state that the 66,66,90 is the most optimal way to "do nothing", they made the processor so I guess they should know best.

28 Nov 2005, 15:47

LocoDelAssembly
Your code has a bug

Joined: 06 May 2005
Posts: 4623
Location: Argentina

LocoDelAssembly 28 Nov 2005, 16:37

Code:

include 'win32ax.inc'

.code

  start:
        invoke  GetCurrentProcess
        invoke  SetPriorityClass, eax, REALTIME_PRIORITY_CLASS

        invoke  GetTickCount
        push    eax

        xor     ecx, ecx

        align   16
  .loop:
        lea     eax, [dword eax + 0]
        db 66h,66h,90h,66h,66h,90h
        dec     ecx
        jnz     .loop

        invoke  GetTickCount
        pop     edx             ; Result of previous call to GetTickCount
        sub     eax, edx
        cinvoke wsprintf, output, fmt, eax
        invoke  MessageBox, 0, output, title, 0

        invoke  ExitProcess,0

.data
  fmt    db "Loop took %u ms", 0
  title  db "AMD speed test", 0

  output rb 256

.end start

Well I can't beleave what I'm seeing, if I comment the suggested AMD sequence the loop took 4360 ms, if I comment the lea again it's take 4360 ms and if I comment both the loop takes 4360 ms. WHAT'S HAPPENING HERE?

I'm using an Athlon 3200+ (S939) with WinXP 32-bits.
[edit]Now I tried running at 1005 Mhz instead of 2010 Mhz and still happening the same but now it takes 8610 ms. Well I will try another way to take the time measurement.[/edit]

28 Nov 2005, 16:37

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 28 Nov 2005, 17:35

Because modern computers don't count ticks but micro-operations. Intel can issue 3 of those in one clock (don't know about AMD).
When you write a jump, any half-tick will be finished and new tick started. Now with this kind of loop - it takes 1 clock no matter what. Thus commenting out doesn't help - you are NOT getting 100% out of your CPU though.
I wonder why my PIII laptop (700MHz) takes 43232ms and 12523ms respectively Wink

Code:

  .loop:
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        db 66h,66h,90h,66h,66h,90h
        dec     ecx
        jnz     .loop

With this kind of code you will minimize the impact from jumps and you can notice some changes Wink

28 Nov 2005, 17:35

LocoDelAssembly
Your code has a bug

Joined: 06 May 2005
Posts: 4623
Location: Argentina

LocoDelAssembly 28 Nov 2005, 18:23

Well I found better times with this:

Code:

  .loop:
        lea     eax, [dword eax + 0]
        lea     ebx, [dword ebx + 0]
        lea     ecx, [dword ecx + 0]
        lea     edx, [dword edx + 0]
        lea     eax, [dword eax + 0]

        dec     ecx 
        jnz     .loop

Works faster than:

Code:

  .loop:
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]
        lea     eax, [dword eax + 0]

        dec     ecx 
        jnz     .loop

However if I put in the loop the bytes sequences both takes 12172 ms :S and without the bytes sequences take 8593 ms and 10734 ms respectively. Is this happening because the processor executes out of order the NOPs?

Quote:

I wonder why my PIII laptop (700MHz) takes 43232ms and 12523ms respectively

Which code takes 12523 ms?

[edit]Executing in the loop only the bytes sequences takes 10734 ms[/edit]

28 Nov 2005, 18:23

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 29 Nov 2005, 09:01

locodelassembly wrote:

Which code takes 12523 ms?

I tryed with all enabled (bytesequences & leas) and none (only empty loop)

_________________
My updated idol Very Happy

http://www.agner.org/optimize/

29 Nov 2005, 09:01

< Last Thread | Next Thread >

Forum Rules:

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum