flat assembler
Message board for the users of flat assembler.

Index > Main > Permutation algo speed test

Author
Thread Post new topic Reply to topic
r22



Joined: 27 Dec 2004
Posts: 805
r22
Here's a permutation algorithm coded in FASM.
It comes with a benchmarking and verifying code.

I'm always interested in optimizations and / or alternative methods.
Especially if a recursive version (that doesn't require the shadow list of interegers) is faster or slower than my iterative version.


Code:
format PE console
entry start

include 'c:\FASM\INCLUDE\win32a.inc'

macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}

section '.data' data readable writeable
        BYTES_PER equ 4
;;;formatting
        printF db '%d ',0
        printLN db 13,10,0
        printFB db '%d ms execution speed',0
;;;Input
align 16
        PermList:
                dd 1,2,3,4,5,6,7,8,9,10,11,12
        EndList:
        PermListSize dd (EndList - PermList) / BYTES_PER
;;;part of algorithm
align 16
      load0123 dd 0,1,2,3
      add4     dd 4,4,4,4

section '.code' code readable executable

start:
        call    [GetTickCount]
        mov     ebx,eax
        call    permutify
        call    [GetTickCount]
        sub     eax,ebx
        push    eax
        push    printFB
        call    [printf]
        add     esp,8
        push    0
        push    0
        push    0
        push    0
        call    [MessageBox]
        push    0
        call    [ExitProcess]

align 8
permutify:
        push    ebx
        mov     eax,[PermListSize]
        push    esi
        add     eax,4 ;;;padding so we can use SIMD
        push    edi
        shl     eax,2 ;;;*4 size->dwords of memory
        push    PAGE_READWRITE
        push    MEM_COMMIT or MEM_RESERVE
        push    eax
        push    0
        call    [VirtualAlloc]
        mov     ebx,[PermListSize]
        lea     esi,[PermList]
        mov     edi,eax
        shr     ebx,1 ;;;/2 would use /4 but can't *16 address
        movdqa  xmm0,dqword[load0123]
        movdqa  xmm1,dqword[add4]
        xor     eax,eax
   .fill:
        movdqa  dqword[edi+eax*8],xmm0
        add     eax,2
        paddd   xmm0,xmm1
        cmp     eax,ebx
        jle     .fill
;call print ;;;use while Perming;;;comment while benchmarking
        mov     ebx,01h
   AMDPad16
   .loop:
        lea     ecx,[edi+ebx*4]
        mov     eax,ebx
        cmp     ebx,dword[PermListSize]
        jge     .end
        ror     eax,1
        sub     dword[ecx],01h
        cdq
        and     edx,[ecx]
        mov     ecx,[esi+ebx*4]
        mov     eax,[esi+edx*4]
        mov     [esi+edx*4],ecx
        mov     [esi+ebx*4],eax
;call print ;;;use while Perming;;;comment while benchmarking
        mov     ebx,01h
   AMDPad16
   .innerloop:
        cmp     dword[edi+ebx*4],00h
        jnz     .innerend
        mov     [edi+ebx*4],ebx
        add     ebx,1
        jmp     .innerloop
   .innerend:
        jmp     .loop
   .end:
        pop     edi
        pop     esi
        pop     ebx
        ret     0

print:
        push    ebx
        mov     ebx,[PermListSize]
   .prnt:
        dec     ebx
        js      .endprnt
        push    dword[PermList+ebx*BYTES_PER]
        push    printF
        call    [printf]
        add     esp,8
        jmp     .prnt
   .endprnt:
        push    printLN
        call    [printf]
        add     esp,4
        pop     ebx
        ret     0

section '.idata' import data readable writeable

  library kernel32,'KERNEL32.DLL',\
          msvcrt,'MSVCRT.DLL',\
          user32,'USER32.DLL'
      include  "c:\FASM\INCLUDE\apia\kernel32.inc"
      include  "c:\FASM\INCLUDE\apia\user32.inc"
  import msvcrt,\
         printf,'printf'

section '.reloc' fixups data discardable 
    


AMD x2 3800+ 2.0GHz 1GB ram
Quote:

3422 ms execution speed
Post 29 Oct 2006, 02:18
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
madmatt



Joined: 07 Oct 2003
Posts: 1045
Location: Michigan, USA
madmatt
I get about 3390ms (Intel Celeron 2.7ghz, 1GB ram). Is this for a school project or something?
Post 29 Oct 2006, 09:33
View user's profile Send private message Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4633
Location: Argentina
LocoDelAssembly
3453 ms execution speed (AMD Athlon64 +3200 [2.0 GHz S939]).
Post 29 Oct 2006, 15:51
View user's profile Send private message Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
madmatt, it's not for any project. Just an interesting algorithm I've been meaning to code up.

I'm thinking that permutations can be done in parallel with eachother.
Using a multi-threaded approach, but despite the rather small and simplistic size to the algorithm analysis is somewhat complex.
Post 29 Oct 2006, 19:03
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
rugxulo



Joined: 09 Aug 2005
Posts: 2341
Location: Usono (aka, USA)
rugxulo
P4 2.52Ghz 512 MB RAM

Quote:

3734 ms execution speed
Post 29 Oct 2006, 19:51
View user's profile Send private message Visit poster's website Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Thanks for all the results.

Now some optimizations for the current algorithm or aa alternative algorithm to test would be interesting.
Post 29 Oct 2006, 22:19
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2020, Tomasz Grysztar. Also on GitHub, YouTube, Twitter.

Website powered by rwasa.