flat assembler
Message board for the users of flat assembler.

Index > Main > How to optimise this code?

Author
Thread Post new topic Reply to topic
Hydrogen



Joined: 15 Apr 2016
Posts: 6
Hydrogen 29 Apr 2016, 09:55
Still new to asm. I've been coding various algorithms to practice and learn more about fasm. I recently coded Radix sort for fun but as a complete newb to assembly programming, I realize my code is inefficient and completely un-optimized. I know there are amazing tricks and the experts here know how to write compact efficient code. Could you guys look over my code and provide some tips and suggestions for improving it? The pseudo-alg I followed is this:

Code:
for i = 0...15 do {
shift_offset = i;
for each number in the array do {
   digit = number >> shift_offset
   if(digit == 0)
      place in zero bucket
   else
      place in one bucket
 } 
  for each number in zero bucket
     copy back to the array
  for each number in one bucket
     copy back to the array
} 
    


And here's my fasm code.
Code:
        format PE Console
        entry start
        include 'win32ax.inc'

size = 13 ;number of values we are considering

section '.code' code readable executable

start:
        mov eax,0        ;prepare the outer for loop
outer:  cmp eax,15
        ja done
        xor ebx,ebx
        mov ebx,eax       ;bx is the current shift offset

        mov ecx,0       ;numbers should increase BY 2 for word
inner:  cmp ecx,(size-1)*2     ;for each number in the array do
        ja indone       ;if greater than size, then finished

        lea edx,[array + ecx]   ;edx = address of current number tested
        bt [edx],ebx     ;test bit number ebx shift offset. bt sets the CF Flag
        jc bitone        ;if CF Flag not set, place in zero bucket
        xor esi,esi      ;esi will store the index to put in zeroes array
        xor edi,edi
        lea esi,[zeroes]
        add esi,[zerocnt]
        mov di,word [edx]
        mov word [esi],di           ;edi contains number: esi contains zeroes index;
                                    ;mov into zeroes array
        add [zerocnt],2             ;increase zeroes index by 2(word)
        jmp nxtnum

bitone: xor esi,esi      ;esi will store the index to put in ones array
        xor edi,edi      ;edi = actual number
        lea esi,[ones]
        add esi,[onecnt]
        mov di,word [edx]
        mov word [esi],di           ;edi contains number: esi contains ones index;
                                    ;mov into ones array
        add [onecnt],2             ;increase ones index by 2(word)

nxtnum: add ecx,2
        jmp inner

indone:
        inc eax   ;increase the next bit to test
        xor edi,edi         ;edi = for loop counter
zerobkt:                    ;copy zeroes back into array
        cmp edi,[zerocnt]
        jae onebkt          ;check if all numbers copied

        lea edx,[zeroes+edi]    ;edx = address of zeroes item
        mov esi,[edx]           ;esi = contents of zeroes item

        lea edx,[array+edi]     ;edx = address of array
        mov word [edx],si           ;mov contents into the array
        add edi,2
        jmp zerobkt             ;move onto next number to copy

onebkt: xor edi,edi
        ;mov edi,[zerocnt]       ;begin ones bucket where zero left off
@@:     cmp edi,[onecnt]
        jae nxtbit              ;if all numbers copied, Then exit and move to next bit

        lea edx,[ones+edi]      ;edx = address of ones item
        mov esi,[edx]           ;esi = contents of ones item

        lea edx,[array+edi]     ;edx = address of array
        add edx,[zerocnt]       ;continue where zeroes left off
        mov word [edx],si           ;mov ones item into array
        add edi,2
        jmp @b              ;move onto next number to copy

nxtbit: mov [zerocnt],0
        mov [onecnt],0

        jmp outer

done:   invoke ExitProcess,0               ;outer for loop done

section '.data' data readable writeable
        ;array dw 562,307,16,449,98,555,309,713,2,84
        array dw 43,5,17,19,59,49,11,1,97,83,71,21,15
        zeroes rw size
        ones rw size
        zerocnt dd 0
        onecnt dd 0

section '.idata' import data readable writeable

  library kernel32,'KERNEL32.DLL',\
          user32,'USER32.DLL'

  include 'api\kernel32.inc'
  include 'api\user32.inc'    

Really looking forward to hearing from you guys. I definitely want to improve. My code is ugly and bloated and I feel like most of you guys here could code Radix sort in 20 lines or less.

thanks
Post 29 Apr 2016, 09:55
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20522
Location: In your JS exploiting you and your system
revolution 29 Apr 2016, 12:55
Just looking quickly at this I saw the following:
Code:
        xor esi,esi      ;esi will store the index to put in zeroes array
        xor edi,edi
        lea esi,[zeroes]
        add esi,[zerocnt]
        mov di,word [edx]
        mov word [esi],di           ;edi contains number: esi contains zeroes index;
                                    ;mov into zeroes array    
It can be simplified to:
Code:
        mov     esi,[zerocnt]
        mov     di,[edx]
        mov     [zeroes + esi],di    
Post 29 Apr 2016, 12:55
View user's profile Send private message Visit poster's website Reply with quote
Hydrogen



Joined: 15 Apr 2016
Posts: 6
Hydrogen 29 Apr 2016, 23:57
revolution wrote:
Just looking quickly at this I saw the following:
Code:
        xor esi,esi      ;esi will store the index to put in zeroes array
        xor edi,edi
        lea esi,[zeroes]
        add esi,[zerocnt]
        mov di,word [edx]
        mov word [esi],di           ;edi contains number: esi contains zeroes index;
                                    ;mov into zeroes array    
It can be simplified to:
Code:
        mov     esi,[zerocnt]
        mov     di,[edx]
        mov     [zeroes + esi],di    


Simple but beautiful! I like how efficient and compact you made it. I couldn't find a way to move through the array except through the use of "lea". But you bypassed that by reversing the instructions. I have so much to learn.
Post 29 Apr 2016, 23:57
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20522
Location: In your JS exploiting you and your system
revolution 30 Apr 2016, 16:21
It is unknown if the code will actually be more efficient, except perhaps for simple metrics like cache usage (i.e. code size). For other measures it might make no difference.
Post 30 Apr 2016, 16:21
View user's profile Send private message Visit poster's website Reply with quote
Hydrogen



Joined: 15 Apr 2016
Posts: 6
Hydrogen 01 May 2016, 05:30
thanks! Another question: when I was coding it I had a hard time with reuse of registers. For example, I would run out of registers to use since they all had values I needed. To get around this I had to declare a new memory variable to temporarily store the new values so that I would not overwrite the registers.

What's the general strategy to avoid memory location usage and instead use only registers? Obviously registers are faster so ideally I want to use nothing but registers. How do you guys maximize register use without resorting to declaring memory variables?
Post 01 May 2016, 05:30
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20522
Location: In your JS exploiting you and your system
revolution 01 May 2016, 05:36
When you need to store values in memory then the stack is usually the best place. It is almost guaranteed to be in the cache and you don't have those ugly global variables polluting the code.

As for maximizing register usage, I guess the only proper way is to not use registers for too long and/or don't use them when you don't have to.

And once again, as to whether using registers is really faster or not is unknown. While it is natural to assume that they are "faster", that assumption can fail in very many ways, modern CPUs are very complex and static prediction of run times is extremely tricky. Always test your resulting code to see if the results are really what you expect. Never simply assume.
Post 01 May 2016, 05:36
View user's profile Send private message Visit poster's website Reply with quote
system error



Joined: 01 Sep 2013
Posts: 670
system error 01 May 2016, 11:22
hydro, registers can be used in smaller parts.

The idea of xmm/ymm/zmm registers is: read from memory once, and then use the registers all the way without going back and forth the memory. I think this serves your purposes right.

And if you need some more registers, then use the FPU registers first to keep your data (you now have 7 more registers to use), which are slightly faster than conventional memory.

What else? Yeahh, there are control registers and debug registers too ^_^

And there's also segment 'registers' if you don't mind. hehehe

And did anybody tell you that there's 'ghost' registers hidden somewhere in your system? How do I know this? Well, let's just say I have friends...

good luck with these registers.
Post 01 May 2016, 11:22
View user's profile Send private message Reply with quote
El Tangas



Joined: 11 Oct 2003
Posts: 120
Location: Sunset Empire
El Tangas 28 May 2016, 14:37
Hi, I would avoid the BT mem,reg instruction, it's pretty slow. I would test the bits using the TEST instruction, against a mask in EBX. Then, ADD ebx, ebx (= shift left) to test the next bit.

Also, you don't need to "initialize" registers with zero before use, for example you do this:
Code:
        xor esi,esi      ;esi will store the index to put in zeroes array
        xor edi,edi
        lea esi,[zeroes]
        add esi,[zerocnt]
        mov di,word [edx]    


xor esi, esi is not needed at all, because you write the full register later with lea. xor edi,edi is needed because you only write half the register, however you can use movzx edi,word [edx] and this will automatically clear the top 16 bits of edi.
So, equivalent code:
Code:
        lea esi,[zeroes]
        add esi,[zerocnt]
        movzx edi,word [edx]    


Finally, instead of this:
Code:
        add [zerocnt],2             ;increase zeroes index by 2(word)
        jmp nxtnum
        (...)
nxtnum: add ecx,2
        jmp inner     


I would just replicate the code. Memory is cheap these days, so optimize for size is usually pointless, but jumps are expensive and should be avoided when possible, especially unpredictable conditional jumps.
Code:
        add [zerocnt],2             ;increase zeroes index by 2(word)
        add ecx,2
        jmp inner
        (...)
        add ecx,2
        jmp inner    


When you get to a more advanced level in x86 assembly, and if you really want to master optimization, read Agner Fog's manuals, I think they are very good:

http://www.agner.org/optimize/
Post 28 May 2016, 14:37
View user's profile Send private message Reply with quote
El Tangas



Joined: 11 Oct 2003
Posts: 120
Location: Sunset Empire
El Tangas 01 Jun 2016, 11:32
Well, just to follow up on this, since you wanted to learn some asm tricks, I wrote some pretty tricky code for you to study Cool probably not the fastest or smallest possible, but illustrates a few things:

1) Think carefully about the algorithm, can it be improved a bit? In this case, I noticed the "zeroes" bin can be the original array, there is no problem because no data can be overwritten and lost, since the zeroes array it a most as long as the original array. So one mem to mem copy operation becomes redundant and can be removed.

2) Removed the random branch that distributes data between the bins. Random branches cannot be predicted by the CPU, therefore are bad. Instead, conditional instructions (CMOV and SET) and some tricks do the job Wink there are other options also.

3) I used the REP MOVSW string copy instruction, this may or may not be the fastest way, but it surely is the shortest.

4) align data to at least DWORD addresses. Jump targets should also be aligned (I didn't do that here)

5) You should not exit programs with a RET, but I did that to show you it's possible, in case you didn't know.

So the code is not under 20 instructions, but close enought Very Happy

Code:
        format PE Console
        entry start


size = 23 ;number of values we are considering
zeroes equ array        ;zeroes bin can actualy be the starting array, there is no danger of data loss
                        ;because length of zeroes <= lenght of array

section '.code' code readable executable

start:
        mov     eax,00010001h           ;prepare selection mask
outer:
        xor     ebx,ebx                 ;initialize "zeroes" array index
        xor     edx,edx                 ;initialize "ones" array index
        lea     ecx,[ebx+edx]           ;initialize sum of "zeroes" and "ones" indexes
inner:
        movzx   ecx,word [array+2*ecx]  ;get current value
        test    ecx,eax                 ;check current bit
        lea     edi,[(ones-zeroes)/2+edx]   ;default, go to "ones" bin
        cmovz   edi,ebx                 ;if zero, go to "zeroes" bin
        mov     [zeroes+2*edi],cx       ;goes to selected bin
        mov     ecx,0                   ;one of the bin indexes must be incremented
        setz    cl                      ;is "zeroes" index to be incremented?
        add     ebx,ecx                 ;update "zeroes" index (index+1=count) LEA preserves flags
        xor     ecx,1                   ;if ecx=0 make ecx=1, and vice versa
        add     edx,ecx                 ;update "ones" index
        lea     ecx,[ebx+edx]           ;get sum of "zeroes" and "ones" indexes
        cmp     ecx,size                ;has it reached the final value?
        jne     inner                   ;not yet

next_bit:
        sub     ecx,ebx                 ;get "ones" array lenght in ecx
        jz      skip_copy               ;if zero, there is nothing to copy
        mov     esi,ones                ;eSi source of data to copy
        lea     edi,[array+2*ebx]       ;eDi destination of data to copy
        rep     movsw                   ;append "ones" array to "zeroes" array

skip_copy:
        rol     eax,1                   ;after 16 rotations, CF will be 1
        jnc     outer                   ;not finished, CF is still zero
        ret

section '.data' data readable writeable
        array dw 562,307,16,449,98,555,309,713,2,84,43,5,17,19,59,49,11,1,97,83,71,21,15
        ;zeroes rw size
        align 4         ;it's always a good idea to align data to at least dword addresses
        ones rw size

     
Post 01 Jun 2016, 11:32
View user's profile Send private message Reply with quote
l4m2



Joined: 15 Jan 2015
Posts: 674
l4m2 14 Aug 2016, 17:19
Is this an valid solution?
Code:
        mov     edx,    1
        xor     ebx,    ebx
lp:     mov     esi,    array
        mov [t],    esi
        mov     [t+4],dword     tmp
        mov     ecx,    len
lp2:    scasw
        test    edx,    eax
        setnz   bl
        mov     edi,    [t+4*ebx]
        stosw
        mov     [t+4*ebx], \
                        edi
        loop    lp2
        mov     esi,    tmp
        mov     edi,    [esp]
        mov     ecx,    [t+4]
        sub     ecx,    tmp
        rep     movsb
        add     dx,     dx
        jnz     lp
            
Post 14 Aug 2016, 17:19
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.