flat assembler
Message board for the users of flat assembler.

Index > Macroinstructions > Help me to optimise this macro for less RAM usage

Author
Thread Post new topic Reply to topic
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20298
Location: In your JS exploiting you and your system
revolution 07 Dec 2007, 15:20
At vid's suggestion I post this here as an exercise in reducing RAM usage when expanding macros.

The macro is a simple instruction counter that returns the number of PUSHes/POPs the CPU will execute. I had originally intended to use it in part of a proc macro that uses esp as the frame pointer (to free up ebp). The idea was to track the esp offset automatically with the help of this macro. I eventually abandoned it because I could only compile very small programs before running out of memory.
Code:
irp instr,push,pop {
    local i,.i,defSize,aSize,dSize,invSize
    macro instr#Count retCount,[args] \{
   \common
    virtual at 0
            xchg ax,ax
          defSize=1 shl $     ;use16=2, use32=4, use64=4
      xchg eax,eax
        invSize=($*2)and 6  ;use16=6, use32=6, use64=0
  end virtual
 retCount=0
    .i=$
      instr args
  while .i<$
           aSize=defSize
       dSize=defSize
       load i from .i
      while (i and 0e7h)=026h | (i and 0fch)=064h         ;override or segment prefix
         if i=066h                                       ;operand size toggle
                    dSize=defSize xor 6
             else if i=067h                                  ;address size toggle
                    aSize=defSize xor invSize
               end if
              .i=.i+1
             load i from .i
          end while
           if (i and 0f0h)=040h                                ;REX
                .i=.i+1
             load i from .i
          end if
      retCount=retCount+1
       match =push,instr \\{
          if i=068h                                           ;imm16/32
           .i=.i+dSize
     else if i=06ah                                      ;imm8
               .i=.i+1
         else if i=0ffh                                      ;ModR/M
       \\}
          match =pop,instr \\{
           if i=08fh                                           ;ModR/M
       \\}
                .i=.i+1
             load i from .i
              if (i and 0c0h)=040h                            ;disp8
                  .i=.i+1
         else if (i and 0c0h)=080h                       ;disp16/32
              .i=.i+aSize
             else if aSize=4 & (i and 0c7h)=05h          ;disp32
                 .i=.i+aSize
             else if aSize=2 & (i and 0c7h)=06h          ;disp16
                 .i=.i+aSize
             end if
              if aSize=4 & (i and 0c0h)<>0c0h & (i and 7)=4     ;SIB
                    .i=.i+1
             if (i and 0c7h)=4                           ;SIB with Mod=00
                    load i from .i
                      if (i and 07h)=5                        ;SIB without base + disp32
                      .i=.i+aSize
                     end if
                  end if
          end if
          else if i=00fh                                      ;push/pop fs/gs
             .i=.i+1
         end if
      .i=.i+1
 end while
   if .i<>$
          halt ;cannot decode push/pop
    end if
    \}
}

macro testPush expCount,args {
    pushCount c,args
    if expCount<>c
     halt ;count not equal to expected count
    end if
}
macro testPop expCount,args {
    popCount c,args
    if expCount<>c
    halt ;count not equal to expected count
    end if
}

virtual at 0
    testPush 1,1
    irp Use,use16,use32 {
   Use
 testPush 16,ax bx cx dx si di bp sp eax ebx ecx edx esi edi ebp esp
 testPop 16,ax bx cx dx si di bp sp eax ebx ecx edx esi edi ebp esp
  irp Size,dword,word \{
            testPush 6,Size cs Size ds Size es Size ss Size fs Size gs
          testPop 5,Size ds Size es Size ss Size fs Size gs
           testPush 3,Size 256  Size[word 0]  Size[dword 0]
            irp SegmentReg,cs,ds,es,ss,fs,gs \\{
             irp Displacement,0,1,256 \\\{
                   irp AddrReg,bx,bp,si,di,bx+si,bx+di,bp+si,bp+di \\\\{
                  testPush 1,Size[SegmentReg:AddrReg+Displacement]
                        testPop 1,Size[SegmentReg:AddrReg+Displacement]
             \\\\}
              irp BaseReg,eax,ebx,ecx,edx,esi,edi,ebp,esp,0 \\\\{
                    irp IndexReg,eax,ebx,ecx,edx,esi,edi,ebp,0 \\\\\{
                     irp Scale,1,2,4,8 \\\\\\{
                            testPush 1,Size[SegmentReg:BaseReg+IndexReg*Scale+Displacement]
                         testPop 1,Size[SegmentReg:BaseReg+IndexReg*Scale+Displacement]
                      \\\\\\}
                      \\\\\}
                \\\\}
          \\\}
            \\}
      \}
    }
    irp Use,use64 {
  Use
 testPush 16,ax bx cx dx si di bp sp r8w r9w r10w r11w r12w r13w r14w r15w
   testPush 16,rax rbx rcx rdx rsi rdi rbp rsp r8 r9 r10 r11 r12 r13 r14 r15
   testPop 16,ax bx cx dx si di bp sp r8w r9w r10w r11w r12w r13w r14w r15w
    testPop 16,rax rbx rcx rdx rsi rdi rbp rsp r8 r9 r10 r11 r12 r13 r14 r15
    irp Size,qword,word \{
            testPush 2,Size fs Size gs
          testPop 2,Size fs Size gs
           testPush 3,Size 256  Size[dword 0]  Size[$]
         irp SegmentReg,,fs:,gs: \\{
              irp Displacement,0,1,256 \\\{
                   irp BaseReg,eax,ebx,ecx,edx,esi,edi,ebp,esp,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d,0 \\\\{
                      irp IndexReg,eax,ebx,ecx,edx,esi,edi,ebp,0,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d \\\\\{
                       irp Scale,1,2,4,8 \\\\\\{
                            testPush 1,Size[SegmentReg BaseReg+IndexReg*Scale+Displacement]
                             testPop 1,Size[SegmentReg BaseReg+IndexReg*Scale+Displacement]
                          \\\\\\}
                      \\\\\}
                \\\\}
              irp BaseReg,rax,rbx,rcx,rdx,rsi,rdi,rbp,rsp,r8,r9,r10,r11,r12,r13,r14,r15,0 \\\\{
                      irp IndexReg,rax,rbx,rcx,rdx,rsi,rdi,rbp,0,r8,r9,r10,r11,r12,r13,r14,r15 \\\\\{
                       irp Scale,1,2,4,8 \\\\\\{
                            testPush 1,Size[SegmentReg BaseReg+IndexReg*Scale+Displacement]
                             testPop 1,Size[SegmentReg BaseReg+IndexReg*Scale+Displacement]
                          \\\\\\}
                      \\\\\}
                \\\\}
          \\\}
            \\}
      \}
    }
end virtual    
This uses a lot of RAM when compiling. I would be delighted if someone can find a way to significantly reduce the space requirements.

The code inside the final virtual block is the test I used to verify the operation, it exercises all forms of PUSH and POP, but without lots of RAM it can't be compiled.
Post 07 Dec 2007, 15:20
View user's profile Send private message Visit poster's website Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid 07 Dec 2007, 15:30
first quick idea: write first macro separately for each of "push" and "pop", and then you can get rid of "match =push, instr" and "match =pop, instr" parts.

EDIT: change my mind, this won't matter too much, it will only save a memory before preprocessing, not after.
Post 07 Dec 2007, 15:30
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid 07 Dec 2007, 15:57
I think this macro cannot be considerably optimized.

"push" instruction is extra bitch, because it allows multiple space-separated arguments. Combined with "ptr" syntax, there is no other way to parse arguments, other than parsing generated opcodes, like you did.

Even push itself can be pretty ambigous, and depends on how assembler decides to assemble it:
Code:
push -5 dword ptr ebx
push dword ptr ebx -5
...
    
Post 07 Dec 2007, 15:57
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20298
Location: In your JS exploiting you and your system
revolution 08 Dec 2007, 08:48
vid wrote:
I think this macro cannot be considerably optimized.

"push" instruction is extra bitch, because it allows multiple space-separated arguments. Combined with "ptr" syntax, there is no other way to parse arguments, other than parsing generated opcodes, like you did.
Hmm, pity about that. The testing part is also very memory intensive with the irp's. If it was to be completely unrolled with just a long list of push/pop instruction it will use much less memory also. Of course that would be more tedious to write and debug.
vid wrote:
Even push itself can be pretty ambigous, and depends on how assembler decides to assemble it:
Code:
push -5 dword ptr ebx
push dword ptr ebx -5
...
    
The assembler uses the greedy approach so the code you post will assemble predictably. I also struck this trap some time back, see http://board.flatassembler.net/topic.php?t=3292
Post 08 Dec 2007, 08:48
View user's profile Send private message Visit poster's website Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid 08 Dec 2007, 10:03
Quote:
Hmm, pity about that. The testing part is also very memory intensive with the irp's. If it was to be completely unrolled with just a long list of push/pop instruction it will use much less memory also. Of course that would be more tedious to write and debug.

I doubt about that. IMO it will have similar requirement. FASM will unroll it to same code you'd write, and macro nesting needs just couple of bytes on stack. I'm not sure, we'd need to ask tomasz
Post 08 Dec 2007, 10:03
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20298
Location: In your JS exploiting you and your system
revolution 08 Dec 2007, 10:51
vid wrote:
I doubt about that. IMO it will have similar requirement. FASM will unroll it to same code you'd write, and macro nesting needs just couple of bytes on stack. I'm not sure, we'd need to ask tomasz
Try it, use the -m parameter and see what you get. I expect a fully unrolled version will use less than 1/3 of the memory. I haven't the time to try it now, it would take too long to unroll the irp's, but I would be happy to learn that fasm is more efficient with memory than my prediction suggests.
Post 08 Dec 2007, 10:51
View user's profile Send private message Visit poster's website Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid 08 Dec 2007, 13:24
i tried to compile it with max available for me (250MB), and still it didn't compile. But please, try it, if you have more RAM Wink
Post 08 Dec 2007, 13:24
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4624
Location: Argentina
LocoDelAssembly 08 Dec 2007, 16:15
Compiles at 768 MB, and if you remove the use detection code outside and place it only when needed you save around 64 MB. Removing "local" also reduces memory requirements (the xxxx?xxxxxxxx are a lot longer than just "i", "a", etc).

Still, note that these macros will not solve all cases, imagine this bitchy code
Code:
proc_esp foo, arg

mov  ecx, 3
.loop:
  push ecx
  dec  ecx
  jnz .loop
  
  invoke bar, arg ; How could it detect correctly arg position?

  add  esp, 3*4

  ret
endp    


The code is very unrealistic, but it is just to show that push will not be properly tracked always.

Here another more realistic
Code:
proc_esp bar, arg1, arg2, arg3, arg4
  cmp [arg1], 1
  jne .push_arg2

  push [arg3] ; Will be counted even though it has 50 % of changes to get executed only
  jmp .call_func

.push_arg2:
  push [arg2] ; Will be counted even though it has 50 % of changes to get executed only (and arg2 position will be wrongly considered as shifted)
  jmp .call_func

.call_func:
  push [arg4] ; What gonna happen here?
  call func

  ret
endp    


These problems doesn't exists with EBP-based frames and I'm particularly worried about the second example, it is not very uncommon actually.

Optimizing this is a good challenge though Smile but to address this problem I think that instead of trying to provide automated tracking we need to feed the code with programmer's info instead so when you are going to access a parameter, the macros will compute deltas based on the info provided by hand.

To make this handy the delta could be modified in both, relative and absolute fashion.
Post 08 Dec 2007, 16:15
View user's profile Send private message Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid 08 Dec 2007, 17:13
I think trying to make this automated is not best idea.

If you want to keep track of stack, you should use win64 style: one "sub rsp,XXX" at begininng, and then just moves to/from memory.

But optimizing macro is still good challenge. Changing length of names is still good idea: maybe tomasz could use "a?1" instead of "a?00000001", and save quite lot of memory if lot of macros with locals are used
Post 08 Dec 2007, 17:13
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
Tomasz Grysztar



Joined: 16 Jun 2003
Posts: 8349
Location: Kraków, Poland
Tomasz Grysztar 08 Dec 2007, 17:16
vid wrote:
But optimizing macro is still good challenge. Changing length of names is still good idea: maybe tomasz could use "a?1" instead of "a?00000001", and save quite lot of memory if lot of macros with locals are used

Good idea, and very simple to do.
Post 08 Dec 2007, 17:16
View user's profile Send private message Visit poster's website Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20298
Location: In your JS exploiting you and your system
revolution 08 Dec 2007, 18:30
LocoDelAssembly wrote:
... but to address this problem I think that instead of trying to provide automated tracking we need to feed the code with programmer's info instead so when you are going to access a parameter, the macros will compute deltas based on the info provided by hand.

To make this handy the delta could be modified in both, relative and absolute fashion.
Thanks for your suggestion. I was indeed aware of the two cases you mentioned. Like I said it was part of the whole solution I had planned to attempt to implement. After I saw the memory usage go through the roof I backed completely off the project and forgot about it. That was about six months ago.

Pulling the 'use' detection code means creating a macro for each 'use' and assigning globally unique names for the two parameters. Not too much of an overhead, and can be done quite quickly.

I think I need to seriously change my estimate above of 1/3 memory for unrolling the irp's. I calculated there are 120987 pushes and pops combined. With each uncompressed macro call being ~1500 bytes that comes to ... well a big number. So I reevaluate to more like 90%.
Post 08 Dec 2007, 18:30
View user's profile Send private message Visit poster's website Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4016
Location: vpcmpistri
bitRAKE 08 Dec 2007, 19:06
Code:
    pushad
    mov dword [esp+_ESP_],.x
    jmp MyFunk
.x: popad
.
.
.
MyFunk:
...
    jmp dword [esp+_ESP_]    
It keeps the stack aligned to 32 bytes, and all the registers are free for use. Laughing
Post 08 Dec 2007, 19:06
View user's profile Send private message Visit poster's website Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20298
Location: In your JS exploiting you and your system
revolution 08 Dec 2007, 22:59
bitRAKE wrote:
Code:
    pushad
    mov dword [esp+_ESP_],.x
    jmp MyFunk
.x: popad
.
.
.
MyFunk:
...
    jmp dword [esp+_ESP_]    
It keeps the stack aligned to 32 bytes, and all the registers are free for use. Laughing
Umm,no it doesn't, esp is not free, any form of stack access that doesn't get later restored will kill your code. You need a procedure unique global memory slot to save and restore esp if you want all registers free.
Code:
mov [this_procs_global_store],esp
do_whatever: nop
...
mov esp,[this_procs_global_store]
retn arg_count    
Post 08 Dec 2007, 22:59
View user's profile Send private message Visit poster's website Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4016
Location: vpcmpistri
bitRAKE 08 Dec 2007, 23:40
revolution wrote:
Umm,no it doesn't, esp is not free, any form of stack access that doesn't get later restored will kill your code. You need a procedure unique global memory slot to save and restore esp if you want all registers free.
Code:
mov [this_procs_global_store],esp
do_whatever: nop
...
mov esp,[this_procs_global_store]
retn arg_count    
That is a good way to keep ESP balanced when there is chance it could be modified. Usually, it is possible to avoid an unbalanced stack. Even on dynamically recursive functions some type of unwinding of the stack can be done in catastrophic times. Good this kind of thing is rather rare.
Post 08 Dec 2007, 23:40
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.