flat assembler
Message board for the users of flat assembler.

Index > High Level Languages > what happened to fibonacci in gcc? huge code produced...

Author
Thread Post new topic Reply to topic
tthsqe



Joined: 20 May 2009
Posts: 767
tthsqe 28 Nov 2013, 12:12
c code:
Code:
int f(int n) {
  if (n<2)
    return n;
  else
    return f(n-1)+f(n-2);
}    


compiled by g++ 4.8.1 -O3 option:
Code:
f(int):
        push    r15
        mov     eax, edi
        push    r14
        push    r13
        push    r12
        push    rbp
        push    rbx
        sub     rsp, 168
        cmp     edi, 1
        mov     DWORD PTR [rsp+108], edi
        jle     .L29
        lea     edx, [rdi-3]
        mov     DWORD PTR [rsp+44], 0
        mov     DWORD PTR [rsp+68], edx
.L28:
        lea     edx, [rax-1]
        cmp     edx, 1
        jle     .L30
        mov     esi, DWORD PTR [rsp+68]
        lea     edx, [rax-2]
        sub     eax, 4
        mov     DWORD PTR [rsp+92], eax
        mov     DWORD PTR [rsp+40], 0
        mov     ecx, edx
        mov     DWORD PTR [rsp+112], edx
        shr     esi
        mov     DWORD PTR [rsp+72], ecx
        lea     edx, [rsi+rsi]
        mov     DWORD PTR [rsp+120], esi
        mov     esi, eax
        sub     esi, edx
        mov     DWORD PTR [rsp+116], esi
.L26:
        mov     edx, DWORD PTR [rsp+72]
        cmp     edx, 1
        mov     eax, edx
        jle     .L31
        sub     eax, 3
        mov     DWORD PTR [rsp+52], 0
        mov     esi, eax
        mov     DWORD PTR [rsp+96], eax
        mov     eax, DWORD PTR [rsp+92]
        shr     eax
        mov     DWORD PTR [rsp+128], eax
        add     eax, eax
        sub     esi, eax
        mov     eax, edx
        sub     eax, 1
        mov     DWORD PTR [rsp+124], esi
        mov     DWORD PTR [rsp+76], eax
.L24:
        mov     esi, DWORD PTR [rsp+76]
        cmp     esi, 1
        mov     eax, esi
        jle     .L32
        sub     eax, 3
        mov     DWORD PTR [rsp+56], 0
        mov     edx, eax
        mov     DWORD PTR [rsp+100], eax
        mov     eax, DWORD PTR [rsp+96]
        shr     eax
        mov     DWORD PTR [rsp+136], eax
        add     eax, eax
        sub     edx, eax
        mov     eax, esi
        sub     eax, 1
        mov     DWORD PTR [rsp+132], edx
        mov     DWORD PTR [rsp+80], eax
.L22:
        mov     edx, DWORD PTR [rsp+80]
        cmp     edx, 1
        mov     eax, edx
        jle     .L33
        sub     eax, 3
        mov     DWORD PTR [rsp+48], 0
        mov     esi, eax
        mov     DWORD PTR [rsp+104], eax
        mov     eax, DWORD PTR [rsp+100]
        shr     eax
        mov     DWORD PTR [rsp+144], eax
        add     eax, eax
        sub     esi, eax
        mov     eax, edx
        sub     eax, 1
        mov     DWORD PTR [rsp+140], esi
        mov     DWORD PTR [rsp+84], eax
.L20:
        mov     esi, DWORD PTR [rsp+84]
        cmp     esi, 1
        mov     eax, esi
        jle     .L34
        sub     eax, 3
        mov     DWORD PTR [rsp+60], 0
        mov     edx, eax
        mov     DWORD PTR [rsp+64], eax
        mov     eax, DWORD PTR [rsp+104]
        shr     eax
        mov     DWORD PTR [rsp+152], eax
        add     eax, eax
        sub     edx, eax
        mov     eax, esi
        sub     eax, 1
        mov     DWORD PTR [rsp+148], edx
        mov     DWORD PTR [rsp+88], eax
.L18:
        mov     edx, DWORD PTR [rsp+88]
        cmp     edx, 1
        mov     eax, edx
        jle     .L35
        sub     eax, 3
        mov     DWORD PTR [rsp+32], 0
        mov     esi, eax
        mov     DWORD PTR [rsp+20], eax
        mov     eax, DWORD PTR [rsp+64]
        shr     eax
        mov     DWORD PTR [rsp+156], eax
        add     eax, eax
        sub     esi, eax
        mov     eax, edx
        sub     eax, 1
        mov     DWORD PTR [rsp+36], esi
        mov     DWORD PTR [rsp+24], eax
.L16:
        mov     ecx, DWORD PTR [rsp+24]
        cmp     ecx, 1
        mov     eax, ecx
        jle     .L36
        mov     eax, DWORD PTR [rsp+20]
        lea     r13d, [rcx-3]
        lea     r12d, [rcx-1]
        mov     DWORD PTR [rsp+12], 0
        mov     esi, r13d
        shr     eax
        mov     DWORD PTR [rsp+28], eax
        add     eax, eax
        sub     esi, eax
        mov     DWORD PTR [rsp+16], esi
.L14:
        cmp     r12d, 1
        mov     eax, r12d
        jle     .L37
        mov     r14d, r13d
        lea     ebp, [r12-3]
        lea     r15d, [r12-1]
        shr     r14d
        xor     ebx, ebx
        lea     eax, [r14+r14]
        sub     ebp, eax
.L12:
        mov     edi, r15d
        sub     r15d, 2
        call    f(int)
        add     ebx, eax
        cmp     r15d, ebp
        jne     .L12
        neg     r14d
        lea     eax, [r13+0+r14*2]
.L10:
        add     ebx, eax
        sub     r12d, 2
        add     DWORD PTR [rsp+12], ebx
        sub     r13d, 2
        cmp     r12d, DWORD PTR [rsp+16]
        jne     .L14
        mov     eax, DWORD PTR [rsp+28]
        mov     edx, DWORD PTR [rsp+20]
        neg     eax
        lea     eax, [rdx+rax*2]
.L9:
        add     eax, DWORD PTR [rsp+12]
        sub     DWORD PTR [rsp+24], 2
        add     DWORD PTR [rsp+32], eax
        mov     eax, DWORD PTR [rsp+36]
        sub     DWORD PTR [rsp+20], 2
        cmp     DWORD PTR [rsp+24], eax
        jne     .L16
        mov     eax, DWORD PTR [rsp+156]
        mov     edx, DWORD PTR [rsp+64]
        neg     eax
        lea     eax, [rdx+rax*2]
.L8:
        add     eax, DWORD PTR [rsp+32]
        sub     DWORD PTR [rsp+88], 2
        add     DWORD PTR [rsp+60], eax
        mov     eax, DWORD PTR [rsp+148]
        sub     DWORD PTR [rsp+64], 2
        cmp     DWORD PTR [rsp+88], eax
        jne     .L18
        mov     eax, DWORD PTR [rsp+152]
        mov     edx, DWORD PTR [rsp+104]
        neg     eax
        lea     eax, [rdx+rax*2]
.L7:
        add     eax, DWORD PTR [rsp+60]
        sub     DWORD PTR [rsp+84], 2
        add     DWORD PTR [rsp+48], eax
        mov     eax, DWORD PTR [rsp+140]
        sub     DWORD PTR [rsp+104], 2
        cmp     DWORD PTR [rsp+84], eax
        jne     .L20
        mov     eax, DWORD PTR [rsp+144]
        mov     esi, DWORD PTR [rsp+100]
        neg     eax
        lea     eax, [rsi+rax*2]
.L6:
        add     eax, DWORD PTR [rsp+48]
        sub     DWORD PTR [rsp+80], 2
        add     DWORD PTR [rsp+56], eax
        mov     eax, DWORD PTR [rsp+132]
        sub     DWORD PTR [rsp+100], 2
        cmp     DWORD PTR [rsp+80], eax
        jne     .L22
        mov     eax, DWORD PTR [rsp+136]
        mov     edx, DWORD PTR [rsp+96]
        neg     eax
        lea     eax, [rdx+rax*2]
.L5:
        add     eax, DWORD PTR [rsp+56]
        sub     DWORD PTR [rsp+76], 2
        add     DWORD PTR [rsp+52], eax
        mov     eax, DWORD PTR [rsp+124]
        sub     DWORD PTR [rsp+96], 2
        cmp     DWORD PTR [rsp+76], eax
        jne     .L24
        mov     eax, DWORD PTR [rsp+128]
        mov     esi, DWORD PTR [rsp+92]
        neg     eax
        lea     eax, [rsi+rax*2]
        jmp     .L4
.L37:
        xor     ebx, ebx
        jmp     .L10
.L36:
        mov     DWORD PTR [rsp+12], 0
        jmp     .L9
.L35:
        mov     DWORD PTR [rsp+32], 0
        jmp     .L8
.L34:
        mov     DWORD PTR [rsp+60], 0
        jmp     .L7
.L33:
        mov     DWORD PTR [rsp+48], 0
        jmp     .L6
.L32:
        mov     DWORD PTR [rsp+56], 0
        jmp     .L5
.L30:
        sub     eax, 2
        mov     DWORD PTR [rsp+40], 0
        mov     DWORD PTR [rsp+112], eax
.L3:
        add     edx, DWORD PTR [rsp+40]
        sub     DWORD PTR [rsp+68], 2
        add     DWORD PTR [rsp+44], edx
        cmp     eax, 1
        jg      .L28
        and     DWORD PTR [rsp+108], 1
        jmp     .L2
.L29:
        mov     DWORD PTR [rsp+44], 0
.L2:
        mov     eax, DWORD PTR [rsp+108]
        add     eax, DWORD PTR [rsp+44]
        add     rsp, 168
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L31:
        mov     DWORD PTR [rsp+52], 0
.L4:
        add     eax, DWORD PTR [rsp+52]
        sub     DWORD PTR [rsp+72], 2
        add     DWORD PTR [rsp+40], eax
        mov     eax, DWORD PTR [rsp+116]
        sub     DWORD PTR [rsp+92], 2
        cmp     DWORD PTR [rsp+72], eax
        jne     .L26
        mov     eax, DWORD PTR [rsp+120]
        mov     edx, DWORD PTR [rsp+68]
        neg     eax
        lea     edx, [rdx+rax*2]
        mov     eax, DWORD PTR [rsp+112]
        jmp     .L3    

Can any one confirm this in their own g++ version?
Post 28 Nov 2013, 12:12
View user's profile Send private message Reply with quote
dogman



Joined: 18 Jul 2013
Posts: 114
dogman 28 Nov 2013, 15:28
Yep! 300 lines.
Post 28 Nov 2013, 15:28
View user's profile Send private message Reply with quote
AsmGuru62



Joined: 28 Jan 2004
Posts: 1561
Location: Toronto, Canada
AsmGuru62 28 Nov 2013, 16:25
Maybe GCC inlined every call to f()?
Post 28 Nov 2013, 16:25
View user's profile Send private message Send e-mail Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 3892
Location: vpcmipstrm
bitRAKE 28 Nov 2013, 19:16
-Os -m32

ICC is the only one to switch to register passing.
Post 28 Nov 2013, 19:16
View user's profile Send private message Visit poster's website Reply with quote
dogman



Joined: 18 Jul 2013
Posts: 114
dogman 29 Nov 2013, 07:17
Intel makes good compilers. I'm real happy with their Fortran.

bitrake, nice site you found! I'll compile it under a few other compilers and post the results later. I have some that are not on that site.
Post 29 Nov 2013, 07:17
View user's profile Send private message Reply with quote
TmX



Joined: 02 Mar 2006
Posts: 841
Location: Jakarta, Indonesia
TmX 29 Nov 2013, 10:15
dogman wrote:
Intel makes good compilers. I'm real happy with their Fortran.


AFAIK Intel C++ is also praised.
Fortunately there are non commercial versions of their compiler.
Unfortunately, no Windows version. Wink

Quite pricey, though.
Post 29 Nov 2013, 10:15
View user's profile Send private message Reply with quote
ASM-Man



Joined: 11 Jan 2013
Posts: 64
ASM-Man 30 Nov 2013, 15:17
dogman wrote:
Intel makes good compilers. I'm real happy with their Fortran.

bitrake, nice site you found! I'll compile it under a few other compilers and post the results later. I have some that are not on that site.


And I with their C/C++. Vey nice really.

_________________
I'm not a native speaker of the english language. So, if you find any mistake what I have written, you are free to fix for me or tell me on. Smile
Post 30 Nov 2013, 15:17
View user's profile Send private message Reply with quote
dogman



Joined: 18 Jul 2013
Posts: 114
dogman 30 Nov 2013, 18:12
Here's the output from 3 other compilers.

CC (Solaris Studio 12.3)

Code:
        .file "test.c"
        .code32

        .set .simple_nop, 0x90
        .globl __1cBf6Fi_i_
        .type __1cBf6Fi_i_, @function
        .local Dlrodata.lrodata
        .local Dldata.ldata

        .ident     "iropt: Sun Compiler Common 12.3 Linux_i386 2011/11/16"
        .ident     "ir2hf: Sun Compiler Common 12.3 Linux_i386 2011/11/16"
        .ident     "ube: Sun Compiler Common 12.3 Linux_i386 2011/11/16"

        .section .text,"ax"
{                   1  }        .align 16,.simple_nop


/================================================================================
/ FUNCTION __1cBf6Fi_i_

/               %rbp - used as frame pointer.

/               arg0 "n":                4(%esp),

__1cBf6Fi_i_:

/ BLOCK: 3, pred: 1, succ: 7 8, count: 17.3027

/ FILE test.c

/    1                !int f(int n) {

{ int[ 3]           1  }        push    %ebp            
{ int[ 1]           1  }        movl    %esp,%ebp               
{ int[ 3]           1  }        push    %ebx            
{ int[ 3]           1  }        push    %esi            

/    2                !  if (n<2)

{ int[ 4]           2  }        movl    8(%ebp),%ebx            / sym=n
{ int[ 1]           2  }        cmpl    $2,%ebx         
{                   2  }        jl      .CG2.14         
{                   5  } .CG3.15:

/ BLOCK: 8, pred: 3, succ: 10 11, count: 12.4580


/    3                !    return n;
/    4                !  else
/    5                !    return f(n-1)+f(n-2);

{ int[ 1]           5  }        leal    -2(%ebx),%esi           
{ int[ 1]           2  }        cmpl    $2,%esi         
{                   2  }        jl      .CG4.16         
{                   5  } .CG5.17:

/ BLOCK: 11, pred: 8, succ: 13 14, count: 11.2266

{ int[ 1]              }        leal    -4(%ebx),%esi           
{ int[ 1]           2  }        cmpl    $2,%esi         
{                   2  }        jl      .CG6.18         
{                   5  } .CG7.19:

/ BLOCK: 14, pred: 11, succ: 16 17, count: 10.1169

{ int[ 1]              }        leal    -6(%ebx),%esi           
{ int[ 1]           2  }        cmpl    $2,%esi         
{                   2  }        jl      .CG8.20         
{                   5  } .CG9.21:

/ BLOCK: 17, pred: 14, succ: 19 20, count: 9.11688

{ int[ 1]              }        leal    -8(%ebx),%esi           
{ int[ 1]           2  }        cmpl    $2,%esi         
{                   2  }        jl      .CGA.22         
{                   5  } .CGB.23:

/ BLOCK: 20, pred: 17, succ: 9, count: 4.55844

{ int[ 1]           5  }        subl    $12,%esp                
{ int[ 1]              }        leal    -9(%ebx),%eax           
{ int[ 3]           5  }        push    %eax            
{                   5  }        call    __1cBf6Fi_i_            

/ BLOCK: 9, pred: 20, succ: 6, count: 4.55844

{ int[ 1]           5  }        addl    $4,%esp         
{ int[ 1]           5  }        movl    %eax,%esi               
{ int[ 1]              }        leal    -10(%ebx),%eax          
{ int[ 3]           5  }        push    %eax            
{                   5  }        call    __1cBf6Fi_i_            
{ int[ 1]           5  }        addl    $16,%esp                

/ BLOCK: 6, pred: 9, succ: 19, count: 4.55844

{ int[ 1]           5  }        addl    %eax,%esi               
{                   3  } .CGA.22:
{                   5  } .CGC.24:

/ BLOCK: 21, pred: 19, succ: 5, count: 9.11688

{ int[ 1]           5  }        subl    $12,%esp                
{ int[ 1]              }        leal    -7(%ebx),%eax           
{ int[ 3]           5  }        push    %eax            
{                   5  }        call    __1cBf6Fi_i_            
{ int[ 1]           5  }        addl    $16,%esp                

/ BLOCK: 5, pred: 21, succ: 16, count: 9.11688

{ int[ 1]           5  }        addl    %eax,%esi               
{                   3  } .CG8.20:
{                   5  } .CGD.25:

/ BLOCK: 22, pred: 16, succ: 18, count: 10.1169

{ int[ 1]           5  }        subl    $12,%esp                
{ int[ 1]              }        leal    -5(%ebx),%eax           
{ int[ 3]           5  }        push    %eax            
{                   5  }        call    __1cBf6Fi_i_            
{ int[ 1]           5  }        addl    $16,%esp                

/ BLOCK: 18, pred: 22, succ: 13, count: 10.1169

{ int[ 1]           5  }        addl    %eax,%esi               
{                   3  } .CG6.18:
{                   5  } .CGE.26:

/ BLOCK: 23, pred: 13, succ: 4, count: 11.2266

{ int[ 1]           5  }        subl    $12,%esp                
{ int[ 1]              }        leal    -3(%ebx),%eax           
{ int[ 3]           5  }        push    %eax            
{                   5  }        call    __1cBf6Fi_i_            
{ int[ 1]           5  }        addl    $16,%esp                

/ BLOCK: 4, pred: 23, succ: 10, count: 11.2266

{ int[ 1]           5  }        addl    %eax,%esi               
{                   3  } .CG4.16:
{                   5  } .CGF.27:

/ BLOCK: 24, pred: 10, succ: 26, count: 12.4580

{ int[ 1]           5  }        subl    $12,%esp                
{ int[ 1]           5  }        addl    $-1,%ebx                
{ int[ 3]           5  }        push    %ebx            
{                   5  }        call    __1cBf6Fi_i_            
{ int[ 1]           5  }        addl    $16,%esp                

/ BLOCK: 26, pred: 24, succ: 7, count: 12.4580

{ int[ 1]           5  }        leal    (%eax,%esi),%ebx                
{                   3  } .CG2.14:
{                   5  } .CG10.28:

/ BLOCK: 25, pred: 7, succ: 2, count: 4.84477

{ int[ 1]           5  }        movl    %ebx,%eax               
{ int[ 3]           5  }        pop     %esi            
{ int[ 3]           5  }        pop     %ebx            
{                   5  }        leave                   
{                   5  }        ret                     
        .size __1cBf6Fi_i_, . - __1cBf6Fi_i_
.CG0:



        .section .data,"aw"
Ddata.data: / Offset 0



        .section .bss,"aw"

Bbss.bss:


        .section .bssf,"aw"


        .section .rodata,"a"
Drodata.rodata: / Offset 0



        .section .picdata,"aw"
Dpicdata.picdata: / Offset 0



        .section .lbss,"awh"
        .type Blbss.lbss, @object

Blbss.lbss:


        .section .ldata,"awh"
Dldata.ldata: / Offset 0
        .type Dldata.ldata, @object



        .section .lrodata,"ah"
Dlrodata.lrodata: / Offset 0
        .type Dlrodata.lrodata, @object
    


OpenUH (University of Houston version of Open64 compiler)

Code:
        #  /opt/openuh-3.0.29/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0

        #-----------------------------------------------------------
        # Compiling test.c (/tmp/ccI#.s4ieOz)
        #-----------------------------------------------------------

        #-----------------------------------------------------------
        # Options:
        #-----------------------------------------------------------
        #  Target:Wolfdale, ISA:ISA_1, Endian:little, Pointer Size:32
        #  -O3  (Optimization level)
        #  -g0  (Debug level)
        #  -m2  (Report advisories)
        #-----------------------------------------------------------

        .file   1       "/tmp/test.c"


        .text
        .align  2

        .section .except_table_supp, "a",@progbits

        .section .except_table, "a",@progbits
        .section .text
        .p2align 5,,

        # Program Unit: _Z1fi
.globl  _Z1fi
        .type   _Z1fi, @function
_Z1fi:  # 0x0
        # .frame        %esp, 20, %esp
        # _temp_gra_spill1 = 8
        .loc    1       1       0
 #   1  int f(int n) {
.LBB1__Z1fi:
.LEH_adjustsp__Z1fi:
        addl $-20,%esp                  # [0] 
.L_0_1282:
        .loc    1       2       0
 #   2    if (n<2)
        movl 24(%esp),%edx              # [0] n
        cmpl $1,%edx                    # [3] 
        jg .Lt_0_770                    # [4] 
.LBB3__Z1fi:
        .loc    1       3       0
 #   3      return n;
        movl %edx,%eax                  # [0] 
        addl $20,%esp                   # [0] 
        ret                             # [0] 
        .p2align 4,,15
.Lt_0_770:
        .loc    1       5       0
 #   4    else
 #   5      return f(n-1)+f(n-2);
        movl 24(%esp),%eax              # [0] n
        addl $-1,%eax                   # [3] 
        movl %eax,0(%esp)               # [4] id:8
        call _Z1fi                      # [4] _Z1fi
.LBB5__Z1fi:
        movl %eax,8(%esp)               # [0] _temp_gra_spill1
        movl 24(%esp),%eax              # [0] n
        addl $-2,%eax                   # [3] 
        movl %eax,0(%esp)               # [4] id:9
        call _Z1fi                      # [4] _Z1fi
.LBB6__Z1fi:
        movl %eax,%edx                  # [0] 
        movl 8(%esp),%eax               # [0] _temp_gra_spill1
        addl %edx,%eax                  # [3] 
        addl $20,%esp                   # [3] 
        ret                             # [3] 
.L_0_1538:
.LDWend__Z1fi:
        .size _Z1fi, .LDWend__Z1fi-_Z1fi

        .section .except_table
        .align  0
        .type   .range_table._Z1fi, @object
.range_table._Z1fi:     # 0x0
        # offset 0
        .byte   255
        # offset 1
        .byte   0
        .uleb128        .LSDATTYPEB1-.LSDATTYPED1
.LSDATTYPED1:
        # offset 6
        .byte   1
        .uleb128        .LSDACSE1-.LSDACSB1
.LSDACSB1:
        .uleb128        .L_0_1282-_Z1fi
        .uleb128        .L_0_1538-.L_0_1282
        # offset 17
        .uleb128        0
        # offset 21
        .uleb128        0
.LSDACSE1:
        # offset 25
        .sleb128        0
        # offset 29
        .sleb128        0
.LSDATTYPEB1:
        # end of initialization for .range_table._Z1fi
        .section .text
        .align  4
        .section .except_table_supp
        .align  4
        .section .except_table
        .align  4

        .section .eh_frame, "a",@progbits
.LEHCIE:
        .4byte  .LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
        .4byte 0x0
        .byte   0x01, 0x7a, 0x50, 0x4c, 0x00, 0x01, 0x7c, 0x08
        .byte   0x06, 0x00
        .4byte  __gxx_personality_v0
        .byte   0x00, 0x0c, 0x04, 0x04, 0x88, 0x01
        .align 4
.LEHCIE_end:
        .4byte  .LFDE1_end - .LFDE1_begin
.LFDE1_begin:
        .4byte  .LFDE1_begin - .LEHCIE
        .4byte  .LBB1__Z1fi
        .4byte  .LDWend__Z1fi - .LBB1__Z1fi
        .byte   0x04
        .4byte  .range_table._Z1fi
        .byte   0x04
        .4byte  .LEH_adjustsp__Z1fi - .LBB1__Z1fi
        .byte   0x0e, 0x18
        .align 4
.LFDE1_end:

        .section .debug_line, ""
        .section        .note.GNU-stack,"",@progbits
        .ident  "#Open64 Compiler Version 5.0 : test.c compiled with : -g0 -O3 -march=wolfdale -msse2 -msse3 -mno-3dnow -mno-sse4a -mssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul -mno-avx -mno-xop -mno-fma -mno-fma4 -m32"

    


Open64

Code:
        #  /opt/open64-5.0/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0

        #-----------------------------------------------------------
        # Compiling test.c (/tmp/ccI#.Bc1ETQ)
        #-----------------------------------------------------------

        #-----------------------------------------------------------
        # Options:
        #-----------------------------------------------------------
        #  Target:Wolfdale, ISA:ISA_1, Endian:little, Pointer Size:32
        #  -O3  (Optimization level)
        #  -g0  (Debug level)
        #  -m2  (Report advisories)
        #-----------------------------------------------------------

        .file   1       "test.c"


        .text
        .align  2

        .section .except_table_supp, "a",@progbits

        .section .except_table, "a",@progbits
        .section .text
        .p2align 5,,

        # Program Unit: _Z1fi
.globl  _Z1fi
        .type   _Z1fi, @function
_Z1fi:  # 0x0
        # .frame        %esp, 20, %esp
        # _temp_gra_spill1 = 8
        .loc    1       1       0
 #   1  int f(int n) {
.LBB1__Z1fi:
.LEH_adjustsp__Z1fi:
        addl $-20,%esp                  # [0] 
.L_0_1282:
        .loc    1       2       0
 #   2    if (n<2)
        movl 24(%esp),%edx              # [0] n
        cmpl $1,%edx                    # [3] 
        jg .Lt_0_770                    # [4] 
.LBB3__Z1fi:
        .loc    1       3       0
 #   3      return n;
        movl %edx,%eax                  # [0] 
        addl $20,%esp                   # [0] 
        ret                             # [0] 
        .p2align 4,,15
.Lt_0_770:
        .loc    1       5       0
 #   4    else
 #   5      return f(n-1)+f(n-2);
        movl 24(%esp),%eax              # [0] n
        addl $-1,%eax                   # [3] 
        movl %eax,0(%esp)               # [4] id:8
        call _Z1fi                      # [4] _Z1fi
.LBB5__Z1fi:
        movl %eax,8(%esp)               # [0] _temp_gra_spill1
        movl 24(%esp),%eax              # [0] n
        addl $-2,%eax                   # [3] 
        movl %eax,0(%esp)               # [4] id:9
        call _Z1fi                      # [4] _Z1fi
.LBB6__Z1fi:
        movl %eax,%edx                  # [0] 
        movl 8(%esp),%eax               # [0] _temp_gra_spill1
        addl %edx,%eax                  # [3] 
        addl $20,%esp                   # [3] 
        ret                             # [3] 
.L_0_1538:
.LDWend__Z1fi:
        .size _Z1fi, .LDWend__Z1fi-_Z1fi

        .section .except_table
        .align  0
        .type   .range_table._Z1fi, @object
.range_table._Z1fi:     # 0x0
        # offset 0
        .byte   255
        # offset 1
        .byte   0
        .uleb128        .LSDATTYPEB1-.LSDATTYPED1
.LSDATTYPED1:
        # offset 6
        .byte   1
        .uleb128        .LSDACSE1-.LSDACSB1
.LSDACSB1:
        .uleb128        .L_0_1282-_Z1fi
        .uleb128        .L_0_1538-.L_0_1282
        # offset 17
        .uleb128        0
        # offset 21
        .uleb128        0
.LSDACSE1:
        # offset 25
        .sleb128        0
        # offset 29
        .sleb128        0
.LSDATTYPEB1:
        # end of initialization for .range_table._Z1fi
        .section .text
        .align  4
        .section .except_table_supp
        .align  4
        .section .except_table
        .align  4

        .section .eh_frame, "a",@progbits
.LEHCIE:
        .4byte  .LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
        .4byte 0x0
        .byte   0x01, 0x7a, 0x50, 0x4c, 0x00, 0x01, 0x7c, 0x08
        .byte   0x06, 0x00
        .4byte  __gxx_personality_v0
        .byte   0x00, 0x0c, 0x04, 0x04, 0x88, 0x01
        .align 4
.LEHCIE_end:
        .4byte  .LFDE1_end - .LFDE1_begin
.LFDE1_begin:
        .4byte  .LFDE1_begin - .LEHCIE
        .4byte  .LBB1__Z1fi
        .4byte  .LDWend__Z1fi - .LBB1__Z1fi
        .byte   0x04
        .4byte  .range_table._Z1fi
        .byte   0x04
        .4byte  .LEH_adjustsp__Z1fi - .LBB1__Z1fi
        .byte   0x0e, 0x18
        .align 4
.LFDE1_end:

        .section .debug_line, ""
        .section        .note.GNU-stack,"",@progbits
        .ident  "#Open64 Compiler Version 5.0 : test.c compiled with : -g0 -O3 -march=wolfdale -msse2 -msse3 -mno-3dnow -mno-sse4a -mssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul -mno-avx -mno-xop -mno-fma4 -m32"

    


All three look better than gcc at -O3...

_________________
Sources? Ahahaha! We don't need no stinkin' sources!
Post 30 Nov 2013, 18:12
View user's profile Send private message Reply with quote
cod3b453



Joined: 25 Aug 2004
Posts: 618
cod3b453 30 Nov 2013, 18:24
Well the only way to settle it is to race them Laughing
Post 30 Nov 2013, 18:24
View user's profile Send private message Reply with quote
tthsqe



Joined: 20 May 2009
Posts: 767
tthsqe 06 Dec 2013, 18:52
cod3b43,
we have already had extensive speed tests (http://board.flatassembler.net/topic.php?t=10158&postdays=0&postorder=asc&start=20) for this academic example. I'm just always interested what the latest compiler tech is doing. At least for intel cpu's, I think the fastest human implementation was what I have below. On amd, you might want to replace the xch with a push and pop. Though, I wouldn't mind if you did run a speed test, to see if all of that code that gcc wrote actually does improve performance.

Code:
f:     ;argument passed in eax
    cmp eax,1
    jbe .1
    push ebx
    lea ebx,[eax-2]
    dec eax
    call f
    xchg eax,ebx
    call f
    add eax,ebx
    pop ebx
.1  ret       
Post 06 Dec 2013, 18:52
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 3892
Location: vpcmipstrm
bitRAKE 07 Dec 2013, 07:46
No compiler I know of will generate the simple loop using the XADD instruction.

http://www.asmcommunity.net/forums/topic/?id=14206

_________________
¯\(°_o)/¯ “languages are not safe - uses can be” Bjarne Stroustrup
Post 07 Dec 2013, 07:46
View user's profile Send private message Visit poster's website Reply with quote
tthsqe



Joined: 20 May 2009
Posts: 767
tthsqe 07 Dec 2013, 11:50
bitrake, could you give a precise rule-based deduction of the fact that your xadd loop correctly implements
Code:
unsigned int f(unsigned int n) { 
  if (n<2) 
    return n; 
  else 
    return f(n-1)+f(n-2);}    

I think compilers technology has not focused on such optimizations. If you bag of tricks is precise enough, you should be able to teach it to the computer.
Post 07 Dec 2013, 11:50
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 3892
Location: vpcmipstrm
bitRAKE 07 Dec 2013, 20:19
The first thing for the compiler to understand is the range of numbers involved, and the function utility (how it is used). Next is to understand the sequential nature of the recursion - caching values is beneficial. XADD is a special case of that value caching which the instruction selector should find.
Post 07 Dec 2013, 20:19
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2023, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.