EFLA (Extremely Fast Line Algorithm)

Index > Main > EFLA (Extremely Fast Line Algorithm)

Author

Thread

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 09 Sep 2010, 12:39

Update: proc working.
What I missed before:
1) div needed signing, hence the idiv
2) div needed cdq, I lazily used edx=0 before
3) I switched x/y by accident in the loops

Code:

proc CreateEFLine x1,y1,x2,y2,c ;101,10,11,52,0FFh
        locals
          yLonger       rd 1 ;bool
          shortLen      rd 1
          longLen       rd 1
          swap          rd 1
          decInc        rd 1
        endl

        mov     rsi,[c]
        mov     [yLonger],0     ;bool yLonger=false;
        mov     [shortLen],r9d  ;int shortLen=y2-y;
        sub     [shortLen],edx
        sbb     eax,eax
        mov     [longLen],r8d   ;int longLen=x2-x;
        sub     [longLen],ecx
        sbb     ebx,ebx

        xor     eax,[shortLen]  ;if (abs(shortLen)>abs(longLen)) {
        xor     ebx,[longLen]
        cmp     eax,ebx
        jle     .no_swap
        mov     eax,[shortLen]  ;    int swap=shortLen;
        mov     ebx,[longLen]   ;    shortLen=longLen;
        mov     [shortLen],ebx  ;    longLen=swap;
        mov     [longLen],eax

        or      [yLonger],-1    ;    yLonger=true;
      .no_swap:                 ;}
        mov     [decInc],0      ;int decInc;
        cmp     [longLen],0     ;if (longLen==0) decInc=0;
        jz      .no_div0
        mov     [y1],rdx
        mov     eax,[shortLen]  ;else decInc = (shortLen << 16) / longLen;
        shl     eax,16
        cdq
        idiv    [longLen]
        mov     [decInc],eax
        mov     rdx,[y1]
      .no_div0:
                                ;
        cmp     [yLonger],0     ;if (yLonger) {
        je      .yShorter
        cmp     [longLen],0     ;    if (longLen>0) {
        jle     .longLen_le1
        add     [longLen],edx   ;        longLen+=y;
        shl     ecx,16          ;        for (int j=0x8000+(x<<16);y<=longLen;++y) {
        add     ecx,0x8000
      .line1:
        cmp     edx,[longLen]
        jg      .exit1
        add     edx,1
        mov     eax,ecx
        shr     eax,16
        mov     ebx,edx
        call    putpixel        ;            myPixel(surface,j >> 16,y);
        add     ecx,[decInc]    ;            j+=decInc;
        jmp     .line1
      .exit1:                   ;        }
        ret                     ;        return;
      .longLen_le1:             ;    }
        add     [longLen],edx   ;    longLen+=y;
        shl     ecx,16          ;    for (int j=0x8000+(x<<16);y>=longLen;--y) {
        add     ecx,0x8000
      .line2:
        cmp     edx,[longLen]
        jl      .exit2
        sub     edx,1
        mov     eax,ecx
        shr     eax,16
        mov     ebx,edx
        call    putpixel        ;        myPixel(surface,j >> 16,y);
        sub     ecx,[decInc]    ;        j-=decInc;
        jmp     .line2
      .exit2:                   ;    }
        ret                     ;    return;
      .yShorter:                ;}
                                ;
        cmp     [longLen],0     ;if (longLen>0) {
        jle     .longLen_le2
        add     [longLen],ecx   ;    longLen+=x;
        shl     edx,16          ;    for (int j=0x8000+(y<<16);x<=longLen;++x) {
        add     edx,0x8000
      .line3:
        cmp     ecx,[longLen]
        jg      .exit3
        add     ecx,1
        mov     eax,ecx
        mov     ebx,edx
        shr     ebx,16
        call    putpixel        ;        myPixel(surface,x,j >> 16);
        add     edx,[decInc]    ;        j+=decInc;
        jmp     .line3
      .exit3:                   ;    }
        ret                     ;    return;
      .longLen_le2:             ;}
        add     [longLen],ecx   ;longLen+=x;
        shl     edx,16          ;for (int j=0x8000+(y<<16);x>=longLen;--x) {
        add     edx,0x8000
      .line4:
        cmp     ecx,[longLen]
        jl      .exit3
        sub     ecx,1
        mov     eax,ecx
        mov     ebx,edx
        shr     ebx,16
        call    putpixel        ;    myPixel(surface,x,j >> 16);
        sub     edx,[decInc]    ;    j-=decInc;
        jmp     .line4
      .exit4:                   ;}
        ret
endp

Update: putpixel routine posted on request

Code:

putpixel:
      push    rdi
      mov     edi,ebx
      imul    edi,[RES_X]
      add     edi,eax
      imul    edi,[BPP]
      add     edi,[LFB] ; LFB+(B*X+A)*BPP
      cmp     [BPP],3
      jne     @f
      mov     word[rdi],0;si
      ror     rsi,16
      mov     byte[rdi+2],0;sil
      rol     rsi,16
      jmp     .bppok
    @@:
      mov     [rdi],esi
    .bppok:
      pop     rdi
    ret

_________________
My updated idol Very Happy

http://www.agner.org/optimize/

Last edited by Madis731 on 12 Sep 2010, 07:00; edited 2 times in total

09 Sep 2010, 12:39

mindcooler

Joined: 01 Dec 2009
Posts: 423
Location: Västerås, Sweden

mindcooler 09 Sep 2010, 13:06

Line drawing is probably limited by cache performance.

09 Sep 2010, 13:06

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 11 Sep 2010, 19:55

Debugged and measured with http://board.flatassembler.net/topic.php?t=11609:

Code:

Lines drawn in 8 different directions:
 101,101,151,111,0FFh    
 101,101,111,151,0FF00h  
 101,101,091,151,0FF0000h
 101,101,051,111,0FFh    
 101,101,051,091,0FF00h  
 101,101,091,051,0FF0000h
 101,101,111,051,0FFh    
 101,101,151,091,0FF00h
  
 24bpp    EFLINE    | Bresenham
 Bochs - 10169      | 9584
 QEMU  - ~236000    | ~192000
 VBox  - 8000       | 7450  (Intel VT-x enabled)

 32bpp    EFLINE    | Bresenham
 Bochs - 8537       | 8768
 QEMU  - NA         | NA
 VBox  - 5000       | 6400  (Intel VT-x enabled)

One might say that it is not fair to compare them in VM-s, but its at least an indicator. If you wish I will test it on a real machine, but there you have it.
There really are opportunities for this implementation and the best part is that 32 bits per pixel is what most cards use.
Both implementations use the same putpixel call. It is possible to inline all the putpixel calls, but it doesn't gain as much performance as it loses bytes Smile

11 Sep 2010, 19:55

nop

Joined: 01 Sep 2008
Posts: 165
Location: right here left there

nop 11 Sep 2010, 23:41

Madis731 wrote:

Both implementations use the same putpixel call. It is possible to inline all the putpixel calls, but it doesn't gain as much performance as it loses bytes

why 4 identical exits? Shocked

you could save 3 bytes there Rolling Eyes

are you going to post putpixel? or at least the required entry and exit conditions?

11 Sep 2010, 23:41

bitRAKE

Joined: 21 Jul 2003
Posts: 4550
Location: vpcmpistri

bitRAKE 12 Sep 2010, 04:48

Madis731, good conversion of the C code. It needs to be rewritten with your assembly hat on - only one jump is needed prior to one of the eight loops. I'll give it a go and maybe post before I go to bed...

12 Sep 2010, 04:48

edfed

Joined: 20 Feb 2006
Posts: 4366
Location: Now

edfed 12 Sep 2010, 07:33

for 24 bpp and 32 bpp, pixels are the same, 24 bpp.

but, in 32 bpp, they are aligned on 32 bits boundary.

then, there is no need to putpixel with a test for 24 bpp.

only the 24 bpp put pixel is ok for both cases.
Very Happy

Code:

putpixel: 
      push    rdi 
      mov     edi,ebx 
      imul    edi,[RES_X] 
      add     edi,eax 
      imul    edi,[BPP] 
      add     edi,[LFB] ; LFB+(B*X+A)*BPP 
      mov     word[rdi],si 
      ror     rsi,16 
      mov     byte[rdi+2],sil 
      rol     rsi,16 
.bppok: 
      pop     rdi 
      ret

_________________
Smile

fool gitlab fasmstuff
gitlab foolstuff

12 Sep 2010, 07:33

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 12 Sep 2010, 10:25

32bpp putpixel is smaller and faster than 24bpp one, but two routines would waste even more. That is why I've chosen this one. Actually the best would be to compile two versions of the image, which of the first one (24bpp) would be for compatibility with extremely old videocards and some emulators (QEMU). I could just assume 32bpp everywere and coding would be much easier. In 32bpp world there is no need to imul edi,[BPP] and I could use a shortcut like lea edi,[4*rdi] or add edi,edi / add edi,edi or shl edi,2.

Code:

putpixel24:
      push    rax rdi
      mov     edi,ebx
      imul    edi,[RES_X]
      add     edi,eax
      lea     edi,[rdi*3]
      add     edi,[LFB] ; LFB+(B*X+A)*3
      mov     eax,[rdi]
      and     eax,0FF000000h
      or      eax,esi
      mov     [rdi],eax
      pop     rdi rax
    ret

putpixel32:
      push    rdi
      mov     edi,ebx
      imul    edi,[RES_X]
      add     edi,eax
      lea     edi,[rdi*4]
      add     edi,[LFB] ; LFB+(B*X+A)*4
      mov     [rdi],esi
      pop     rdi
    ret

12 Sep 2010, 10:25

bitRAKE

Joined: 21 Jul 2003
Posts: 4550
Location: vpcmpistri

bitRAKE 12 Sep 2010, 11:42

The current version of your line routine often gets stuck in an endless loop because ECX/EDX can also equal [.longlen]. Hooking it up to a PRNG is a more thorough way to test, imho. Please, test the program below. Presently it's at over 350K random lines per second.

Description:	Plotter thread test harness.	Download
Filename:	plot.thread0.asm
Filesize:	16.06 KB
Downloaded:	898 Time(s)

_________________
¯\(°_o)/¯ AI may [not] have aided with the above reply.

12 Sep 2010, 11:42

Madis731

Joined: 25 Sep 2003
Posts: 2138
Location: Estonia

Madis731 12 Sep 2010, 17:27

267k Bresenham's and 318k EFLines per second.
It was not an infinite loop - it crashed - and now I fixed the implementation to respect the C-style ++x/--y. Example:

Code:

      .line1:
        add     edx,1 ;++y means increment before condition test
        cmp     edx,[longLen]
        jg      .exit ; It can still be jg instead of jge

It works, but I think decInc shouldn't be left for the running program to decide. It is not guaranteed to be 0 when it is not initialized to 0.
I think this is dangerous:
mov eax,[.shortLen-2];else decInc = (shortLen << 16) / longLen;
.shortLen-2 dword holds the 2 high bytes of the yLonger boolean value. If it happens to be -1 then the high 16 bits of eax are trashed.

I think we can (to be safe) sacrifice the additional bytes and cycles needed for the decInc=0 and the shortLen shl 16 sequences. The performance won't matter much.

12 Sep 2010, 17:27

edfed

Joined: 20 Feb 2006
Posts: 4366
Location: Now

edfed 13 Sep 2010, 09:32

Madis731

your EFLA algo looks like a fixed point arythmetrick!
Smile

means that maybe a great optimisation can be made that will allow very few instructions, only on registers, in the loop to compute each dot.

Code:

.line1:
        cmp ebx,edx            ;edx = longlen 
        jg .exit1 
        add ebx,1 
        mov eax,ecx 
        shr eax,16 
        call putpixel        ;            myPixel(surface,j >> 16,y); 
        add ecx,edi           ;edi=decinc          j+=decInc; 
        jmp .line1 
.exit1:

and if putpixel is optimised inside the loop, the gain will be very high, but not compatible with putpixel subroutinage.... Sad

13 Sep 2010, 09:32

edfed

Joined: 20 Feb 2006
Posts: 4366
Location: Now

edfed 13 Sep 2010, 09:52

if using ecx as a loop register:
3 instructions disapears

cmp ebx,edx
jg .exit
add ebx,1

but the instruction mov ebx,ecx reapear

Code:

;pixel assume eax=X, ebx=Y

mov edx,shortlen
mov ecx,longlen
sub ecx,edx


line1:
lea ebx,[ecx+edx]; ecx = longlen - shortlen; edx =
mov eax,esi ;esi=j
shr eax,16
call putpixel
add esi,edi ; edi=incdec
loop .line1

13 Sep 2010, 09:52

bitshifter

Joined: 04 Dec 2007
Posts: 812
Location: Massachusetts, USA

bitshifter 14 Sep 2010, 00:56

Next step is to special case each of the 8 octants.
The code will grow in size but contain less screwing around
with abs() and swapping registers and crap.
You may also want to break out special case horz/vert lines also.
Just remember: generic=slow, special cased=fast.
Dont build the code around the SetPixel routine,
but instead build the SetPixel routine around (into) the code.

14 Sep 2010, 00:56

bitRAKE

Joined: 21 Jul 2003
Posts: 4550
Location: vpcmpistri

bitRAKE 14 Sep 2010, 12:47

This just seems easier to me:
(...and doesn't crash Wink

)

Code:

DrawLine:
        .x0     equ esi
        .y0     equ edi
        .x1     equ ecx
        .y1     equ edx
        .color  equ ebp

        .x_     equ edx
        .y_     equ .x_
        .slope  equ eax

        sub .x1,.x0
        sbb ebx,ebx     ; 0=right       1=left

        sub .y1,.y0
        sbb eax,eax     ; 0=up          1=down

        xor .x1,ebx
        xor .y1,eax
        sub .x1,ebx
        sub .y1,eax
        rcl ebx,1       ; merge flags

        cmp .x1,.y1
        jz .45 ; (rare) special handling
        mov eax,.x1
        cmovc .x1,.y1
        cmovc .y1,eax

        rcl ebx,1 + 4   ; 0=x         1=y
        xor eax,eax
        div .y1 ; loop counter is longest delta

        and ebx,111b shl 4
        add ebx,.tab
        jmp rbx

.45:    jrcxz .dot ; (rare) single pixel
        mov edx,ebx
        or eax,1
        or edx,1
@@:     call .plot
        add .x0,edx
        add .y0,eax
        loop @B
        retn

.dot:   call .plot
        retn

        align 16
.tab:   OCTANT 0,1,2,3,4,5,6,7

.plot:  imul ebx,.y0,CLIENT.X
        add ebx,.x0
        mov dword [gfx + rbx*4],.color
        retn

macro OCTANT [num] {
  align 16
  @@:   call .plot
  if num and 1
    if num and 2
        dec .y0
    else
        inc .y0
    end if
    if num and 4
        sub .x_,.slope
        sbb .x0,0
    else
        add .x_,.slope
        adc .x0,0
    end if
  else
    if num and 4
        dec .x0
    else
        inc .x0
    end if
    if num and 2
        sub .y_,.slope
        sbb .y0,0
    else
        add .y_,.slope
        adc .y0,0
    end if
  end if
        loop @B
        retn
}

...didn't even need the 'other' registers.
So, 32-bit implementation is straight forward.

In-lining the .plot code provided a %25 increase!

14 Sep 2010, 12:47

< Last Thread | Next Thread >

Forum Rules:

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum