flat assembler
Message board for the users of flat assembler.

 Index > Main > EFLA (Extremely Fast Line Algorithm)
Author

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Update: proc working.
What I missed before:
1) div needed signing, hence the idiv
2) div needed cdq, I lazily used edx=0 before
3) I switched x/y by accident in the loops
Code:
```proc CreateEFLine x1,y1,x2,y2,c ;101,10,11,52,0FFh
locals
yLonger       rd 1 ;bool
shortLen      rd 1
longLen       rd 1
swap          rd 1
decInc        rd 1
endl

mov     rsi,[c]
mov     [yLonger],0     ;bool yLonger=false;
mov     [shortLen],r9d  ;int shortLen=y2-y;
sub     [shortLen],edx
sbb     eax,eax
mov     [longLen],r8d   ;int longLen=x2-x;
sub     [longLen],ecx
sbb     ebx,ebx

xor     eax,[shortLen]  ;if (abs(shortLen)>abs(longLen)) {
xor     ebx,[longLen]
cmp     eax,ebx
jle     .no_swap
mov     eax,[shortLen]  ;    int swap=shortLen;
mov     ebx,[longLen]   ;    shortLen=longLen;
mov     [shortLen],ebx  ;    longLen=swap;
mov     [longLen],eax

or      [yLonger],-1    ;    yLonger=true;
.no_swap:                 ;}
mov     [decInc],0      ;int decInc;
cmp     [longLen],0     ;if (longLen==0) decInc=0;
jz      .no_div0
mov     [y1],rdx
mov     eax,[shortLen]  ;else decInc = (shortLen << 16) / longLen;
shl     eax,16
cdq
idiv    [longLen]
mov     [decInc],eax
mov     rdx,[y1]
.no_div0:
;
cmp     [yLonger],0     ;if (yLonger) {
je      .yShorter
cmp     [longLen],0     ;    if (longLen>0) {
jle     .longLen_le1
shl     ecx,16          ;        for (int j=0x8000+(x<<16);y<=longLen;++y) {
.line1:
cmp     edx,[longLen]
jg      .exit1
mov     eax,ecx
shr     eax,16
mov     ebx,edx
call    putpixel        ;            myPixel(surface,j >> 16,y);
jmp     .line1
.exit1:                   ;        }
ret                     ;        return;
.longLen_le1:             ;    }
shl     ecx,16          ;    for (int j=0x8000+(x<<16);y>=longLen;--y) {
.line2:
cmp     edx,[longLen]
jl      .exit2
sub     edx,1
mov     eax,ecx
shr     eax,16
mov     ebx,edx
call    putpixel        ;        myPixel(surface,j >> 16,y);
sub     ecx,[decInc]    ;        j-=decInc;
jmp     .line2
.exit2:                   ;    }
ret                     ;    return;
.yShorter:                ;}
;
cmp     [longLen],0     ;if (longLen>0) {
jle     .longLen_le2
shl     edx,16          ;    for (int j=0x8000+(y<<16);x<=longLen;++x) {
.line3:
cmp     ecx,[longLen]
jg      .exit3
mov     eax,ecx
mov     ebx,edx
shr     ebx,16
call    putpixel        ;        myPixel(surface,x,j >> 16);
jmp     .line3
.exit3:                   ;    }
ret                     ;    return;
.longLen_le2:             ;}
shl     edx,16          ;for (int j=0x8000+(y<<16);x>=longLen;--x) {
.line4:
cmp     ecx,[longLen]
jl      .exit3
sub     ecx,1
mov     eax,ecx
mov     ebx,edx
shr     ebx,16
call    putpixel        ;    myPixel(surface,x,j >> 16);
sub     edx,[decInc]    ;    j-=decInc;
jmp     .line4
.exit4:                   ;}
ret
endp
```

Update: putpixel routine posted on request
Code:
```putpixel:
push    rdi
mov     edi,ebx
imul    edi,[RES_X]
imul    edi,[BPP]
cmp     [BPP],3
jne     @f
mov     word[rdi],0;si
ror     rsi,16
mov     byte[rdi+2],0;sil
rol     rsi,16
jmp     .bppok
@@:
mov     [rdi],esi
.bppok:
pop     rdi
ret
```

_________________
My updated idol http://www.agner.org/optimize/

Last edited by Madis731 on 12 Sep 2010, 07:00; edited 2 times in total
09 Sep 2010, 12:39
mindcooler

Joined: 01 Dec 2009
Posts: 423
Location: Västerås, Sweden
mindcooler
Line drawing is probably limited by cache performance.
09 Sep 2010, 13:06

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Debugged and measured with http://board.flatassembler.net/topic.php?t=11609:
Code:
```Lines drawn in 8 different directions:
101,101,151,111,0FFh
101,101,111,151,0FF00h
101,101,091,151,0FF0000h
101,101,051,111,0FFh
101,101,051,091,0FF00h
101,101,091,051,0FF0000h
101,101,111,051,0FFh
101,101,151,091,0FF00h

24bpp    EFLINE    | Bresenham
Bochs - 10169      | 9584
QEMU  - ~236000    | ~192000
VBox  - 8000       | 7450  (Intel VT-x enabled)

32bpp    EFLINE    | Bresenham
Bochs - 8537       | 8768
QEMU  - NA         | NA
VBox  - 5000       | 6400  (Intel VT-x enabled)
```

One might say that it is not fair to compare them in VM-s, but its at least an indicator. If you wish I will test it on a real machine, but there you have it.
There really are opportunities for this implementation and the best part is that 32 bits per pixel is what most cards use.
Both implementations use the same putpixel call. It is possible to inline all the putpixel calls, but it doesn't gain as much performance as it loses bytes
11 Sep 2010, 19:55
nop

Joined: 01 Sep 2008
Posts: 165
Location: right here left there
nop
Both implementations use the same putpixel call. It is possible to inline all the putpixel calls, but it doesn't gain as much performance as it loses bytes
why 4 identical exits? you could save 3 bytes there
are you going to post putpixel? or at least the required entry and exit conditions?
11 Sep 2010, 23:41
bitRAKE

Joined: 21 Jul 2003
Posts: 3285
Location: vpcmipstrm
bitRAKE
Madis731, good conversion of the C code. It needs to be rewritten with your assembly hat on - only one jump is needed prior to one of the eight loops. I'll give it a go and maybe post before I go to bed...
12 Sep 2010, 04:48
edfed

Joined: 20 Feb 2006
Posts: 4252
Location: Now
edfed
for 24 bpp and 32 bpp, pixels are the same, 24 bpp.

but, in 32 bpp, they are aligned on 32 bits boundary.

then, there is no need to putpixel with a test for 24 bpp.

only the 24 bpp put pixel is ok for both cases.

Code:
```putpixel:
push    rdi
mov     edi,ebx
imul    edi,[RES_X]
imul    edi,[BPP]
mov     word[rdi],si
ror     rsi,16
mov     byte[rdi+2],sil
rol     rsi,16
.bppok:
pop     rdi
ret
```
12 Sep 2010, 07:33

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
32bpp putpixel is smaller and faster than 24bpp one, but two routines would waste even more. That is why I've chosen this one. Actually the best would be to compile two versions of the image, which of the first one (24bpp) would be for compatibility with extremely old videocards and some emulators (QEMU). I could just assume 32bpp everywere and coding would be much easier. In 32bpp world there is no need to imul edi,[BPP] and I could use a shortcut like lea edi,[4*rdi] or add edi,edi / add edi,edi or shl edi,2.
Code:
```putpixel24:
push    rax rdi
mov     edi,ebx
imul    edi,[RES_X]
lea     edi,[rdi*3]
mov     eax,[rdi]
and     eax,0FF000000h
or      eax,esi
mov     [rdi],eax
pop     rdi rax
ret

putpixel32:
push    rdi
mov     edi,ebx
imul    edi,[RES_X]
lea     edi,[rdi*4]
mov     [rdi],esi
pop     rdi
ret
```
12 Sep 2010, 10:25
bitRAKE

Joined: 21 Jul 2003
Posts: 3285
Location: vpcmipstrm
bitRAKE
The current version of your line routine often gets stuck in an endless loop because ECX/EDX can also equal [.longlen]. Hooking it up to a PRNG is a more thorough way to test, imho. Please, test the program below. Presently it's at over 350K random lines per second.

_________________
12 Sep 2010, 11:42

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
267k Bresenham's and 318k EFLines per second.
It was not an infinite loop - it crashed - and now I fixed the implementation to respect the C-style ++x/--y. Example:
Code:
```      .line1:
add     edx,1 ;++y means increment before condition test
cmp     edx,[longLen]
jg      .exit ; It can still be jg instead of jge
```

It works, but I think decInc shouldn't be left for the running program to decide. It is not guaranteed to be 0 when it is not initialized to 0.
I think this is dangerous:
mov eax,[.shortLen-2];else decInc = (shortLen << 16) / longLen;
.shortLen-2 dword holds the 2 high bytes of the yLonger boolean value. If it happens to be -1 then the high 16 bits of eax are trashed.

I think we can (to be safe) sacrifice the additional bytes and cycles needed for the decInc=0 and the shortLen shl 16 sequences. The performance won't matter much.
12 Sep 2010, 17:27
edfed

Joined: 20 Feb 2006
Posts: 4252
Location: Now
edfed

your EFLA algo looks like a fixed point arythmetrick!

means that maybe a great optimisation can be made that will allow very few instructions, only on registers, in the loop to compute each dot.

Code:
```.line1:
cmp ebx,edx            ;edx = longlen
jg .exit1
mov eax,ecx
shr eax,16
call putpixel        ;            myPixel(surface,j >> 16,y);
jmp .line1
.exit1:
```

and if putpixel is optimised inside the loop, the gain will be very high, but not compatible with putpixel subroutinage....
13 Sep 2010, 09:32
edfed

Joined: 20 Feb 2006
Posts: 4252
Location: Now
edfed
if using ecx as a loop register:
3 instructions disapears

cmp ebx,edx
jg .exit

but the instruction mov ebx,ecx reapear

Code:
```;pixel assume eax=X, ebx=Y

mov edx,shortlen
mov ecx,longlen
sub ecx,edx

line1:
lea ebx,[ecx+edx]; ecx = longlen - shortlen; edx =
mov eax,esi ;esi=j
shr eax,16
call putpixel
loop .line1
```
13 Sep 2010, 09:52
bitshifter

Joined: 04 Dec 2007
Posts: 786
Location: Massachusetts, USA
bitshifter
Next step is to special case each of the 8 octants.
The code will grow in size but contain less screwing around
with abs() and swapping registers and crap.
You may also want to break out special case horz/vert lines also.
Just remember: generic=slow, special cased=fast.
Dont build the code around the SetPixel routine,
but instead build the SetPixel routine around (into) the code.
14 Sep 2010, 00:56
bitRAKE

Joined: 21 Jul 2003
Posts: 3285
Location: vpcmipstrm
bitRAKE
This just seems easier to me:
(...and doesn't crash )
Code:
```DrawLine:
.x0     equ esi
.y0     equ edi
.x1     equ ecx
.y1     equ edx
.color  equ ebp

.x_     equ edx
.y_     equ .x_
.slope  equ eax

sub .x1,.x0
sbb ebx,ebx     ; 0=right       1=left

sub .y1,.y0
sbb eax,eax     ; 0=up          1=down

xor .x1,ebx
xor .y1,eax
sub .x1,ebx
sub .y1,eax
rcl ebx,1       ; merge flags

cmp .x1,.y1
jz .45 ; (rare) special handling
mov eax,.x1
cmovc .x1,.y1
cmovc .y1,eax

rcl ebx,1 + 4   ; 0=x         1=y
xor eax,eax
div .y1 ; loop counter is longest delta

and ebx,111b shl 4
jmp rbx

.45:    jrcxz .dot ; (rare) single pixel
mov edx,ebx
or eax,1
or edx,1
@@:     call .plot
loop @B
retn

.dot:   call .plot
retn

align 16
.tab:   OCTANT 0,1,2,3,4,5,6,7

.plot:  imul ebx,.y0,CLIENT.X
mov dword [gfx + rbx*4],.color
retn

macro OCTANT [num] {
align 16
@@:   call .plot
if num and 1
if num and 2
dec .y0
else
inc .y0
end if
if num and 4
sub .x_,.slope
sbb .x0,0
else
end if
else
if num and 4
dec .x0
else
inc .x0
end if
if num and 2
sub .y_,.slope
sbb .y0,0
else
end if
end if
loop @B
retn
}    ```
...didn't even need the 'other' registers.
So, 32-bit implementation is straight forward.

In-lining the .plot code provided a %25 increase!
14 Sep 2010, 12:47
 Display posts from previous: All Posts1 Day7 Days2 Weeks1 Month3 Months6 Months1 Year Oldest FirstNewest First

 Jump to: Select a forum Official----------------AssemblyPeripheria General----------------MainTutorials and ExamplesDOSWindowsLinuxUnixMenuetOS Specific----------------MacroinstructionsOS ConstructionIDE DevelopmentProjects and IdeasNon-x86 architecturesHigh Level LanguagesProgramming Language DesignCompiler Internals Other----------------FeedbackHeapTest Area

Forum Rules:
 You cannot post new topics in this forumYou cannot reply to topics in this forumYou cannot edit your posts in this forumYou cannot delete your posts in this forumYou cannot vote in polls in this forumYou cannot attach files in this forumYou can download files in this forum