flat assembler
Message board for the users of flat assembler.

Index > Linux > Assembly SSE/MMX Infinite Loop

Author
Thread Post new topic Reply to topic
chris03_dev



Joined: 24 Oct 2020
Posts: 3
chris03_dev 22 Oct 2021, 05:33
Greetings. This is a bit of a newbie question, as I have just been learning floating-point assembly... SSE in particular.

I am currently using FASM as a compiler backend, and currently testing for Linux. I am currently stumped, as this code should have worked perfectly fine, but instead caused an infinite loop:

Code:
.c0_l:
; x /= y
pxor xmm0, xmm0
pxor xmm1, xmm1
movss xmm1, dword [rbp-16]
movss xmm0, dword [rbp-8]
divss xmm0, xmm1
movss [rbp-8], xmm0

; hello:5 += 1
inc [hello+5]

; hello:5 >= 127
movzx eax, byte [hello+5]
cmp eax, 127
setae al
and al, 1
movzx eax, al

cmp eax, 0
je .c1_0

; hello:5 = 32
mov [hello+5], byte 32

.c1_0:
.c1e:

mov rsi, hello
mov rdx, 13
mov rax, 1
mov rdi, 1
syscall
; x >= 50
pxor xmm0, xmm0
movss xmm0, dword [rbp-8]
ucomiss xmm0, dword [__F4]
setge al
and al, 1
movzx eax, al

cmp eax, 1
jge .c0_l
    


Subsequently, I have also inserted some system calls for string output in order to verify that the program is indeed having an infinite loop.

Just in case the problem is somewhere else within the program, I shall also provide the whole source code, down below:

Code:
format elf64 executable
segment writeable
align 128
__F0 dd 2500.000000
align 128
__F1 dd 100.000000
align 128
__F2 dd 5.000000
align 128
__F3 dd 5.000000
align 128
__F4 dd 50.000000
align 128
__F5 dd 5.000000
align 128
__F6 dd 0.000000
align 128
__F7 dd 5.000000
align 128
__F8 dd 7.000000
align 64
hello            db 5    dup ("Hello",'"'," World",10,"")
world            db "World",10
ptr3             rq 1
segment executable
align 128
include 'rv64.asm'
strlen:
push rbp
mov rbp, rsp
sub rsp, 20
; len              = rbp - 12
; float            = rbp - 20
; len = 0
mov [rbp-12], word 0

; strlen
mov rax, strlen

; strlen:5:2[1, 2]
; 1
mov eax, 1

mov rdi, rax
; 2
mov eax, 2

mov rsi, rax
mov rax, [strlen+40]
mov rax, [rax+16]
call rax

; len~:4:2[2]:2[3]{u1}:7
; 2
mov eax, 2

mov rdi, rax
mov rax, [rbp-12]
mov rax, [rax+32]
mov rax, [rax+16]
call rax
push rax
; 3
mov eax, 3

mov rdi, rax
pop rax
mov rax, [rax+8]
call rax
mov eax, dword [rbp-12]
mov rdx, rax
movzx eax, byte [rdx+7]

add rsp, 20
pop rbp
ret
entry main
main:
push rbp
mov rbp, rsp
sub rsp, 80
; x                = rbp - 8
; y                = rbp - 16
; z                = rbp - 24
; float            = rbp - 36
; ptr1             = rbp - 48
; ptr2             = rbp - 60
; ptr3             = rbp - 72
; ptr1 = ptr2
lea rcx, [rbp-60]
mov [rbp-48], rcx

; ptr2 = ptr3
lea rcx, [rbp-72]
mov [rbp-60], rcx

; ptr3 = hello + 0
mov r10, hello
add r10, 0
mov [rbp-72], r10

; x *= ptr3:1
pxor xmm0, xmm0
mov rcx, [rbp-72]
movzx ecx, byte [rcx+1]
cvtsi2ss xmm1, rcx

; ptr1 + 1
lea rax, [rbp-48]
inc rax

; ptr2~ *= 1
mov rdx, [rbp-60]
imul eax, [rdx], 1
mov [rdx], al

; x = 2500
pxor xmm0, xmm0
movss xmm0, dword [__F0]
movss [rbp-8], xmm0

; y = 100
pxor xmm0, xmm0
movss xmm0, dword [__F1]
movss [rbp-16], xmm0

; z = 5
pxor xmm0, xmm0
movss xmm0, dword [__F2]
movss [rbp-24], xmm0

; hello:1   = 'a'
mov [hello+1], byte 'a'

; ptr3:2    = 'b'
mov rdx, [rbp-72]
mov [rdx+2], byte 'b'

; ptr2~:3   = 'c'
mov rdx, [rbp-60]
mov rdx, [rdx]
mov [rdx+3], byte 'c'

; ptr1~~:4  = 'd'
mov rdx, [rbp-48]
mov rdx, [rdx]
mov rdx, [rdx]
mov [rdx+4], byte 'd'

; ptr1~~:4 -= 32
mov rdx, [rbp-48]
mov rdx, [rdx]
mov rdx, [rdx]
sub [rdx+4], byte 32

; float = x * 5
pxor xmm2, xmm2
movss xmm2, dword [rbp-8]
mulss xmm2, dword [__F3]
pxor xmm0, xmm0
movsd [rbp-36], xmm0

; strlen[1 + 5, 2, 3]{i4} >> 5
mov [rsp], rax

; 1 + 5
mov eax, 6

mov rdi, rax
; 2
mov eax, 2

mov rsi, rax
; 3
mov eax, 3

mov rdx, rax
mov rax, [rsp]


.c0_l:
; x /= y
pxor xmm0, xmm0
pxor xmm1, xmm1
movss xmm1, dword [rbp-16]
movss xmm0, dword [rbp-8]
divss xmm0, xmm1
movss [rbp-8], xmm0

; hello:5 += 1
inc [hello+5]

; hello:5 >= 127
movzx eax, byte [hello+5]
cmp eax, 127
setae al
and al, 1
movzx eax, al

cmp eax, 0
je .c1_0

; hello:5 = 32
mov [hello+5], byte 32

.c1_0:
.c1e:

mov rsi, hello
mov rdx, 13
mov rax, 1
mov rdi, 1
syscall
; x >= 50
pxor xmm0, xmm0
movss xmm0, dword [rbp-8]
ucomiss xmm0, dword [__F4]
setge al
and al, 1
movzx eax, al

cmp eax, 1
jge .c0_l

; x >= 5
pxor xmm0, xmm0
movss xmm0, dword [rbp-8]
ucomiss xmm0, dword [__F5]
setge al
and al, 1
movzx eax, al

cmp eax, 0
je .c2_0

mov rsi, hello
mov rdx, 13

jmp .c2e
.c2_0:
; x > 0
pxor xmm0, xmm0
movss xmm0, dword [rbp-8]
ucomiss xmm0, dword [__F6]
setg al
and al, 1
movzx eax, al

cmp eax, 0
je .c2_1

mov rsi, world
mov rdx, 5

jmp .c2e
.c2_1:

mov rsi, world
mov rdx, 3

.c2e:

mov rax, 1
mov rdi, 1
syscall
; x * (z + 5) * 7 * z
pxor xmm2, xmm2
movss xmm2, dword [rbp-24]
addss xmm2, [__F7]
pxor xmm0, xmm0
movss xmm0, dword [rbp-8]
mulss xmm0, dword [__F8]
pxor xmm1, xmm1
movss xmm1, dword [rbp-24]

; 0
mov eax, 0

add rsp, 80
pop rbp
mov rdi, rax
mov rax, 60
syscall

    


Thank you for your assistance.

_________________
Every rule has an exception...
Never trust deep learning algorithms to make the final decisions.
Post 22 Oct 2021, 05:33
View user's profile Send private message Reply with quote
macomics



Joined: 26 Jan 2021
Posts: 1043
Location: Russia
macomics 22 Oct 2021, 06:45
chris03_dev wrote:
Code:
cmp eax, 127
setae al
and al, 1
movzx eax, al

cmp eax, 0
je .c1_0    
Code:
setge al
and al, 1
movzx eax, al

cmp eax, 1
jge .c0_l    
I still don't understand what you want to achieve with this. Why are you not satisfied with just jXX commands instead of each of these blocks?
Code:
cmp byte [hello+5], 127
jnae .cl_0    
Code:
jae .c0_l    
about ucomiss wrote:
Performs an unordered compare of the double-precision floating-point values in the low quadwords of operand 1 (first operand) and operand 2 (second operand), and sets the ZF, PF, and CF flags in the EFLAGS register according to the result (unordered, greater than, less than, or equal). The OF, SF and AF flags in the EFLAGS register are set to 0. The unordered result is returned if either source operand is a NaN (QNaN or SNaN).
about setge wrote:
REX + 0F 9D |SETGE r/m8* | Set byte if greater or equal (SF=OF).
After this command, the flags will always be equal. If you need a comparison result, then you need to use an unsigned command.
Code:
jae .c0_l    
Post 22 Oct 2021, 06:45
View user's profile Send private message Reply with quote
chris03_dev



Joined: 24 Oct 2020
Posts: 3
chris03_dev 22 Oct 2021, 11:11
Quote:

I still don't understand what you want to achieve with this. Why are you not satisfied with just jXX commands instead of each of these blocks?


The assembly code is mostly generated by the compiler, save for the inline assembly code used for the system calls. Though, I have yet to implement support for conditional jumping within the expressions themselves.

Besides that, thank you for your clarification. Using unsigned instructions for floating point comparisons is quite an invaluable piece of information for me. I have yet to see another response like yours for this specific issue, throughout the months I have been learning assembly language on my own.

P. S. I don't usually read about EFLAGS registers as I believe it is a waste of time. I usually just analyze code from webpage tutorials, SO, and godbolt.org.

_________________
Every rule has an exception...
Never trust deep learning algorithms to make the final decisions.
Post 22 Oct 2021, 11:11
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.