flat assembler
Message board for the users of flat assembler.

Index > Tutorials and Examples > Futex (for fast thread synchronization) example on Linux

Author
Thread Post new topic Reply to topic
Jessé



Joined: 03 May 2025
Posts: 79
Location: Brazil
Jessé 06 Feb 2026, 04:38
I've carved this example while discussing (in C, for more accuracy) with ChatGPT about futex usage on Linux:

Code:
format ELF64 executable 3

include 'fastcall_v1.inc'
include 'stdmacros.inc'
include 'stdio.inc'


define  FUTEX_WAIT          0
define  FUTEX_WAKE          1

define  FUTEX_PRIVATE       128

define  SYSCALL_NANOSLEEP   35
define  SYSCALL_FUTEX       202


_bss    thread1         dq ?
        thread2         dq ?
        thread3         dq ?

        
_data   event           dd 0
        ftx_loops       dd 0

        
_code   Start entry     endbr64
                        libc.StartMain(&@f);
                        
                @@      endbr64
                        push        rbp
                        mov         rbp, rsp
                        push        rbx
                        push        r15
                        
                        mov         rax, [stdout]
                        mov         rcx, [rax]
                        mov         [stdout], rcx
                        
                        thrd_create(&thread1, &Thread1, NULL);
                        
                        usleep(5000000);
                        
                        lfence
                        rdtsc
                        mov         ebx, eax
                        mov         r15d, edx
                        lock or     [event], 1
                @@      test        [event], 1
                        jnz         @b
                        mfence
                        rdtsc
                        shl         r15, 32
                        shl         rdx, 32
                        or          rbx, r15
                        or          rax, rdx
                        sub         rax, rbx
                        mov         r15, rax
                        
                        usleep(2000);
                        fprintf([stdout], <"T1: took %lu cycles to sync.",10,0>, r15);
                        fflush([stdout]);
                        
                        thrd_join([thread1], NULL);
                        
                        nop
                        nop
                        
                        thrd_create(&thread2, &Thread2, NULL);
                        
                        usleep(5000000);
                        
                        lfence
                        rdtsc
                        mov         ebx, eax
                        mov         r15d, edx
                @@      lock or     [event], 1
                        mov         eax, SYSCALL_FUTEX
                        lea         rdi, [event]
                        mov         esi, FUTEX_WAKE or FUTEX_PRIVATE
                        mov         edx, 1
                        xor         r10, r10
                        xor         r8, r8
                        xor         r9d, r9d
                        syscall
                @@      test        [event], 1
                        jnz         @b
                        mfence
                        rdtsc
                        shl         r15, 32
                        shl         rdx, 32
                        or          rbx, r15
                        or          rax, rdx
                        sub         rax, rbx
                        mov         r15, rax
                        
                        usleep(2000);
                        fprintf([stdout], <"T2: took %lu cycles to sync.",10,0>, r15);
                        fflush([stdout]);
                        
                        thrd_join([thread2], NULL);
                        
                        nop
                        nop
                        
                        pop         r15
                        pop         rbx
                        leave
                        xor         eax, eax
                        ret

        ; NANO WAIT thread
        Thread1:        endbr64
                        push        rbp
                        mov         rbp, rsp
                        
                        fputs("Thread #1 is running and will wait event...", [stdout]);
                        fflush([stdout]);
                        
                @@      push        500000
                        push        0
                @@      mov         eax, SYSCALL_NANOSLEEP
                        lea         rdi, [rsp]
                        lea         rsi, [rsp]
                        syscall
                        test        eax, eax
                        jnz         @b
                        add         rsp, 16
                        test        [event], -1
                        jz          @b2
                        lock xor    [event], 1
                        
                        puts(" done.");
                        
                        nop
                        nop
                        
                        leave
                        xor         eax, eax
                        ret
                        
        ; FUTEX thread
        Thread2:        endbr64
                        push        rbp
                        mov         rbp, rsp
                        
                        fputs("Thread #2 is running and will wait event...", [stdout]);
                        fflush([stdout]);
                        
                @@      mov         eax, SYSCALL_FUTEX
                        lea         rdi, [event]
                        mov         esi, FUTEX_WAIT or FUTEX_PRIVATE
                        xor         edx, edx
                        xor         r10, r10
                        xor         r8, r8
                        xor         r9d, r9d
                        syscall
                        lock btr    [event], 0
                        jc          @f
                        inc         [ftx_loops]
                        jmp         @b
                        
                @@      fprintf([stdout], <" done within %u syscall loops.",10,0>, [ftx_loops]);
                        
                        nop
                        nop
                        
                        leave
                        xor         eax, eax
                        ret
                        

    


It worked fine and also measures the time difference (based on TSC, so, it wobbles a lot) between 'Thread1' (nanosleep() mode) and 'Thread2' (futex WAIT/WAKE mode).

I've seen a clear win (all time) for futex method while testing this. It will be the adopted technique in a GUI application I'm doing.

Hope it helps someone someday. Macros and headers that support this example can be found here.

Regards,

_________________
jesse6
Post 06 Feb 2026, 04:38
View user's profile Send private message Visit poster's website Reply with quote
Jessé



Joined: 03 May 2025
Posts: 79
Location: Brazil
Jessé 06 Feb 2026, 04:45
I must add the to fact that 500 µs used within nanosleep() technique is the quickiest possible wait time I found (here) when things start to demand more CPU (below this point).
Post 06 Feb 2026, 04:45
View user's profile Send private message Visit poster's website Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4364
Location: vpcmpistri
bitRAKE 07 Feb 2026, 03:11
Looks like you are creating unnecessary contention by putting data in the same cache line. Any data in the mutex cache line also becomes artificially pinned unless all the threads are on the same core. Split the data to separate cache lines and you'll see greater consistency even in low demand scenarios, imho.

If possible, the code should only ever read or write any cache line - single producer, single consumer. These types of patterns can often be made lock-free. As you've seen this might only impact in high-demand workloads -- there are just too many resources. Split the work into read-write pairs and you'll see greater performance in high-demand work.

_________________
¯\(°_o)/¯ AI may [not] have aided with the above reply.
Post 07 Feb 2026, 03:11
View user's profile Send private message Visit poster's website Reply with quote
Jessé



Joined: 03 May 2025
Posts: 79
Location: Brazil
Jessé 07 Feb 2026, 06:02
Nice theoretical tip. But...

I know people nowadays love lockless code (for unknown reasons), but, I've seen no one presenting a single measure of that "impact", like I did to myself before.
This only becomes meaningful if one have an assembly program with some millions or billions of lines of code, or a loop that does millions of lines some thousand times in a row, which I've never seen one doing up to this date, too. Just like the aligned access fever thing (which I also have measured, instead of just speculating about), which also has meaningless results for tiny codes.

So, to answer your point, I measure the impact of it, and it is 10 cycles for each on my AMD Ryzen 7.

To me, the lock will stay - for hardware locked thread safety, and where thread creation is the real concern.

(I will probably thank you if you come up with your lockless cache splitted version of it, so I can measure your suggestion properly)

By the way, where is that "mutex" located according to you on the above code? Have you ever read before any Linux internals documentation regarding what's really happening on the above source (a tip: a hash is created)? I start to think this is a speculation, too...
Post 07 Feb 2026, 06:02
View user's profile Send private message Visit poster's website Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4364
Location: vpcmpistri
bitRAKE 07 Feb 2026, 09:49
I hope you'll forgive my imitation of your coding style - the goal is to remove communication barriers and not to add to them.

The kernel hash mechanism does not negate user space false sharing; the cache-line issue is about the user space memory line containing event. My comments were about how the processor (typically) functions.

Perhaps this code is not completely orthogonal to your (future) goals:
Code:
format ELF64 executable 3

include 'fastcall_v1.inc'
include 'stdmacros.inc'
include 'stdio.inc'


define  FUTEX_WAIT          0
define  FUTEX_WAKE          1

define  FUTEX_PRIVATE       128

define  SYSCALL_NANOSLEEP   35
define  SYSCALL_FUTEX       202

;/----------------------------\;
;        Build Switches        ;
;\----------------------------/;
define ITERS             5_000_000 ; try 1-5 million
define SPLIT_CACHELINES  1        ; 0 = packed (false sharing), 1 = split
define USE_RDTSCP        1         ; Ryzen supports this
define USE_PAUSE         1
; perhaps greater consistency if you have many cores:
;     taskset -c 0,1

; some helper macros

macro my_tsc ; serializing return of 64-bit TSC in RAX
        if USE_RDTSCP
                rdtscp ; we can also detect core migration
        else
                lfence ; complete earlier instructions
                rdtsc
        end if
        lfence ; don't allow TSC read to slip past following instructions
        shl rdx, 32
        or rax, rdx
end macro

macro spin_wait_event_zero
        local spin,done
spin:
        mov eax, [event] ; no lock needed
        test eax, eax
        if USE_PAUSE
                jz done
                pause
                jmp spin
        else
                jnz spin
        end if
done:
end macro


_bss    thread1         dq ?

        
_data

if SPLIT_CACHELINES
        align 64 ; next cache line
        event dd 0

        align 64 ; next cache line
        ftx_loops dd 0

        align 64 ; next cache line
        main_loops dq ITERS
else
        ; packed: both in same cache line -> false sharing between threads
        event           dd 0
        ftx_loops       dd 0
        main_loops      dq ITERS
end if

        
_code   Start entry     endbr64
                        libc.StartMain(&@f);
                        
                @@      endbr64
                        push        rbp
                        mov         rbp, rsp
                        push        rbx ; to align stack?
                        push        r15
                        push        r14
                        push        r13
                        push        r12
                        push        rbx
                        
                        mov         rax, [stdout]
                        mov         rcx, [rax]
                        mov         [stdout], rcx
                        
                        thrd_create(&thread1, &Thread1, NULL);

                        ; Warm up / let thread park
                        usleep(10000);
                        
                        ; Stats
                        xor         r12, r12 ; sum cycles
                        or          r13, -1  ; min cycles
                        xor         r14, r14 ; max cycles

                .loop:
                        my_tsc
                        xchg        rbx, rax

                        ; Signal: event = 1 (atomic)
                        mov         eax, 1
                        xchg        [event], eax

                        ; Wake: futex_wake(&event, 1)
                        mov         eax, SYSCALL_FUTEX
                        lea         rdi, [event]
                        mov         esi, FUTEX_WAKE or FUTEX_PRIVATE
                        mov         edx, 1
                        xor         r10, r10
                        xor         r8,  r8
                        xor         r9d, r9d
                        syscall

                        ; Wait until worker consumes (event back to 0)
                        spin_wait_event_zero

                        my_tsc
                        sub         rax, rbx    ; dt

                        add         r12, rax    ; sum
                        cmp         rax, r13
                        cmovb       r13, rax    ; min
                        cmp         rax, r14
                        cmova       r14, rax    ; max

                        dec         [main_loops]
                        jnz         .loop

                        ; avg = sum / ITERS
                        mov         rax, r12
                        xor         rdx, rdx
                        mov         ecx, ITERS
                        div         rcx         ; rax = avg

                        ; Print: min/max/avg and near_event_writes
                        fprintf([stdout], <"iters=%u min=%lu max=%lu avg=%lu nearWrites=%u",10,0>, ITERS, r13, r14, rax, [ftx_loops])
                        fflush([stdout])

                        ; Tell worker to exit: event=2 then wake
                        mov          eax, 2
                        xchg         eax, [event]

                        mov          eax, SYSCALL_FUTEX
                        lea          rdi, [event]
                        mov          esi, FUTEX_WAKE or FUTEX_PRIVATE
                        mov          edx, 1
                        xor          r10, r10
                        xor          r8,  r8
                        xor          r9d, r9d
                        syscall

                        thrd_join([thread1], NULL);
                        
                        nop
                        nop
                        
                        pop         rbx
                        pop         r12
                        pop         r13
                        pop         r14
                        pop         r15
                        leave
                        xor         eax, eax
                        ret

        ; FUTEX thread
        Thread1:        endbr64
                        push        rbp
                        mov         rbp, rsp
                .wait:
                        cmp         [event], 0
                        jnz         .consume

                .sleep:
                        mov         eax, SYSCALL_FUTEX
                        lea         rdi, [event]
                        mov         esi, FUTEX_WAIT or FUTEX_PRIVATE
                        xor         edx, edx
                        xor         r10, r10
                        xor         r8, r8
                        xor         r9d, r9d
                        syscall

                ; ignore errors for demo (EINTR/EAGAIN/spurious) and re-check

                        jmp         .wait

                .consume:
                        ; atomically grab current value and clear
                        xor         ecx, ecx
                        xchg        [event], ecx ; implied lock

; Not needed because there is only one thread (main) storing non-zero values.
; If it was non-zero then it is still non-zero.
;                       jrcxz       .sleep

                        cmp         ecx, 2
                        jz          .exit

; Optional: write a counter "near" the futex word (false sharing when packed)
                        inc         [ftx_loops]

                        jmp         .wait

                .exit:
                        leave
                        xor         eax, eax
                        ret
    
My guess is that using all the build switches will produce lower averages verses not.
Post 07 Feb 2026, 09:49
View user's profile Send private message Visit poster's website Reply with quote
Jessé



Joined: 03 May 2025
Posts: 79
Location: Brazil
Jessé 07 Feb 2026, 16:34
Yes, that's good argumentation now (with an example back)!
I'll read it and test it carefully for sure.

Quote:
The kernel hash mechanism does not negate user space false sharing;

I guess you might be correct about this.
Perhaps I need a little more reading on how it works.

Many thanks!
Post 07 Feb 2026, 16:34
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2026, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.