flat assembler
Message board for the users of flat assembler.

Index > Main > SSE BCD numbers.

Author
Thread Post new topic Reply to topic
Roman



Joined: 21 Apr 2012
Posts: 2030
Roman 21 Nov 2025, 12:18
fasmw 1.73
i used DeepSeek and get this code
Code:
align 16
    ;bcd_data1     dw 0x1234, 0x5678, 0x9012, 0x3456, 0x7890, 0x1234, 0x5678, 0x9012
    bcd_data1:     times 8 dw 0x2229
    bcd_data2:     times 8 dw 0x1111
    bcd_output:    times 8 dw 0
    six_mask:      times 8 dw 0x0006  ; Correction value
    nine_mask:     times 8 dw 0x0009  ; Comparison value  

bcd_add_sse:
    movdqu xmm0, xword [bcd_data1]  ; Load first BCD number (4 digits per word)
    movdqu xmm1,xword  [bcd_data2]  ; Load second BCD number
    
    ; Add the BCD words
    paddw xmm0, xmm1          ; Add words
    
    ; BCD correction for 4-digit words
    movdqu xmm2, xmm0         ; Copy result
    movdqu xmm3, [nine_mask]  ; Load 9 mask
    
    ; Check each nibble for >9
    movdqu xmm4, xmm0
    psubusw xmm4, xmm3        ; Unsigned subtract with saturation
    pand xmm4, [six_mask]     ; Mask with 6 where >9
    
    ; Apply correction
    paddw xmm0, xmm4          ; Add correction
    
    movdqu [bcd_output], xmm0 ; Store result
    ret                     
    

I do call bcd_add_sse and get in bcd_output text ":3:3:3:3:3:3:3:3"
3A,33,3A,33,3A,33,3A,33,3A,33,3A,33,3A,33,3A,33
Post 21 Nov 2025, 12:18
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4315
Location: vpcmpistri
bitRAKE 21 Nov 2025, 21:39
I'm assuming you want packed-BCD words SIMD added resulting in the same data type.

Does this produce the desired result?
Code:
; SSE2 packed-BCD add for 4-digit words (16-bit lanes).
; Inputs:
;   xmm0 = A  (8x u16 words, each word has 4 BCD digits packed per nibble)
;   xmm1 = B  (same layout)
; Output:
;   xmm0 = A + B (packed BCD), ignoring carry out of the top digit (d3).
;
; Word layout (per 16-bit lane, little-endian bytes):
;   low byte : [d1][d0]  (high nibble = d1, low nibble = d0)
;   high byte: [d3][d2]  (high nibble = d3, low nibble = d2)
;
; Clobbers xmm2–xmm7.

bcd_add4w_sse2:
    ; ---- extract low and high nibbles into bytes ----
    movdqa  xmm2, xmm0
    movdqa  xmm3, xmm1
    pand    xmm2, [mask0F]        ; alo = low nibbles (d0,d2 in even/odd bytes)
    pand    xmm3, [mask0F]        ; blo

    movdqa  xmm4, xmm0
    movdqa  xmm5, xmm1
    psrlw   xmm4, 4
    psrlw   xmm5, 4
    pand    xmm4, [mask0F]        ; ahi = high nibbles (d1,d3 in even/odd bytes)
    pand    xmm5, [mask0F]        ; bhi

    ; ---- digit0 (d0, even bytes) ----
    movdqa  xmm6, xmm2
    pand    xmm6, [evenMask]      ; d0
    movdqa  xmm7, xmm3
    pand    xmm7, [evenMask]      ; b0
    paddb   xmm6, xmm7            ; s0 = d0 + b0

    movdqa  xmm7, xmm6
    pcmpgtb xmm7, [nine]          ; carry0 mask (0xFF where s0 > 9)

    movdqa  xmm0, xmm7
    pand    xmm0, [six]           ; 6*carry0
    paddb   xmm6, xmm0            ; s0 += 6 if needed
    pand    xmm6, [mask0F]        ; digit0 normalized to 0..9 in even bytes

    pand    xmm7, [one]           ; carry0 as 0/1 in even bytes

    ; ---- digit1 (d1, even bytes) ----
    movdqa  xmm1, xmm4
    pand    xmm1, [evenMask]      ; d1
    movdqa  xmm0, xmm5
    pand    xmm0, [evenMask]      ; b1
    paddb   xmm1, xmm0
    paddb   xmm1, xmm7            ; + carry0

    movdqa  xmm0, xmm1
    pcmpgtb xmm0, [nine]          ; carry1 mask (even bytes)

    movdqa  xmm7, xmm0
    pand    xmm7, [six]
    paddb   xmm1, xmm7            ; adjust
    pand    xmm1, [mask0F]        ; digit1 (even bytes)

    pand    xmm0, [one]           ; carry1 as 0/1 in even bytes
    psllw   xmm0, 8               ; shift to odd byte of each word (carry into d2)

    ; ---- digit2 (d2, odd bytes) ----
    movdqa  xmm7, xmm2
    pand    xmm7, [oddMask]       ; d2
    pand    xmm3, [oddMask]       ; b2  (blo no longer needed after this)
    paddb   xmm7, xmm3
    paddb   xmm7, xmm0            ; + carry1

    movdqa  xmm3, xmm7
    pcmpgtb xmm3, [nine]          ; carry2 mask (odd bytes)

    movdqa  xmm0, xmm3
    pand    xmm0, [six]
    paddb   xmm7, xmm0            ; adjust
    pand    xmm7, [mask0F]        ; digit2 (odd bytes)

    pand    xmm3, [one]           ; carry2 as 0/1 in odd bytes (into d3)

    ; ---- digit3 (d3, odd bytes) ----
    movdqa  xmm0, xmm4
    pand    xmm0, [oddMask]       ; d3
    movdqa  xmm2, xmm5
    pand    xmm2, [oddMask]       ; b3
    paddb   xmm0, xmm2
    paddb   xmm0, xmm3            ; + carry2

    movdqa  xmm2, xmm0
    pcmpgtb xmm2, [nine]          ; carry3 mask (ignored beyond the word)

    movdqa  xmm3, xmm2
    pand    xmm3, [six]
    paddb   xmm0, xmm3            ; adjust
    pand    xmm0, [mask0F]        ; digit3 (odd bytes)

    ; ---- repack nibbles back into packed BCD bytes ----
    movdqa  xmm2, xmm1
    psllw   xmm2, 4               ; digit1 << 4 (even bytes only)
    por     xmm2, xmm6            ; low bytes = [d1][d0]

    movdqa  xmm3, xmm0
    psllw   xmm3, 4               ; digit3 << 4 (odd bytes only)
    por     xmm3, xmm7            ; high bytes = [d3][d2]

    por     xmm2, xmm3
    movdqa  xmm0, xmm2            ; result
    ret


section .rodata align=16
mask0F:    db 0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F
evenMask:  db 0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00
oddMask:   db 0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF
nine:      db 0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09,0x09
six:       db 0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06
one:       db 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01    
(Untested, but looks good. Smile)

Perhaps the ADC equivalent would be a better algorithm to work on - more flexible. Yet, it seems like an unpacked core set of functions with a packed layer on top using them would be even more flexible.

_________________
¯\(°_o)/¯ AI may [not] have aided with the above reply.
Post 21 Nov 2025, 21:39
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.