flat assembler
Message board for the users of flat assembler.

Index > Windows > Optimization advice

Goto page 1, 2  Next
Author
Thread Post new topic Reply to topic
AE



Joined: 07 Apr 2022
Posts: 70
AE 25 Feb 2023, 19:40
Had to write an alternative 'PathRemoveFileSpec' function variant for the special case (a lot of long names).
If we simplify the logic, it all comes down to the analog of the 'wcsrchr' function except that we know in advance the size of the string.
The reason is that all native Windows implementations use slow word-by-word search so I decided to add the SSE2 part.
Unfortunately, no good examples were found and I had to write from scratch.
Some CRT dlls had SSE4.2 algo with std fallback but I need SSE2 specifically.
Certainly there are good examples in glibс, but AT&T syntax and linux code make it not the easiest to understand.

The code works, but in my opinion it is not written in the most optimal way (Also I may have missed something).
That's why I need a fresh look and possible advice on optimization.

Code:
format PE64 GUI 6.0
entry start
include 'win64w.inc'
section '.data' data readable writeable

    struct UNICODE_STRING
        Length             dw ?
        MaximumLength      dw ?
                           dd ?
        Buffer             dq ?
    ends
    SlashPat    dq ?,?  ; 128 bit mask for SSE2 search
    Path        du '\??\X:\Dir\SomeOtherDir\AndAnotherDir\AndAnotherDir\LoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooongNameDir',0
    CurPath     UNICODE_STRING

section '.text' code readable executable
start:
    sub     rsp,8

    ; Create and save pattern '\' mask
    mov    eax, 005C005Ch                ; du '\\'
    movd   xmm0, eax                     ;
    pshufd xmm0, xmm0, 0                 ; fill whole xmm0 with DWORD value from xmm0
    movdqu dqword [SlashPat], xmm0       ; save xmm0 to var

    mov     [CurPath.Length], 304*2
    mov     [CurPath.Buffer], Path
    @@:
    fastcall StepBackward
    invoke  MessageBox, HWND_DESKTOP, addr Path, 0, MB_ICONERROR or MB_OK or MB_TOPMOST
    cmp     [CurPath.Length], 14
    ja      @b
    error_exit:
    invoke  ExitProcess,0


    proc StepBackward
        ; Used: CurPath.UNICODE_STRING,  SlashPat  dq ?,?  ; 128 bit mask for SSE2 search
        movzx   rcx, [CurPath.Length]       ; String lenght (in bytes)
        mov     r8,  [CurPath.Buffer]       ; String buffer
        mov     r10, r8                     ; Stop  addr
        add     r8,  rcx                    ; Start addr (for simple search)
        ;
        bt      ecx, 0                      ; DEBUG size of wcstring always N|2
        jc      error_exit                  ;       if not - something goes wrong
        bt      r8d, 0                      ; DEBUG addr must be N|2 (aligned in memory)
        jc      error_exit                  ;       if not - something goes wrong
        ;
        ; If string lenght >16 use SSE2 otherwise use std 'per-word' search
        ; cmp     rcx, 16
        ; jb      .simplesearch
        mov     rbx, rcx
        sar     ebx, 4                      ; sse2 cicles num (sz/16)
        cmp     ebx, 1                      ;
        jb      .simplesearch               ; min value 0
        ; jmp     .simplesearch             ; DEBUG disable sse2
        ;
        sub     r8, 16                      ; Start addr (for SSE2)
        ; lea     r8,  [r8+rcx-16]            ; logic was split
        lea     rdx, [SlashPat]             ; pattern (pre-generated)
        movdqu  xmm1,  [rdx]                ; load it
    .SearchChr:
        test    ebx, ebx
        je     .simplesearch_r              ; if something left check it with simple algo
        ;
        movdqu  xmm0, [r8]                  ; Read from string
        sub     r8, 16                      ; step -16
        sub     rbx, 1                      ; cicles -1
        pcmpeqw xmm0, xmm1                  ; compare
        pmovmskb eax, xmm0                  ; send 128-->16-bit
        test eax, eax                       ; check bits
        jz  .SearchChr
    .FoundMatch:
        bsr    eax, eax                     ; get last bit
        lea    rax, [r8+rax+15]             ; final addr
        jmp    .ExitSearch
    .simplesearch_r:
        add     r8, 16                       ; restore position
    .simplesearch:
        mov     rax, r8
        @@:
        cmp     rax, r10                     ; cmp cur addr vs end addr
        jb      .ExitSearch                  ;
        ;
        sub     rax, 2                       ; step
        cmp     word [rax], '\'              ; compare
        jne     @b
    .ExitSearch:                             ; done
        ;
        mov     rdx, rax
        sub     rax, [CurPath.Buffer]        ; Calc new size
        cmp     rax, 14                      ; NT path used ('\??\X:\')
        jbe     @f                           ; if not disk root
        mov     [rdx], word 0                ; add zeroterm (Special case. Not normally used in UNICODE_STRING)
        jmp     .exit
        @@:
        mov     [rdx+2], word 0              ; if disk root leave '\'
        add     rax, 2                       ; and write zero after it
        .exit:
        mov     [CurPath.Length], ax         ; save new path lenght
        ret
    endp


section '.idata' import data readable writeable
library kernel32,'KERNEL32.DLL',\
            user32,  'user32.dll'
    include 'api/KERNEL32.inc'
    include 'api/USER32.inc'    
Post 25 Feb 2023, 19:40
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20360
Location: In your JS exploiting you and your system
revolution 25 Feb 2023, 19:52
Please be clear about what you are optimising for.

Size, ease of code comprehension, speed, something else? Then make a test suite, or measurement system, to inform you about how the code is performing.

But most importantly, there is likely no point in testing isolated special case code. Test within the final working app. Use the final app to guide your optimisation efforts towards the right places.
Post 25 Feb 2023, 19:52
View user's profile Send private message Visit poster's website Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 25 Feb 2023, 19:53
revolution wrote:
Please be clear about what you are optimising for

Speed.
Post 25 Feb 2023, 19:53
View user's profile Send private message Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 25 Feb 2023, 19:54
glibc is overly complicated. wcsrchr can be implemented with aligned loads by just starting with truncated aligned address (which is guaranteed to be on the same page as the first character in the input, so it can't cause a page fault by accessing stuff "before" the string), and getting rid of the low bits from the truncation. You don't need the "simple search" at all then.

e.g. if r8 is your buffer, start with something like:
Code:
    mov ecx, r8d
    and r8, -16  ; truncate align to 16 bytes
    and ecx, 15  ; how many bytes we truncated

    movdqa xmm0, [r8]
    pcmpeqw xmm0, xmm1
    pmovmskb eax, xmm0
    shr eax, cl  ; get rid of the low bits from the "truncation"
    test eax, eax
    jnz found    ; make sure at found you add "ecx" to the result after bsr, if you offset it from r8

    ; here do aligned 16-byte-at-a-time loop, can even go beyond the end of the string safely
    ; only thing is at the very end after calculating the total offset, cap it to the input's length
    ; this will ignore if something matches past the end of the string    
You can use the same trick with NUL terminated strings, just do two pcmpeqw, one that also checks for NUL chars, and combine them with pandn.

BTW don't use sar unless input is signed integer. Lengths aren't, use shr. Make it a habit, because for example if in my code above you used sar by habit, it could end up a disaster.
Post 25 Feb 2023, 19:54
View user's profile Send private message Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 26 Feb 2023, 09:40
Furs wrote:
Lengths aren't, use shr

You are right. Thx!

Furs wrote:
e.g. if r8 is your buffer, start with something like:

As far as I understand, this trick is designed to check unaligned string tails, right?
Maybe I didn't quite get the point, but it doesn't work quite right at the last step.
For example, let the path be '\??\X:\1\2'
Code:
; Current path: '\??\X:\1\2',0 (it can be null terminated or not it doesn't matter)
; r8 = rcx = string end      ^
; 
; '\??\X:\1\2'
;          ^     after 'and r8, -16' points here
;
; 'and ecx, 15'  ecx = 4
;
;  \     2     RestMemGarbage...
; ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
; │5C│00│32│00│XX│XX│XX│XX│XX│XX│XX│XX│XX│XX│XX│XX│ XMM0
; └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘
;  \     \     \     \     \     \     \     \   
; ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
; │5C│00│5C│00│5C│00│5C│00│5C│00│5C│00│5C│00│5C│00│ XMM1
; └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘
;                                                
; ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐
; │FF│FF│00│00│??│??│??│??│??│??│??│??│??│??│??│??│ XMM0    after 'pcmpeqw xmm0, xmm1'
; └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘
;
; 0000 0000 0000 0000   0000 0000 0000 0011         EAX after 'pmovmskb eax, xmm0' (eax=3)
;
; 0000 0000 0000 0000   0000 0000 0000 0000         EAX after 'shr eax, cl' (eax=0) (shr 3, 4) <<< ?
;
; Result = no match    

And if you mean check from the beginning of the buffer, then it makes no sense, the search should be from the end to the beginning.

PS
Also I can simplify things, given that the minimum possible size of the path is 16 otherwise we never get procedure call.


Last edited by AE on 26 Feb 2023, 11:03; edited 2 times in total
Post 26 Feb 2023, 09:40
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4046
Location: vpcmpistri
bitRAKE 26 Feb 2023, 10:20
Could process on cacheline boundaries to maximize memory throughput:
Code:
; I design from the inner-loop outward
.search:
        .UNROLL := 8

        repeat .UNROLL
                movdqa xmm#%, xmm0 ; load patterns
        end repeat
        repeat .UNROLL
                pcmpeqw xmm#%, [rsi-16*%]
        end repeat
        iterate REG, eax,edx,ecx,ebx,r8d,r9d,r10d,r11d
                if % > .UNROLL
                        break
                end if
                pmovmskb REG, xmm#%
        end iterate
        iterate REG, eax,edx,ecx,ebx,r8d,r9d,r10d,r11d
                if % > .UNROLL
                        break
                end if
                test REG, REG
                jnz .NZ#% ; have a match
        end iterate
        sub rsi, 16 * .UNROLL
        cmp rsi, rdi
        jnc .search

        ; handle string head    
... that's a lot more complex though (think about the prolog/epilog needed), and assumes file names are probably short - the inner loop rarely loops.

Working from the end of the string might be faster if you were processing an array of strings and didn't want to pollute the data cache. Usually, something is done with the string and it needs to be in cache.

Note: in your present code you need to preserve RBX
Post 26 Feb 2023, 10:20
View user's profile Send private message Visit poster's website Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4046
Location: vpcmpistri
bitRAKE 26 Feb 2023, 11:52
Small code could look something like:
Code:
        virtual at RDX
                .ustr UNICODE_STRING
        end virtual

        movzx eax,[.ustr.Length]
        mov rcx,[.ustr.Buffer]
        jrcxz .empty
        xchg rax,rcx
        shr ecx,1 ; characters, USC2
        jnz .search
.empty:
        retn

.search:
        cmp word [rax+(rcx-1)*2],'\'
        loopnz .search
        jrcxz .not_found

        ; logic to filter network paths, root drives

.not_found:
        mov word [rax+rcx*2],0 ; terminate for stringz use
        movzx [.ustr.Length],cx ; update length
        retn    
... and still work backward. REPNZ SCASW is also possible with a little overhead.

_________________
¯\(°_o)/¯ “languages are not safe - uses can be” Bjarne Stroustrup
Post 26 Feb 2023, 11:52
View user's profile Send private message Visit poster's website Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 26 Feb 2023, 13:34
AE wrote:
And if you mean check from the beginning of the buffer, then it makes no sense, the search should be from the end to the beginning.
Yeah, I meant from the beginning, sorry, I had wcsrchr in mind, which needs that (because it's NUL terminated, so you don't know the end of the string).

You can do similar trick by starting from the end though. But pick the last character's address in the string, and then truncate/align that address, which ensures your aligned read will never page fault because it's guaranteed to be on the same page as that character, and contain the last character in it, somewhere.

Then discard what's beyond the string (or before it) using shift or ANDing or similar.
Post 26 Feb 2023, 13:34
View user's profile Send private message Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 26 Feb 2023, 13:58
So far I get this

Code:
format PE64 GUI 6.0
entry start
include 'win64w.inc'
section '.data' data readable writeable

    struct UNICODE_STRING
        Length             dw ?
        MaximumLength      dw ?
                           dd ?
        Buffer             dq ?
    ends
    SlashPat    dq ?,?  ; 128 bit mask for SSE2 search
    ; Path        du '\??\X:\1\2',0
    Path        du '\??\X:\Dir\SomeOtherDir\AndAnotherDir\AndAnotherDir\LoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooongNameDir',0
    CurPath     UNICODE_STRING

section '.text' code readable executable
start:
    sub     rsp,8

    ; Create and save pattern '\' mask
    mov    eax, 005C005Ch                ; du '\\'
    movd   xmm0, eax                     ;
    pshufd xmm0, xmm0, 0                 ; fill whole xmm0 with DWORD value from xmm0
    movdqu dqword [SlashPat], xmm0       ; save xmm0 to var

    mov     [CurPath.Length], 304*2
    mov     [CurPath.Buffer], Path
    @@:
    fastcall StepBackward
    invoke  MessageBox, HWND_DESKTOP, addr Path, 0, MB_ICONERROR or MB_OK or MB_TOPMOST
    cmp     [CurPath.Length], 14
    ja      @b
    error_exit:
    invoke  ExitProcess,0



    proc StepBackward
        ; Used: CurPath.UNICODE_STRING,  SlashPat  dq ?,?  ; 128 bit mask for SSE2 search
        movzx   rcx, [CurPath.Length]       ; String lenght (in bytes)
        mov     r8,  [CurPath.Buffer]       ; String buffer
        mov     r10, r8                     ; Stop  addr
        add     r8,  rcx                    ; Start addr
        ;
        ; cmp     rcx, 16                     ; DEBUG minimal path size when procedure is called is 16
        ; jb      error_exit                  ;       if not - something goes wrong
        ; bt      ecx, 0                      ; DEBUG size of wcstring always N|2
        ; jc      error_exit                  ;       if not - something goes wrong
        ; bt      r8d, 0                      ; DEBUG addr must be N|2 (aligned in memory)
        ; jc      error_exit                  ;       if not - something goes wrong
        ;
        lea     rdx, [SlashPat]             ; pattern (pre-generated)
        movdqu  xmm1,  [rdx]                ; load it
        ;
        mov ecx, r8d
        and r8, -16                         ; 
        and ecx, 15                         ; unaligned tail
        ;
        test     ecx, ecx                   ; if tail exist check it first with simple word-by-word search
        je       .sse_search
        ; Simple search for unaligned tail
        lea     rax, [r8+rcx]
        @@:
        cmp     rax, r8                      ; cmp cur addr vs aligned addr
        jb      .sse_search                  ; if tail have no matches we continue with sse2 search
        sub     rax, 2                       ; step
        cmp     word [rax], '\'              ; compare
        jne     @b
        jmp     .EOS                         ; get match in tail
        ;
        .sse_search:
        sub      r8, 16                      ; step -16
        cmp      r8, r10
        jb       error_exit                  ; DEBUG  if not found - something goes wrong
        ; movdqu  xmm0, [r8]
        movdqa   xmm0, [r8]                  ; read aligned
        pcmpeqw  xmm0, xmm1
        pmovmskb eax, xmm0
        test     eax, eax
        jz       .sse_search
        ; sse2 match found
        bsr    eax, eax                      ; get last bit
        lea    rax, [r8+rax]                 ; calc addr
        and    eax, -2                       ; fix 'mid word' shift
        ;
        .EOS:                                ; done
        mov     rdx, rax
        sub     rax, [CurPath.Buffer]        ; Calc new size
        cmp     rax, 14                      ; NT path used ('\??\X:\')
        jbe     @f                           ; if not disk root
        mov     [rdx], word 0                ; add zeroterm (Special case. Not normally used in UNICODE_STRING)
        jmp     .skip
        @@:
        mov     [rdx+2], word 0              ; if disk root leave '\'
        add     rax, 2                       ; and write zero after it
        .skip:
        mov     [CurPath.Length], ax         ; save new path lenght
        .exit:
        ret
    endp


section '.idata' import data readable writeable
library kernel32,'KERNEL32.DLL',\
            user32,  'user32.dll'
    include 'api/KERNEL32.inc'
    include 'api/USER32.inc'    


A little bit shorter but "tail" still checked by-word way.
Post 26 Feb 2023, 13:58
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4046
Location: vpcmpistri
bitRAKE 26 Feb 2023, 20:40
Another way to adjust R8 to the alignment needed:
Code:
        jmp     .tail_test
.tail:
        sub     r8, 2
        cmp     r8, r10
        jc      .tail_underflow ; not found
        cmp     word [r8], '\'
        jz      .EOS
.tail_test:
        test    r8, 15
        jnz     .tail
.sse_search:    
... do short strings not aligned do unexpected things in your code? Probably not allowed nor a valid use case?

_________________
¯\(°_o)/¯ “languages are not safe - uses can be” Bjarne Stroustrup
Post 26 Feb 2023, 20:40
View user's profile Send private message Visit poster's website Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 27 Feb 2023, 12:35
AE wrote:
So far I get this
Why not just:
Code:
    ; assume CurPath.Length is in ebx
    lea r8, [r8 + rbx - 2]  ; not rbx*2 since Length is in BYTES
    mov edx, r8d
    and r8, -16  ; truncate align to 16 bytes
    and edx, 15  ; how many bytes we truncated
    mov ecx, 31
    sub ecx, edx

    movdqa xmm0, [r8]
    pcmpeqw xmm0, xmm1
    pmovmskb eax, xmm0
    shl eax, cl  ; get rid of the high bits beyond end of string
    test eax, eax
    jnz found_at_end

    ; here do aligned 16-byte-at-a-time loop
    ...


  found_at_end:
    ; assume CurPath.Length is in ebx
    bsr eax, eax
    sub ebx, 32+1          ; adjust for 'eax' so that highest bit in eax is -2 from end of string, and others are further negative (+1 due to wide chars' bsr getting the "high" bit of the 2-bit pairs)
    add ebx, eax           ; this gets char position in string
    js not_found           ; this means it's before the string ("negative" char pos), so ignore

    ; ebx = char pos in string in bytes    
There may be some bugs with respect to it being wide chars, I haven't thought much, but the idea is this.
Post 27 Feb 2023, 12:35
View user's profile Send private message Reply with quote
bitRAKE



Joined: 21 Jul 2003
Posts: 4046
Location: vpcmpistri
bitRAKE 28 Feb 2023, 03:56
Can the input string address be MOD 8 and only 3 characters long?
Code:
        align 16
        rb 8 ; undefined bytes
        label ustr:256
        du 'C:\'
        ustr.bytes = $ - ustr
        rb sizeof ustr - ustr.bytes

        CurPath UNICODE_STRING \
                Length: ustr.bytes,\
                MaximumLength: sizeof ustr,\
                Buffer: ustr    
... if this is possible then tail processing gets more complex because the bytes before ustr are undefined. IIRC, Windows allocates UNICODE_STRING from a pool and allows the pointer to be MOD 8. If the pointer is forced to be MOD 16 then this is not possible. Maybe it's only allowed to be MOD 8 on 32-bit windows?

Obviously, if we'd like to support MOD 2 string pointers then tail processing is complicated when the string is short. I realize this is a corner case: suggesting that the short string lacks a '\' and that the undefined bytes before the pointer contains a '\'. Very Happy

_________________
¯\(°_o)/¯ “languages are not safe - uses can be” Bjarne Stroustrup
Post 28 Feb 2023, 03:56
View user's profile Send private message Visit poster's website Reply with quote
DimonSoft



Joined: 03 Mar 2010
Posts: 1228
Location: Belarus
DimonSoft 28 Feb 2023, 07:04
I wonder how much will one lose due to the complex setup and larger code for the SSE implementation as compared to using simple string instructions based implementation. I mean, paths and strings in general are not quite large for most use cases (and filename part plus [back]slash are WAY shorter).

And then having a tiny piece of code lets one have better utilization of caches, including prefetchers, decoded ICaches, etc. REP+instruction take around two or three bytes and no branching instructions (with target addresses that need to pass through the addressing mechanisms) required. Will even 2–3 SSE instructions with all the stuff around them be able to fit into a small prefetcher?

REPs “inform” the processor about the intention of the code (instead of performing tricky stuff on registers that are too large for a single string element or, even worse, using XMMs just for style points with no tricky stuff), so, as their implementation is up to the processor and its internals, can possibly be improved in performance in the future without any changes to the code.

And then one has to unwind the tricks and quirks to get the actual result. Which also takes time and space. Looks like another side of bloatware.

I don’t ask if it’s worth the saved microseconds, I ask if the microseconds are really saved, not wasted.
Post 28 Feb 2023, 07:04
View user's profile Send private message Visit poster's website Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 28 Feb 2023, 14:17
bitRAKE wrote:
Can the input string address be MOD 8 and only 3 characters long?
Code:
        align 16
        rb 8 ; undefined bytes
        label ustr:256
        du 'C:\'
        ustr.bytes = $ - ustr
        rb sizeof ustr - ustr.bytes

        CurPath UNICODE_STRING \
                Length: ustr.bytes,\
                MaximumLength: sizeof ustr,\
                Buffer: ustr    
... if this is possible then tail processing gets more complex because the bytes before ustr are undefined. IIRC, Windows allocates UNICODE_STRING from a pool and allows the pointer to be MOD 8. If the pointer is forced to be MOD 16 then this is not possible. Maybe it's only allowed to be MOD 8 on 32-bit windows?

Obviously, if we'd like to support MOD 2 string pointers then tail processing is complicated when the string is short. I realize this is a corner case: suggesting that the short string lacks a '\' and that the undefined bytes before the pointer contains a '\'. Very Happy
It doesn't matter if they're aligned to less. The only requirement is that they're aligned to 2 bytes, and that's because of pcmpeqw. I'm assuming your wide strings are aligned to 2 bytes though.

When you align an address to 16 bytes, you guarantee they're on the same page as the respective address you want. If that address doesn't fault on access, it won't fault when truncated. That's because all CPU accesses (in this case 16 bytes) are smaller and a common divisor of the page size (4096 or something higher).

In my case, yes you will have undefined/garbage bytes, that's why the shift is there to remove bytes beyond end of string. And why you check for "negative" pos, in case you have some garbage bytes that match the character before the string.

This trick doesn't require much setup either, it's really super convenient for SIMD manipulations of long data. Wink


If you look at my code, it takes the address of the last character in the string. This character is guaranteed to be on valid page (else it would crash even with "simple" loop without SIMD!). If you truncate this, the resulting access is guaranteed to be on the same page, no matter what.

The only niche case that will break is if you set debugger hardware breakpoints. Since a SIMD will potentially access more memory (but the same pages, though). I don't think this is worth worrying about.

And well, the wide string has to be aligned to 2 bytes (have lowest bit set to 0).
Post 28 Feb 2023, 14:17
View user's profile Send private message Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 28 Feb 2023, 18:28
DimonSoft wrote:
I don’t ask if it’s worth the saved microseconds, I ask if the microseconds are really saved, not wasted.
And that's where it's worth taking a broader view.
This is not just a special case for a narrow application, but essentially an improved 'wcsrchr' function algorithm. And this can already save tens of seconds in some cases.


Some speed tests:
Code:
Time: 400.9993000000   ; PathRemoveFileSpec
Time: 229.0009000000   ; Windows native (per-word)
Time: 52.0004000000    ; +sse2 v1
Time: 60.9997000000    ; +sse2 v2    


For some reason my first version is a little bit faster then last one Smile

Haven't tried the suggestions from the last posts yet because I've been away for a few days. And thanks to everyone who helps!
Post 28 Feb 2023, 18:28
View user's profile Send private message Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 28 Feb 2023, 19:48
Furs wrote:
Why not just
Code:
Time: 411.0540000000    ; PathRemoveFileSpec (raw call so in fact it is even longer)
Time: 231.9472000000    ; Windows native (per-word)
Time: 54.9989000000     ; +sse2 v1
Time: 53.0015000000     ; +sse2 v2  
Time: 46.9982000000     ; 'Why not just' (unoptimized)
    
Well, I admit, pure sse2 variant is a little bit faster.

Need to test it in different cases for possible errors and will use it as a main.
Thx!
Post 28 Feb 2023, 19:48
View user's profile Send private message Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 01 Mar 2023, 14:23
Only thing you need to take care of is, again, negative positions on the "tail" of the search (i.e. after the 16-byte-at-a-time loop), treat them as "not found". It's possible that it ends up there since string isn't necessarily 16-byte aligned.
Post 01 Mar 2023, 14:23
View user's profile Send private message Reply with quote
DimonSoft



Joined: 03 Mar 2010
Posts: 1228
Location: Belarus
DimonSoft 01 Mar 2023, 20:16
AE wrote:
Some speed tests:
Code:
Time: 400.9993000000   ; PathRemoveFileSpec
Time: 229.0009000000   ; Windows native (per-word)
Time: 52.0004000000    ; +sse2 v1
Time: 60.9997000000    ; +sse2 v2    

What are the units and what is the source data used for measurements? How does it relate to real-world applications?
Post 01 Mar 2023, 20:16
View user's profile Send private message Visit poster's website Reply with quote
AE



Joined: 07 Apr 2022
Posts: 70
AE 02 Mar 2023, 06:38
Furs wrote:
Code:
    ; assume CurPath.Length is in ebx
    lea r8, [r8 + rbx - 2]  ; not rbx*2 since Length is in BYTES
   

  ;found_at_end:
    ;bsr eax, eax
    ;sub ebx, 32+1          ; adjust for 'eax' so that highest bit in eax is -2 from end of string, and others are further negative (+1 due to wide chars' bsr getting the "high" bit of the 2-bit pairs)
    ;add ebx, eax           ; this gets char position in string
    ;js not_found              


Why '-2' ?
Valid start addr will be 'add r8, rbx'.

And then 'eax' correction must be? 32?
Also maybe instead '+1' better use 'and -2'?
Post 02 Mar 2023, 06:38
View user's profile Send private message Reply with quote
Furs



Joined: 04 Mar 2016
Posts: 2521
Furs 02 Mar 2023, 14:28
AE wrote:
Furs wrote:
Code:
    ; assume CurPath.Length is in ebx
    lea r8, [r8 + rbx - 2]  ; not rbx*2 since Length is in BYTES
   

  ;found_at_end:
    ;bsr eax, eax
    ;sub ebx, 32+1          ; adjust for 'eax' so that highest bit in eax is -2 from end of string, and others are further negative (+1 due to wide chars' bsr getting the "high" bit of the 2-bit pairs)
    ;add ebx, eax           ; this gets char position in string
    ;js not_found              


Why '-2' ?
Valid start addr will be 'add r8, rbx'.

And then 'eax' correction must be? 32?
Also maybe instead '+1' better use 'and -2'?
EDIT: Nevermind, my code was correct, because it shifted to the up of the 32-bit register.

Anyway here's an explanation with ASCII art:

Code:
The underscores are "garbage bytes" or "undefined" bytes around the string that you must ignore.

This is a 16-byte block of memory containing wide chars:

_b_abc_b

so string = abc
string length = 6  (3 chars × 2 bytes each)
char to find = b

r8 = address of 'a' (start of string).
rbx = length of string (6)


So, lea r8, [r8 + rbx - 2], gets the address of c, the last char in the string. (&a + 6 - 2 = &a + 4 = c's address)
You truncate this to _b_abc_b in xmm register, then create a mask that will look like (in little endian):

__    **    __
1100001100001100
 b _ c b a _ b _

Note the underscores above the masked b positions. Those are "fake" because they're outside the string.
Only the ** above the middle 1s (in string abc) is valid.

Next you shift this to the left by 4+16 bits (16 to put it into the upper 16-bits of the 32-bit register) to discard bits beyond the string:

  **    __
0011000011000000 [16 zeros]
 c b a
Now you use bsr on these 32 bits.
Note how bsr will give you the UPPER BIT of the 2-byte char.
Hence the +1 (we subtract 32 from it, so +1 means subtract another one).
We want the byte position of the lowest bit, not highest!

However the bit position here will be given as "offset" from 32.
i.e. bit position 31 = highest bit = last char
bit position 29 = second to last char
bsr will give you 29.

But in terms of position in the string:

last char corresponds to string length - 2
second to last char corresponds to string length - 4

You need to convert this from 29 to "string length - 4"

So you must make sure 31 becomes -2 and 29 becomes -4, then add this to your string length.
31 - (32+1) = -2

Let's see it in practice:

string length - (32+1) + bsr result
6 - (32+1) + 29 = 2

which means it found 'b' at BYTE position 2 in the string.
...which is correct! our string was abc, and in terms of positions, a:0, b:2, c:4


Now what if our string was acd instead? Then mask would look like:

__          __
1100000000001100
 b _ d c a _ b _

shifted:

        __
0000000011000000 [16 zeros]
 d c a

bsr will now give us 23.

string length - (32+1) + bsr result
6 - (32+1) + 23 = -4

Result is negative, because it's "garbage/undefined" bytes before the string.
Which is actually correct position, but it must be discarded because it's before the string.
It has sign bit set, so this jumps to "not_found", which is fine.    
Post 02 Mar 2023, 14:28
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  
Goto page 1, 2  Next

< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.