flat assembler
Message board for the users of flat assembler.

Index > Windows > 64 bit not understood sub rsp,8 !

Goto page Previous  1, 2, 3, 4, 5, 6, 7, 8, 9
Author
Thread Post new topic Reply to topic
I



Joined: 19 May 2022
Posts: 58
I 19 May 2022, 16:02
revolution wrote:
But why do I have to restore the stack to get an exception delivered, can't I just set RSP to a different memory region? In later versions Windows this isn't possible, it checks to see if RSP points to the memory region designated as the stack. This is a recent change in Windows that was made to "fix" some security problems. Older versions of Windows do allow it.
Is this is just for exception handling?


For wonky RSP and W10 this works for me, might work on some other W10 versions too, idk. Windows can give the error message in that case.
Code:
format PE64 GUI 4.0
entry start

include 'win64a.inc'

struct FLASHWINFO
    cbsize       dq 0x20
    hwnd         dq ?
    dwflags      dd 15
    uCount       dq 2
    dwTimeout    dd 0
ends

section '.text' code readable executable

  start:
        sub      rsp,5*8

        call     [__win32kstub_NtUserFlashWindowEx]
        or       eax,1000h
        mov      [Flasher],eax

        call     [__win32kstub_NtUserGetMessage]
        or       eax,1000h
        mov      [GetMess],eax

        call     [__win32kstub_NtUserTranslateMessage]
        or       eax,1000h
        mov      [TranMess],eax

        call     [__win32kstub_NtUserDispatchMessage]
        or       eax,1000h
        mov      [DisMess],eax

        call     [__win32kstub_NtUserGetDC]
        or       eax,1000h
        mov      [GDC],eax

        call     [__win32kstub_NtUserSetTimer]
        or       eax,1000h
        mov      [Timer],eax

        xor      ecx,ecx
        call     [GetModuleHandle]
        mov      [wc.hInstance],rax

        xor      ecx,ecx
        mov      edx,IDI_APPLICATION
        call     [LoadIcon]
        mov      [wc.hIcon],rax
        mov      [wc.hIconSm],rax

        xor      ecx,ecx
        mov      edx,IDC_ARROW
        call     [LoadCursor]
        mov      [wc.hCursor],rax

        mov      rcx,wc
        call     [RegisterClassEx]

        xor      ecx,ecx
        mov      rdx,_class
        mov      r8,_Title
        mov      r9d,WS_VISIBLE+WS_DLGFRAME+WS_SYSMENU
        mov      rax,[wc.hInstance]
        mov      qword[rsp+20h],20
        mov      qword[rsp+28h],20
        mov      qword[rsp+30h],600
        mov      qword[rsp+38h],50
        mov      qword[rsp+40h],0
        mov      qword[rsp+48h],0
        mov      qword[rsp+50h],rax
        mov      qword[rsp+58h],0
        call     [CreateWindowEx]
        mov      [Fw.hwnd],rax
        mov      [mainhwnd],eax

        mov      dword[rsp],0x11111111
        mov      dword[rsp+4],0x11111111         ; Test Number

        sub      rsp,3                           ; RSP unaligned!

;        xor     rsp,rsp                         ; Causes access fault!
                                                 ; Sometimes app continues depending on call

        mov      ecx,[mainhwnd]                  ; TranslateMessage
        mov      r10,rcx
        mov      eax,[GDC]
        syscall
        mov      [hDC],eax

        xor      ecx,ecx
        xor      edx,edx
        mov      r8d,5000
        lea      r9,[TimerProc]
        call     [SetTimer]
        xor      r10,rcx
        mov      eax,[Timer]
        syscall

msg_loop:
        mov      [C1],rsp                        ; Each message save RSP for output
        mov      rcx,msg                         ; GetMessage
        xor      edx,edx
        xor      r8,r8
        xor      r9,r9
        mov      eax,[GetMess]
        mov      r10,rcx
        syscall

        cmp      eax,1
        jb       end_loop
        jne      msg_loop

        mov      rcx,msg                         ; TranslateMessage
        mov      r10,rcx
        mov      eax,[TranMess]
        syscall

        mov      rcx,msg                         ; DispatchMessage
        mov      r10,rcx
        mov      eax,[DisMess]
        syscall

        jmp      msg_loop
end_loop:

        add      rsp,3                            ;
        xor      ecx,ecx
        call     [ExitProcess]



proc TimerProc                                   ; Gets it's own stack
        sub      rsp,8 * 4 + 4                   ; An alignment debacle!
                                                 ; Change if it doesn't work for you.
        mov      ecx,5000
        mov      edx,100
        call     [Beep]                          ; Time is ticking!
        mov      rcx,wsbuff
        mov      rdx,ws
        mov      r8,[C1]                         ; Rsp from msg loop
        mov      r9,[r8+3]                       ; Wonky rsp value
        mov      [rsp+20h],rsp                   ; Rsp from here
        call     [wsprintf]

        mov      ecx,[hDC]
        mov      edx,5
        mov      r8,rdx
        mov      r9,wsbuff
        mov      [rsp+20h],eax
        call     [TextOut]

        mov      rcx,Fw
        call     [FlashWindowEx]

        ret
endp

proc WindowProc
        sub      rsp,8 * 4
        cmp      rdx,WM_DESTROY
        je       wmdestroy
  defwndproc:
        call     [DefWindowProc]
        jmp      finish
  wmdestroy:
        xor      ecx,ecx
        call     [PostQuitMessage]
        xor      rax,rax
  finish:
        ret
endp

  _class         db 'RSP',0
  _Title         db 'The time bomb is ticking!',0
  ws             db 'RSP %016IX - VAL %016IX : RSP2 %016IX',0

section '.data' data readable writeable shareable
  C1             dq ?
  GetMess        dd ?
  TranMess       dd ?
  DisMess        dd ?
  Timer          dd ?
  mainhwnd       dd ?
  GDC            dd ?
  hDC            dd ?
  ToT            dd ?
  Flasher        dd ?
  wc             WNDCLASSEX sizeof.WNDCLASSEX,0,WindowProc,0,0,NULL,NULL,NULL,COLOR_BTNFACE+1,NULL,_class,NULL
  msg            MSG
  Fw             FLASHWINFO
  wsbuff         rb 100h

section '.idata' import data readable writeable

  library gdi32,'GDI32.DLL',\
          kernel32,'KERNEL32.DLL',\
          user32,'USER32.DLL',\
          win32k,'win32k.sys'

  import  gdi32,\
          TextOut,'TextOutA'

  import  kernel32,\
          Beep,'Beep',\
          ExitProcess,'ExitProcess',\
          GetModuleHandle,'GetModuleHandleA'

  import  user32,\
          CreateWindowEx,'CreateWindowExA',\
          DefWindowProc,'DefWindowProcA',\
          FlashWindowEx,'FlashWindowEx',\
          LoadCursor,'LoadCursorA',\
          LoadIcon,'LoadIconA',\
          PostQuitMessage,'PostQuitMessage',\
          RegisterClassEx,'RegisterClassExA',\
          SetTimer,'SetTimer',\
          wsprintf,'wsprintfA'

  import  win32k,\
          __win32kstub_NtUserSetTimer,'__win32kstub_NtUserSetTimer',\
          __win32kstub_NtUserGetMessage,'__win32kstub_NtUserGetMessage',\
          __win32kstub_NtUserTranslateMessage,'__win32kstub_NtUserTranslateMessage',\
          __win32kstub_NtUserDispatchMessage,'__win32kstub_NtUserDispatchMessage',\
          __win32kstub_NtUserGetDC,'__win32kstub_NtUserGetDC',\
          __win32kstub_NtUserTextOut,'__win32kstub_NtUserTextOut',\
          __win32kstub_NtUserFlashWindowEx,'__win32kstub_NtUserFlashWindowEx'

    


Sorry if it's a bit rough.
Post 19 May 2022, 16:02
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20460
Location: In your JS exploiting you and your system
revolution 19 May 2022, 16:05
You can't call the system APIs with an unaligned stack. The system uses the SSE aligned data memory access instructions.

But you can use an unaligned stack within your own code.
Post 19 May 2022, 16:05
View user's profile Send private message Visit poster's website Reply with quote
macomics



Joined: 26 Jan 2021
Posts: 1043
Location: Russia
macomics 19 May 2022, 18:04
Code:
SYSCALL: Operation
IF (CS.L ≠ 1 ) or (IA32_EFER.LMA ≠ 1) or (IA32_EFER.SCE ≠ 1) THEN #UD; FI;
                ; Not in 64-Bit Mode or SYSCALL/SYSRET not enabled in IA32_EFER
RCX := RIP;     ; Will contain address of next instruction
RIP := IA32_LSTAR;
R11 := RFLAGS;
RFLAGS := RFLAGS AND NOT(IA32_FMASK);
CS.Selector := IA32_STAR[47:32] AND FFFCH       ; Operating system provides CS; RPL forced to 0
                ; Set rest of CS to a fixed value
                ; With 4-KByte granularity, implies a 4-GByte limit
CS.Base := 0;   ; Flat segment
CS.Limit := FFFFFH;
CS.Type := 11;  ; Execute/read code, accessed
CS.S := 1;
CS.DPL := 0;
CS.P := 1;
CS.L := 1;      ; Entry is to 64-bit mode
CS.D := 0;      ; Required if CS.L = 1
CS.G := 1;      ; 4-KByte granularity

IF ShadowStackEnabled(CPL) THEN IA32_PL3_SSP := LA_adjust(SSP); FI;
                ; adjust so bits 63:N get the value of bit N–1, where N is the CPU’s maximum linear-address width
                ; With shadow stacks enabled the system call is supported from Ring 3 to Ring 0
                ; OS supporting Ring 0 to Ring 0 system calls or Ring 1/2 to ring 0 system call
                ; Must preserve the contents of IA32_PL3_SSP to avoid losing ring 3 state
CPL := 0;
IF ShadowStackEnabled(CPL) THEN SSP := 0; FI;
IF EndbranchEnabled(CPL) THEN
  IA32_S_CET.TRACKER = WAIT_FOR_ENDBRANCH;
  IA32_S_CET.SUPPRESS = 0;
FI;
SS.Selector := IA32_STAR[47:32] + 8;    ; SS just above CS
                ; Set rest of SS to a fixed value
                ; With 4-KByte granularity, implies a 4-GByte limit
SS.Base := 0;   ; Flat segment
SS.Limit := FFFFFH;
SS.Type := 3;   ; Read/write data, accessed
SS.S := 1;
SS.DPL := 0;
SS.P := 1;
SS.B := 1;      ; 32-bit stack segment
SS.G := 1;      ; 4-KByte granularity    
When using this command, the OS itself must save, load rsp0 and restore the rsp value before sysret. But this command itself does not use the stack. i.e., no checks occur at all (even for alignment).

To prevent such errors, WinAPI system libraries exist. They eventually also execute the syscall command, but before that they try to prevent errors that may occur in ring-0 when using syscall directly at the ring-3 level.
Post 19 May 2022, 18:04
View user's profile Send private message Reply with quote
I



Joined: 19 May 2022
Posts: 58
I 20 May 2022, 01:59
@macomics cool and thanks for the reply, unfortunately most of it above my pay grade Very Happy

All I can do is some basic tests and try and learn. A little reading in Intel's SDM says each privilege level has it's own stack(s)? and copies data if needed, doesn't seem to matter about alignment on our user stack other than maybe performance as long as it' points to valid data.

Someone was asking about effect with context switching so the above code should give some context switching with wonky CPL3 stack, especially with the blocking GetMessage call.

While looking at making context switches happen quickly to see if some data below rsp is trashed I came across an interesting syscall, NtYieldExecution. It's not one of the user types so unfortunately cannot be looked up with a call to win32.sys but the Ntdll call jumps straight to the syscall, no parameters are used.
Code:
section '.text' code readable executable

  start:
        sub     rsp,1                                   ; wonky stack
        call    [NtYieldExecution]
        sub     rsp,39                                  ; make stack nice for needy api's?
    


If we cut out the middleman Ntdll we can use the syscall direct with rsp==0 which I think confirms our stack isn't needed at all for this particular case.
Post 20 May 2022, 01:59
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20460
Location: In your JS exploiting you and your system
revolution 20 May 2022, 02:32
That is correct, your RSP value is not used by the kernel for anything except delivering you the exception context if you asked for it.

Also, the alignment is only needed because of an MS decision to use the MOVDQA, and not MOVDQU, instructions to copy the stack from your space when doing transfers into the kernel. This is purely an MS decision, the hardware can do it, but MS decided to ignore that.

There are some APIs that you don't need to have an aligned stack, because they don't cross over to the kernel. But the list of those APIs is not reliable, they can change from version to version, so it is useless in most cases.
Post 20 May 2022, 02:32
View user's profile Send private message Visit poster's website Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  
Goto page Previous  1, 2, 3, 4, 5, 6, 7, 8, 9

< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.