;pe 32bit GUI
format PE GUI 4.0
entry Start
include 'c:\fasmw17316\include\Win32a.inc'
include 'fasmAPI\madmat32.txt'
macro Msg a { invoke MessageBox,0,a,0,0 }
macro ExitMsg a { invoke MessageBox,0,a,'ExitToWin',0
      invoke  ExitProcess,0
      }
macro mIncTxt t { local .1
                  inc   byte [zahodov+1]
                  cmp   byte [zahodov+1],"9"
                  jbe   .1
                  mov   byte [zahodov+1],"0"
                  inc   byte [zahodov]
  .1:
                  }
macro xmval_ abvec,xm,a,op,b{
      abvec equ xmm#xm
      movups xmm#xm,dqword [a]
      movups xmm0,dqword [b]
      if op eq '+'
      addps  xmm#xm,xmm0
      end if
      if op eq '-'
      subps  xmm#xm,xmm0
      end if
      if op eq '*'
      mulps  xmm#xm,xmm0
      end if
      if op eq '/'
      divps  xmm#xm,xmm0
      end if
      }
macro xmvec4  abvec,xm,a{
      abvec equ xmm#xm
      match c+e, a \{ ;display e
      xmalgn xmm#xm,dqword [c]
      ;movups xmm0,dqword [e]
      addps  xmm#xm,dqword e      \}
      match c*e, a \{
      xmalgn xmm#xm,dqword [c]
      ;movups xmm0,dqword [e]
      mulps  xmm#xm,dqword e      \}
      match c-e, a \{
      xmalgn xmm#xm,dqword [c]
     ; movups xmm0,dqword [e]
      subps  xmm#xm,dqword e      \}
      match c/e, a \{
      xmalgn xmm#xm,dqword [c]
      ;movups xmm0,dqword [e]
      divps  xmm#xm,dqword e      \}
      match c:, a \{
      xmalgn xmm#xm,dqword [c]    \}
      }
macro Dot a,b {
      mulps   a,b
      haddps  a,a
      haddps  a,a
      }
macro lenght a {
      Dot a,a
      sqrtss a,a
      }
macro clamp x,c,v { xorps c,c
      maxss  x,c
      mov eax,v
      movd c,eax
      minss   x,c
      }
macro SferLineP r,lineS,lineE,c { local .a
      movss xmm0,[r]
      mulss xmm0,xmm0
;ab = a - b;
      movups xmm1,dqword [lineS]
      movaps xmm2,xmm1
      movups xmm3,dqword [lineE]
      subps  xmm2,xmm3
      xorps  xmm5,xmm5    ;=-ab
      subps  xmm5,xmm2
;pa = p-a;
      movups xmm4,dqword [c]
      subps  xmm4,xmm1
;t=dot(pa, -ab) / dot(ab, ab);
      Dot xmm4,xmm5
      Dot xmm2,xmm2
      divss xmm4,xmm2
;clamp(t,0,1)
    ;  xorps xmm3,xmm3
    ;  maxss  xmm4,xmm3
    ;  mov eax,1.0
    ;  movd xmm2,eax
    ;  minss   xmm4,xmm2
      clamp  xmm4,xmm3,1.0
      movsldup xmm4,xmm4
      movlhps  xmm4,xmm4
;a + (b - a) *t
      movups xmm3,dqword [lineE]
      subps  xmm3,xmm1           ;b-a
      mulps  xmm3,xmm4           ;*t
      addps  xmm1,xmm3
;we get two points on line and sphereCntr. Do simple sphere i point distance !
;or this
;pc = project_point - circle_pos
;if dot(pc, pc) <= circle_radius * circle_radius; sfera intersect line
      movups xmm4,dqword [c]
      subps  xmm1,xmm4
      Dot xmm1,xmm1
      comiss xmm1,xmm0
      ja   @f
      Msg  'sfera intersect line !'
@@:
      }
macro SferLine2 r,lineS,lineE,c { local .a
     ; mov eax,0.7
     ; movd xmm6,eax
      movss xmm0,[r]
      mulss xmm0,xmm0
      movups xmm1,dqword [lineS]
      movups xmm3,dqword [lineE]
      movups xmm2,dqword [c]                ;sfera centr coords
      movaps xmm5,xmm2
      subps  xmm2,xmm3
      subps  xmm3,xmm1         ;vAB
      ;movaps xmm6,xmm3
      ;subps  xmm2,xmm1         ;vCA
      ;movaps xmm4,xmm2
      ;subps  xmm2,xmm3
      ;subps  xmm3,xmm1
      Dot    xmm3,xmm5         ;projection(Circle,lineVectrAB)
      movaps xmm6,xmm3
      ;divss  xmm3,xmm6
      Dot   xmm3,xmm3
 movaps xmm2,xmm5
      Dot    xmm2,xmm2         ;lenght(proj+vCA)
      ;divss  xmm2,xmm6
      ;Dot    xmm3,xmm3
      ;addss  xmm3,xmm2
      addss   xmm3,xmm2
      ;mulss   xmm3,xmm6
      divss    xmm3,xmm6
      sqrtss xmm3,xmm3
      comiss xmm3,xmm0
      ja     @f
      Msg    'circle on line !'
@@:

          }
macro SferLine p,lineS,lineE { local .a
      xmm_d equ xmm1
      xmm_w equ xmm0

      movups xmm_d,dqword [lineS]
      movaps xmm3,xmm_d
      movups xmm2,dqword [lineE]
      subps xmm2,xmm_d        ;=d
      movaps xmm_d,xmm2
      movups xmm_w,dqword [p]
      subps  xmm_w,xmm3       ;=w
      ;lensq(d)=xmm3
      movaps xmm3,xmm_d
      mulps  xmm3,xmm3
      haddps  xmm3,xmm3
      haddps  xmm3,xmm3
      ;rsqrtss  xmm3,xmm3   ;0.45
      ;sqrtss  xmm3,xmm3
;float fR = (dot(w, d) / lensq(d));
      Dot xmm_w,xmm_d
      divss xmm_w,xmm3
      xorps xmm2,xmm2
;       if(fR <= 0.0f) return _vLineS;
      comiss xmm_w,xmm2
      ja @f
      movups xmm1,dqword [lineS]
      jmp    .a
@@:
; Nearest point is on the line
;        else if(fR < 1.0f) return _vLineS + d * fR;
      mov eax,1.0
      movd xmm2,eax
      comiss xmm_w,xmm2
      jb @f
      mulss  xmm_d,xmm_w
      movups xmm2,dqword [lineS]
      addps  xmm_d,xmm2
      jmp    .a
@@:   movups xmm1,dqword [lineE]
; Nearest point to _vPoint is _vLineE
;        else return _vLineE;

.a:
      }
macro SSE_AABB p,box {
      movaps xmm1,dqword [p]
      movaps xmm3,xmm1
      cmpps xmm1,dqword [box],6 ;if p > boxMin than eax=0x0fff
      pmovmskb eax,xmm1
      cmp   eax,0x0FFF
      jnz    @f      ;jb
      cmpps xmm3,dqword [box+16],2 ;if p <= boxMax than eax=0x0fff
      pmovmskb eax,xmm3
      cmp   eax,0x0FFF
      jnz   @f
Msg 'point in 3D box'
@@:
      }
macro SSE_AABB2 p,box,mtk {
      movaps xmm1,dqword [p]
      movaps xmm3,xmm1
      cmpps xmm1,dqword [box],6 ;if p > boxMin than eax=0x0fff
      pmovmskb eax,xmm1
      cmp   eax,0x0FFF
      jnz    mtk
      cmpps xmm3,dqword [box+16],2 ;if p <= boxMax than eax=0x0fff
      pmovmskb eax,xmm3
      cmp   eax,0x0FFF
      jnz   mtk
      }
DbgFlag__ = 0       ;print info 

section '.code' code readable writeable executable 
        macro push_virt {push_virt_label: virtual at ebx}  ;include macro.txt
        macro push_virt { push_virt                        ;include data.txt
                          push_virt_cs dd qvalsmm+10 dup(0) }  ;this as rd in section data
        kuda             dd 0,0
        BtnCheckReplacer dd 0
        txt              db '55 44:55,ExitProcess',0
        find             db "itPro",0
        zahodov          db "01",0
        app:
                         db "//a1",13,10
                         db "mov ?0,?2",13,10
                         db ".%%:",13,10
                         db "&&      call  ?1",13,10
                         db "&&",13,10
                         db "//      dec ?0",13,10
                         db "        jnz .%%",13,10
                         db "&&",13,10
                         db "....",13,10
                         db "//a2",13,10
                         db "mov ?0,?1",13,10
                         db ".%%:",13,10
                         db "&&",13,10
                         db "//dec ?0",13,10
                         db "        jnz .%%",13,10
                         db "....",13,10

                         db "----",13,10
                         db "a1:edx;ebx;esi;",13,10
                         db "a1:[Ops];esi;",13,10
                         db "a2:[Tom];45",13,10
                         db "----",13,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
        align 16
        aa               dd 1.0,2.0,3.5,5.25
        abuuff = 0
        align 16
        tdigs  db '50000000000     '
        tdigs0 db '           65539'
        align 16
        av32_1 db 8 dup('fron')
        av32_2 db 8 dup('1111')
        av32_10 db 8 dup('fron')
        av32_20 db 8 dup('fdon')
        align 16
        maskroll dd 4 dup(0x7f7f7f7f)
        xxmm    dd 8 dup(0)
        tspace  db 16 dup('*')
        txtFind db 'we rot played we',0
        txtA    db 'we yyy11l1111mnn',0
        txtRng  db 'nnnnzzzzzzzzzzzz',0
        txtFin2 db '!!=th0+/         ',0
        txtRng2 db '09(/==azaz',0
        txtaRng db '   nzzzzzzzzzzzz',0
        align 16
        ;hjj1  db  '0000000000000000'
        hjj2  db  1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
        ;apply db  0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
        afft  db  '::::::::::::::::'
    hex_lut:  db  "0123456789abcdef"
    low_nibble_mask: times 16 db 0x0f
    reverse_8B: db 7,6,5,4,3,2,1,0,   15,14,13,12,11,10,9,8
    align 16
    vec_ASCII_zero: times 16 db '0'
    vec_af_add:     times 16 db 'a'-58 ;('0'+10)
    vec_9:          times 16 db 9
    align 16
    endianSwitch    db 15, 14, 13, 12,   11, 10, 9, 8,   7, 6, 5, 4,   3, 2, 1, 0
wrapper         db 0C6h, 0C6h, 0C6h, 0C6h,   0C6h, 0C6h, 0C6h, 0C6h,   0C6h, 0C6h, 0C6h, 0C6h,   0C6h, 0C6h, 0C6h, 0C6h
spaceCorrector  db 10h,  10h,  10h,  10h,    10h,  10h,  10h,  10h,    10h,  10h,  10h,  10h,    10h,  10h,  10h,  10h
wrapper10       db 10,   10,   10,   10,     10,   10,   10,   10,     10,   10,   10,   10,     10,   10,   10,   10
carryMask       db 0FFh, 0FFh, 0FFh, 0FFh,   0FFh, 0FFh, 0FFh, 0FFh,   7,    7,    7,    7,      7,    7,    7,    7
spaceDecector   db '!',  '!',  '!',  '!',    '!',  '!',  '!',  '!',    '!',  '!',  '!',  '!',    '!',  '!',  '!',  '!'
;ee.pp dq 0.1
align 32
jj7 dd 8 dup(0)
align 16
fann  db 16 dup('8')
      db 8 dup (10,100)
      db 16 dup(48)
align 16
fann3 db '1  1','2 05','1 53','27 1'
tSpaces db 16 dup(32)
ttAa    db 16 dup(1)
ttAa2   db 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ;1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
rept 1 {
xmmpos  db 1 ;,2,3,4,5,6,7,8,9,10,11,12,13,14,15
        times 15 db %+1
        db 0,2
        times 14 db %+2
        db 0,1,3
        times 13 db %+3
        db 0,1,2,4
        times 12 db %+4
        db 0,1,2,3,5
        times 11 db %+5
        db 0,1,2,3,4,6
        times 10 db %+6
        db 0,1,2,3,4,5,7
        times 9 db %+7
        db 0,1,2,3,4,5,6,8
        times 8 db %+8
        db 0,1,2,3,4,5,6,7,9
        times 7 db %+9
        db 0,1,2,3,4,5,6,7,8,10
        times 6 db %+10
        db 0,1,2,3,4,5,6,7,8,9,11
        times 5 db %+11
        db 0,1,2,3,4,5,6,7,8,9,10,12
        times 4 db %+12
        db 0,1,2,3,4,5,6,7,8,9,10,11,13
        times 3 db %+13
        db 0,1,2,3,4,5,6,7,8,9,10,11,12,14
        times 2 db %+14
        db 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,128
        db 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128
     }
rept 0 {
xmmpos  db 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
        db 0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15
        db 0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,15  ;+
        db 0,1,2,4,5,6,7,8,9,10,11,12,13,14,15,15
        db 0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,15
        db 0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,15
        db 0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,15
        db 0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,15
        }
        rept 0 {
        db 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
        }

align 16
fann2 db '9853','2548','1163','7770' ;4 dup('9853')     ;naoborot
      db 4 dup (1,10,10,100)
      db 16 dup(48)
      txt_Swap db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
      iTen  dw 8 dup(1,10)
align 16
dbrr dw 40,4,8,5,20,3,7,9
align 16
acc dd 1,2,3,4
    dd 4.0,2.0,3.0,1.0
ee.d1  dd 10.0,20.0,1.0,1.0
       dd 2.0,4.0,2.0,4.0         ;bite mask (2)00,(4)01,(8)10,(16)11
sa.callnz:
        Msg 'callnz!'
        ret
sa.callnz2:
        push ebx
        Msg 'callnz2!'
        ret
align 16
      pp3       dd 14.0,14.0,14.0,0
      bbox3D    dd 1.0,1.0,1.0,0.0        ;all min
                dd 20.0,20.0,20.0,-1.0     ;all max

      pp1   dd 120.0,122.0,120.0,0       ;sphere center
      linS  dd 10.0,10.0,10.0,0
      linE  dd 220.0,220.0,220.0,0
      Radius2 dd 2.0;Radius*Radius
              dd 0.01
macro displ s& { display `s#' '  }
macro ux i*& { irps a, i \{ displ a \} }

;macro x i*& { match a b,i: \{ display \`a  \}}

macro x i*& {  irps a, i \{
      if \`a = ':'
      display  ' '
     else
      display \`a
      end if
      ;display \`a
      ;display '{',\`a,'}',13,10
      \} }

Start:   macro clrflgs a,[b] { b equ a }
         macro doflgs [a] { match a,a \{a\} }

         mov eax,1073741824 ;5.6     2.0f = 1073741824 int
      movd  xmm1,eax
      movss xmm0,xmm1
      roundss xmm0,xmm0,9
      subss    xmm1,xmm0     ;xmm1=0.6

         clrflgs x,a1,a2,a30
         hunn equ mov:eax,1
         flag_eq_state equ 1
         if flag_eq_state = 1
         doflgs a1  hunn , a30 'fff'
         end if

         mov eax,.hu2 ;av32_1      ;pcmpistri to ecx index mask
         mov ebx,eax
         CMPXCHG ebx,edx           ;cmp eax,ebx != eax=ebx If eax=ebx then ebx=edx
         rept 4*1 n:0 {nmval#n equ allvll+n*4
                     allvll_size equ n+1
         }
         mov ebx,Status-allvll
         x mov:eax,:[ebx+5]
         display 13,10
         ux cwertyhh ;mov eax, [ebx+5]
         xmalgn equ movaps
         xmvec4  vecAB1,0,linS:
         xmalgn equ movups
         xmvec4  vecAB1,1,pp1-[linS]
         xmvec4  vecAB2,2,pp1+vecAB1
         xmalgn equ movaps
         xmvec4  vecAB3,3,pp1/[linE]
         xmvec4  vecAB4,4,pp1*vecAB2
         ;addss vecAB,vecAB
         ;SferLine2 Radius2,linS,linE,pp1
         SferLineP Radius2,linS,linE,pp1
         ExitMsg 'exit'
         mov  eax,pp1
         mov  edx,linE
         mov  ecx,linS
         call Aff1
         movups xmm1,dqword [pp1]
       ; mov     ecx, DWORD PTR __vLineS
       ; mov     edx, DWORD PTR __vLineE
       ; mov     eax, DWORD PTR __vPoint
         ;SferLine  pp1,linS,linE           ;ret xmm1
         Dot xmm1,xmm1
         sqrtss xmm1,xmm1
         mulss  xmm1,[Radius2+4]
         comiss  xmm1,[Radius2]
         ja   @f
         Msg  'sphere intersect line'
@@:

         SSE_AABB  pp3,bbox3D
         SSE_AABB2 pp3,bbox3D,@f
         Msg 'SSE_AABB2'
@@:
 ;jz  .cii           ;2 bytes
 cmp eax,eax
 push .gu ;$+6+5
 jz  near sa.callnz      ;6 bytes
 add esp,4
.gu:

 mov ebx,.gu2
 cmp eax,eax
 jz  near sa.callnz2      ;6 bytes
.gu2:
 Msg 'bbbb'
 mov  ebx,4;_250_000_000          ;5~6 sec
.cii:
 movaps  xmm1,dqword [fann3]
 movaps  xmm2,dqword [tSpaces]
@@:
 ;movaps  xmm1,dqword [fann3]
 ;movaps  xmm2,dqword [tSpaces]              ;20 secs na 5 spaces
 pcmpistri xmm2,xmm1,0x4                     ;26 secs na 7 spaces   6 sec na 1 space
 cmp   cx,15                                 ;12 secs na 3 spaces   9 sec na 2 space
 ja    .vnn
 shl   ecx,4
 pshufb  xmm1,dqword [xmmpos+ecx]
 jmp @b
 ;cmp   cx,15*16
 ;jbe    @b
.vnn:
 ;movaps dqword [dbrr],xmm1
 ;Msg dbrr

 rept 2+2+2-6   {                   ;2+4
 movaps  xmm0,xmm1
 pcmpgtb xmm0,dqword [tSpaces]
 pmovzxbw xmm0 ,xmm0
 ;movaps dqword [dbrr],xmm0
 PHMINPOSUW xmm0,xmm0
 ;movaps dqword [dbrr],xmm0
 psrldq xmm0,2
 movd  eax,xmm0
 ;inc eax
 ;add   eax ,eax

 shl   eax,4
 pshufb  xmm1,dqword [xmmpos+eax]
 }

 ;movaps dqword [dbrr],xmm1
 dec ebx
 jnz .cii
 movaps dqword [dbrr],xmm1
 ExitMsg   dbrr
 movaps dqword [dbrr],xmm0

 mov eax,8
 movd xmm0,eax
 PSRLq   xmm1,xmm0
 movaps dqword [dbrr],xmm0
 movaps dqword [dbrr],xmm1
 ExitMsg   dbrr
 
 movaps  xmm1,dqword [fann3]
 movaps  xmm2,xmm1
rept 0 {
 Msg "bbb"
 movaps  xmm1,dqword [fann3]
 movaps  xmm5,dqword [xmmpos]
 mov  ebx,4_250_000_000          ;5~6 sec
@@:
 movaps  xmm0,xmm1
 pcmpgtb xmm0,dqword [tSpaces]
 PSRLD   xmm1,xmm0
 movaps dqword [dbrr],xmm0
 movaps dqword [dbrr],xmm1

 pand    xmm0,dqword [ttAa]
 paddb   xmm5,xmm0                 ;+
 ;pxor    xmm0,dqword [ttAa2]
 paddb   xmm0,xmm5 ;dqword [xmmpos]
 pshufb  xmm1,xmm0
 movaps dqword [dbrr],xmm0
 movaps dqword [dbrr],xmm1
 dec ebx
 jnz @b
 ExitMsg "end"
 }
 mov esi,000_000_001       ;20 000 000  8sec
 Msg "bbb"
.huu:
 mov byte [.fgw+4],0
 xor ebx,ebx
 xorps xmm0,xmm0
 xorps xmm3,xmm3
 movaps xmm1,dqword [fann3]
 jmp .ffw
@@:
 movd   xmm0,ebx
 inc    ebx
.fgw: pslldq xmm0,0
      inc byte [.fgw+4]
      paddb xmm3,xmm0
.fgw2:
 psrldq xmm1,1
.ffw:
 movd eax,xmm1
 test  al,al
 jz    @f
 cmp   al,32
 jnz    @b
 inc   ebx

 jmp   .fgw2
@@:   pshufb  xmm2,xmm3
      movaps dqword [dbrr],xmm2
 dec esi
 jnz .huu
 ;ExitMsg "end"

mov esi,1_000_000_000       ;1_000 000 000  7 sec !
 Msg "bbb"
.hu2:
 ;mov    dx,1
 mov    ebx,dbrr
 movaps xmm1,dqword [fann3]
 jmp .fff
@@:
 psrldq xmm1,1
.fff:
 movd eax,xmm1
 ;test  al,al
 ;jz    @f
 cmp   al,32
 ;xor   cx,cx
 ;cmove ebx,edx
 jz    @b
 mov    [ebx],al
 ;mov    edx,ebx
 inc    ebx
 ;inc    ebx
 ;jmp   @b
 test  al,al
 jnz    @b
@@:
   dec esi
 jnz .hu2
  ExitMsg dbrr ;"end"
;blend
 movaps xmm1,dqword [fann3]
 movaps xmm2,xmm1
 movaps xmm0,xmm1
 movaps xmm1,dqword [fann]
 ;xorps xmm0,xmm0
 pcmpgtb xmm0,dqword [tSpaces]     ;or pcmpeqb xmm1,dqword [tSpaces]
 ;pblendw xmm0,xmm2,00_00_10_11b ;xmm1
 pblendvb xmm1,xmm2 ;mask iz xmm0
 movaps dqword [dbrr],xmm1

  Msg "bbb"
   mov ebx,4_259_999_000          ;20 sec all. sse part 3 sec
@@:
 lea eax,[ebx+ebx*3]
 movaps xmm1,dqword [fann2]      ;+eax*4
 pshufb xmm1,dqword [txt_Swap]
 psubb xmm1,dqword [fann2+32]
 pmaddubsw xmm1,dqword [fann2+16]
 pmullw xmm1,dqword [iTen]
 movaps xmm2,xmm1
 ;psrlw xmm1,16
 ;psllw xmm2,1
 psrldq xmm2,2
 paddw xmm1,xmm2
 movaps dqword [dbrr],xmm1

rept 0 {
 movd eax,xmm1                  ;this slow !!! 80 secs !
 mov  edx,0xffff0000            ;faster read from mem !
 pext ecx,eax,edx
 and eax,0x0000ffff
 add eax,ecx
 }
rept 0 {
 mov ax,[dbrr+2]
 ;imul eax, 10
 add  ax,[dbrr]

 mov ax,[dbrr+2+4]
 ;imul eax, 10
 add  ax,[dbrr+4]

 mov ax,[dbrr+2+8]
 ;imul eax, 10
 add  ax,[dbrr+8]

 mov ax,[dbrr+2+12]
 ;imul eax, 10
 add  ax,[dbrr+12]
 }

 dec ebx
 jnz @b
 Msg "end"


 movaps xmm1,dqword [fann]
 ;paddb  xmm1,dqword [fann+32]
 psubb xmm1,dqword [fann+32]
 movaps dqword [dbrr],xmm1

 pmaddubsw xmm1,dqword [fann+16]
 movaps dqword [dbrr],xmm1
 xor eax,eax
 mov ax,[dbrr]

 movaps xmm1,dqword [dbrr]
 PHMINPOSUW xmm0,xmm1
 movaps dqword [dbrr],xmm0           ;ret value16 i indx
 MPSADBW xmm0,xmm1,0
 rept 1 {
   mov ecx,0x30313233
   mov ebx,0xff000000             ;bit mask
   pext eax,ecx,ebx
   mov ebx,0x00ff0000             ;bit mask
   pext eax,ecx,ebx
   mov ebx,0x0000ff00             ;bit mask
   pext eax,ecx,ebx
   mov ebx,0xff0000ff             ;bit mask
   pext eax,ecx,ebx               ;eax=0x00003033
   mov ebx,0xffff00ff             ;bit mask
 ;  pext eax,ecx,ebx               ;eax=0x00303133
   }
   ;qvalsmm = 10
   ;cvtdq2ps xmm1,dqword [acc]
   ;vcvtps2dq ymm1,yword [acc+16]
   ;vmovups yword [jj7],ymm1

   ;mov ebx,push_virt_label
   ;mov [push_virt_cs+5*4],40             ;ida pro show mov [ebx],40
   ;xorps xmm0,xmm0
   Msg "bbb"
   mov ebx,4_200_000_000         ;empty loop  4_200_000_000   ryzen 5 3500  2 secs
.hhh:
        mov eax,[ee.d1]
        sub eax,[ee.d1+8]
        cmp eax,[ee.d1+12]
        jnz @f
@@:
        mov eax,[ee.d1]
        sub eax,[ee.d1+8]
        cmp eax,[ee.d1+12]
        jnz @f

rept 0 {
   movaps xmm1,dqword [ee.d1+16]  ;loop 4_200_000_000   ryzen 5 3500  3 secs
   movaps xmm0,xmm1
   movaps xmm2,dqword [ee.d1]

     cmpleps xmm1,dqword [ee.d1];xmm2        ;mask in xmm1
   pmovmskb eax,xmm1
   cmp   eax,0x00ff
   jnz    @f
   ;Msg 'point in rect'
     }
@@:
     dec ebx
     jnz .hhh
     Msg "point in rect"
   cmpleps xmm1,xmm2        ;mask in xmm1
   pmovmskb eax,xmm1
   cmp   al,-1
   jnz    @f
   ;movaps xmm1,[ee.d1+16]
   movups xmm2,dqword [ee.d1+8]
   CMPLtPS xmm0,xmm2
   ;cmpeqps  xmm1,xmm0
   pmovmskb eax,xmm0
   ;test   al,al
   cmp    al,-1
   jz    @f
   Msg 'point in rect'
@@:

   mov eax,0x35;353535
   movd xmm0,eax
   VPBROADCASTB xmm0,xmm0
  ; movsldup xmm0,xmm0         ;64 bits pol registra
  ; MOVLHPS xmm0,xmm0
   movaps dqword [ee.d1],xmm0

   xorps xmm1,xmm1
   movaps xmm0, dqword [hex_lut]
   mov eax,0x39
   movd xmm0,eax
   pshufb xmm0,xmm1 ;dqword [endianSwitch]
   movaps dqword [ee.d1],xmm0
   ;xorps xmm0,xmm0               ;pshufd vse zatret v xmm0
    pshufd xmm0, dqword [ee.d1+16], 01_00_10_00b
    ;movaps dqword [ee.d1],xmm0     ;00001100b dast dd 2,16,2,2
                                   ;01001100b dast dd 2,16,2,4
                                   ;01001000b dast dd 2,8,2,4
movaps xmm0, dqword [ee.d1]
movaps xmm1, dqword [ee.d1+16]
    shufps xmm0, xmm1,11100111b ;11100100b
movaps dqword [ee.d1],xmm0

   mov  edx,2144540000;14967295;4242400000;3999999999  ;4294967295
        inc edx
        mov  edi,edx
        mov eax,0.1
        movd xmm0,eax
        mov eax,0x7fffffff
        movd xmm3,eax
        ;movsd xmm0,[ee.pp]
.c1:
         test edi,edi
         jz   .nnn
         cvtsi2ss xmm1,edi        ;max value 2144540000 vishe gluki idut
         ;vcvtsi2sd xmm1,xmm1,edi
         ;andps xmm1,xmm3
         mulss xmm1,xmm0
         ;andps xmm1,xmm3
         cvtss2si eax,xmm1
         mov   edx,eax
         lea eax,[eax+eax*4]
         add eax,eax
        sub       edi,eax
        mov       eax,edi
        ;sub       eax,edi
        add       al,48
        mov       edi,edx

.a:
    pinsrb xmm2,eax,0    ;edx
    ;inc byte [.a+5]
    pslldq   xmm2,1
    ;test eax, eax       ; EAX
    ;jnz cycle1
    jmp  .c1
.nnn:
    psrldq   xmm2,1
    movups dqword [xxmm],xmm2
    ExitMsg   xxmm

  movups  xmm0, dqword [tdigs0]
  pshufb  xmm0, dqword [endianSwitch]  ; switch little/big-endian
 mov ebx,0_000_000_01;0;15secs
 .uppp:
    paddb   xmm0, dqword [wrapper]         ; shift digits from ASCII to wrapping edge ('0'->0F6h .. '9'->0FFh)
    
    ; increment low qword
    mov     eax, 1             ;ok 1  2 5 10
    ;mov eax,0x00000001
    xorps   xmm1, xmm1
    ;pinsrw  xmm1, eax, 0
    movd    xmm1,eax
    paddq   xmm0, xmm1
    ;psubq    xmm0,xmm1
    
    ; carry to high qword
    xorps   xmm1, xmm1
    pcmpeqb xmm1, xmm0                       ; byte == 0 ?
    pshufb  xmm1, dqword [ carryMask]        ; mask = 0XX_XX_XX_XX_XX_XX_XX_XX_00_00_00_00_00_00_00_00h, where XX = byte7
    psubq   xmm0, xmm1                       ; sub (-1) equivalent add 1
    
    ; correct wrapped digits
    xorps   xmm1, xmm1
    pcmpeqb xmm1, xmm0                       ; byte == 0 ?
    andps   xmm1, dqword [wrapper10]
    psubb   xmm0, xmm1
    
    psubb   xmm0, dqword [wrapper]          ; shift digits back to ASCII-codes
    
    ; detect spaces affected by carry
    movaps  xmm1, dqword [spaceDecector]
    pcmpeqb xmm1, xmm0                       ; check for spaces under carry
    andps   xmm1, dqword [spaceCorrector]   ; shift such spaces (actually '!') to '1'
    paddb   xmm0, xmm1
    dec ebx
    jnz  .uppp
    movaps  xmm1,xmm0
    pshufb  xmm1, dqword [endianSwitch]
    movups  dqword [xxmm], xmm1
    mov     word [xxmm+16],0
    ;ExitMsg xxmm


  movaps xmm3,dqword [tdigs]
  xorps  xmm0,xmm0
  movaps xmm4,dqword [afft]
  mov    eax,0x1
  ;movaps xmm5,dqword [apply]
  movd   xmm5,eax
  pslldq xmm5,9
  mov    eax,0x30303030
  movd   xmm6,eax
  pslldq xmm6,4
  ;movd   xmm6,eax     ;delet old 8 bytes !
  pinsrd  xmm6,eax,0
  MOVLHPS xmm6,xmm6
  ;movups dqword [xxmm],xmm6
  ;ExitMsg xxmm

  mov    ebx,1_000_999_590;+1;-1 ;92
   ;inc    byte [tdigs+4]
.ew:   movaps xmm1,xmm3 ;dqword [tdigs]
   mov    eax,0x1
  movd   xmm5,eax
  pslldq xmm5,9

   ;pextrb eax,xmm1,4+3+2
   ;inc    al
   ;pinsrb xmm1,eax,4+3+2
   paddb xmm1,xmm5
 ;rept 6+1 {
 mov  al,2;6+1
.et:
   ;movaps xmm2,dqword [afft]
   movaps xmm2,xmm4
   PCMPGTB xmm2,xmm1
   pand    xmm1,xmm2
   pmaxub  xmm1,xmm6 ;dqword [hjj1]
   psrldq   xmm2,1
   pandn     xmm2,dqword [hjj2]
   paddb    xmm1,xmm2
  ;       }
  comiss xmm2,xmm0
   ja     .et
  dec al
  jnz .et
   movaps    xmm3,xmm1

   dec    ebx
   jnz    .ew

   ;movq   qword [xxmm], xmm1  ;dast 0009fff6  = 655350
   movups  dqword [xxmm], xmm1
   mov     word [xxmm+9+1],0
   ;ExitMsg xxmm
   mov eax,dword [xxmm+7]
   mov byte [xxmm+7],'.'
   mov dword [xxmm+8],eax
   mov     word [xxmm+9+2],0
   ExitMsg xxmm


   mov eax,0x30303030
   pinsrd xmm2,eax,0
   pinsrd xmm2,eax,1
   pinsrd xmm2,eax,2
   pinsrd xmm2,eax,3
   mov eax,0x09090909
   pinsrd xmm3,eax,0
   pinsrd xmm3,eax,1
   pinsrd xmm3,eax,2
   pinsrd xmm3,eax,3
   mov eax,0x27272727
   pinsrd xmm4,eax,0
   pinsrd xmm4,eax,1
   pinsrd xmm4,eax,2
   pinsrd xmm4,eax,3
   mov    edx,  655350    ; number

    ;; or enter here for fastcall arg passing.  Or rdi, esi for x86-64 System V.  SSE2 is baseline for x86-64
    bswap  edx
    movd   xmm0, edx

    movdqa xmm1, xmm0
    psrld  xmm1, 4          ; right shift: high nibble -> low  (with garbage shifted in)
    punpcklbw xmm1, xmm0    ; interleave high/low nibble of each byte into a pair of bytes
    pand   xmm1, [low_nibble_mask]   ; zero the high 4 bits of each byte
    ; unpacked to 8 bytes, each holding a 4-bit integer, in printing order

    movdqa  xmm0, xmm1
    pcmpgtb xmm1, xmm3 ;[vec_9]
    pand    xmm1, xmm4 ;[vec_af_add] ; digit>9 ?  'a'-('0'+10)  :  0
    
    paddb   xmm0, xmm2;[vec_ASCII_zero]
    paddb   xmm0, xmm1      ; conditional add for digits that were outside the 0..9 range, bringing them to 'a'..'f'

    movq   qword [xxmm], xmm0  ;dast 0009fff6  = 655350
    ;ExitMsg xxmm

;text num to hex
    mov    eax,655350
    movd   xmm1, eax    ; number
    mov eax,0x0f0f0f0f
    pinsrd xmm2,eax,0
    pinsrd xmm2,eax,1
    pinsrd xmm2,eax,2
    pinsrd xmm2,eax,3
    mov eax,0x33323130 ;30313233
   pinsrd xmm3,eax,0
   mov eax,0x37363534 ;34353637
   pinsrd xmm3,eax,1
   mov eax,0x62613938 ;38396162
   pinsrd xmm3,eax,2
   mov eax,0x66656463 ;63646566
   pinsrd xmm3,eax,3

    movdqa xmm0, xmm1
    psrld  xmm1, 4          ; right shift: high nibble -> low  (with garbage shifted in)
    punpcklbw xmm0, xmm1    ; interleave low/high nibbles of each byte into a pair of bytes
    pand   xmm0, xmm2 ;[low_nibble_mask]   ; zero the high 4 bits of each byte (for pshufb)
    ; unpacked to 8 bytes, each holding a 4-bit integer

    movdqa xmm1, xmm3 ;[hex_lut]
    pshufb xmm1, xmm0       ; select bytes from the LUT based on the low nibble of each byte in xmm0

    pshufb xmm1, [reverse_8B]  ; printing order is MSB-first

    movq   qword [xxmm], xmm1  ;dast 0009fff6  = 655350
    ExitMsg xxmm

        mov esi,xxmm+4
        mov edi,65535                ;delit = 6553 no net ostatka 5

.hh:    mov ebp,edi
        mov       eax, 1717986919
        mul      edi                    ;imul ili mul ?
        sar       edi, 31
        sar       edx, 2
        sub       edx, edi              ;?
        mov       edi,edx
        lea       edx,[edx+edx*4]
        add       edx,edx
        mov       eax,ebp
        sub       eax,edx
        add       al,48

        mov       [esi],al
        dec       esi
        cmp       esi,xxmm
        jae       .hh
        Msg  xxmm



movups xmm1,dqword [tdigs]
;movups xmm1,dqword [tdigs]
xor edx,edx
.w1:
pextrb eax,xmm1,0
cmp  al,32
jbe  .we
;movzx  eax,al
sub  al,48
imul edx,10
add  edx,eax
psrldq   xmm1,1
jmp .w1
.we:    inc edx
        xorps xmm2,xmm2
rept 0 {
    mov eax,edx
    mov ebx, 10     ;   EBX 
    cycle1:
    xor edx, edx        ;     ()
    div ebx         ; (EDX:EAX)/EBX,   EDX
    add edx, 48        ;    
    }
        mov  edx,4014967295;4242400000;3999999999  ;4294967295
        inc edx
        mov  edi,edx
        xor edx,edx
cycle1:
         mov ebp,edi
         test ebp,ebp
         jz   .nnn
        mov       eax, 1717986919
        mul      edi                    ;imul ili mul ?
        sar       edi, 31
        sar       edx, 2
       ; sub       edx, edi              ;?
        mov       edi,edx
        lea       edx,[edx+edx*4]
        add       edx,edx
        mov       eax,ebp
        sub       eax,edx
        add       al,48

.a:
    pinsrb xmm2,eax,0    ;edx
    ;inc byte [.a+5]
    pslldq   xmm2,1
    ;test eax, eax       ; EAX
    ;jnz cycle1
    jmp  cycle1
.nnn:
    psrldq   xmm2,1
    movups dqword [xxmm],xmm2
    Msg   xxmm
movups xmm1,dqword [tdigs];[txtaRng]
movups dqword [xxmm],xmm1
mov eax,10.0
movd xmm2,eax
xorps xmm4,xmm4
.11:
pextrb eax,xmm1,0
cmp  al,32
jbe  .ee
sub  al,48
;movzx  eax,al
;sub dl,48
mulss xmm4,xmm2
cvtsi2ss xmm3,eax
addss xmm4,xmm3
psrldq   xmm1,1
jmp .11
.ee:
cvtss2si edx,xmm4

psrldq   xmm1,1
movups dqword [xxmm],xmm1

VPMOVZXBW ymm0, xmm1
VPMOVZXBW ymm0,dqword [av32_1]
vmovups yword [xxmm],ymm0
invoke MessageBoxW,0,xxmm,0,0
vmovups ymm1,yword [av32_1]
vmovups ymm2,yword [av32_2]
;vpcmpistri xmm1,xmm2,0x18
;vpcmpistri ymm1,yword [av32_2],0x18  ;invalid size
;vpcmpb k1 {k2},ymm1,ymm2,1 ;avx512
;vpcmpltb k1 {k2},ymm1,ymm2
vpcmpgtb ymm0,ymm1,ymm2
vmovups yword [xxmm],ymm0

vmovups ymm1,yword [av32_10]
vmovups ymm2,yword [av32_20]
vpcmpeqb ymm0,ymm1,ymm2
vmovups yword [xxmm],ymm0
vpmovmskb eax,ymm0
;inc  eax
cmp  eax,-1
jnz  @f
Msg 'avx2 equal'
@@:

movups xmm1,dqword [txtaRng]
xor ecx,ecx
.1:
pextrb eax,xmm1,0
inc  ecx
inc byte [.1+5]
cmp  al,32
jz   .1
mov  eax,xxmm+1
sub  eax,ecx
movups dqword [eax],xmm1
Msg   xxmm
;---
movups xmm1,dqword [txtaRng]
xor ecx,ecx
rept 16 n:0 {
pextrb eax,xmm1,n
inc  ecx
cmp  al,32
ja   .2 }

.2:
mov  eax,xxmm+1
sub  eax,ecx
movups dqword [eax],xmm1
Msg   xxmm

movups xmm1,dqword [txtRng]

pextrb eax,xmm1,1
;PMOVSXBW xmm1, xmm1
PMOVZXBW xmm1, xmm1
mov eax,0x00000038
PINSRB xmm1,eax,1
;pmaxub xmm1,dqword [tspace]
movaps dqword [xxmm],xmm1
;Msg   xxmm
movups xmm1,dqword [txtRng]
movups xmm2,dqword [maskroll]
;PSLLW xmm1,8
psllw   xmm1, 1
pand xmm1,xmm2

movaps dqword [xxmm],xmm1
Msg   xxmm
invoke MessageBoxW,0,xxmm+1,0,0
PACKUSWB xmm1, xmm1
movaps dqword [xxmm],xmm1
vimm = 0x18    ;=  0x10
Byts_Mask = 64;+16;32+64;64+32
movups xmm1,dqword [txtA]
movups xmm2,dqword [txtFind]
;movaps xmm2,xmm1
;pcmpistri xmm1,xmm2,vimm ;11b i 0
pcmpistri xmm1,dqword [txtFind],vimm
vimm = 0x8+Byts_Mask  ;any
movups xmm1,dqword [txtA]
movups xmm2,dqword [txtFind]
xorps   xmm0,xmm0
pcmpistrm xmm1,dqword [txtFind],vimm  ;+16 dla ret mask
;pcmpistrm xmm1,xmm2,vimm
pmovmskb eax,xmm0
;orps   xmm0,dqword [tspace]
andps  xmm0,xmm2
pmaxub xmm0,dqword [tspace]

movaps dqword [xxmm],xmm0
;range
vimm = 0x4;+32
movups xmm1,dqword [txtRng2]
pcmpistri xmm1,dqword [txtFin2],vimm
pmovmskb eax,xmm0


                       xor ebx,ebx
                       mov esi,Status
            @@:        movss xmm1,dword [aa+ebx*4]
                       cvtss2sd xmm1,xmm1
                       movsd qword [kuda],xmm1
                       cinvoke sprintf,esi,"%1.4f ;", dword [kuda], dword [kuda+4]
                       add  esi,eax
                       inc  ebx
                       cmp  ebx,4
                       jb   @b
                       Msg  Status


        mov edx,45
        cvtsi2ss xmm1,edx
        movss xmm3,xmm1
        mov edx,0.1
        movd xmm0,edx

        mov edx,10.0
        movd xmm2,edx
        mulss xmm1,xmm0
        roundss xmm1,xmm1,9
        cvtss2si edx,xmm1
        mulss xmm1,xmm2
        subss xmm3,xmm1
        cvtss2si eax,xmm3

   if abuuff = 1
        mov    esi,app
        call   Proc_Smart
   end if
        movss  xmm0, dword [aa]
        ;vminps zmm1, zmm2, zmm0 ;dword [aa] {1to16}
        cld
        mov   ecx,5
        movd  xmm1,ecx
        mov   esi,find
        mov   edx,esi
        mov   edi,txt
.gg:    repe  cmpsb
        jnz   .net      ;if ecx>0 than not found. If ecx=0 found word
        Msg   'found'
        mov   cl,1
        jmp   .done
.net:   mIncTxt zahodov

        xor   cl,cl
        cmp   byte [edi],0
        jz    .done
        mov   esi,edx
        movd  ecx,xmm1
        jmp   .gg
.done:
        Msg     zahodov ;'ExitProcess'
        invoke  ExitProcess,0
  if abuuff = 1
        include "Blocks\Proc_Smart.txt"
  end if
proc Aff1
       ; mov     ecx, DWORD PTR __vLineS
       ; mov     edx, DWORD PTR __vLineE
       ; mov     eax, DWORD PTR __vPoint
        movss   xmm6, DWORD   [ecx]
        movss   xmm3, DWORD   [edx]
        movss   xmm7, DWORD   [ecx+4]
        subss   xmm3, xmm6
        movss   xmm4, DWORD   [edx+4]
        movss   xmm0, DWORD   [eax]
        subss   xmm4, xmm7
        movss   xmm2, DWORD   [eax+4]
        subss   xmm0, xmm6
        movss   xmm1, DWORD   [ecx+8]
        subss   xmm2, xmm7
        movss   xmm5, DWORD   [edx+8]
        subss   xmm5, xmm1
        movss   DWORD   [tv385], xmm1
        mulss   xmm0, xmm3
        mulss   xmm2, xmm4
        addss   xmm2, xmm0
        movss   xmm0, DWORD   [eax+8]
        subss   xmm0, xmm1
        movaps  xmm1, xmm4
        mulss   xmm1, xmm4
        mulss   xmm0, xmm5
        addss   xmm2, xmm0
        movaps  xmm0, xmm3
        mulss   xmm0, xmm3
        addss   xmm1, xmm0
        movaps  xmm0, xmm5
        mulss   xmm0, xmm5
        addss   xmm1, xmm0
        xorps   xmm0, xmm0
        divss   xmm2, xmm1
        comiss  xmm0, xmm2
        jb       LN2@ClosestPoi
        ;mov     eax, DWORD   __$ReturnAddress$[esp-4]
        movq    xmm0, QWORD   [ecx]
        mov     ecx, DWORD   [ecx+8]
        movq    QWORD   [eax], xmm0
        mov     DWORD   [eax+8], ecx
        ret
LN2@ClosestPoi:
        movss   xmm0, DWORD   [__real@3f800000]
        comiss  xmm0, xmm2
        jbe      LN4@ClosestPoi
        ;mov     eax, DWORD   __$ReturnAddress$[esp-4]
        mulss   xmm3, xmm2
        mulss   xmm4, xmm2
        mulss   xmm5, xmm2
        addss   xmm3, xmm6
        addss   xmm4, xmm7
        addss   xmm5, DWORD   [tv385]
        movss   DWORD   [eax], xmm3
        movss   DWORD   [eax+4], xmm4
        movss   DWORD   [eax+8], xmm5
        ret
LN4@ClosestPoi:
        ;mov     eax, DWORD PTR __$ReturnAddress$[esp-4]
        movq    xmm0, QWORD   [edx]
        mov     ecx, DWORD   [edx+8]
        movq    QWORD   [eax], xmm0
        mov     DWORD   [eax+8], ecx
        ret
        __real@3f800000 dd 1.0
        tv385 dd 0
endp

SECTION '.data' DATA READABLE WRITEABLE
                FasmHwnd       dd  0
                ;push_virt              ;as rd
                ;end virtual
                db 48,48,48,48

SECTION '.idata' IMPORT DATA READABLE WRITEABLE

     library  kernel32, 'KERNEL32.DLL',\
              user32,   'USER32.DLL',\
              winmm,    'WINMM.DLL',\
              msvcrt,   'MSVCRT.DLL'

 include 'fasmAPI\kernel32.inc'
 include 'fasmAPI\user32.inc'
 include 'fasmAPI\winmm.inc'
 include 'fasmAPI\msvcrt.inc'

section '.bss' readable writeable
        if defined   allvll_size
        allvll          rd      allvll_size
        end if
        Status          rb      512
        Status22        rb      16*2000 
        pTxt            rd      4000             ;ptr to text
        RichTxt         rb      16*4000
