;Copyright (C) 2007 by Louis J. Ricci
;Author E-Mail: R22Lou[-AT]cox[Dot-]net
;This library is free software; you can redistribute it and/or
;modify it under the terms of the GNU Lesser General Public
;License as published by the Free Software Foundation; either
;version 2.1 of the License, or (at your option) any later version.
;This library is distributed in the hope that it will be useful,
;but WITHOUT ANY WARRANTY; without even the implied warranty of
;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;Lesser General Public License for more details.
format PE64 DLL
entry DllEntryPoint
macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}
include '%fasminc%\win64a.inc'
section '.data' data readable writeable
align 16
  StrTestBuffer db 'This is a PROE encryption test ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',0
                dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
align 16
  TestKeyHash1  dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
align 16
  TestKeyHash2  dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  StrPassed     db 'Working Properly',0
  StrFailed     db 'Not Working Properly',0
align 16
ROLLLUT_SIZE equ 2048
RollLUT:
Repeat 256
       dq ((%-1) And 63)
end repeat
RollLUTI:
Repeat 256
       dq 64 - ((%-1) And 63)
end repeat
section '.code' code readable executable
proc DllEntryPoint hinstDLL,fdwReason,lpvReserved
        mov     eax,TRUE
        ret
endp
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and hash address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
;;;RETURN       -1=Fail 0=Success
PROEValidateParameters:
        sub     rsp,8*7
        cmp     r9,1
        jl      .fail
        test    rdx,63
        jnz     .fail
        cmp     rcx,-1
        je      .fail
        test    rcx,rcx
        jz      .fail
        cmp     r8,-1
        je      .fail
        test    r8,r8
        jz      .fail
        xor     eax,eax
        jmp     .end
   .fail:
        mov     eax,-1
   .end:
        add     rsp,8*7
        ret     0
AMDPad16
;;;RETURN    -1=Failed 0=Success
PROESelfTest:
        sub     rsp,8*7
        mov     r8,TestKeyHash1
        mov     r15,0xabcdef89
        mov     rcx,124
   .keyfill1:
        mov     dword[r8+rcx],r15d
        add     r15,r15
        add     r15,r8
        sub     rcx,4
        jns     .keyfill1
        xor     ecx,ecx
        mov     [r8+128],rcx
        mov     [r8+136],rcx
        mov     rcx,StrTestBuffer
        mov     rdx,128
        ;lea     r8,[r8]
        mov     r9,9
        call    PROEEncryptBufferWithChecksum
        mov     r9,TestKeyHash1
        mov     r8,TestKeyHash2
        mov     r15,0xabcdef89
        mov     rcx,124
   .keyfill2:
        mov     dword[r8+rcx],r15d
        add     r15,r15
        add     r15,r9
        sub     rcx,4
        jns     .keyfill2
        xor     ecx,ecx
        mov     [r8+128],rcx
        mov     [r8+136],rcx
        mov     rcx,StrTestBuffer
        mov     rdx,128
        ;lea     r8,[r8]
        mov     r9,9
        call    PROEDecryptBufferWithChecksum
        mov     rcx,[TestKeyHash1+128]
        mov     rdx,[TestKeyHash1+136]
        cmp     rcx,[TestKeyHash2+128]
        jne     .error
        cmp     rdx,[TestKeyHash2+136]
        jne     .error
        xor     ecx,ecx
        mov     rdx,StrTestBuffer
        mov     r8,StrPassed
        mov     r9,1
        call    [MessageBox]
        xor     eax,eax
        add     rsp,8*7
        ret     0
   .error:
        xor     ecx,ecx
        mov     rdx,StrTestBuffer
        mov     r8,StrFailed
        mov     r9,1
        call    [MessageBox]
        mov     eax,-1
        add     rsp,8*7
        ret     0
DQ1     equ 0
DQ2     equ 16
DQ3     equ 32
DQ4     equ 48
DQ5     equ 64
DQ6     equ 80
DQ7     equ 96
DQ8     equ 112
;;;STRUCTURE    Key and Hash, 16 byte aligned
;;;             128 bytes + 16 bytes
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and checksum address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEEncryptBufferWithChecksum:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     [rsp+8*14],r13
        mov     rdi,r8
        mov     r13,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .encryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;update hash
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rdi]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
AMDPad16
   .encryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        js     .endEncryptBlock
;;;UNROLL ----------------------------
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;update hash
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rdi]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
AMDPad16
   .encryptBlock2_2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2_2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        jns     .encryptBlock
   .endEncryptBlock:
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        mov     r13,[rsp+8*14]
        add     rsp,8*15
        ret     0
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and checksum address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEDecryptBufferWithChecksum:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     rdi,r8
        mov     r12,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .decryptBlock:
        movdqa  xmm0,dqword[rdi]
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        movdqa  dqword[rsp-24],xmm0
        and     eax,127
        sub     rsi,64
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2
;;update hash
;        movdqa  xmm0,[rsi+DQ1]
;        movdqa  xmm1,[rsi+DQ2]
;        movdqa  xmm2,[rsi+DQ3]
;        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rsp-24]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
;;;
        sub     rdx,64
        js      .endDecryptBlock
;;;UNROLL --------------------------------------
        movdqa  xmm0,dqword[rdi]
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        movdqa  dqword[rsp-24],xmm0
        and     eax,127
        sub     rsi,64
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2_2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2_2
;;update hash
;        movdqa  xmm0,[rsi+DQ1]
;        movdqa  xmm1,[rsi+DQ2]
;        movdqa  xmm2,[rsi+DQ3]
;        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rsp-24]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
;;;
        sub     rdx,64
        jns     .decryptBlock
   .endDecryptBlock:
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        add     rsp,8*15
        ret     0
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key address, 16 byte aligned, 128 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEEncryptBuffer:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     [rsp+8*14],r13
        mov     rdi,r8
        mov     r13,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .encryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
AMDPad16
   .encryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        jns     .encryptBlock
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        mov     r13,[rsp+8*14]
        add     rsp,8*15
        ret     0
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key address, 16 byte aligned, 128 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEDecryptBuffer:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     rdi,r8
        mov     r12,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .decryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2
;;;
        sub     rdx,64
        jns      .decryptBlock
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        add     rsp,8*15
        ret     0
section '.idata' import data readable writeable
;;;API imports
  library user32,'USER32.DLL'
        import user32,\
               MessageBox,'MessageBoxA'
section '.edata' export data readable
  export 'PROE_Lib_Win64.DLL',\
         PROEEncryptBuffer,'PROEEncryptBuffer',\
         PROEEncryptBufferWithChecksum,'PROEEncryptBufferWithChecksum',\
         PROEDecryptBuffer,'PROEDecryptBuffer',\
         PROEDecryptBufferWithChecksum,'PROEDecryptBufferWithChecksum',\
         PROEValidateParameters,'PROEValidateParameters',\
         PROESelfTest,'PROESelfTest'
section '.reloc' fixups data discardable