flat assembler
Message board for the users of flat assembler.

Index > Windows > PROE Encryption Win64

Goto page 1, 2  Next
Author
Thread Post new topic Reply to topic
r22



Joined: 27 Dec 2004
Posts: 805
r22
Here's the PE64 DLL source for my encryption algorithm.
Any ideas for improvement or errors I've missed would be appreciated.

Code:
;Copyright (C) 2007 by Louis J. Ricci
;Author E-Mail: R22Lou[-AT]cox[Dot-]net
;This library is free software; you can redistribute it and/or
;modify it under the terms of the GNU Lesser General Public
;License as published by the Free Software Foundation; either
;version 2.1 of the License, or (at your option) any later version.
;This library is distributed in the hope that it will be useful,
;but WITHOUT ANY WARRANTY; without even the implied warranty of
;MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;Lesser General Public License for more details.

format PE64 DLL
entry DllEntryPoint

macro AMDPad16
{
    virtual
        align 16
        a = $-$$
    end virtual
    if a=1
       db 90h
    end if
    if a=2
       db 66h,90h
    end if
    if a=3
       db 66h,66h,90h
    end if
    if a=4
       db 66h,66h,66h,90h
    end if
    if a=5
       db 66h,66h,90h,66h,90h
    end if
    if a=6
       db 66h,66h,90h,66h,66h,90h
    end if
    if a=7
       db 66h,66h,66h,90h,66h,66h,90h
    end if
    if a=8
       db 66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=9
       db 66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=10
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=11
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
    if a=12
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h
    end if
    if a=13
       db 66h,66h,66h,90h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=14
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h,66h,66h,90h
    end if
    if a=15
       db 66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,66h,90h,66h,66h,90h
    end if
}

include '%fasminc%\win64a.inc'

section '.data' data readable writeable
align 16
  StrTestBuffer db 'This is a PROE encryption test ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',0
                dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
align 16
  TestKeyHash1  dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
align 16
  TestKeyHash2  dq 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  StrPassed     db 'Working Properly',0
  StrFailed     db 'Not Working Properly',0

align 16
ROLLLUT_SIZE equ 2048
RollLUT:
Repeat 256
       dq ((%-1) And 63)
end repeat
RollLUTI:
Repeat 256
       dq 64 - ((%-1) And 63)
end repeat

section '.code' code readable executable

proc DllEntryPoint hinstDLL,fdwReason,lpvReserved
        mov     eax,TRUE
        ret
endp


AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and hash address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
;;;RETURN       -1=Fail 0=Success
PROEValidateParameters:
        sub     rsp,8*7
        cmp     r9,1
        jl      .fail
        test    rdx,63
        jnz     .fail
        cmp     rcx,-1
        je      .fail
        test    rcx,rcx
        jz      .fail
        cmp     r8,-1
        je      .fail
        test    r8,r8
        jz      .fail
        xor     eax,eax
        jmp     .end
   .fail:
        mov     eax,-1
   .end:
        add     rsp,8*7
        ret     0



AMDPad16
;;;RETURN    -1=Failed 0=Success
PROESelfTest:
        sub     rsp,8*7
        mov     r8,TestKeyHash1
        mov     r15,0xabcdef89
        mov     rcx,124
   .keyfill1:
        mov     dword[r8+rcx],r15d
        add     r15,r15
        add     r15,r8
        sub     rcx,4
        jns     .keyfill1
        xor     ecx,ecx
        mov     [r8+128],rcx
        mov     [r8+136],rcx
        mov     rcx,StrTestBuffer
        mov     rdx,128
        ;lea     r8,[r8]
        mov     r9,9
        call    PROEEncryptBufferWithChecksum
        mov     r9,TestKeyHash1
        mov     r8,TestKeyHash2
        mov     r15,0xabcdef89
        mov     rcx,124
   .keyfill2:
        mov     dword[r8+rcx],r15d
        add     r15,r15
        add     r15,r9
        sub     rcx,4
        jns     .keyfill2
        xor     ecx,ecx
        mov     [r8+128],rcx
        mov     [r8+136],rcx
        mov     rcx,StrTestBuffer
        mov     rdx,128
        ;lea     r8,[r8]
        mov     r9,9
        call    PROEDecryptBufferWithChecksum

        mov     rcx,[TestKeyHash1+128]
        mov     rdx,[TestKeyHash1+136]
        cmp     rcx,[TestKeyHash2+128]
        jne     .error
        cmp     rdx,[TestKeyHash2+136]
        jne     .error
        xor     ecx,ecx
        mov     rdx,StrTestBuffer
        mov     r8,StrPassed
        mov     r9,1
        call    [MessageBox]
        xor     eax,eax
        add     rsp,8*7
        ret     0
   .error:
        xor     ecx,ecx
        mov     rdx,StrTestBuffer
        mov     r8,StrFailed
        mov     r9,1
        call    [MessageBox]
        mov     eax,-1
        add     rsp,8*7
        ret     0

DQ1     equ 0
DQ2     equ 16
DQ3     equ 32
DQ4     equ 48
DQ5     equ 64
DQ6     equ 80
DQ7     equ 96
DQ8     equ 112
;;;STRUCTURE    Key and Hash, 16 byte aligned
;;;             128 bytes + 16 bytes
AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and checksum address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEEncryptBufferWithChecksum:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     [rsp+8*14],r13
        mov     rdi,r8
        mov     r13,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .encryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;update hash
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rdi]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
AMDPad16
   .encryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        js     .endEncryptBlock
;;;UNROLL ----------------------------
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;update hash
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rdi]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
AMDPad16
   .encryptBlock2_2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2_2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        jns     .encryptBlock
   .endEncryptBlock:
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        mov     r13,[rsp+8*14]
        add     rsp,8*15
        ret     0

AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key and checksum address, 16 byte aligned, 128+16 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEDecryptBufferWithChecksum:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     rdi,r8
        mov     r12,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .decryptBlock:
        movdqa  xmm0,dqword[rdi]
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        movdqa  dqword[rsp-24],xmm0
        and     eax,127
        sub     rsi,64
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2
;;update hash
;        movdqa  xmm0,[rsi+DQ1]
;        movdqa  xmm1,[rsi+DQ2]
;        movdqa  xmm2,[rsi+DQ3]
;        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rsp-24]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
;;;
        sub     rdx,64
        js      .endDecryptBlock
;;;UNROLL --------------------------------------
        movdqa  xmm0,dqword[rdi]
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        movdqa  dqword[rsp-24],xmm0
        and     eax,127
        sub     rsi,64
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2_2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2_2
;;update hash
;        movdqa  xmm0,[rsi+DQ1]
;        movdqa  xmm1,[rsi+DQ2]
;        movdqa  xmm2,[rsi+DQ3]
;        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm8,dqword[rdi+128]
        pshufd  xmm4,xmm0,00111001b
        pshufd  xmm5,xmm1,00111001b
        pshufd  xmm6,xmm2,00111001b
        pshufd  xmm7,xmm3,00111001b
        movdqa  xmm9,xmm8
        pxor    xmm7,xmm0
        pxor    xmm6,xmm1
        psllq   xmm8,3
        paddb   xmm9,dqword[rsp-24]
        pxor    xmm5,xmm2
        pxor    xmm4,xmm3
        pxor    xmm7,xmm6
        pxor    xmm5,xmm4
        paddb   xmm8,xmm7
        paddb   xmm9,xmm5
         pxor    xmm8,xmm9
         movdqa  [rdi+128],xmm8
;;;
        sub     rdx,64
        jns     .decryptBlock
   .endDecryptBlock:
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        add     rsp,8*15
        ret     0


AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key address, 16 byte aligned, 128 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEEncryptBuffer:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     [rsp+8*14],r13
        mov     rdi,r8
        mov     r13,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .encryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r13
        add     rax,r8
        sub     rsi,64
        and     eax,127
AMDPad16
   .encryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .encryptBlock2
;;;buffer bit shuffling (ROLling) using values from the key
       ; movdqa  xmm0,[rsi+DQ1]
       ; movdqa  xmm1,[rsi+DQ2]
       ; movdqa  xmm2,[rsi+DQ3]
       ; movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psrlq   xmm0,xmm8
        psrlq   xmm1,xmm8
        psrlq   xmm2,xmm8
        psrlq   xmm3,xmm8
        psllq   xmm4,xmm9
        psllq   xmm5,xmm9
        psllq   xmm6,xmm9
        psllq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
;;;
        sub     rdx,64
        jns     .encryptBlock
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        mov     r13,[rsp+8*14]
        add     rsp,8*15
        ret     0

AMDPad16
;;;PARAMETERS   RCX: buffer address, 16byte aligned
;;;             RDX: buffer length in bytes, multiple of 64
;;;             R8 : key address, 16 byte aligned, 128 bytes in size
;;;             R9 : number of passes, greater than or equal to 1, recommented 2
PROEDecryptBuffer:
        sub     rsp,8*15
        mov     [rsp+8*10],rbx
        mov     [rsp+8*11],rsi
        mov     [rsp+8*12],rdi
        mov     [rsp+8*13],r12
        mov     rdi,r8
        mov     r12,r9
;;;hash and encrypt the block of data
        lea     rsi,[rcx+rdx]
        sub     rdx,64
        xor     eax,eax
        lea     rcx,[RollLUT]
AMDPad16
   .decryptBlock:
        movzx   r8,byte[rdi+rax]
        mov     rbx,r12
        add     rax,r8
        sub     rsi,64
        and     eax,127
;;;buffer bit shuffling (ROLling) using values from the key
        movdqa  xmm0,[rsi+DQ1]
        movdqa  xmm1,[rsi+DQ2]
        movdqa  xmm2,[rsi+DQ3]
        movdqa  xmm3,[rsi+DQ4]
        movdqa  xmm4,xmm0
        movdqa  xmm5,xmm1
        movdqa  xmm6,xmm2
        movdqa  xmm7,xmm3
        movq    xmm8,[rcx+r8*8]
        movq    xmm9,[rcx+r8*8+ROLLLUT_SIZE] ;;;+SizeOfLUT
        psllq   xmm0,xmm8
        psllq   xmm1,xmm8
        psllq   xmm2,xmm8
        psllq   xmm3,xmm8
        psrlq   xmm4,xmm9
        psrlq   xmm5,xmm9
        psrlq   xmm6,xmm9
        psrlq   xmm7,xmm9
        por     xmm0,xmm4
        por     xmm1,xmm5
        por     xmm2,xmm6
        por     xmm3,xmm7
        movdqa  [rsi+DQ1],xmm0
        movdqa  [rsi+DQ2],xmm1
        movdqa  [rsi+DQ3],xmm2
        movdqa  [rsi+DQ4],xmm3
AMDPad16
   .decryptBlock2:
;;setup
        movdqa  xmm0, [rdi+DQ1]
        movdqa  xmm1, [rdi+DQ2]
        movdqa  xmm2, [rdi+DQ3]
        movdqa  xmm3, [rdi+DQ4]
        movdqa  xmm4, [rdi+DQ5]
        movdqa  xmm5, [rdi+DQ6]
        movdqa  xmm6, [rdi+DQ7]
        movdqa  xmm7, [rdi+DQ8]
;;copy
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
        movdqa  xmm12, xmm4                   
        movdqa  xmm13, xmm5                   
        movdqa  xmm14, xmm6                   
        movdqa  xmm15, xmm7                   
;;shift right logical 63bits to have the mask of highest bit
        psrlq   xmm8, 63                      
        psrlq   xmm9, 63                      
        psrlq   xmm10, 63                     
        psrlq   xmm11, 63                     
        psrlq   xmm12, 63                     
        psrlq   xmm13, 63                     
        psrlq   xmm14, 63                     
        psrlq   xmm15, 63                     
;;shift left to remove the highest bit and empty the lowest
        psllq   xmm4, 1                       
        psllq   xmm5, 1                       
        psllq   xmm6, 1                       
        psllq   xmm7, 1                       
;;add masked bit                              
        paddq   xmm0, xmm12                   
        paddq   xmm1, xmm13                   
        paddq   xmm2, xmm14                   
        paddq   xmm3, xmm15                   
;;logical or lowest bit                       
        por     xmm4, xmm8                    
        por     xmm5, xmm9                    
        por     xmm6, xmm10                   
        por     xmm7, xmm11                   
;;copy                                        
        movdqa  xmm8, xmm0                    
        movdqa  xmm9, xmm1                    
        movdqa  xmm10, xmm2                   
        movdqa  xmm11, xmm3                   
;;Bit ROLL by prime numbers 7, 5, 3, 11       
        psllq   xmm0, 7                       
        psllq   xmm1, 5                       
        psllq   xmm2, 3                       
        psllq   xmm3, 11                      
        psrlq   xmm8, 57;64-7                 
        psrlq   xmm9, 59;64-5                 
        psrlq   xmm10, 61;64-3                
        psrlq   xmm11, 53;64-11               
        por     xmm0, xmm8                    
        por     xmm1, xmm9                    
        por     xmm2, xmm10                   
        por     xmm3, xmm11                   
;;Dword order switching                       
        pshufd  xmm0, xmm0, 00111001b
        pshufd  xmm1, xmm1, 00111001b
        pshufd  xmm2, xmm2, 00111001b
        pshufd  xmm3, xmm3, 00111001b
;;xor/add old key with new key
        paddb  xmm1,[rdi+DQ1]
        paddb  xmm2,[rdi+DQ2]
        paddb  xmm3,[rdi+DQ3]
        paddb  xmm4,[rdi+DQ4]
        pxor   xmm5,[rdi+DQ5]
        pxor   xmm6,[rdi+DQ6]
        pxor   xmm7,[rdi+DQ7]
        pxor   xmm0,[rdi+DQ8]
;;Modify Key with rotation of dq words
        movdqa  [rdi+DQ1], xmm1
        movdqa  [rdi+DQ2], xmm2
        movdqa  [rdi+DQ3], xmm3
        movdqa  [rdi+DQ4], xmm4
        movdqa  [rdi+DQ5], xmm5
        movdqa  [rdi+DQ6], xmm6
        movdqa  [rdi+DQ7], xmm7
        movdqa  [rdi+DQ8], xmm0
;;prepare output
        ;copy
        movdqa  xmm8, xmm0
        movdqa  xmm9, xmm1
        movdqa  xmm10, xmm2
        movdqa  xmm11, xmm3
        movdqa  xmm12, xmm4
        movdqa  xmm13, xmm5
        movdqa  xmm14, xmm6
        movdqa  xmm15, xmm7
        ;shift words by primes
        psllw   xmm0, 3
        psllw   xmm1, 5
        psllw   xmm2, 7
        psllw   xmm3, 11
        ;
        psrlw   xmm8, 5
        psrlw   xmm9, 7
        psrlw   xmm10, 11
        psrlw   xmm11, 3
        ;
        psllw   xmm4, 7
        psllw   xmm5, 11
        psllw   xmm6, 3
        psllw   xmm7, 5
        ;
        psrlw   xmm12, 11
        psrlw   xmm13, 3
        psrlw   xmm14, 5
        psrlw   xmm15, 7
        ; add / xor
        pxor    xmm0, xmm8
        pxor    xmm1, xmm9
        pxor    xmm2, xmm10
        pxor    xmm3, xmm11
        paddb   xmm4, xmm12
        paddb   xmm5, xmm13
        paddb   xmm6, xmm14
        paddb   xmm7, xmm15
        pxor    xmm0, xmm4
        pxor    xmm1, xmm5
        pxor    xmm2, xmm6
        paddb   xmm3, xmm7
;;;xor random 64bytes with block
        pxor    xmm0,dqword[rsi+DQ1]
        pxor    xmm1,dqword[rsi+DQ2]
        pxor    xmm2,dqword[rsi+DQ3]
        pxor    xmm3,dqword[rsi+DQ4]
;;;modify block
        movdqa  dqword[rsi+DQ1],xmm0
        movdqa  dqword[rsi+DQ2],xmm1
        movdqa  dqword[rsi+DQ3],xmm2
        movdqa  dqword[rsi+DQ4],xmm3
;;;
        dec     rbx
        jnz     .decryptBlock2
;;;
        sub     rdx,64
        jns      .decryptBlock
        mov     rbx,[rsp+8*10]
        mov     rsi,[rsp+8*11]
        mov     rdi,[rsp+8*12]
        mov     r12,[rsp+8*13]
        add     rsp,8*15
        ret     0


section '.idata' import data readable writeable
;;;API imports
  library user32,'USER32.DLL'

        import user32,\
               MessageBox,'MessageBoxA'

section '.edata' export data readable

  export 'PROE_Lib_Win64.DLL',\
         PROEEncryptBuffer,'PROEEncryptBuffer',\
         PROEEncryptBufferWithChecksum,'PROEEncryptBufferWithChecksum',\
         PROEDecryptBuffer,'PROEDecryptBuffer',\
         PROEDecryptBufferWithChecksum,'PROEDecryptBufferWithChecksum',\
         PROEValidateParameters,'PROEValidateParameters',\
         PROESelfTest,'PROESelfTest'

section '.reloc' fixups data discardable 
    


Last edited by r22 on 29 Mar 2007, 20:30; edited 3 times in total
Post 07 Jan 2007, 00:23
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
LocoDelAssembly
Your code has a bug


Joined: 06 May 2005
Posts: 4633
Location: Argentina
LocoDelAssembly
r22, this algorithm is based on someone else or you invented it from scratch?

I checked https://sourceforge.net/projects/proe but I don't see any docs explaining the backgrounds of this algorithm nor some proof that it is uncrackeable (though, don't make any math demostration for me because I'll not understand anything Razz, I'm just suggesting the documentation to attract more users).

640 MB in 1.5 seconds, pretty fast Very Happy
Post 07 Jan 2007, 00:55
View user's profile Send private message Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Yeah the SF.net project page is a work in progress.

I pretty much came up with the algorithm from scratch (including the hash) Although when you break the prng part of the algorithm down into it's simplest form it's similar to a random bit stream generator using a shift register.

It's uncrackability is based on boolean logic.
128byte seed goes in ->
it gets manipulated based on the prng ->
it then gets XOR'd and ADD'd with itself to make 64bytes ->
those 64bytes are XOR'd with the data to encrypt it

The only way to crack it would be to guess the 128byte seed, so when computers are able to chrunch 2^( 128*8 ) iterations the algorithm will become obselete.

The PRNG needs more testing, I've only used ENT to prove it's randomness.
Post 07 Jan 2007, 01:27
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid
r22: you may try to give your algo to some brainheads to crack it.


Last edited by vid on 13 Jan 2007, 10:17; edited 1 time in total
Post 07 Jan 2007, 07:13
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
ACP



Joined: 23 Sep 2006
Posts: 204
ACP
Sorry, I didn't have time to look at your code yet, but If I understand you description correctly you basically do xor on input stream with 64 bytes long key. If this is the case than the encryption can be easily broken.
Post 08 Jan 2007, 00:12
View user's profile Send private message Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
ACP: My description was incomplete as I didn't mention the psuedo random bit rolling process in the algorithm.

But in any case, after you've looked at the code, I'd like to hear an elaboration of how it can be easily broken. If that's still your position after you've looked at the implementation.
Post 08 Jan 2007, 00:24
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Here's an almost finalized draft of the paper that goes with the PROE algorithm. It's a short thesis that goes into the AES as well as optimization and security concerns.

It's the paper I'm using for honors credit at the college I attend (RIC www.ric.edu). Chapter 2 encompasses all the procedures that make up PROE with detailed descriptions.


Description: 3/29/07 Algorithm Documentation Updated
Download
Filename: PROEThesis_Final.zip
Filesize: 108.89 KB
Downloaded: 150 Time(s)



Last edited by r22 on 29 Mar 2007, 20:46; edited 4 times in total
Post 08 Jan 2007, 00:38
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
sylwek32



Joined: 27 Apr 2006
Posts: 339
sylwek32
too bad i havn't got a 64bit machine to test it. Very Happy
Post 08 Jan 2007, 01:49
View user's profile Send private message Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
TESTING UPDATE: I've run the PRNG 100Billion times on a key and have found that the first 16bytes of the returned 64bytes does NOT repeat. I'm sure their are keys that will produce less secure results but since the keys should be generated pseudo randomly those malfunctioning keys would probably be nonexistant or a ludicrous probability of showing up.

I could make a 32bit version but the way the algorithm is set up it'd take a pretty big hit in performance.

64bit ROL and ROR on a 32bit machine is messy, also the lack of xmmx registers.

I could probably make an UNcompatible 32bit version that would essentially be the 64bit version scaled in half (64byte keys ineast of 128) and (32byte block size instead of 64).
Post 09 Jan 2007, 00:24
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid
r22: for PRNG use windoze CryptAPI
Post 09 Jan 2007, 00:53
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Do you mean for the actual PRNG or to create the 128byte key for it?

Right now the p.o.c. dialog application I have creates the keys using a low level mouse hook and rdtsc values.

As for the PRNG algorithm itself, I've tested it pretty thoroughly and am confident it's randomness is to a level suitable for cryptography (until proven otherwise Razz ).

An optimized version of 3DES, Blowfish, Twofish or AES in assembly to test the encryption speed against would be very helpful.

PROE seems to encrypt at around ~15 clock cycles / byte

I'm also thinking about switching the BitRolling portion of the algorithm from using the key values to using a LUT with a key value as an index the LUT would contain sequences of prime numbers to BitRoll by. This would take away from the greater randomness of using the truncated seed/key bytes but I still think it would be a cleaner implementation.

Also it would add a table of hex values which (we can all agree) makes fancy encryption algorithms look cooler Very Happy
Post 09 Jan 2007, 02:58
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
vid
Verbosity in development


Joined: 05 Sep 2003
Posts: 7105
Location: Slovakia
vid
Post 09 Jan 2007, 03:26
View user's profile Send private message Visit poster's website AIM Address MSN Messenger ICQ Number Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Good read thank you for the link.

Right now the PRNG passes the N-bits test, and somewhat passes the state compromise test (since you wouldn't be able to FULLY reproduce the previous values, but you can reproduce a portion of them).

By adding a XORing step to the key saving portion of the algorithm I beleive I can make the PRNG invulnerable to a state compromise.

This weekend I'll implement this change and also convert the bit rolling portion of the PROE algorithm to use SSE and LUTs (should be much faster), even though the probability of cracking will improve slightly
from (8*63)*( 2^(128*8) ) to (3.988*63)*( 2^(128*8) )
Post 11 Jan 2007, 01:53
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
ACP



Joined: 23 Sep 2006
Posts: 204
ACP
r22 wrote:
ACP: My description was incomplete as I didn't mention the psuedo random bit rolling process in the algorithm.

But in any case, after you've looked at the code, I'd like to hear an elaboration of how it can be easily broken. If that's still your position after you've looked at the implementation.

Thanks for the documentation. I'll read it and let you know.
Post 12 Jan 2007, 22:15
View user's profile Send private message Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
I've updated the first post with the newest iteration of the PROE algorithm (and final version until proven otherwise Razz). I've also updated the documentation post.

I'll get around to modifying the sourceforge project soon.

I think the new bit rolling implementation is pretty good.
I choose a byte value from the seed and use that as the index into a LUT which has the number of bits to roll by. Also the index into the seed is random.

As for the PRNG portion of the algorithm I went with PADDB and PXOR to modify the key based on the previous key values, this should make any state compromise of the algorithm unable to reproduce previous values.

I also retested the randomness after I made this change to make sure it didn't affect entropy, chi square distribution, correlation and monte carlo pi derivation.
Post 14 Jan 2007, 02:54
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Using the library and a small benchmark utility I've found that the PROE algorithm's 64bit implementation can encrypt at roughly

3 CPU CYCLES / BYTE (AMD x2 3800+ 2.0GHz 1GB RAM)

If anyone with a 64bit version of Windows can test this I'd appreciate it. Heres the source, exe and dll.

The only chart I really have for comparison is from
www.schneier.com/aes-letter-26apr.html
and it doesn't represent highly optimized versions of the other algorithms.

If you don't have enough RAM to allocate 640MB your results will be MUCH slower because disk paging will be involved (in this case just modify the benchmark source to use some smaller multiple of 64 in the TEST_SIZE equate at the top of the source).

EDIT: Feryno, thanks for the heads up. I should really start using the proc macro.


Description: Source, EXE, and DLL.
Download
Filename: PROELibWin64.zip
Filesize: 10 KB
Downloaded: 123 Time(s)



Last edited by r22 on 29 Mar 2007, 21:03; edited 3 times in total
Post 16 Jan 2007, 01:38
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
Feryno



Joined: 23 Mar 2005
Posts: 466
Location: Czech republic, Slovak republic
Feryno
r22 pls read private message (2007 january 15) for Vista RC2 x64 report
Post 16 Jan 2007, 08:42
View user's profile Send private message Visit poster's website ICQ Number Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
Anyone have any ideas for fast I/O on win 64. Right now I'm using _SEQUENTIAL flag for the reading and the WRITE_THROUGH flag for the writing.

I was thinking about using file mapping but haven't had the time to read up on it.
-Can I allocate more space then the file actually takes up (like pad it to a length that is a multiple of 64?)
-What about large files ( > 800MB) will I have to map only a portion at a time.
-If I'm using file mapping to modify (encrypt) a file can I save it as a new file or will it just overwrite the original. Some people might want to encrypt a file (to put on a less secure back up media) but not delete the original. This could be solved by simply copying the file, but then again if its a large file... Also I'd lose the ability to do a simple file scrub on the original before deleting it.
-Has anyone benchmarked the speed of using a file mapping object VS reading a file into a buffer, modifying the buffer, then writing it back out again.
Post 19 Jan 2007, 02:28
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
f0dder



Joined: 19 Feb 2004
Posts: 3170
Location: Denmark
f0dder
Memory mapping is mostly a convenience thing - and with 64bit windows, it's convenient again (you can map entire files even if they're huge, without having to map/unmap windows of it like on win32).

Yes, you can increase file size, but this can only be done at the time of CreateFileMapping(), and the file will always be extended fully to that size, so you'll need SetEndOfFile() later on.

note: you get additional CPU load because mmap relies on the pagefault mechanism, including relatively costly ring transitions. Also, if you map huge files, you'll use additional memory for the required pagetable mappings.

r22 wrote:

If I'm using file mapping to modify (encrypt) a file can I save it as a new file or will it just overwrite the original.

Depends on how you do this Smile. If you use in-place encryption, you'll overwrite the original file (look up the copy-on-write functionality though) - but you can use non-inplace encryption with one filemap for input and another for output.

r22 wrote:

Has anyone benchmarked the speed of using a file mapping object VS reading a file into a buffer, modifying the buffer, then writing it back out again.

Yeah, informally.

You'll end up finishing in the same clock time, the mmap method takes more cpu time though.

If you do sequential file processing, probably the most efficient method is to use FILE_FLAG_NO_BUFFERING and have decent buffer sizes. In addition to that, you can do OVERLAPPED file I/O, allowing you to process one buffer while the next one is being read from the file.
Post 19 Jan 2007, 10:05
View user's profile Send private message Visit poster's website Reply with quote
r22



Joined: 27 Dec 2004
Posts: 805
r22
I updated the algorithm documentation to fix a few errors and update the make believe pseudo code I was using to a more standard format.
Post 08 Feb 2007, 05:48
View user's profile Send private message AIM Address Yahoo Messenger Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  
Goto page 1, 2  Next

< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2020, Tomasz Grysztar. Also on GitHub, YouTube, Twitter.

Website powered by rwasa.