; -----------------------------------------------------------------------------
; Major.Minor.Build: 0.5.7
; -----------------------------------------------------------------------------
; Copyright (c) 2008, Alex Patterson, Greenville, WI. All rights reserved.
;
; License Terms
; 
; The free distribution and use of this software is allowed (with or without
;  changes) provided that:
;
;  1. Source code distributions include the above copyright notice, this
;      list of conditions and the following disclaimer.
; 
;  2. Binary distributions include the above copyright notice, this list
;      of conditions and the following disclaimer in their documentation.
; 
;  3. The name of the copyright holder is not used to endorse products
;      built using this software without specific written permission.
; 
; Disclaimer
; 
; This software is provided 'as is' by the author, who assumes no
;  liability for any and all negative results of using this software.
; -----------------------------------------------------------------------------
; Issue Date < release date > day/month/year
;
; An AES implementation for x86 processors using the FASM assembler, and is
;  mainly for educational use.  This code provides the standard AES block size
;  of 128 bits (16 bytes), and currently supports 256-bit keys only, due to
;  personal use requirements.
;
;  calling interfaces:
;
;  AES_EncryptKeys    ( byte key[32] )
;  AES_DecryptKeys    ( byte key[32] )
;  AES_EncryptBlock   ( byte in[16], byte out[16] )
;  AES_DecryptBlock   ( byte in[16], byte out[16] )
;  AES_SecureMemory   ( void )
;
; In this implementation the stdcall convention is used, where the parameters
;  are pushed in reverse order onto the stack, and the callee clears the
;  stack frame.  The standard callback registers are preserved across calls
;  to these functions, including ebx,esi,edi,& ebp.
;
; The encryption key schedule is generated in forward order, according to the
;  following diagram.
;
;  lower address   [  encryption round 0   ]   first key addition
;                  [  encryption round 1   ]
;                  [  encryption round 2   ]   main n-1 round table lookups
;                  [  encryption round ..  ]
;                  [  encryption round n-1 ]
;  higher address  [  encryption round n   ]   final subsitution & key addition
;
;
; The decryption key schedule is also generated in forward order, and is accessed
;  in reverse during decryption.
;
;   --  [  decryption round n   ] =                [ encryption round 0   ] )
;       [  decryption round n-1 ] = InvMixColumns( [ encryption round 1   ] )
;       [  decryption round ..  ] = InvMixColumns( [ encryption round 2   ] )
;       [  decryption round 2   ] = InvMixColumns( [ encryption round ..  ] )
;       [  decryption round 1   ] = InvMixColumns( [ encryption round n-1 ] )
;   ++  [  decryption round 0   ] =                [ encryption round n   ] )
;

; -----------------------------------------------------------------------------
;
; fasm PE file headers
; -----------------------------------------------------------------------------

SYS_LIN=1

if defined SYS_LIN

 format ELF
 include '%fasm%\macro\struct.inc'

 public aes_init    as 'AES_Init'
 public aes_enckey  as 'AES_EncryptKeys'
 public aes_deckey  as 'AES_DecryptKeys'
 public aes_encuser as 'AES_EncryptBlock'
 public aes_decuser as 'AES_DecryptBlock'
 public aes_memory  as 'AES_SecureMemory'

end if

; -----------------------------------------------------------------------------
;
; algorithm constants
; -----------------------------------------------------------------------------

 b equ byte
 d equ dword

; table generation (galois field polynomials / memory)

 gf_modulus	     = 0x1b
 gf_affine	     = 0x1f
 gf_magic	     = 0x63
 alog_table	     = aes_tables.tab_t2
 log_table	     = aes_tables.tab_t2+256

; stack frame

 aes_stack_space     = 16 ; stack space for preserved registers
 aes_in_blk	     = 4  ; offset to in[] parameter
 aes_out_blk	     = 8  ; offset to out[] parameter
 aes_schedule_param  = 4  ; schedule's offset to parameter

; aes constants

 aes_max_rounds      = 14 ; aes-256 rounds

; size control options

 aes_encrypt_unroll  = 0  ; 0 for small & slow looped, 1 for big & fast unrolled
 aes_decrypt_unroll  = 0  ; //
 aes_schedule_unroll = 0  ; //

; -----------------------------------------------------------------------------
;
; structures
; -----------------------------------------------------------------------------

 struct aes_state_
     align 16
     keys	rd 4*(aes_max_rounds+1)
     align 16
     keys_d	rd 4*(aes_max_rounds+1)
     align 16
     state	rb 16
		rb 16
 ends

 struct aes_tables_
     tab_s	rb 256
     tab_si	rb 256
     tab_t1	rd 256
     tab_t2	rd 256
     tab_t3	rd 256
     tab_t4	rd 256
     tab_t5	rd 256
     tab_t6	rd 256
     tab_t7	rd 256
     tab_t8	rd 256
     tab_u1	rd 256
     tab_u2	rd 256
     tab_u3	rd 256
     tab_u4	rd 256
     tab_rc	rd 30
 ends

; -----------------------------------------------------------------------------
;
; '.udata' section - memory of tables | schedules | state
; -----------------------------------------------------------------------------
if defined SYS_LIN

 section '.data' writeable align 16

end if

 align 16
 aes_tables aes_tables_
 align 16
 aes_state  aes_state_

; -----------------------------------------------------------------------------
;
; core schedule routine macros
; -----------------------------------------------------------------------------
 macro normalkey     {
     xor    eax, [edi-32]
     stosd	     }

 macro firstkey rcon {
     ror    eax, 16
     substitute
     xor    eax, [rcon]
     xor    eax, [edi-32]
     stosd	     }

 macro middlekey     {
     substitute
     ror    eax, 8
     normalkey	     }

 macro substitute    {
     xlatb
     ror    eax, 8
     xlatb
     ror    eax, 8
     xlatb
     ror    eax, 8
     xlatb	     }

 macro table_lookup_keys_d source {

    movzx   eax, b[source+0]
    movzx   ebx, b[source+1]
    movzx   ecx, b[source+2]
    movzx   edx, b[source+3]
    mov     eax, d[eax*4+aes_tables.tab_u1]
    xor     eax, d[ebx*4+aes_tables.tab_u2]
    xor     eax, d[ecx*4+aes_tables.tab_u3]
    xor     eax, d[edx*4+aes_tables.tab_u4]
    stosd  }

; -----------------------------------------------------------------------------
;
; core aes routine macros
; -----------------------------------------------------------------------------

; encryption operations

 macro table_lookup_key_add source,dest,key {

  rept 4 column {

    movzx   eax, b[source+((column-1) mod 4)*4+0]
    movzx   ebx, b[source+((column+0) mod 4)*4+1]
    movzx   ecx, b[source+((column+1) mod 4)*4+2]
    mov     eax, d[eax*4+aes_tables.tab_t1]
    xor     eax, d[ebx*4+aes_tables.tab_t2]
    movzx   ebx, b[source+((column+2) mod 4)*4+3]
    xor     eax, d[ecx*4+aes_tables.tab_t3]
    xor     eax, d[ebx*4+aes_tables.tab_t4]
    xor     eax, d[key+(column-1)*4]
    mov     d[dest+(column-1)*4],eax  \}  }

 macro table_lookup_final source,dest,key {

  rept 4 column  {

    movzx   eax, b[source+((column+2) mod 4)*4+3]
    mov     bl , b[source+((column+1) mod 4)*4+2]
    mov     cl , b[source+((column+0) mod 4)*4+1]
    mov     dl , b[source+((column-1) mod 4)*4+0]
    mov     ah , b[eax+aes_tables.tab_s]
    mov     al , b[ebx+aes_tables.tab_s]
    shl     eax, 16
    mov     ah , b[ecx+aes_tables.tab_s]
    mov     al , b[edx+aes_tables.tab_s]
    xor     eax, d[key+(column-1)*4]
    mov     d[dest+(column-1)*4],eax  \}  }

; decryption operations

 macro table_lookup_key_add_d source,dest,key {

  rept 4 column {

    movzx   eax, b[source+((column-1) mod 4)*4+0]
    movzx   ebx, b[source+((column+2) mod 4)*4+1]
    movzx   ecx, b[source+((column+1) mod 4)*4+2]
    mov     eax, d[eax*4+aes_tables.tab_t5]
    xor     eax, d[ebx*4+aes_tables.tab_t6]
    movzx   ebx, b[source+((column+0) mod 4)*4+3]
    xor     eax, d[ecx*4+aes_tables.tab_t7]
    xor     eax, d[ebx*4+aes_tables.tab_t8]
    xor     eax, d[key+(column-1)*4]
    mov     d[dest+(column-1)*4], eax	\}  }

 macro table_lookup_final_d source,dest,key {

  rept 4 column {

    movzx  eax , b[source+((column+0) mod 4)*4+3]
    mov     bl , b[source+((column+1) mod 4)*4+2]
    mov     cl , b[source+((column+2) mod 4)*4+1]
    mov     dl , b[source+((column-1) mod 4)*4+0]
    mov     ah , b[eax+aes_tables.tab_si]
    mov     al , b[ebx+aes_tables.tab_si]
    shl     eax, 16
    mov     ah , b[ecx+aes_tables.tab_si]
    mov     al , b[edx+aes_tables.tab_si]
    xor     eax, d[key+(column-1)*4]
    mov     d[dest+(column-1)*4], eax	\} }

; -----------------------------------------------------------------------------
;
; common stack routine macros
; -----------------------------------------------------------------------------

 macro preserve_common_registers {

    sub     esp, aes_stack_space
    mov     [esp+ 0], ebx
    mov     [esp+ 4], ebp
    mov     [esp+ 8], edi
    mov     [esp+12], esi    }

 macro restore_common_registers ret_val {

    mov     ebx, [esp+ 0]
    mov     ebp, [esp+ 4]
    mov     edi, [esp+ 8]
    mov     esi, [esp+12]
    add     esp, aes_stack_space
    ret     ret_val  }

; -----------------------------------------------------------------------------
;
; '.code' section - table generation code
; -----------------------------------------------------------------------------

if defined SYS_LIN

section '.text' executable

end if

 aes_init:
;    cmp     d[esp+8],1 ; dll_process_attach
;    jne .a
    push ebx esi edi

; alog and log byte[256] tables

    mov     eax, 1
    mov     b[alog_table], al
    mov     b[log_table],   0
    mov     ecx, eax
@@: mov     bh, al
    add     bh, bh
    sbb     bl, bl
    and     bl, gf_modulus
    xor     bh, bl
    xor     al, bh
    mov     b[ecx+alog_table], al
    mov     b[eax+ log_table], cl
    add     cl, 1
    jnc     @b
    mov     b[eax+ log_table], cl

; substitution and inverse byte[] tables

    mov     edx, 255
    mov     ecx, gf_magic
    mov     b[000+ aes_tables.tab_s], cl
    mov     b[ecx+aes_tables.tab_si], 0
.s :movzx   eax, b[edx+log_table]
    not     al
    mov     al, b[eax+alog_table]
    mov     bl, gf_affine
    mov     cl, gf_magic
@@: shr     al, 1
    sbb     bh, bh
    and     bh, bl
    xor     cl, bh
    rol     bl, 1
    test    al, al
    jnz     @b
    mov     [edx+ aes_tables.tab_s], cl
    mov     [ecx+aes_tables.tab_si], dl
    sub     edx, 1
    jnz     .s

; t5, t6, t7, t8, u1, u2, u3 & u4 dword[256] tables

    xor     esi, esi
    xor     edx, edx
    mov     cl,  b[log_table+0x0e]
    mov     ch,  b[log_table+0x09]
    mov     bl,  b[log_table+0x0d]
    mov     bh,  b[log_table+0x0b]
.t: movzx   eax, b[esi+aes_tables.tab_si]
    lea     edi, [eax*4]
    test    eax, eax
    movzx   ebp, al
    jz	    .2
    mov     al, b[eax+ log_table]
    mov     dl, al
    add     dl, cl
    adc     dl, 0
    mov     dl, b[edx+alog_table]
    shrd    ebp,edx, 8
    mov     dl, al
    add     dl, ch
    adc     dl, 0
    mov     dl, b[edx+alog_table]
    shrd    ebp,edx, 8
    mov     dl, al
    add     dl, bl
    adc     dl, 0
    mov     dl, b[edx+alog_table]
    shrd    ebp,edx, 8
    add     al, bh
    adc     al, 0
    mov     al, b[eax+alog_table]
    shrd    ebp,eax, 8
.2: mov     [esi*4+aes_tables.tab_t5], ebp
    mov     [edi  +aes_tables.tab_u1], ebp
    rol     ebp, 8
    mov     [esi*4+aes_tables.tab_t6], ebp
    mov     [edi  +aes_tables.tab_u2], ebp
    rol     ebp, 8
    mov     [esi*4+aes_tables.tab_t7], ebp
    mov     [edi  +aes_tables.tab_u3], ebp
    rol     ebp, 8
    mov     [esi*4+aes_tables.tab_t8], ebp
    mov     [edi  +aes_tables.tab_u4], ebp
    add     esi, 1
    test    esi, 0xff
    jnz     .t

; t1, t2, t3 & t4 dword[256] tables

    xor     edx, edx
@@: movzx   eax, b[edx+aes_tables.tab_s]
    mov     ah, al
    add     ah, ah
    sbb     bl, bl
    and     bl, gf_modulus
    xor     ah, bl
    mov     ch, al
    xor     ch, ah
    mov     cl, al
    shl     ecx, 16
    mov     ch, al
    mov     cl, ah
    mov     [edx*4+aes_tables.tab_t1], ecx
    rol     ecx, 8
    mov     [edx*4+aes_tables.tab_t2], ecx
    rol     ecx, 8
    mov     [edx*4+aes_tables.tab_t3], ecx
    rol     ecx, 8
    mov     [edx*4+aes_tables.tab_t4], ecx
    add     dl, 1
    jnc     @b

; rcon dword[30] table

    xor     edx, edx
    mov     eax, 1
@@: mov     [edx*4+aes_tables.tab_rc], eax
    add     al, al
    sbb     bl, bl
    and     bl, gf_modulus
    xor     al, bl
    add     dl, 1
    cmp     dl, 30
    jb	    @b
.a: pop edi esi ebx
    ret ; 12

; -----------------------------------------------------------------------------
;
; encryption key schedule - user interface
; -----------------------------------------------------------------------------

 aes_enckey:

; preserve registers

    preserve_common_registers

; obtain input parameter, prepare registers for string instructions

    mov     esi, [esp+aes_stack_space+aes_schedule_param] ; lodsd
    mov     edi, aes_state.keys 			  ; stosd
    mov     ebx, aes_tables.tab_s			  ; xlatb

; input the user's key

    cld
    mov     ecx, 8
    rep     movsd

; 'kickstart' the algorithm

    mov     eax, [edi-4]

; perform the key generation

 if aes_schedule_unroll

  rept 6 rc {

    firstkey	((rc-1)*4)+aes_tables.tab_rc
    normalkey
    normalkey
    normalkey
    middlekey
    normalkey
    normalkey
    normalkey }

    firstkey	(( 6 )*4)+aes_tables.tab_rc
    normalkey
    normalkey
    normalkey

 else

    mov     ebp, aes_tables.tab_rc
    mov     cl , 32	  ; int i = Nk
.1: mov     edx, ecx
    and     edx, 31	  ;
    cmp     edx, 0	  ; i mod Nk = 0
    jne     .2
    ror     eax, 16	  ; little-endian conversion
    substitute		  ; temp = SubWord(RotWord(temp))
    xor     eax, [ebp]	  ; temp ^= Rcon[i/Nk]
    add     ebp, 4
    jmp     .3
.2: cmp     edx, 16	  ; i mod Nk = 4
    jne     .3
    substitute		  ; temp = SubWord(temp)
    ror     eax, 8
.3: xor     eax, [edi-32] ; temp ^= w[i-Nk]
    stosd		  ; w[i] = temp
    add     ecx, 4	  ; i++
    cmp     cl , 240	  ; while (i < Nb*(Nr+1))
    jnz     .1

 end if

; restore registers and return

     restore_common_registers 4

; -----------------------------------------------------------------------------
;
; decryption key schedule - user interface
; -----------------------------------------------------------------------------
 aes_deckey:

; preserve registers

    preserve_common_registers

; perform a normal encryption key scheduling

    push    d[esp+aes_stack_space+aes_schedule_param]
    call    aes_enckey

; set up schedule pointers

    mov     esi, aes_state.keys
    mov     edi, aes_state.keys_d

; move first round into decryption schedule

    mov     ecx, 4
    rep     movsd

; perform the invmixcolumns operation on all but first and last round keys

 if aes_schedule_unroll

  rept 52 keys {
    table_lookup_keys_d aes_state.keys+16+(keys-1)*4 }
    mov     esi, aes_state.keys+224

 else

    mov     ebp, 52
@@: table_lookup_keys_d esi
    add     esi, 4
    dec     ebp
    jnz     @b

 end if

; finish by moving last round into decryption schedule

    mov     cl, 4
    rep     movsd

; restore registers and return

    restore_common_registers 4

; -----------------------------------------------------------------------------
;
; aes encryption - user interface
; -----------------------------------------------------------------------------
 aes_encuser:

; preserve registers

    preserve_common_registers

; obtain input parameter, prepare registers for instructions

    cld
    mov     esi, [esp+aes_stack_space+aes_in_blk] ; lodsd
    mov     edi, aes_state.state		  ; stosd

; import user data, perform first key addition

  rept 4 column {
    lodsd
    xor     eax, d[aes_state.keys+(column-1)*4]
    stosd	}

; prepare registers for main algorithm

    mov     esi, aes_state.state     ; *state
    mov     ebp, aes_state.keys+16   ; *round key
    mov     edx, aes_max_rounds-1    ; rounds-1

; perform the nr-1 normal rounds

 if aes_encrypt_unroll

  rept 13 round {
    table_lookup_key_add esi,edi,(aes_state.keys+round*16)
    xchg    edi, esi }

 else

@@: table_lookup_key_add esi,edi,ebp
    add     ebp, 16
    xchg    edi, esi
    dec     edx
    jnz     @b

 end if

; perform final round, export data to out[]

    mov     edi, [esp+aes_stack_space+aes_out_blk]
    table_lookup_final esi,edi,aes_state.keys+224

; restore registers and return

    restore_common_registers 8

; -----------------------------------------------------------------------------
;
; aes decryption - user interface
; -----------------------------------------------------------------------------
 aes_decuser:

; preserve registers

    preserve_common_registers

; obtain input parameter, prepare registers for instructions

    mov     esi, [esp+aes_stack_space+aes_in_blk] ; lodsd
    mov     edi, aes_state.state		  ; stosd

; import user data, perform first key addition

  rept 4 column {
    lodsd
    xor     eax, d[aes_state.keys_d+224+(column-1)*4]
    stosd	}

; prepare pointer to state

    mov     esi, aes_state.state ; *state

; perform the nr-1 normal rounds

 if aes_decrypt_unroll

  rept 13 round {
    table_lookup_key_add_d esi,edi,(aes_state.keys_d+208-((round-1)*16))
    xchg    edi, esi }

 else

; prepare pointer to decryption schedule, loop counter

    mov     ebp, aes_state.keys_d+208 ; *round key
    mov     edx, aes_max_rounds-1     ; rounds-1

@@: table_lookup_key_add_d esi,edi,ebp
    sub     ebp, 16
    xchg    edi, esi
    dec     edx
    jnz     @b

 end if

; perform final round, export data to out[]

    mov     edi, [esp+aes_stack_space+aes_out_blk]
    table_lookup_final_d esi,edi,aes_state.keys_d

; restore registers and return

    restore_common_registers 8

; -----------------------------------------------------------------------------
;
; aes secure zero memory
; -----------------------------------------------------------------------------
 aes_memory:

; fill all bytes with 0b.11110000

     cld     ; forward
     mov     eax, 0xF0F0F0F0
     mov     ecx, 140
     mov     edi, aes_state.keys
     rep     stosd
     sub     edi, 4

; overlap all bytes with 0b.00001111

     std     ; backward
     not     eax ; 0x0F0F0F0F
     mov     ecx, 140
     rep     stosd
     ret

