format PE GUI 4.0
LOOP_TIMES = 20000000

	call    [GetCurrentProcess]

	push    REALTIME_PRIORITY_CLASS
	push    eax
	call    [SetPriorityClass]

	; f0dder: set thread priority and affinity as well
	call	[GetCurrentThread]
	mov		ebx, eax

	push	THREAD_PRIORITY_TIME_CRITICAL
	push	ebx
	call	[SetThreadPriority]
	
	push	1
	push	ebx
	call	[SetThreadAffinityMask]

	mov     edi, 5

align 16
start:

BT_TESTING:
  mov     ebp, LOOP_TIMES

  xor     eax, eax
  cpuid 
  rdtsc 
  mov     [BT_LowStart], eax
  mov     [BT_HighStart], edx

.loop:
  mov     ebx, string + string.size
  mov     esi, -string.size

.innerLoop:
  mov     eax, string
  movzx   edx, byte [ebx+esi]
  call    BT.includes?

  add     esi, 1
  jnz     .innerLoop

  sub     ebp, 1
  jnz     .loop

  xor     eax, eax
  cpuid 
  rdtsc 
  mov     [BT_LowEnd], eax
  mov     [BT_HighEnd], edx

align 16
TEST_TESTING:
  mov     ebp, LOOP_TIMES

  xor     eax, eax
  cpuid 
  rdtsc 
  mov     [TEST_LowStart], eax
  mov     [TEST_HighStart], edx

.loop:
  mov     ebx, string + string.size
  mov     esi, -string.size

.innerLoop:
  mov     eax, string
  movzx   edx, byte [ebx+esi]
  call    TEST.includes?

  add     esi, 1
  jnz     .innerLoop

  sub     ebp, 1
  jnz     .loop

  xor     eax, eax
  cpuid 
  rdtsc 
  mov     [TEST_LowEnd], eax
  mov     [TEST_HighEnd], edx




  dec     edi
  jnz     start 

  mov     eax, [BT_LowEnd]
  mov     ebx, [BT_HighEnd]
  sub     eax, [BT_LowStart]
  sbb     ebx, [BT_HighStart]

  mov     ecx, [TEST_LowEnd]
  mov     edx, [TEST_HighEnd]
  sub     ecx, [TEST_LowStart]
  sbb     edx, [TEST_HighStart]

  mov     edi, eax
  mov     esi, ebx
  sub     edi, ecx
  sbb     esi, edx

  ; EBX:EAX = Number of cycles for BT instruction
  ; EDX:ECX = Number of cycles for TEST instruction
  ; ESI:EDI = EBX:EAX - EDX:ECX
  int3

align 16
BT.includes?:; EAX = Pointer to CharSet | EDX = char to add
  bt      dword [eax], edx

  sbb     eax, eax
; With the commented code takes 1 cycle more so I use SBB instead
;  setc    al
;  movsx   eax, al
  ret

align 16
TEST.includes?:; EAX = Pointer to CharSet | EDX = char to add
  mov     ecx, edx
  mov     edx, 1
  shl     edx, cl
  shr     ecx, 5
  mov     eax, [eax+ecx]

; With the commented code takes 1 cycle more
;  test    [eax+ecx], edx

;  sete    al
;  movsx   eax, al

  and     eax, edx
  ret

align 64 ; AMD64 cache line size
aCharSet rb 256 / 8

string   db "flat assembler is GREAT Very Happy"

string.size = $ - string


BT_LowStart     dd ?
BT_HighStart    dd ?
BT_LowEnd       dd ?
BT_HighEnd      dd ?

TEST_LowStart   dd ?
TEST_HighStart  dd ?
TEST_LowEnd     dd ?
TEST_HighEnd    dd ?

include 'win32a.inc'

align 4
data import
  library kernel,'KERNEL32.DLL'

  import kernel,\
         GetCurrentProcess, 'GetCurrentProcess',\
		 GetCurrentThread,	'GetCurrentThread',\
		 SetThreadAffinityMask,	'SetThreadAffinityMask',\
		 SetThreadPriority,	'SetThreadPriority',\
         SetPriorityClass,  'SetPriorityClass'

end data
