cls= 32	;cache line size in bytes

macro	call	proc,[arg] {
common
  if ~ arg eq
reverse
	push	arg
common
  end if
	call	proc
}

;define string with implicit size
struc	dstr	[arg] {
common
local	cnt
	dd	.size
@@:
  cnt= 0
  if ~arg eq
forward
	db	arg
  cnt= cnt + 1
common
  end if
  .size= $ - @b
  .cnt= cnt
  .els= 1
}

SYS_EXIT	equ  1
SYS_WRITE	equ  4

STDIN           equ 0
STDOUT          equ 1
STDERR          equ 2

format	ELF executable
entry	start

segment	readable executable

;print a string in .txt to stdout
p_txt:
	push	ebx ecx edx
virtual at esp+10h
	.txt	dd ?
end virtual
	mov	eax,SYS_WRITE
	mov	ebx,STDOUT
	mov	ecx,[.txt]
	mov	edx,[ecx]
	add	ecx,4
	int	0x80
	pop	edx ecx ebx
	ret	4

;append the string in .src to another one in .dst
;optimized for small texts
a_txt:
	push	edi esi ecx eax
virtual at esp+14h
	.dst	dd ?
	.src	dd ?
end virtual
	cld
	mov	edi,[.dst]
	mov	esi,[.src]
	mov	ecx,[esi]
	mov	eax,[edi]
	add	[edi],ecx
	lea	edi,[edi+eax+4]
	add	esi,4
	rep	movsb
	pop	eax ecx esi edi
	ret	8

;append the byte in .dat as a hexadecimal string to the string in .dst
a_hex_b:
	push	edi eax edx
virtual at esp+10h
	.dst	dd ?
	.dat	dd ?
end virtual
	mov	edi,[.dst]
	movzx	eax,byte [.dat]
	mov	ah,al
	shr	al,4
	cmp	al,9
	seta	dl
	neg	dl
	and	ah,0Fh
	cmp	ah,9
	seta	dh
	neg	dh
	and	edx,0707h
	lea	eax,[eax+edx+'00']
	mov	edx,[edi]
	add	dword [edi],2
	mov	[edi+edx+4],ax
	pop	edx eax edi
	ret	8

;append the dword in .dat as a hexadecimal string to the string in .dst
a_hex_d:
	push	edi eax
virtual at esp+0Ch
	.dst	dd ?
	.dat	dd ?
end virtual
	mov	edi,[.dst]
	mov	eax,[.dat]
	rol	eax,8
	call	a_hex_b,	edi,eax
	rol	eax,8
	call	a_hex_b,	edi,eax
	rol	eax,8
	call	a_hex_b,	edi,eax
	rol	eax,8
	call	a_hex_b,	edi,eax
	pop	eax edi
	ret	8

; Put whatever stuff you want to benchmark in the callback handler.
; All registers except esp,ebp,esi and edi may be modified
; (else change the code).
; This benchmark is very precise since:
; 	- it uses a differential time-calculation algorithm and
; 	- serialzationing instructions (cpuid) to prevent parallel executions,
; 	  code alignment to the cache-line size,
; 	  breaking of dependency chains and
; 	  looping of the time-calculation to have everything cached when the
; 	  time gets actually stored or else this would make the
; 	  time calculation _REALLY_ unreliable and
; Because of this, it is perfect for calculating the time singles or clusters
; of instructions actually take. This can be a huge help when writing speed
; optimized code. :)
TSC_bench:
	push	ebp ecx ebx esi edi
	mov	ebp,0	;break dependency chain
	mov	ecx,0
	mov	ebx,0
	mov	esi,dummy_proc
	mov	edi,0
	mov	eax,0
	mov	edx,0
	push	esi edi ebp
virtual	at esp		;local variables
  .TSC_adj	dd ?,?
  .dummy_proc	dd ?
end virtual
virtual at esp+24h	;parameters
  .loop_cnt	dd ?	;how often to actually loop the calculation
  .callback	dd ?	;the procedure which time we want to measure
end virtual
	mov	[.TSC_adj],eax
	mov	[.TSC_adj+4],edx
	mov	ebp,[.loop_cnt]
;TSC calibrating part
align	cls
  .TSC_cal_loop:
	rdtsc
	mov	esi,eax
	mov	edi,edx
	mov	eax,0	;break dependency chain
	cpuid		;serializing instruction to prevent parallelism
	call	dword [.dummy_proc]
	rdtsc
	sub	eax,esi
	sbb	edx,edi
	sub	ebp,1	;prevent partial flag dependency (P4+ only)
	jnz	.TSC_cal_loop
;this is the number of CPU-cycles nothing will take, used to adjust the actual
;CPU-cycles mesuring later
	mov	[.TSC_adj],eax
	mov	[.TSC_adj+4],edx
	mov	ebp,[.loop_cnt]
align	cls
;actual TSC measuring
  .TSC_loop:
	rdtsc
	mov	esi,eax
	mov	edi,edx
	mov	eax,0
	cpuid
	call	dword [.callback]
	rdtsc
	sub	eax,esi
	sbb	edx,edi
	sub	ebp,1
	jnz	.TSC_loop

	sub	eax,[.TSC_adj]
	sbb	edx,[.TSC_adj+4]
	pop	ebp edi esi
	pop	edi esi ebx ecx ebp
	ret	8
;the number of CPU-cycles the callback procedure took are returned in edx:eax

align	cls
dummy_proc:
	ret

align	cls
test_proc:	;something to test
;this should be the size of all instructions of 1 round of the repeat loop
;which must also be equal for the initialization, main and finitialization part
;we can't know their exact values at this point, so we preset them to 0
init_code_cell_size	= 0
code_cell_size		= 0
finit_code_cell_size	= 0

;the actual cell code
macro	init_code_cell	i {
	rdtsc
	bt	eax,i
	jc	near $+6 + (i -1) * init_code_cell_size
	jnc	near $+6 + (61-i) * init_code_cell_size
}
macro	code_cell	i {
	rdtsc
	bt	eax,i and 1Fh
	jc	near $+6 + (i shr 8  and 3Fh - 20h) * code_cell_size
	jnc	near $+6 + (i shr 16 and 3Fh - 20h) * code_cell_size
}
macro	finit_code_cell	i {
	rdtsc
	bt	eax,i
	jc	near $+6 - (i +    1) * finit_code_cell_size
	jnc	near $+6 - (63-(i*2)) * finit_code_cell_size
}

;define actual code cell sizes now
virtual	at 0
	init_code_cell	0
	init_code_cell_size= $
end virtual
virtual	at 0
	code_cell	0
	code_cell_size= $
end virtual
virtual	at 0
	finit_code_cell	0
	finit_code_cell_size= $
end virtual
;check if all 3 code cell sizes are equal, else code will produce
;unreliable results
if ( init_code_cell_size <> code_cell_size) | \
   (finit_code_cell_size <> code_cell_size)
  display	"ERROR: code cell sizes do not match."
  "ERROR: code cell sizes do not match."
end if

;initialization part
  repeat 31
	init_code_cell	%-1
  end repeat
;random seed:	Uncomment the %t if you're feeling lucky and this will give you
;a different code each time you compile it
  d= 03191BCF4h	;03197BCF1h	;%t
  h= d
;main part
  repeat 080h
    h= h + 9E3779B97F4A7C15h		;~ MAX_INT * (sqrt(5)/2 - 1/2)
    d= ((d shl 43 + d shr 21) xor h) + d;some random random-number function
	code_cell	d
  end repeat
;finitialization part
  repeat 32
	finit_code_cell	32-%
  end repeat
	ret

start:
	call	TSC_bench,	1,test_proc
	mov	dword [txt_buf],0
	call	a_hex_d,	txt_buf,edx
	call	a_txt,		txt_buf,space
	call	a_hex_d,	txt_buf,eax
	call	a_txt,		txt_buf,ln
	call	p_txt,	txt_buf
exit:
	emms
	mov     eax,SYS_EXIT
	xor     ebx,ebx
	int	0x80

dd	$ - $$
db	"79538333"

segment	readable

space	dstr " "
ln	dstr 0Ah

segment	readable writeable

txt_buf	db 100h dup ?
