
	format	pe gui

	include	'%fasminc%\WIN32A.INC'

;;===========================================================================

align 16
copy_mmxopt:
CACHELINE	equ	20h	;; for Celerons etc
CACHESIZE	equ	20000h	;; 128Kb for celeron/duron
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
.l:	add	ECX, CACHESIZE	;;  move up to end of cashed block
	mov	EAX, CACHESIZE / (CACHELINE*2)	;;  note: prefetch loop is unrolled 2X
;;  prefetch data
@@:		test	ebx, [ESI+ECX-CACHELINE]	;;  acess one address in this cache line...
		test	ebx, [ESI+ECX-CACHELINE*2]	;;  ... and one in the previous line
		sub	ECX, CACHELINE*2
		dec	EAX
	jnz	@b
;;  copy block from the cache
	mov	EAX, CACHESIZE / 64
@@:		movq	mm0,qword [esi+ecx]
		movq	mm1,qword [esi+ecx+8]
		movq	mm2,qword [esi+ecx+16]
		movq	mm3,qword [esi+ecx+24]
		movq	mm4,qword [esi+ecx+32]
		movq	mm5,qword [esi+ecx+40]
		movq	mm6,qword [esi+ecx+48]
		movq	mm7,qword [esi+ecx+56]
		movntq	qword [edi+ecx],mm0
		movntq	qword [edi+ecx+8],mm1
		movntq	qword [edi+ecx+16],mm2
		movntq	qword [edi+ecx+24],mm3
		movntq	qword [edi+ecx+32],mm4
		movntq	qword [edi+ecx+40],mm5
		movntq	qword [edi+ecx+48],mm6
		movntq	qword [edi+ecx+56],mm7
		add	ECX, 64
		dec	EAX
	jnz	@b
	or	ECX, ECX
	jnz	.l	
	sfence	
	emms
	jmp	r
	
align 16
copy_mmxnt:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	movq	mm0,qword [esi+ecx]
	movq	mm1,qword [esi+ecx+8]
	movq	mm2,qword [esi+ecx+16]
	movq	mm3,qword [esi+ecx+24]
	movq	mm4,qword [esi+ecx+32]
	movq	mm5,qword [esi+ecx+40]
	movq	mm6,qword [esi+ecx+48]
	movq	mm7,qword [esi+ecx+56]
	movntq	qword [edi+ecx],mm0
	movntq	qword [edi+ecx+8],mm1
	movntq	qword [edi+ecx+16],mm2
	movntq	qword [edi+ecx+24],mm3
	movntq	qword [edi+ecx+32],mm4
	movntq	qword [edi+ecx+40],mm5
	movntq	qword [edi+ecx+48],mm6
	movntq	qword [edi+ecx+56],mm7
	add	ECX, 64
	jnz	@b
	emms
	jmp	r

align 16
copy_mmx:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	movq	mm0,qword [esi+ecx]
	movq	mm1,qword [esi+ecx+8]
	movq	mm2,qword [esi+ecx+16]
	movq	mm3,qword [esi+ecx+24]
	movq	mm4,qword [esi+ecx+32]
	movq	mm5,qword [esi+ecx+40]
	movq	mm6,qword [esi+ecx+48]
	movq	mm7,qword [esi+ecx+56]
	movq	qword [edi+ecx],mm0
	movq	qword [edi+ecx+8],mm1
	movq	qword [edi+ecx+16],mm2
	movq	qword [edi+ecx+24],mm3
	movq	qword [edi+ecx+32],mm4
	movq	qword [edi+ecx+40],mm5
	movq	qword [edi+ecx+48],mm6
	movq	qword [edi+ecx+56],mm7
	add	ECX, 64
	jnz	@b
	emms
	jmp	r

align 16
copy_mov:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	mov	EAX, dword [ESI+ECX]
	mov	EDX, dword [ESI+ECX+4]
	mov	dword [EDI+ECX], EAX
	mov	dword [EDI+ECX+4], EDX
	add	ECX, 8
	jnz	@b
	jmp	r

align 16
copy_movsd:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	shr	ECX, 2 ;;  number of DWORDs
	rep	movsd
	jmp	r

;;===========================================================================
entry	$

;;  calculate CPU frequency
	invoke	GetCurrentProcess
	invoke	SetPriorityClass, EAX, REALTIME_PRIORITY_CLASS
	invoke	GetCurrentThread
	invoke	SetThreadPriority, EAX, THREAD_PRIORITY_TIME_CRITICAL
	xor	EAX, EAX
	cpuid
	mov	EBP, CPU
	mov	dword [EBP], EBX
	mov	dword [EBP+4], EDX
	mov	dword [EBP+8], ECX
	xor	EBX, EBX
	push	EBX	;;  zero counter
	push	EBX
@@:		invoke	Sleep, 0
		rdtsc
		add	[ESP], EAX	;;  low part
		adc	[ESP+4], EDX	;;  high part
		invoke	Sleep, [delay]
		rdtsc
		sub	[ESP], EAX	;;  accumulate
		sbb	[ESP+4], EDX
		inc	EBX
		cmp	EBX, 5		;;  iterations
	jnz	@b
	fild	qword [ESP]		;;  accumulated value
	fchs				;;  correct sign
	push	EBX
	fidiv	dword [ESP]	;;  average value
	fdiv	[KHz]
	fidiv	[delay]		;; CPU frequency
	fst	[KHz]		;; store CPU frequency

;;  memcory benchmark
	mov	EBP, procs
memtest:	xor	EBX, EBX
		push	EBX	;;  zero counter
		push	EBX
@@:	;;  trash the cache
		mov esi, dst-80000h
		mov edi, esi
		mov ecx, 80000h	;; 512Kb
		rep movsb
		invoke	Sleep, 0
		rdtsc
		sub	[ESP], EAX	;;  low part
		sbb	[ESP+4], EDX	;;  high part	
		jmp	dword [EBP]
r:		rdtsc
		rdtsc
		add	[ESP], EAX	;;  accumulate
		adc	[ESP+4], EDX
		inc	EBX
		cmp	EBX, 20		;;  iterations
	jnz	@b
;;  calculate bandwidth
	fild	dword [EBP+4]	;;  nymber of bytes
	fadd	st0, st0		;;  double amount
	fmul	[KHz]		;;  * frequency
	fild	qword [ESP]	;;  accumulated tics
	push	EBX
	fidiv	dword [ESP]	;;  average tics value
	fdivp	st1,st0		;;  bytes*frec/tics
	pop	EBX	;;  correct stack
	pop	EBX
	fistp	dword [ESP]	;;  push bandwidth
;;  process table
	add	EBP, 8	;;  next proc
	cmp	dword [EBP], 0	;; check table end
	jnz	memtest
;;  cpu frequency
	push	eax	;;  meke local dword
	fld	[KHz]	;; convert type
	fist	dword [ESP]	;; to int
	invoke	wsprintf, buf, form ;;  use dwords onto the stack
;;  show all info
	xor	EBX, EBX
	invoke	MessageBox,EBX, buf,caption,EBX	
	invoke	ExitProcess, EBX

;;===========================================================================
align	4

delay	dd 500
KHz	dd 1000f

procs:	dd	copy_mmxopt,100000h,	copy_mmxopt,1000000h
	dd	copy_mmxnt,10000h,	copy_mmxnt,100000h,	copy_mmxnt,1000000h
	dd	copy_mmx,10000h,	copy_mmx,100000h,	copy_mmx,1000000h
	dd	copy_mov,10000h,	copy_mov,100000h,	copy_mov,1000000h
	dd	copy_movsd,10000h,	copy_movsd,100000h,	copy_movsd,1000000h, 0

TAB	= 9
CR	= 0Dh
caption	db	'  memcopy benchmark by S.T.A.S.',0
form	db	'CPU '
CPU	dd	0,0,0
	db	' @ %lu MHz',CR,'Test results:',CR, CR
	db	'copy method',TAB,'bytes to copy / bandwidth', CR
	db	TAB,TAB,'16,0 Mb',TAB,TAB,'1,0 Mb',TAB,TAB,'64 Kb', CR, CR
	db	'movsd:',TAB,TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db	'mov w/GPRs:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db	'mmx (8*qwords):',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db	'mmx/movntq:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db	'optimized mmx:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'n/a', CR, 0

;;===========================================================================
data	import
	library	kernel32,'KERNEL32.DLL',\
		user32,'USER32.DLL'

	include '%fasminc%/apia/kernel32.inc'
	include '%fasminc%/apia/user32.inc'
end data

;;===========================================================================

buf	rd 100h

;buffers for memcopy
align 1000h
src	rb 1000000h	; 16Mb
dst	rb 1000000h