
	format	pe

	include '%fasminc%\WIN32A.INC'

;;===========================================================================

align 16
copy_mmxopt:
CACHELINE	equ     20h     ;; for Celerons etc
CACHESIZE	equ     20000h  ;; 128Kb for celeron/duron
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
.l:	add	ECX, CACHESIZE  ;;  move up to end of cashed block
	mov	EAX, CACHESIZE / (CACHELINE*2)  ;;  note: prefetch loop is unrolled 2X
;;  prefetch data
@@:		test	ebx, [ESI+ECX-CACHELINE]        ;;  acess one address in this cache line...
		test	ebx, [ESI+ECX-CACHELINE*2]      ;;  ... and one in the previous line
		sub	ECX, CACHELINE*2
		dec	EAX
	jnz	@b
;;  copy block from the cache
	mov	EAX, CACHESIZE / 64
@@:		movq	mm0,qword [esi+ecx]
		movq	mm1,qword [esi+ecx+8]
		movq	mm2,qword [esi+ecx+16]
		movq	mm3,qword [esi+ecx+24]
		movq	mm4,qword [esi+ecx+32]
		movq	mm5,qword [esi+ecx+40]
		movq	mm6,qword [esi+ecx+48]
		movq	mm7,qword [esi+ecx+56]
		movntq	qword [edi+ecx],mm0
		movntq	qword [edi+ecx+8],mm1
		movntq	qword [edi+ecx+16],mm2
		movntq	qword [edi+ecx+24],mm3
		movntq	qword [edi+ecx+32],mm4
		movntq	qword [edi+ecx+40],mm5
		movntq	qword [edi+ecx+48],mm6
		movntq	qword [edi+ecx+56],mm7
		add	ECX, 64
		dec	EAX
	jnz	@b
	or	ECX, ECX
	jnz	.l
	sfence
	emms
	jmp	r

align 16
copy_mmxnt:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	movq	mm0,qword [esi+ecx]
	movq	mm1,qword [esi+ecx+8]
	movq	mm2,qword [esi+ecx+16]
	movq	mm3,qword [esi+ecx+24]
	movq	mm4,qword [esi+ecx+32]
	movq	mm5,qword [esi+ecx+40]
	movq	mm6,qword [esi+ecx+48]
	movq	mm7,qword [esi+ecx+56]
	movntq	qword [edi+ecx],mm0
	movntq	qword [edi+ecx+8],mm1
	movntq	qword [edi+ecx+16],mm2
	movntq	qword [edi+ecx+24],mm3
	movntq	qword [edi+ecx+32],mm4
	movntq	qword [edi+ecx+40],mm5
	movntq	qword [edi+ecx+48],mm6
	movntq	qword [edi+ecx+56],mm7
	add	ECX, 64
	jnz	@b
	emms
	jmp	r

align 16
copy_mmx:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	movq	mm0,qword [esi+ecx]
	movq	mm1,qword [esi+ecx+8]
	movq	mm2,qword [esi+ecx+16]
	movq	mm3,qword [esi+ecx+24]
	movq	mm4,qword [esi+ecx+32]
	movq	mm5,qword [esi+ecx+40]
	movq	mm6,qword [esi+ecx+48]
	movq	mm7,qword [esi+ecx+56]
	movq	qword [edi+ecx],mm0
	movq	qword [edi+ecx+8],mm1
	movq	qword [edi+ecx+16],mm2
	movq	qword [edi+ecx+24],mm3
	movq	qword [edi+ecx+32],mm4
	movq	qword [edi+ecx+40],mm5
	movq	qword [edi+ecx+48],mm6
	movq	qword [edi+ecx+56],mm7
	add	ECX, 64
	jnz	@b
	emms
	jmp	r

align 16
copy_mov:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	lea	ESI, [ESI+ECX]
	lea	EDI, [EDI+ECX]
	neg	ECX
@@:	mov	EAX, dword [ESI+ECX]
	mov	EDX, dword [ESI+ECX+4]
	mov	dword [EDI+ECX], EAX
	mov	dword [EDI+ECX+4], EDX
	add	ECX, 8
	jnz	@b
	jmp	r

align 16
copy_movsd:
	mov	ESI, src ;;  source array
	mov	EDI, dst ;;  destination array
	mov	ECX, [EBP+4] ;;  number of bytes
	shr	ECX, 2 ;;  number of DWORDs
	rep	movsd
	jmp	r

;;===========================================================================
entry   $

;;  calculate CPU frequency
	invoke	GetCurrentProcess
	invoke	SetPriorityClass, EAX, REALTIME_PRIORITY_CLASS
	invoke	GetCurrentThread
	invoke	SetThreadPriority, EAX, THREAD_PRIORITY_TIME_CRITICAL
	xor	EAX, EAX
	cpuid
	mov	EBP, CPU
	mov	dword [EBP], EBX
	mov	dword [EBP+4], EDX
	mov	dword [EBP+8], ECX
	xor	EBX, EBX
	push	EBX     ;;  zero counter
	push	EBX
@@:		invoke	Sleep, 0
		rdtsc
		add     [ESP], EAX      ;;  low part
		adc     [ESP+4], EDX    ;;  high part
		invoke	Sleep, [delay]
		rdtsc
		sub     [ESP], EAX      ;;  accumulate
		sbb     [ESP+4], EDX
		inc	EBX
		cmp	EBX, 5	        ;;  iterations
	jnz	@b
	fild	qword [ESP]	        ;;  accumulated value
	fchs			        ;;  correct sign
	push	EBX
	fidiv	dword [ESP]     ;;  average value
	fdiv    [KHz]
	fidiv   [delay]         ;; CPU frequency
	fst     [KHz]	        ;; store CPU frequency

;;  memcory benchmark
	mov	EBP, procs
memtest:	xor	EBX, EBX
		push	EBX     ;;  zero counter
		push	EBX
@@:     ;;  trash the cache
		mov esi, dst-80000h
		mov edi, esi
		mov ecx, 80000h ;; 512Kb
		rep movsb
		invoke	Sleep, 0
		rdtsc
		sub     [ESP], EAX      ;;  low part
		sbb     [ESP+4], EDX    ;;  high part
		jmp	dword [EBP]
r:		rdtsc
		rdtsc
		add     [ESP], EAX      ;;  accumulate
		adc     [ESP+4], EDX
		inc	EBX
		cmp	EBX, 20         ;;  iterations
	jnz	@b
;;  calculate bandwidth
	fild	dword [EBP+4]   ;;  nymber of bytes
	fadd	st0, st0	        ;;  double amount
	fmul    [KHz]	        ;;  * frequency
	fild	qword [ESP]     ;;  accumulated tics
	push	EBX
	fidiv	dword [ESP]     ;;  average tics value
	fdivp	st1,st0         ;;  bytes*frec/tics
	pop	EBX     ;;  correct stack
	pop	EBX
	fistp	dword [ESP]     ;;  push bandwidth
;;  process table
	add	EBP, 8  ;;  next proc
	cmp	dword [EBP], 0  ;; check table end
	jnz	memtest
;;  cpu frequency
	push	eax     ;;  meke local dword
	fld     [KHz]   ;; convert type
	fist	dword [ESP]     ;; to int
	invoke	wsprintf, buf, caption ;;  use dwords onto the stack
;;  show all info
;        push    ecx
;        mov     ecx, esp
	push    0
	push	esp;ecx
	push	eax
	push	buf
	invoke	GetStdHandle, STD_OUTPUT_HANDLE
	push	eax
	invoke	WriteFile
	invoke	ExitProcess, 0

;;===========================================================================
align   4

delay	dd 500
KHz	dd 1000f

procs:	dd	copy_mmxopt,100000h,	copy_mmxopt,1000000h
	dd	copy_mmxnt,10000h,	copy_mmxnt,100000h,	copy_mmxnt,1000000h
	dd	copy_mmx,10000h,	copy_mmx,100000h,	copy_mmx,1000000h
	dd	copy_mov,10000h,	copy_mov,100000h,	copy_mov,1000000h
	dd	copy_movsd,10000h,	copy_movsd,100000h,	copy_movsd,1000000h, 0

TAB     = 9
CR	fix 0Dh,0Ah
caption db	CR,'memcopy benchmark by S.T.A.S.'
form	db	CR,CR,'CPU '
CPU	dd      0,0,0
	db      ' @ %lu MHz',CR,'Test results:',CR, CR
	db      'copy method',TAB,'bytes to copy / bandwidth', CR
	db	TAB,TAB,'16,0 Mb',TAB,TAB,'1,0 Mb',TAB,TAB,'64 Kb', CR, CR
	db      'movsd:',TAB,TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db      'mov w/GPRs:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db      'mmx (8*qwords):',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db      'mmx/movntq:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec', CR
	db      'optimized mmx:',TAB,'%lu Mb/sec',TAB,'%lu Mb/sec',TAB,'n/a', CR, 0

;;===========================================================================
data	import
	library kernel32,'KERNEL32.DLL',\
		user32,'USER32.DLL'

	include '%fasminc%/apia/kernel32.inc'
	include '%fasminc%/apia/user32.inc'
end data

;;===========================================================================

buf	rd 100h

;buffers for memcopy
align 1000h
src	rb 1000000h     ; 16Mb
dst	rb 1000000h