format PE console 4.0
entry main
include 'C:\fasmw16727\INCLUDE\win32a.inc'

  ; ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; processor clock cycle count for a block of code. These macros must
  ; be used in pairs, and the block of code must be placed in between
  ; the counter_begin and counter_end macro calls. The counter_end macro
  ; returns the clock cycle count for a single pass through the block of
  ; code, corrected for the test loop overhead, in EAX.
  ;
  ; These macros require a .586 or higher processor directive.
  ;
  ; If your code is using MMX instructions and not executing an EMMS
  ; at the end of each MMX instruction sequence, defining the symbol
  ; _EMMS will cause the ctr_end macro to insert an EMMS in front of
  ; the FPU instructions.
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce the same cycle count.
  ; ---------------------------------------------------------------------

macro	counter_begin loopcount*, priority {
        LOCAL label

        mov [__counter__loop__count__], loopcount
        if ~ priority eq
          invoke GetCurrentProcess
          invoke SetPriorityClass, eax, priority
        end if
        xor   eax, eax        ;; Use same CPUID input value for each call
        cpuid                 ;; Flush pipe & wait for pending ops to finish
        rdtsc                 ;; Read Time Stamp Counter

        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count
        mov   [__counter__loop__counter__], loopcount
        xor   eax, eax
        cpuid                 ;; Make sure loop setup instructions finish
      ALIGN 16                ;; Optimal loop alignment for P6
      @@:                     ;; Start an empty reference loop
        sub   [__counter__loop__counter__], 1
        jnz   @B

        xor   eax, eax
        cpuid                 ;; Make sure loop instructions finish
        rdtsc                 ;; Read end count
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of overhead count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of overhead count in EDX
        push  edx             ;; Preserve high-order 32 bits of overhead count
        push  eax             ;; Preserve low-order 32 bits of overhead count

        xor   eax, eax
        cpuid
        rdtsc
        push  edx             ;; Preserve high-order 32 bits of start count
        push  eax             ;; Preserve low-order 32 bits of start count
        mov   [__counter__loop__counter__], loopcount
        xor   eax, eax
        cpuid                 ;; Make sure loop setup instructions finish
      ALIGN 16                ;; Optimal loop alignment for P6
      label:                  ;; Start test loop
        __counter__loop__label__ equ label
}

macro	counter_end {
        sub   [__counter__loop__counter__], 1
        jnz   __counter__loop__label__

        xor   eax, eax
        cpuid                 ;; Make sure loop instructions finish
        rdtsc                 ;; Read end count
        pop   ecx             ;; Recover low-order 32 bits of start count
        sub   eax, ecx        ;; Low-order 32 bits of test count in EAX
        pop   ecx             ;; Recover high-order 32 bits of start count
        sbb   edx, ecx        ;; High-order 32 bits of test count in EDX
        pop   ecx             ;; Recover low-order 32 bits of overhead count
        sub   eax, ecx        ;; Low-order 32 bits of adjusted count in EAX
        pop   ecx             ;; Recover high-order 32 bits of overhead count
        sbb   edx, ecx        ;; High-order 32 bits of adjusted count in EDX

        mov   dword [__counter__qword__count__], eax
        mov   dword [__counter__qword__count__ + 4], edx

        invoke GetCurrentProcess
        invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS

        if defined _EMMS
          EMMS
        end if

	finit
	fild	[__counter__qword__count__]
	fild	[__counter__loop__count__]
	fdivp	st1,st0
	fistp	[__counter__qword__count__]

	mov	eax, dword [__counter__qword__count__]
}

  ; ---------------------------------------------------------------------
  ; These two macros perform the grunt work involved in measuring the
  ; execution time in milliseconds for a specified number of loops
  ; through a block of code. These macros must be used in pairs, and
  ; the block of code must be placed in between the timer_begin and
  ; timer_end macro calls. The timer_end macro returns the elapsed
  ; milliseconds for the entire loop in EAX.
  ;
  ; These macros utilize the high-resolution performance counter.
  ; The return value will be zero if the high-resolution performance
  ; counter is not available.
  ;
  ; If your code is using MMX instructions and not executing an EMMS
  ; at the end of each MMX instruction sequence, defining the symbol
  ; _EMMS will cause the timer_end macro to insert an EMMS in front of
  ; the FPU instructions.
  ;
  ; The loopcount parameter should be set to a relatively high value to
  ; produce repeatable results.
  ;
  ; Note that setting the priority parameter to REALTIME_PRIORITY_CLASS
  ; involves some risk, as it will cause your process to preempt *all*
  ; other processes, including critical Windows processes. Setting the
  ; priority parameter to HIGH_PRIORITY_CLASS instead will significantly
  ; reduce the risk, and in most cases will produce very nearly the same
  ; result.
  ; ---------------------------------------------------------------------
macro	timer_begin loopcount*, priority {
        LOCAL label

        invoke QueryPerformanceFrequency, __timer__pc__frequency__
        test	eax,eax
        jz	label
	if ~ priority eq
              invoke GetCurrentProcess
              invoke SetPriorityClass, eax, priority
        end if
        invoke	QueryPerformanceCounter, __timer__pc__count__
        push	dword [__timer__pc__count__+4]
        push	dword [__timer__pc__count__]
        mov	[__timer__loop__counter__], loopcount
        ALIGN 16              ;; Optimal loop alignment for P6
        @@:                   ;; Start an empty reference loop
            sub   [__timer__loop__counter__], 1
            jnz   @b
	invoke QueryPerformanceCounter, __timer__pc__count__
	pop   ecx           ;; Recover low-order 32 bits of start count
	sub   dword [__timer__pc__count__], ecx
	pop   ecx           ;; Recover high-order 32 bits of start count
	sbb   dword [__timer__pc__count__ + 4], ecx

	push	dword [__timer__pc__count__ + 4] ;; Overhead count
	push    dword [__timer__pc__count__]     ;; Overhead count
	invoke QueryPerformanceCounter, __timer__pc__count__
	push    dword [__timer__pc__count__ + 4] ;; Start count
	push    dword [__timer__pc__count__]     ;; Start count
	mov	[__timer__loop__counter__], loopcount
	ALIGN 16              ;; Optimal loop alignment for P6
          label:                ;; Start test loop
            __timer__loop__label__ equ label
}

macro	timer_end {
        invoke QueryPerformanceFrequency, __timer__pc__frequency__
        test	eax,eax
        jz	@f
	sub   [__timer__loop__counter__], 1
	jnz   __timer__loop__label__
	invoke QueryPerformanceCounter, __timer__pc__count__
	pop   ecx           ;; Recover low-order 32 bits of start count
	sub   dword [__timer__pc__count__], ecx
	pop   ecx           ;; Recover high-order 32 bits of start count
	sbb   dword [__timer__pc__count__ + 4], ecx
	pop   ecx           ;; Recover low-order 32 bits of overhead count
	sub   dword [__timer__pc__count__], ecx
	pop   ecx           ;; Recover high-order 32 bits of overhead count
	sbb   dword [__timer__pc__count__ + 4], ecx

	invoke GetCurrentProcess
        invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS

        if defined _EMMS
          EMMS
        end if

	finit
	fild	[__timer__pc__count__]
	fild	[__timer__pc__frequency__]
	fdivp	st1,st0
	mov	[__timer__dw_count__], 1000
	fild	[__timer__dw_count__]
	fmulp	st1,st0
	fistp	[__timer__dw_count__]
	mov	eax, [__timer__dw_count__]
	@@:
}

profile_number = 0

macro	dbdata [data] {
	if data eqtype ""
		db data
	else if data eqtype 0
		local dividend, divisor
		divisor = 10
		dividend = data
		while dividend >= 10* divisor
			divisor = divisor * 10
		end while
		while divisor > 1
			db (dividend/divisor)+"0"
			dividend = dividend mod divisor
			divisor = divisor/10
		end while
		db dividend+"0"
	end if
}

macro new_data_label {
	match n,data_label \{
		data_label equ n\#Z
		data_size equ n\#Z\#.size
		\}
 }

data_label equ _		; initialize
create_data equ

macro	adddata [arg] {
	common
	new_data_label
	local _arg		; use local variables to store instances of global variables
	_arg equ arg
	macro	create_data \{
		data_label equ _
		create_data
		new_data_label
		data_label:
			match a,_arg \\{
			dbdata a \\}	; translate _arg into actual values
			db 13,10
		.size = $-data_label
	\}
}

macro	profile_begin {
	rpt = count
	if count = 0
		rpt = 1
	end if
	invoke	Sleep,0
	invoke	GetCurrentProcess
	invoke	FlushInstructionCache,eax,0,0
	if use_rdtsc = 1
		counter_begin rpt, HIGH_PRIORITY_CLASS
	else
		timer_begin rpt, HIGH_PRIORITY_CLASS
	end if
	mov	eax,max
  @@:	push	eax
  	mov	edi,inputbuffer + profile_number * 12
  	profile_number = profile_number + 1
  	profile_loop equ @b
}

macro	profile_end suffix {
	pop	eax
	if count = 0
		sub	eax,1
		jnc	profile_loop
	end if
	if use_rdtsc = 1
		counter_end
	else
		timer_end
	end if
	adddata suffix
	mov	esi,data_label
	mov	ecx,data_size
	if use_rdtsc = 1
		call	__putclocks
	else
		call	__putticks
	end if
}

macro	print [arg] {
	common
	adddata arg
	invoke	GetStdHandle,STD_OUTPUT_HANDLE
	invoke	WriteFile,eax,data_label,data_size,byteswritten,0
}

;------------------------------------------------------------------------
section '.text' code readable executable

macro n2dTest a,b,c{
max		= a
count		= b
use_rdtsc	= c
;max = 999999999	; number to use for input

;	count = 1000	; if 0 then count down from max to 0
			; else repeat routine n times, using max as input number each time
;	use_rdtsc = 1	; 1 = use rdtsc (good for short tests)
			; 2 = use milliseconds
;	print a," processed ",b," times"
	print b," loops"

;	profile_begin
;	call	IntToStr32_JOH_IA32_6_c
;	profile_end "IntToStr32_JOH_IA32_6_c"

;	profile_begin
;	call	dw2a
;	profile_end "dw2a"

;	profile_begin
;	stdcall	udwordl,edi,eax
;	profile_end "udwordl"


;	profile_begin
;	call	putdec
;	profile_end "putdec"

;	profile_begin
;	mov	esi,eax
;	call	bin2ascii
;	profile_end "bin2ascii"

;	profile_begin
;	mov	esi,eax
;	call	ulong2ascii
;	profile_end "ulong2ascii"

;	profile_begin
;	stdcall	declspec,edi,eax
;	profile_end "declspec"

;	profile_begin
;	call	revITOA
;	profile_end "revITOA"

	profile_begin
	call	emptyFunc
	profile_end "empty32bitFunc"

	profile_begin
	call	$33:empty64bitFunc
	profile_end "empty64bitFunc"

	profile_begin
	call	$33:func2x64
	profile_end "cmps way (64bit)"

	profile_begin
	call	$33:func1x64
	profile_end "cmp way (64bit)"

	profile_begin
	call	$33:func3x64
	profile_end "Borsuc's way (64bit)"

	mov	dword [inputbuffer+profile_number * 12],0A0D0A0Dh
	invoke	GetStdHandle, STD_OUTPUT_HANDLE
	invoke	WriteFile, eax, blah, 1, 0, 0
;	invoke	WriteFile, eax, inputbuffer, 88, byteswritten, 0
}
main:

;n2dTest 999999999,1,1
;n2dTest 10,1,1
;n2dTest 57129,1,1
;n2dTest 1248681091,1,1
;n2dTest 999999999,1000,1
;n2dTest 10,1000,1
;n2dTest 57129,1000,1
;n2dTest 1248681091,1000,1
;n2dTest 999999,0,0

n2dTest 1248681091,1,1
n2dTest 1248681091,2,1
n2dTest 1248681091,1,1
n2dTest 1248681091,2,1
n2dTest 1248681091,5,1
n2dTest 1248681091,1000,1
n2dTest 1248681091,100000,1
n2dTest 1248681091,1000000,1
n2dTest 1248681091,1000000,0



;quit:   invoke	ExitProcess,1
quit:   invoke	Sleep,-1
blah	db 10
;------------------------------------------------------------------------

__putclocks:
	mov	edi,clocks.add		;EDI = where to append postfix
	rep	movsb
	mov	esi,clocks.begin
.go:	mov	ecx,10
.div:	dec	esi			;Reverse storage
	sub	edx,edx
	div	ecx			; Divide EDX:EAX, EAX:=quotient, EDX:=remainder
	add	dl,'0'			; convert DL to ascii digit
	mov	[esi],dl
	test	eax,eax
	jnz	.div
	sub	edi,esi
	invoke	GetStdHandle, STD_OUTPUT_HANDLE
	invoke	WriteFile, eax, esi, edi, byteswritten, 0
	ret

__putticks:
	mov	edi,ticks.add		;EDI = where to append postfix
	rep	movsb
	mov	esi,ticks.begin
	jmp	__putclocks.go

;Fastest for smallest numbers
align	16
putdec:	mov	ecx,10
.div:	cmp	eax,ecx
	jb	.less
	sub	edx,edx
	div	ecx		;divide edx:eax by ecx
	push	edx		;save remainder
	call	.div
	pop	eax
.less:	add	al,'0'
	stosb
	ret

;Binary-to-ASCII Decimal Conversion Suppressing Leading Zeros
;Adapted from algo by Paul Dixon & Lingo
align	16
dw2a:	cmp	eax,1000000000
	mov	edx,0D1B71759h		; 2^45/10000 = fixed point 1/10000 with 45 digits
	mov	ebp,eax			; save a copy of the number
	jb	.9dig
 	mul	edx			; EDX = integer (quotient) + 45 digits after the dot
	shr	edx,13			; round down EDX quotient (6 high digits)
	mov	eax,68DB9h		; 2^32/10000+1= 1/10000 with 32 digits & rounded up
	imul	esi,edx,10000		; ESI = quotient * 10000
	mul	edx			; * 1/10000 with 32 digits (so EDX = integer = first 2 digits)
	sub	ebp,esi			; EBP = remainder (lower 4 digits)
	mov	edx,dword [chartab+edx+edx]	; look up 2 digits
	mov	esi,100			; load ready for later
	mov	[edi], edx 	; write them to answer
.dig8:	mul	esi				; get next 2 digits
	mov	edx,dword [chartab+edx+edx]	; look up 2 digits
	mov	[edi+2],edx			; write them to answer
	mul	esi				; get next 2 digits
.dig6a:	mov	edx,dword [chartab+edx+edx]	; look up 2 digits
.dig4a:	mov	[edi+4], edx			; write them to answer
.dig4:	mov	eax,28F5C29h			; 2^32\100 +1
	mul	ebp				; ebp= lower 4 digits
	imul	eax,edx,-200
	movzx  	eax,word [chartab+ebp*2+eax]
	mov	edx,dword [chartab+edx+edx]
	add	edi,10
	mov	[edi-4],edx
	mov	[edi-2],eax
	ret

;	mov	edx,dword [chartab+edx+edx]	; look up 2 digits
;	mov	[edi+6],edx			; write to answer
;;	mul	esi				; get final 2 digits
;	shr	eax,7
;	imul	eax,eax,100			;fast multiplication
;	add	eax,1000000h			;round up before rounding down
;	shr	eax,25
;	add	edi,10
;	movzx	eax,word [chartab+eax+eax]	; look them up
;	mov	[edi-2],eax			; write to answer and zero terminate string
;	ret					; all done

.2dig:	cmp	eax,10
	movzx	eax,word [chartab+eax+eax]	; look them up
	mov	[edi+6],eax
	jae	.dig2a
	mov	[edi+6],ah
.dig2a:	sbb	edi,-8
	ret
.4dig:	cmp	eax,1000		; 1000 to 9999
	lea	edi,[edi-6]
	jae	.dig4
	cmp	eax,100			; 0-99
	lea	edx,[eax+eax*4]		; 100-999
	jb	.2dig
	lea	edx,[eax+edx*8]
	shr	edx,12			;EDX = n * 41/4096 = quotient, accurate up to 9990
	add	edi,9
	imul	ebx,edx,-200
	or	dl,'0'
	mov	[edi-3],edx
	movzx	eax,word [chartab+ebx+ebp*2]
	mov	[edi-2],eax
	ret
.8dig:	mul	edx
	mov	edx,dword [chartab+edx+edx]	; look up 2 digits
	mov	[edi+2],edx			; write to answer
;	mul	esi				; get final 2 digits
	shr	eax,7
	imul	edx,eax,100			;fast multiplication
	add	edx,1000000h			;round up before rounding down
	shr	edx,25
	jmp	.dig6a
.6dig:	cmp	eax,100000
	mov	ecx,28F5C29h           ;((2^32)+100-1)/100}
	mov	ebx,eax                ;Dividend}
	sbb	edi,-6
	mul	ecx                     ;EDX = Dividend DIV 100}
	mov	eax,edx                ;Set Next Dividend}
	imul	edx,-200               ;-2 * (100 * Dividend DIV  100)}
	mov	esi,eax                ;Dividend}
	movzx	ebx,word [chartab+ebx*2+edx] ;Dividend MOD 100 in ASCII}
	mul	ecx                     ;EDX = Dividend DIV 100}
	imul	eax,edx,-200
	cmp	edx,10
	mov  	eax,dword [chartab+esi*2+eax] ;Dividend MOD 100 in ASCII}
	mov	edx,dword [chartab+edx+edx]
	mov	[edi-5],dh
	jb	.5dig
	mov	[edi-6],edx
.5dig:	mov	[edi-4],eax
	mov	[edi-2],ebx
	ret
.7dig:	lea	ebx,[edx+edx*4]
	lea	ebx,[edx+ebx*8]
	shr	ebx,12
	lea	edi,[edi-1]
	imul	ecx,ebx,-200
	or	bl,'0'
	mov	[edi+3],ebx
	movzx	edx,word [chartab+ecx+edx*2]
	jmp	.dig4a
.9dig:	cmp	eax,10000
	jb	.4dig
	cmp	eax,1000000
	jb	.6dig
	mul	edx
	shr	edx,13			; EDX = 5 high digits
	mov	eax,28F5C29h			; 2^32\100 +1
	imul	ebx,edx,10000		; EBX = quotient * 10000
	lea	edi,[edi-2]
	sub	ebp,ebx			; EBP = remainder (lower 4 digits)
	cmp	edx,1000
	jb	.7dig
	cmp	edx,10000
	jb	.8dig
;	mov	ebx,edx                ;Dividend}
;	mul	edx                     ;EDX = Dividend DIV 100}
;	mov	esi,edx                ;Dividend}
;	imul	edx,-200               ;-2 * (100 * Dividend DIV  100)}
;	mov	eax,28F5C29h           ;((2^32)+100-1)/100}
;	movzx	ebx,word [chartab+ebx*2+edx] ;Dividend MOD 100 in ASCII}
;	mul	esi                     ;EDX = Dividend DIV 100}
;	dec	edi
;	imul	eax,edx,-200
;	mov  	eax,dword [chartab+esi*2+eax] ;Dividend MOD 100 in ASCII}
;	or	dl,'0'
;	mov	[edi+2],eax
;	mov	[edi+4],ebx
;	mov	[edi+1],dl
;	jmp	.dig4

	mov	eax,68DB9h		; 2^32/10000+1= 1/10000 with 32 digits & rounded up
	mul	edx			; * 1/10000 with 32 digits (so EDX = integer = first 2 digits)
	mov	esi,100
	or	dl,'0'
	inc	edi
	mov	[edi+1],edx
	jmp	.dig8

ulong2ascii:
        mov     ecx,eax         ; save original argument
        mov     esi,89705f41h   ; 1e-9*2^61 rounded
        mul     esi             ; divide by 1e9 by mult. with recip.
        add     eax,80000000h   ; round division result
        mov     esi,0abcc7712h  ; 2^28/1e8 * 2^30 rounded up
        adc     edx,0           ; EDX<31:29> = argument / 1e9
        mov     eax,ecx         ; restore original argument
        shr     edx,29          ; leading decimal digit, 0...4
        mov     ecx,8           ; produce eight more digits
        mov     ebx,edx         ; flags whether non-zero digit seen yet
        or      edx,'0'         ; convert digit to ASCII
        mov     [edi],dl        ; store out to memory
        cmp     ebx,1           ; first digit nonzero ? CY=0 : CY=1
        sbb     edi,-1          ; incr. pointer if first digit non-zero
        imul    ebx,1000000000  ; multiply quotient digit by divisor
        sub     eax,ebx         ; remainder after first digit
        mul     esi             ;  convert number < 1e9
        shld    edx,eax, 2      ;   into fraction such
        inc     edx             ;    that 1.0 = 2^28
        mov     eax,edx         ; save result
        shr     eax,28          ; next digit
        and     edx,0fffffffh   ; fraction part
        or      ebx,eax         ; any non-zero yet ?
        or      eax,'0'         ; convert digit to ASCII
.cvt_loop:
        mov     [edi],al        ; store digit out to memory
        add     edx,edx         ; 2*fraction
        cmp     ebx,1           ; any non-zero digit seen ? CY=0 : CY=1
        lea     edx,[edx*4+edx] ; 10*fraction, new digit EAX<31:28>,
                                ; new fraction EAX<27:0>
        sbb     edi,-1          ; incr. ptr if any non-zero digit seen
        mov     eax,edx         ; save result
        shr     eax,28          ; next digit = integer part
        and     edx,0fffffffh   ; fraction part
        or      ebx,eax         ; any non-zero digit yet ?
        or      eax,'0'         ; convert digit to ASCII
        dec     ecx             ; one more digit
        jnz     .cvt_loop       ; until all nine digits done
        mov     [edi],al        ; store last digit out to memory
        mov     [edi+1],ah     ; place string end marker
        ret

align	16
bin2ascii:
n_from	equ	1234567890
n_to	equ	1234567899
magic1	equ	0a7c5ac47h
magic2	equ	068db8badh

;On entry: esi - number to be converted, edi - destination of ASCII result
;This algorithm generates decimal digits by successive multiplications of fractions by 10.
;The objective is to eliminate slow div instructions. Lets see how:
;Starting from a 32 bit binary number 0XXXXXXXXh, we know that 0XXXXXXXXh <= 4294967295d
;To keep precision, it is convenient to separate this number into high and low order
;decimal numbers, each with 5 digits, and work separately with both parts. This also
;helps optimizing to modern processors, but not in this didactic example.
;For example, 4294967295d would be separated in 42949 (high) and 67295 (low).
;This could be done with a div instruction, dividing by 100000d, but to avoid the div,
;we multiply by the reciprocal, i.e. 1/100000, in hex: 0.0000a7c5ac471b478423
;This number has to be fitted in integer registers to use the mul instruction.
;Thus: edx:eax = 0000.0000:a7c5ac47, after mul by 0XXXXXXXXh we obtain:
;edx:eax = qqqq.rrrr:rrrrrrrr (q - quotient, r - remainder, fraction form)
	mov	eax,magic1
	mul	esi

;Turns out this has not enough precision, because many digits of the fraction
;0.0000a7c5ac471b478423 were ignored. I verified that it is enough to had just
;another precision nibble, rounding to 0.0000a7c5ac472
;multiply by 0.20000000h = divide by 8 = shr by 3
	mov	ecx,esi
	shr	ecx,3

;sum the two partial terms, obtaining the final edx:eax = qqqq.rrrr:rrrrrrrr
	add	eax,ecx
	adc	edx,0

;now we separate the quotient from the remainder:
;qqqq.rrrr:rrrrrrrr -> qqqq.0000h, 0.rrrrrrrh (have to keep track of the hexadecimal point)
	shrd	eax,edx,20		;separate remainder
	and	edx,0FFFF0000h		;mask quotient
	inc	eax			;we loose 4 significand remainder nibbles, round up
	and	eax,0FFFFFFFh		;remove quotient nibble from remainder.
	push	eax			;store remainder

;Now we can process the quotient to obtain the five high decimal digits.
;We have qqqq.0000h in edx, and we know that qqqqh <= 42949d, so if we divide by 10000d
;using the multiply by reciprocal method, we obtain q.rrrrrrrh in edx. To do this, we
;multiply by 0.000068db8badh
	mov	eax,magic2
	mul	edx
	inc	edx			;round up

;We already have the first decimal digit isolated in the high nibble of edx, so now it can
;be stored, and masked out to keep the remainder.
	mov	eax,edx
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	mov	[edi],dl

;The rest of the digits are extracted from the remainder by successive multiplies by 10d,
;masking the remainder for the next step.
	mov	ecx,4
prox_dig_hi:
	lea	eax,[4*eax+eax]		;multiply by 5
	inc	edi
	add	eax,eax			;multiply by 2
	mov	edx,eax
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	dec	ecx
	mov	[edi],dl
	jnz	prox_dig_hi

;Recover the lower digits and repeat the same process.
	pop	eax
	mov	ecx,5
prox_dig_lo:
	lea	eax,[4*eax+eax]
	inc	edi
	add	eax,eax
	mov	edx,eax
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	dec	ecx
	mov	[edi],dl
	jnz	prox_dig_lo
	ret

;Fastest for medium range numbers
align 16
IntToStr32_JOH_IA32_6_c:
	mov	esi,10                 ;Max Digits in Result
	cmp	eax,10
  	sbb	esi,0
  	cmp	eax,100
  	sbb	esi,0
  	cmp	eax,1000
  	sbb	esi,0
  	cmp	eax,10000
  	sbb	esi,0
  	cmp	eax,100000
  	sbb	esi,0
  	cmp	eax,1000000
  	sbb	esi,0
  	cmp	eax,10000000
  	sbb	esi,0
  	cmp	eax,100000000
  	sbb	esi,0
  	cmp	eax,1000000000
  	sbb	esi,0
	mov	byte [edi+esi],0 ;Add Null Terminator
	sub	esi, 2
	jbe	.2dig
	cmp	esi,8		;10 Digit Value?}
	jb	.9dig		;Not a 10 Digit Value}
	mov	ecx,'0'
@@:	sub	eax,1000000000
	inc	ecx
	jnb	@b
	dec	ecx
	add	eax,1000000000
	mov	[edi], cl	;Save Digit 10}
	mov	esi, 7          ;9 Digits Remaining}
	inc	edi		;Destination of 2nd Digit}
.9dig:	mov	ecx, 28F5C29h           ;((2^32)+100-1)/100}
.loop:	mov	ebx, eax                ;Dividend}
	mul	ecx                     ;EDX = Dividend DIV 100}
	mov	eax, edx                ;Set Next Dividend}
	imul	edx, -200               ;-2 * (100 * Dividend DIV  100)}
	mov  	edx, dword [chartab+ebx*2+edx] ;Dividend MOD 100 in ASCII}
	mov	[edi+esi], dx
	sub	esi, 2
	ja	.loop		;Loop Until 1 or 2 Digits Remaining}
.2dig:	jnz    .1dig
	movzx  eax, word [chartab+eax+eax]
	stosw
	ret
.1dig:	or     al , '0'                ;Ascii Adjustment}
	stosb
	ret

;Fastest for large numbers
;Binary-to-ASCII Decimal Conversion Suppressing Leading Zeros
; algo from Paul Dixon		        ;
align 16
udwordl: 				;unsigned DWORD to ASCII
	mov	eax,	[esp+2*4]	; eax->number
   	mov	edx,	0D1B71759h	; =2^45\10000    13 bit extra shift
	mov	[esp+2*4], edi		; save edi
	mov	edi,	eax		; save a copy of the number
	mov	ecx,	[esp+1*4]	; ecx-> sptr
 	mul	edx			; gives 6 high digits in edx
	mov 	[esp+1*4], esi		; save esi
	shr	edx,	13		; correct for multiplier offset used to give better accuracy
	mov	eax,	68DB9h		; =2^32\10000+1
	je	skiphighdigits		; if zero then don’t need to process the top 6 digits
	imul	esi, edx, 10000		; get a copy of high digits ;scale up high digits
	sub	edi, 	esi		; subtract high digits from original. EDI now = lower 4 digits
	mul	edx			; get first 2 digits in edx
	mov	esi,	100		; load ready for later
	jnc	next1			; if zero, supress them by ignoring
	cmp	edx,	9		; 1 digit or 2?
	lea	ecx,	[ecx+8]		;
	ja	ZeroSupressed		; 2 digits, just continue with pairs of digits to the end
	add	edx, 	30h		;
	sub	ecx,	1		; update pointer by 1
	mov	[ecx-7],	edx	; but only write the 1 we need, supress the leading zero
	jne	ZS1  			; continue with pairs of digits to the end
align 16
ZeroSupressed:						;
	mov	edx,	dword [chartab+edx+edx]	; look up 2 digits
	mov	[ecx-8], edx 				; write them to answer
ZS1:							;
	mul	esi					; get next 2 digits
ZS1a:							;
	mov	edx,	dword [chartab+edx+edx]	; look up 2 digits
	mov	[ecx-6], edx				; write them to answer
ZS2:							;
	mul	esi					; get next 2 digits
ZS2a:							;
	mov	edx,	dword [chartab+edx+edx]	; look up 2 digits
	mov	[ecx-4], edx				; write them to answer
ZS3:							;
	mov	eax,	28F5C29h			; 2^32\100 +1
	mul	edi					; edi= lower 4 digits
ZS3a:							;
	mov	edx,	dword [chartab+edx+edx]	; look up 2 digits
	mov	[ecx-2],	edx				; write to answer
ZS4:							;
	mul	esi					; get final 2 digits
ZS4a:							;
	mov	esi,	[esp+1*4]			; restore esi
	lea	eax,	[ecx+2]			;
	mov	edi,	[esp+2*4]			; restore edi
	movzx	eax,	word [chartab+edx+edx]	; look them up
	mov	[ecx],	eax				; write to answer and zero terminate string
	ret	2*4					; all done
align 16							;
next1:							;
	mul	esi  					; get next 2 digits
	jnc	next2                  			; if zero, supress them by ignoring
	cmp	edx,	9                  			; 1 digit or 2?
	lea	ecx,	[ecx+6]			;
	ja	ZS1a                   			; 2 digits, just continue with pairs of digits to the end
	add	edx,	30h				;
	sub	ecx,	1		   		; update pointer by 1
	mov	[ecx-5], edx 				; but only write the 1 we need, supress the leading zero
	jnz	ZS2					; continue with pairs of digits to the end
align 16							;
next2:							;
	mul	esi					; get next 2 digits
	jnc	next3					; if zero, supress them by ignoring
	cmp	edx,	9				; 1 digit or 2?
	lea	ecx,	[ecx+4]			;
	ja	ZS2a					; 2 digits, just continue with pairs of digits to the end
	add	edx,	30h				;
	sub	ecx,	1				; update pointer by 1
	mov	[ecx-3],	edx				; but only write the 1 we need, supress the leading zero
	jnz	ZS3					; continue with pairs of digits to the end
align 16							;
next3:							;
skiphighdigits:						;
	mov	eax,	28F5C29h         		; 2^32\100 +1
	mul	edi					; edi = lower 4 digits
	mov	esi,	100				;
	jnc	next4					; if zero, supress them by ignoring
	cmp	edx,	9				; 1 digit or 2?
	lea	ecx,	[ecx+2]			;
	ja	ZS3a					; 2 digits, just continue with pairs of digits to the end
	add	edx,	30h				;
	sub	ecx,	1				; update pointer by 1
	mov	[ecx-1],	edx				; but only write the 1 we need, supress the leading zero
	jnz	ZS4					; continue with pairs of digits to the end
align 16							;
next4:							;
	mul	esi					; this is the last pair so don;t supress a single zero
	cmp	edx,	9				; 1 digit or 2?
	ja	ZS4a					; 2 digits, just continue with pairs of digits to the end
	mov	esi,	[esp+1*4]			; restore esi
	add	edx,	30h				;
	mov	edi,	[esp+2*4]			; restore edi
	lea	eax,	[ecx+1]			;
	mov	[ecx], 	edx				; zero terminate string
	ret	2*4					; all done

align	16
declspec:			;AMD http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/25112.PDF, pg 183
	push edi 		; Save as per calling conventions.
	push ebx 		; Save as per calling conventions.
	mov edi, [esp+12] 	; sptr
	mov eax, [esp+16] 	; x
	mov ecx, eax 		; Save original argument.
	mov edx, 89705F41h 	; 1e-9 * 2^61 rounded
	mul edx 		; Divide by 1e9 by multiplying with reciprocal.
	add eax, eax 		; Round division result.
	adc edx, 0 		; EDX[31-29] = argument / 1e9
	shr edx, 29 		; Leading decimal digit, 0...4
	mov eax, edx 		; Leading digit
	mov ebx, edx 		; Initialize digit accumulator with
				; leading digit.
	imul eax, 1000000000 	; Leading digit * 1e9
	sub ecx, eax 		; Subtract (leading digit * 1e9) from argument.
	or dl, '0' 		; Convert leading digit to ASCII.
	mov [edi], dl 		; Store leading digit.
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	mov eax, ecx 		; Get reduced argument < 1e9.
	mov edx, 0abcc7712h 	; 2^28 / 1e8 * 2^30 rounded up
	mul edx 		; Divide reduced
	shr eax, 30 		; argument < 1e9 by 1e8,
	lea edx, [eax+4*edx+1] 	; converting it into 4.28 fixed-point
	mov eax, edx 		; format such that 1.0 = 2^28.
	shr eax, 28 		; Next digit
	and edx, 0fffffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-27]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[26-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 27 		; Next digit
	and edx, 07ffffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-26]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[25-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 26 		; Next digit
	and edx, 03ffffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-25]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[24-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 25 		; Next digit
	and edx, 01ffffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-24]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[23-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr, No, keep old ptr.
	shr eax, 24 		; Next digit
	and edx, 00ffffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-23]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[31-23]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 23 		; Next digit
	and edx, 007fffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit out to memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-22]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[22-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 22 		; Next digit
	and edx, 003fffffh 	; Fraction part
	OR ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-21]
	lea edx, [edx*4+edx] 	; 5 * fraction, new fraction EDX[21-0]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 21 		; Next digit
	and edx, 001fffffh 	; Fraction part
	or ebx, eax 		; Accumulate next digit.
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], al 		; Store digit in memory.
	lea eax, [edx*4+edx] 	; 5 * fraction, new digit EAX[31-20]
	cmp ebx, 1 		; Any nonzero digit yet?
	sbb edi, -1 		; Yes, increment ptr. No, keep old ptr.
	shr eax, 20 		; Next digit
	or eax, '0' 		; Convert digit to ASCII.
	mov [edi], ax 		; Store last digit and end marker in memory.
	pop ebx 		; Restore register as per calling convention.
	pop edi 		; Restore register as per calling convention.
	ret 8 			; Pop two DWORD arguments and return.


align	16
revITOA:
        push    -1 
        mov     ebx,0cccccccdh 
    .a: mov     ecx,eax 
        mul     ebx 
        and     edx,-8 
        mov     eax,edx 
        sub     ecx,edx 
        shr     edx,2 
        sub     ecx,edx 
        push    ecx 
        shr     eax,3 
        jnz     .a 
        pop     eax 
    .b: add     al,'0' 
        ;;;put your print/store function here. Example only shown in next line 
                stosb ;example only! 
        pop     eax 
        test    al,10h 
        jz      .b
ret

align	16
func1x64:;!eax;!edi;-edi;/edi;+stack									{
use64
	mov	rax,[variablememory]
	mov	rdx,[variablememory]
	cmp	rax,[const12345678]
je	@f
	cmp	rax,[constabcdefgh]
je	@f
	shl	rax,8
	mov	cx,dx
	cmp	rax,[consthellooo]
je	@f
	cmp	rax,[constwooorld]
je	@f
	cmp	rax,[constthisisa]
je	@f
;	shr	rax,8
	cmp	edx,"test"
je	@f
	shl	edx,8
	cmp	edx,"bye"	shl (8*1)
je	@f
	cmp	edx,":)="	shl (8*1)
je	@f
	cmp	cx,":)"
je	correct
@@:	int3
correct:
use32
retf

align	16
func2x64:;!eax;!edi;-edi;/edi;+stack									{
use64
	mov	rdi,variablememory2
	mov	rsi,const212345678
	cmpsq
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2abcdefgh
	cmpsq
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2hellooo
	mov	rcx,7
	repe	cmpsb
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2wooorld
	mov	rcx,7
	repe	cmpsb
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2thisisa
	mov	rcx,7
	repe	cmpsb
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2test
	cmpsd
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2bye
	mov	rcx,3
	repe	cmpsb
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2smilefrown
	mov	rcx,3
	repe	cmpsb
je	@f
	mov	rdi,variablememory2
	mov	rsi,const2smile
	cmpsw
je	correct2
@@:	int3
correct2:
use32
retf

align	16
func3x64:
use64
mov	rbx,variablememory3
mov	rdi,Strings
xor	rcx,rcx
@@:	mov	cl,[rdi]	;Get the byte before the string representing its length
	test	cl,cl		;See if it's the end of the list
	mov	rsi,rbx		;Restore the compared string's offset (so we can compare again)
jz	@f
	inc	rdi		;Increase by 1 to the actual argument string (i.e skip the byte length prefix)
	repe	cmpsb		;Compare (this is the small way, for large strings, divide the string into chunks of 32 or 64-bits)
	mov	rax,[rdi+rcx]	;the JumpLabel (64-bit address).
	lea	rdi,[rdi+rcx+8]	;Skip the remaining chars (if any) + the JumpLabel (64-bit address).
jne	@b
jmp	rax
@@:
wrong:	int3
correct3:
use32
retf
align 64
variablememory	dq ":) stuff"
const12345678	dq "12345678"
constabcdefgh	dq "abcdefgh"
consthellooo	dq "hellooo"	shl (8*1)
constwooorld	dq "wooorld"	shl (8*1)
constthisisa	dq "thisisa"	shl (8*1)
;consttest	dq "test"	shl (8*4)
;constbye	dq "bye"	shl (8*5)
;constsmilefrown	dq ":)="	shl (8*5)
;constsmile	dq ":)"		shl (8*6)

rd 4096

align 64
variablememory2		db ":) stuff"
const212345678		db "12345678"
const2abcdefgh		db "abcdefgh"
const2hellooo		db "hellooo"
const2wooorld		db "wooorld"
const2thisisa		db "thisisa"
const2test		db "test"
const2bye		db "bye"
const2smilefrown	db ":)="
const2smile		db ":)"

rd 4096

align 64
variablememory3		db ":) stuff"
Strings:		db 8
			db "12345678"
			dq wrong
			db 8
			db "abcdefgh"
			dq wrong
			db 7
			db "hellooo"
			dq wrong
			db 7
			db "wooorld"
			dq wrong
			db 7
			db "thisisa"
			dq wrong
			db 4
			db "test"
			dq wrong
			db 3
			db "bye"
			dq wrong
			db 3
			db ":)="
			dq wrong
			db 2
			db ":)"
			dq correct3
db 0

align	16
emptyFunc:
ret

align	16
qw2Str:		;Truncated. Original from from http://www.masm32.com/board/index.php?topic=9857.15
	mov ebp,edi
	push edi
	mov ebx, eax
	mov edi, 0CCCCCCCDh

	; Lower Dword conversion loop
  @@U32Cvt:	mul edi
	shr edx, 3
	lea ecx, [edx*4+edx]	; ecx=edx*5
;	if 1
		neg ecx		; 3 cycles faster
		lea ebx, [ebx+2*ecx+"0"]
;	else
;		lea ecx, [ecx+ecx-'0']	; ecx=edx*10-48
;		sub ebx, ecx
;	endif
	mov eax, edx		; sub ebx, mov eax 2 cycles faster than inverse sequence
	mov [ebp], bl
	mov ebx, edx
	inc ebp			; add ebp, 1 here a bit slower and 2 bytes more
	test eax, eax
	jnz @@U32Cvt

	; Calculate output string length and reverse it
	pop edi 			; ex mov edi, [esp+3*4][4*4]
	mov [ebp], al
	mov eax, ebp
	sub eax, edi

	; reverse buffer digits (ca. 30 cycles for 20 digits)
  @@:	sub ebp, 1		; dec ebp ca. 10 cycles slower here!
	mov bl, [edi]
	mov cl, [ebp]
	mov [ebp], bl
	mov [edi], cl
	add edi, 1			; inc edi ca. 10 cycles slower here!
	cmp edi, ebp
	jb @B

align	16
empty64bitFunc:
use32
retf


section '.data' data readable writeable import
        library kernel32, 'kernel32.dll'
        import kernel32, \
          ExitProcess,       'ExitProcess',       \
          Sleep,       'Sleep',       \
          GetCurrentProcess, 'GetCurrentProcess', \
          SetPriorityClass,  'SetPriorityClass', \
          FlushInstructionCache,  'FlushInstructionCache', \
          GetStdHandle,  'GetStdHandle', \
          WriteFile,  'WriteFile', \
	QueryPerformanceCounter, 'QueryPerformanceCounter', \
	QueryPerformanceFrequency, 'QueryPerformanceFrequency'

;The character look up table. Make sure it's aligned suitably
align 16
chartab	dw	"00","01","02","03","04","05","06","07","08","09",\
		"10","11","12","13","14","15","16","17","18","19",\
		"20","21","22","23","24","25","26","27","28","29",\
		"30","31","32","33","34","35","36","37","38","39",\
		"40","41","42","43","44","45","46","47","48","49",\
		"50","51","52","53","54","55","56","57","58","59",\
		"60","61","62","63","64","65","66","67","68","69",\
		"70","71","72","73","74","75","76","77","78","79",\
		"80","81","82","83","84","85","86","87","88","89",\
		"90","91","92","93","94","95","96","97","98","99"

create_data

clocks:	db "         "
.begin:	db " clocks "
.add:

ticks:	db "                         "
.begin:	db "ms "
.add:	rb 256

inputbuffer	rb 10000

ALIGN 8             ;; Optimal alignment for QWORD
__counter__qword__count__  rq 1
__counter__loop__count__   rd 1
__counter__loop__counter__ rd 1

__timer__pc__frequency__ rq 1
__timer__pc__count__	 rq 1
__timer__loop__counter__ rd 1
__timer__dw_count__      rd 1

byteswritten	rd 1


