format PE  console 4.0
entry start

include '%fasmpath%\include\win32a.inc'

section '.code' code readable writeable executable
;a7c5.ac471b478423 hex is 42949.67296 decimal (2^32/1e5)
;68db8.bac710cb296 hex is 429496.7296 decimal (2^32/1e4)

n_from	equ	1234567890
n_to	equ	1234567899
magic1	equ	0a7c5ac47h
magic2	equ	068db8badh

start:
	invoke	GetStdHandle, STD_OUTPUT_HANDLE
	mov	[OutHandle],eax
	mov	esi,n_from
next_num:
	mov	edi,result
	call	bin2ascii
	mov	ecx,12
	push	esi
	mov	esi,result
	call	buffered_write
	pop	esi
	inc	esi
	cmp	esi,n_to
	jbe	next_num
	call	buffer_flush
	ret

bin2ascii:
;On entry: esi - number to be converted, edi - destination of ASCII result
;This algorithm generates decimal digits by successive multiplications of fractions by 10.
;The objective is to eliminate slow div instructions. Lets see how:
;Starting from a 32 bit binary number 0XXXXXXXXh, we know that 0XXXXXXXXh <= 4294967295d
;To keep precision, it is convenient to separate this number into high and low order
;decimal numbers, each with 5 digits, and work separately with both parts. This also
;helps optimizing to modern processors, but not in this didactic example.
;For example, 4294967295d would be separated in 42949 (high) and 67295 (low).
;This could be done with a div instruction, dividing by 100000d, but to avoid the div,
;we multiply by the reciprocal, i.e. 1/100000, in hex: 0.0000a7c5ac471b478423
;This number has to be fitted in integer registers to use the mul instruction.
;Thus: edx:eax = 0000.0000:a7c5ac47, after mul by 0XXXXXXXXh we obtain:
;edx:eax = qqqq.rrrr:rrrrrrrr (q - quotient, r - remainder, fraction form)
	mov	eax,magic1
	mul	esi

;Turns out this has not enough precision, because many digits of the fraction
;0.0000a7c5ac471b478423 were ignored. I verified that it is enough to had just
;another precision nibble, rounding to 0.0000a7c5ac472
;multiply by 0.20000000h = divide by 8 = shr by 3
	mov	ecx,esi
	shr	ecx,3

;sum the two partial terms, obtaining the final edx:eax = qqqq.rrrr:rrrrrrrr
	add	eax,ecx
	adc	edx,0

;now we separate the quotient from the remainder: 
;qqqq.rrrr:rrrrrrrr -> qqqq.0000h, 0.rrrrrrrh (have to keep track of the hexadecimal point)
	shrd	eax,edx,20		;separate remainder
	and	edx,0FFFF0000h		;mask quotient
	inc	eax			;we loose 4 significand remainder nibbles, round up
	and	eax,0FFFFFFFh		;remove quotient nibble from remainder.
	push	eax			;store remainder

;Now we can process the quotient to obtain the five high decimal digits.
;We have qqqq.0000h in edx, and we know that qqqqh <= 42949d, so if we divide by 10000d
;using the multiply by reciprocal method, we obtain q.rrrrrrrh in edx. To do this, we
;multiply by 0.000068db8badh
	mov	eax,magic2
	mul	edx
	inc	edx			;round up

;We already have the first decimal digit isolated in the high nibble of edx, so now it can
;be stored, and masked out to keep the remainder.
	mov	eax,edx
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	mov	[edi],dl

;The rest of the digits are extracted from the remainder by successive multiplies by 10d,
;masking the remainder for the next step.
	mov	ecx,4
prox_dig_hi:
	lea	eax,[4*eax+eax]		;multiply by 5
	inc	edi
	add	eax,eax			;multiply by 2
	mov	edx,eax
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	dec	ecx
	mov	[edi],dl
	jnz	prox_dig_hi

;Recover the lower digits and repeat the same process.
	pop	eax
	mov	ecx,5
prox_dig_lo:
	lea	eax,[4*eax+eax]
	inc	edi
	add	eax,eax
	mov	edx,eax
	shr	edx,28
	and	eax,0FFFFFFFh
	add	dl,'0'
	dec	ecx
	mov	[edi],dl
	jnz	prox_dig_lo
	ret

buffered_write:
	mov	edi,[buff_ptr]
	lea	eax,[edi+ecx]
	cmp	eax,2048
	ja	buff_overflow
	mov	[buff_ptr],eax
	add	edi,buffer
	rep	movsb
	ret
	buff_overflow:
	push	ecx
	call	buffer_flush
	pop	ecx
	mov	edi,buffer
	mov	[buff_ptr],ecx
	rep	movsb
	ret

buffer_flush:
	cmp	[buff_ptr],0
	jz	skip_write
	invoke	WriteFile, [OutHandle], buffer, [buff_ptr], nwriten, 0
	mov	[buff_ptr],0
	skip_write:
	ret

section '.idata' import data readable writeable

library kernel,'KERNEL32.DLL'

import kernel,\
	 GetStdHandle,'GetStdHandle',\
	 WriteFile,'WriteFile'
align 4
nwriten		dd 0
OutHandle	dd 0
result		rb 10
		db 13,10,'$'

align 4
buff_ptr	dd 0
align 16
buffer		rb 2048
buff_end: