Use up all 7 GPRs
org 100h
L0:
mov BX, mul1
mov SI, mul2
mov DI, mulF
call mult
; int3
dec dword [counter]
jnz L0
ret
db 11Ch-$ dup ?
counter:
dd 10000000
db 120h-$ dup ?
mul1:
dq -1, -1
mul2:
dq -1, -1
mulF:
dq 4 dup ?
mult:
; Input SI
; Input BX
; Output DI
; Destroy ALL
; Buffer BX CX BP
rept 8 i:-7 {
mov AX, [BX-2*i]
mov [DI+16-2*i], AX
}
mul word [SI]
mov [DI], AX
mov BX, DX
xor CX, CX
Z1 equ BX
Z2 equ CX
Z3 equ BP
rept 14 i: 1 {
if i < 14
xor Z3, Z3
end if
rept i+1 j: 0 \{
if i-j>=0 & i-j<8 & j>=0 & j<8
mov AX, [DI+16+2*j]
mul word [SI+2*(i-j)]
add Z1, AX
adc Z2, DX
if i < 14
adc Z3, 0
end if
end if
\}
mov [DI+2*i], Z1
Z4 equ Z1
Z1 equ Z2
Z2 equ Z3
Z3 equ Z4
}
mov [DI+30], Z1
ret
Slower than similar design but 256-bit x32, assuming it's because CPU don't optimize 16-bit execution