Jok3r
Joined: 25 Jan 2019
Posts: 4
|
These are some math functions in FASM.
In head of each function I commented the input and output place of the function in this format:
(input1 type, input2 type ...) -> (output1 type, output2 type ...)
None of these functions use stack, everything happens in registers. They may use any register in the process of calculation. If any register contains important data for you, save them before calling any of these functions.
Notice that for writing these functions I did not follow any calling convention, neither ABI nor anything else, so for using these functions in such envs you may need to add some stack handling in these functions.
; function list
; modf
; ldexp
; frexp
; expmulti
; exp
; tanh
; trig_reduce
; cos
; sin
; log
; max
; min
; pow
; sigmoid
; relu
; (xmm0 double) -> (xmm0 double, xmm1 double)
modf:
movsd xmm3,xmm0
movsd xmm1,xmm0
movsd xmm0,[c7]
ucomisd xmm0,xmm1
jbe .l2
xorps xmm0,xmm0
ucomisd xmm0,xmm1
ja .l1
ucomisd xmm1,xmm0
jne .l0
jp .l0
movsd xmm0,xmm1
.l0:
ret
.l1:
movsd xmm0,[c26]
pxor xmm1,xmm0
movsd xmm0,xmm1
call modf
movsd xmm2,[c26]
pxor xmm0,xmm2
pxor xmm1,xmm2
ret
.l2:
movq rax,xmm3
mov rdx,rax
shr rax,0x34
and rax,0x7ff
lea r8,[rax-0x3ff]
cmp r8,0x34
jae .l4
lea rcx,[rax-0x433]
neg rcx
cmp rcx,0x40
sbb rax,rax
mov r8,0x1
shl r8,cl
and r8,rax
lea rax,[r8-0x1]
not rax
and rax,rdx
.l3:
movq xmm0,rax
subsd xmm1,xmm0
ret
.l4:
mov rax,rdx
jmp .l3
; (xmm0 double, rdi int64) -> (xmm0 double)
ldexp:
xorps xmm1,xmm1
ucomisd xmm0,xmm1
jne .l0
jnp .l12
.l0:
movsd xmm2,[c1]
ucomisd xmm0,xmm2
jbe .l11
mov eax,0x1
.l1:
test al,al
je .l3
.l2:
ret
.l3:
ucomisd xmm0,xmm0
jne .l2
jp .l2
movq rax,xmm0
btr rax,0x3f
movq xmm2,rax
movsd xmm3,[c2]
ucomisd xmm3,xmm2
jbe .l10
movsd xmm2,[c3]
mulsd xmm2,xmm0
mov rax,0xffffffffffffffcc
.l4:
mov rcx,rdi
add rax,rcx
movq rcx,xmm2
mov rdx,rcx
shr rcx,0x34
and rcx,0x7ff
lea r8,[rcx+rax]
lea r8,[r8-0x3ff]
cmp r8,0xfffffffffffffbcd
jl .l9
cmp r8,0x3ff
jle .l6
ucomisd xmm1,xmm2
jbe .l5
movsd xmm0,[c4]
ret
.l5:
movsd xmm0,[c5]
ret
.l6:
cmp r8,0xfffffffffffffc02
jge .l8
lea rax,[rcx+rax]
lea rax,[rax-0x3ca]
movsd xmm0,[c6]
.l7:
mov rcx,0x800fffffffffffff
and rdx,rcx
add rax,0x3ff
shl rax,0x34
or rdx,rax
movq xmm1,rdx
mulsd xmm0,xmm1
ret
.l8:
mov rax,r8
movsd xmm0,[c7]
jmp .l7
.l9:
shr rdx,0x3f
shl rdx,0x3f
movq xmm0,rdx
ret
.l10:
movups xmm2,xmm0
xor eax,eax
jmp .l4
.l11:
movsd xmm2,[c8]
ucomisd xmm2,xmm0
seta al
jmp .l1
.l12:
ret
; (xmm0 double) -> (xmm0 double, rax int64)
frexp:
movsd xmm3,xmm0
xorps xmm1,xmm1
ucomisd xmm0,xmm1
jne .l0
jnp .l7
.l0:
movsd xmm1,[c1]
ucomisd xmm0,xmm1
jbe .l6
mov eax,0x1
.l1:
test al,al
je .l3
.l2:
xor rax,rax
ret
.l3:
ucomisd xmm0,xmm0
jne .l2
jp .l2
movq rax,xmm3
btr rax,0x3f
movq xmm1,rax
movsd xmm2,[c2]
ucomisd xmm2,xmm1
jbe .l5
movsd xmm1,[c3]
mulsd xmm0,xmm1
mov rax,0xffffffffffffffcc
.l4:
movq rcx,xmm0
mov rdx,0x800fffffffffffff
and rdx,rcx
mov r8,0x3fe0000000000000
or rdx,r8
movq xmm0,rdx
shr rcx,0x34
and rcx,0x7ff
lea rax,[rax+rcx*1]
lea rax,[rax-0x3fe]
ret
.l5:
xor eax,eax
jmp .l4
.l6:
movsd xmm1,[c8]
ucomisd xmm1,xmm0
seta al
jmp .l1
.l7:
xor rax,rax
ret
; (xmm0 double, xmm1 double, rdi int64) -> (xmm0 double)
expmulti:
movups xmm2,xmm0
subsd xmm0,xmm1
movups xmm3,xmm0
mulsd xmm0,xmm0
movsd xmm4,[c9]
mulsd xmm4,xmm0
movsd xmm5,[c10]
addsd xmm5,xmm4
mulsd xmm5,xmm0
movsd xmm4,[c11]
addsd xmm4,xmm5
mulsd xmm4,xmm0
movsd xmm5,[c12]
addsd xmm5,xmm4
mulsd xmm5,xmm0
movsd xmm4,[c13]
addsd xmm4,xmm5
mulsd xmm4,xmm0
movups xmm0,xmm3
subsd xmm3,xmm4
mulsd xmm0,xmm3
movsd xmm4,[c14]
subsd xmm4,xmm3
divsd xmm0,xmm4
subsd xmm1,xmm0
subsd xmm1,xmm2
movsd xmm0,[c7]
subsd xmm0,xmm1
call ldexp
ret
; (xmm0 double) -> (xmm0 double)
exp:
ucomisd xmm0,xmm0
jne .l0
jnp .l1
.l0:
ret
.l1:
movsd xmm1,[c1]
ucomisd xmm0,xmm1
ja .l0
movsd xmm1,[c8]
ucomisd xmm1,xmm0
ja .l9
movsd xmm1,[c15]
ucomisd xmm0,xmm1
ja .l8
movsd xmm1,[c16]
ucomisd xmm1,xmm0
ja .l7
movsd xmm1,[c17]
ucomisd xmm0,xmm1
jbe .l2
movsd xmm1,[c18]
ucomisd xmm1,xmm0
ja .l6
.l2:
xorps xmm1,xmm1
ucomisd xmm1,xmm0
jbe .l4
movsd xmm1,[c19]
mulsd xmm1,xmm0
movsd xmm2,[c20]
subsd xmm1,xmm2
cvttsd2si rax,xmm1
.l3:
xorps xmm1,xmm1
cvtsi2sd xmm1,rax
movsd xmm2,[c21]
mulsd xmm2,xmm1
subsd xmm0,xmm2
mulsd xmm1,[c22]
mov rdi,rax
call expmulti
ret
.l4:
ucomisd xmm0,xmm1
jbe .l5
movsd xmm1,[c19]
mulsd xmm1,xmm0
movsd xmm2,[c20]
addsd xmm2,xmm1
cvttsd2si rax,xmm2
jmp .l3
.l5:
xor eax,eax
jmp .l3
.l6:
movsd xmm1,[c7]
addsd xmm1,xmm0
movsd xmm0,xmm1
ret
.l7:
xorps xmm0,xmm0
ret
.l8:
movsd xmm0,[c5]
ret
.l9:
xorps xmm0,xmm0
ret
; (xmm0 double) -> (xmm0 double)
tanh:
movsd xmm6,xmm0
movq rax,xmm0
btr rax,0x3f
movq xmm0,rax
movsd xmm1,[c23]
ucomisd xmm0,xmm1
jbe .l1
xorps xmm0,xmm0
movsd xmm1,xmm6
ucomisd xmm0,xmm1
jbe .l0
movsd xmm0,[c24]
ret
.l0:
movsd xmm0,[c7]
ret
.l1:
movsd xmm1,[c25]
ucomisd xmm0,xmm1
jae .l5
movsd xmm0,xmm6
xorps xmm1,xmm1
ucomisd xmm0,xmm1
jne .l2
jnp .l4
.l2:
movups xmm1,xmm0
mulsd xmm0,xmm0
movups xmm2,xmm1
mulsd xmm1,xmm0
movsd xmm3,[tanhP]
mulsd xmm3,xmm0
addsd xmm3,[tanhP+8]
mulsd xmm3,xmm0
addsd xmm3,[tanhP+16]
mulsd xmm3,xmm1
movsd xmm1,[tanhQ]
addsd xmm1,xmm0
mulsd xmm1,xmm0
addsd xmm1,[tanhQ+8]
mulsd xmm1,xmm0
addsd xmm1,[tanhQ+16]
divsd xmm3,xmm1
addsd xmm2,xmm3
.l3:
movsd xmm0,xmm2
.l4:
ret
.l5:
addsd xmm0,xmm0
call exp
movsd xmm1,[c7]
addsd xmm0,xmm1
movsd xmm2,[c14]
divsd xmm2,xmm0
subsd xmm1,xmm2
xorps xmm0,xmm0
movsd xmm2,xmm6
ucomisd xmm0,xmm2
jbe .l6
movsd xmm0,[c26]
pxor xmm1,xmm0
.l6:
movups xmm2,xmm1
jmp .l3
; (xmm0 double) -> (xmm0 double, rax uint64)
trig_reduce:
movsd xmm3,xmm0
movsd xmm0,[c36]
movsd xmm1,xmm3
ucomisd xmm0,xmm1
ja .l2
movq rdx,xmm3
mov r13,rdx
shr rdx,0x34
and rdx,0x7ff
lea rax,[rdx-0x3f6]
mov rcx,rax
shr rax,0x6
cmp rax,0x14
jae .l5
lea rdx,[mPi4]
mov rsi,[rdx+rax*8]
shl rsi,cl
lea rdi,[rax+0x1]
cmp rdi,0x14
jae .l3
mov rdi,rcx
and rcx,0x3f
lea r8,[rcx-0x40]
neg r8
cmp r8,0x40
sbb r8,r8
mov r9,[rdx+rax*8+0x8]
mov rcx,rdi
neg rcx
mov r10,r9
shr r9,cl
and r9,r8
or rsi,r9
mov r9,rcx
mov rcx,rdi
shl r10,cl
lea r11,[rax+0x2]
cmp r11,0x14
jae .l4
mov r11,[rdx+rax*8+0x10]
mov rcx,r9
mov r12,r11
shr r11,cl
and r11,r8
or r10,r11
mov rcx,rdi
shl r12,cl
lea rdi,[rax+0x3]
cmp rdi,0x14
jae .l3
mov rdi,0x800fffffffffffff
and r13,rdi
bts r13,0x34
mov rax,[rdx+rax*8+0x18]
mov rcx,r9
shr rax,cl
and rax,r8
or rax,r12
mul r13
mov rax,r10
mov rcx,rdx
mul r13
mov rdi,rax
add rax,rcx
imul rsi,r13
add rdi,rcx
adc rsi,rdx
mov rdx,rsi
shl rsi,0x3
mov r13,rax
shr rax,0x3d
or rsi,rax
bsr rdi,rsi
mov r8,0xffffffffffffffff
cmove rdi,r8
lea r8,[rdi-0x3f]
neg r8
lea rcx,[r8+0x1]
cmp rcx,0x40
sbb r8,r8
cmp rdi,0x40
sbb r9,r9
shr rdx,0x3d
lea r10,[rdi+0x3bf]
shl rsi,cl
and rsi,r8
mov rcx,rdi
shr r13,cl
and r9,r13
or rsi,r9
shr rsi,0xc
shl r10,0x34
or r10,rsi
mov r13,rdx
and rdx,0x1
movq xmm1,r10
cmp rdx,0x1
jne .l1
lea rax,[r13+0x1]
and rax,0x7
movsd xmm2,[c7]
subsd xmm1,xmm2
.l0:
mulsd xmm1,xmm0
movsd xmm0,xmm1
ret
.l1:
mov rax,r13
jmp .l0
.l2:
xor rax,rax
movsd xmm0,xmm1
ret
.l3:
mov rax,rdi
mov ecx,0x14
ret
.l4:
mov rax,r11
mov ecx,0x14
ret
.l5:
mov ecx,0x14
ret
; (xmm0 double) -> (xmm0 double)
cos:
movsd xmm4,xmm0
.l0:
ucomisd xmm0,xmm0
jne .l1
jnp .l2
.l1:
movsd xmm0,[c37]
ret
.l2:
movsd xmm1,[c1]
ucomisd xmm0,xmm1
jbe .l17
mov eax,0x1
.l3:
test al,al
jne .l1
movq rax,xmm4
btr rax,0x3f
movq xmm0,rax
movsd xmm1,[c38]
ucomisd xmm0,xmm1
jae .l16
movsd xmm1,[c39]
mulsd xmm1,xmm0
movsd xmm2,[c40]
ucomisd xmm2,xmm1
jbe .l15
cvttsd2si rax,xmm1
.l4:
test rax,rax
jl .l14
xorps xmm1,xmm1
cvtsi2sd xmm1,rax
.l5:
mov rcx,rax
and rax,0x1
cmp rax,0x1
jne .l13
lea rax,[rcx+0x1]
movsd xmm2,[c7]
addsd xmm2,xmm1
.l6:
and rax,0x7
movsd xmm1,[c41]
mulsd xmm1,xmm2
subsd xmm0,xmm1
movsd xmm1,[c42]
mulsd xmm1,xmm2
subsd xmm0,xmm1
movsd xmm1,[c43]
mulsd xmm1,xmm2
subsd xmm0,xmm1
.l7:
cmp rax,0x3
seta cl
lea rdx,[rax-0x4]
cmp rax,0x3
mov r8,rax
cmova rax,rdx
cmp rax,0x1
jbe .l8
cmp r8,0x3
setbe cl
cmp rax,0x1
.l8:
movups xmm1,xmm0
mulsd xmm0,xmm0
jne .l12
.l9:
movups xmm2,xmm1
mulsd xmm1,xmm0
movsd xmm3,[sinc]
mulsd xmm3,xmm0
addsd xmm3,[sinc+8]
mulsd xmm3,xmm0
addsd xmm3,[sinc+16]
mulsd xmm3,xmm0
addsd xmm3,[sinc+24]
mulsd xmm3,xmm0
addsd xmm3,[sinc+32]
mulsd xmm3,xmm0
addsd xmm3,[sinc+40]
mulsd xmm3,xmm1
addsd xmm2,xmm3
.l10:
test cl,cl
je .l11
movsd xmm0,[c26]
pxor xmm2,xmm0
.l11:
movsd xmm0,xmm2
ret
.l12:
cmp rax,0x2
je .l9
movsd xmm1,[c20]
mulsd xmm1,xmm0
movsd xmm2,[c7]
subsd xmm2,xmm1
movups xmm1,xmm0
mulsd xmm0,xmm0
movsd xmm3,[cosc]
mulsd xmm3,xmm1
addsd xmm3,[cosc+8]
mulsd xmm3,xmm1
addsd xmm3,[cosc+16]
mulsd xmm3,xmm1
addsd xmm3,[cosc+24]
mulsd xmm3,xmm1
addsd xmm3,[cosc+32]
mulsd xmm3,xmm1
addsd xmm3,[cosc+40]
mulsd xmm3,xmm0
addsd xmm2,xmm3
jmp .l10
.l13:
mov rax,rcx
movups xmm2,xmm1
jmp .l6
.l14:
mov rcx,rax
and rax,0x1
mov rdx,rcx
shr rcx,1
or rcx,rax
xorps xmm1,xmm1
cvtsi2sd xmm1,rcx
addsd xmm1,xmm1
mov rax,rdx
jmp .l5
.l15:
subsd xmm1,xmm2
cvttsd2si rax,xmm1
bts rax,0x3f
jmp .l4
.l16:
movq xmm0,rax
call trig_reduce
jmp .l7
.l17:
movsd xmm1,[c8]
ucomisd xmm1,xmm0
seta al
jmp .l3
; (xmm0 double) -> (xmm0 double)
sin:
movsd xmm5,xmm0
xorps xmm1,xmm1
ucomisd xmm0,xmm1
jne .l1
jp .l1
.l0:
ret
.l1:
ucomisd xmm0,xmm0
jne .l0
jp .l0
movsd xmm2,[c1]
ucomisd xmm0,xmm2
jbe .l19
mov eax,0x1
.l2:
test al,al
jne .l18
ucomisd xmm1,xmm0
jbe .l17
movsd xmm2,[c26]
pxor xmm2,xmm0
.l3:
movsd xmm3,[c38]
ucomisd xmm2,xmm3
jae .l16
movsd xmm3,[c39]
mulsd xmm3,xmm2
movsd xmm4,[c40]
ucomisd xmm4,xmm3
jbe .l15
cvttsd2si rax,xmm3
.l4:
test rax,rax
jl .l14
xorps xmm3,xmm3
cvtsi2sd xmm3,rax
.l5:
mov rcx,rax
and rax,0x1
cmp rax,0x1
jne .l13
lea rax,[rcx+0x1]
movsd xmm4,[c7]
addsd xmm4,xmm3
.l6:
and rax,0x7
movsd xmm3,[c41]
mulsd xmm3,xmm4
subsd xmm2,xmm3
movsd xmm3,[c42]
mulsd xmm3,xmm4
subsd xmm2,xmm3
movsd xmm3,[c43]
mulsd xmm3,xmm4
subsd xmm2,xmm3
ucomisd xmm1,xmm0
.l7:
seta cl
cmp rax,0x3
jbe .l8
xor ecx,0x1
add rax,0xfffffffffffffffc
.l8:
movups xmm0,xmm2
mulsd xmm2,xmm2
cmp rax,0x1
jne .l12
.l9:
movsd xmm0,[c20]
mulsd xmm0,xmm2
movsd xmm1,[c7]
subsd xmm1,xmm0
movups xmm0,xmm2
mulsd xmm2,xmm2
movsd xmm3,[cosc]
mulsd xmm3,xmm0
addsd xmm3,[cosc+8]
mulsd xmm3,xmm0
addsd xmm3,[cosc+16]
mulsd xmm3,xmm0
addsd xmm3,[cosc+24]
mulsd xmm3,xmm0
addsd xmm3,[cosc+32]
mulsd xmm3,xmm0
addsd xmm3,[cosc+40]
mulsd xmm3,xmm2
addsd xmm1,xmm3
.l10:
test cl,cl
je .l11
movsd xmm0,[c26]
pxor xmm1,xmm0
.l11:
movsd xmm0,xmm1
ret
.l12:
cmp rax,0x2
je .l9
movups xmm1,xmm0
mulsd xmm1,xmm2
movsd xmm3,[sinc]
mulsd xmm3,xmm2
addsd xmm3,[sinc+8]
mulsd xmm3,xmm2
addsd xmm3,[sinc+16]
mulsd xmm3,xmm2
addsd xmm3,[sinc+24]
mulsd xmm3,xmm2
addsd xmm3,[sinc+32]
mulsd xmm3,xmm2
addsd xmm3,[sinc+40]
mulsd xmm1,xmm3
addsd xmm1,xmm0
jmp .l10
.l13:
mov rax,rcx
movups xmm4,xmm3
jmp .l6
.l14:
mov rcx,rax
and rax,0x1
mov rdx,rcx
shr rcx,1
or rcx,rax
xorps xmm3,xmm3
cvtsi2sd xmm3,rcx
addsd xmm3,xmm3
mov rax,rdx
jmp .l5
.l15:
subsd xmm3,xmm4
cvttsd2si rax,xmm3
bts rax,0x3f
jmp .l4
.l16:
movsd xmm0,xmm2
call trig_reduce
movsd xmm2,xmm0
xorps xmm0,xmm0
movsd xmm1,xmm5
ucomisd xmm0,xmm1
jmp .l7
.l17:
movups xmm2,xmm0
jmp .l3
.l18:
movsd xmm0,[c37]
ret
.l19:
movsd xmm2,[c8]
ucomisd xmm2,xmm0
seta al
jmp .l2
; (xmm0 double) -> (xmm0 double)
log:
movq r8,xmm0
mov rax,0x7fffffffffffffff
and rax,r8
je .l2
mov rax,0x0
cmp rax,r8
jg .l1
mov rax,0x7ff0000000000000
cmp rax,r8
jle .l0
movq xmm0,r8
mov rax,0xfffffffffffff
movq xmm2,rax
andpd xmm2,xmm0
movsd xmm0,[c20]
orpd xmm2,xmm0
shr r8,0x34
and r8,0x7ff
sub r8,0x3fe
xorps xmm1,xmm1
cvtsi2sd xmm1,r8
movsd xmm0,[c27]
cmpnltsd xmm0,xmm2
movsd xmm3,[c7]
andpd xmm3,xmm0
subsd xmm1,xmm3
movsd xmm0,[c7]
addsd xmm3,xmm0
mulsd xmm2,xmm3
subsd xmm2,xmm0
movsd xmm0,[c14]
addsd xmm0,xmm2
movapd xmm3,xmm2
divsd xmm3,xmm0
movapd xmm4,xmm3
mulsd xmm4,xmm4
movapd xmm5,xmm4
mulsd xmm5,xmm5
movsd xmm6,[c28]
mulsd xmm6,xmm5
addsd xmm6,[c29]
mulsd xmm6,xmm5
addsd xmm6,[c30]
mulsd xmm6,xmm5
addsd xmm6,[c31]
mulsd xmm4,xmm6
movsd xmm6,[c32]
mulsd xmm6,xmm5
addsd xmm6,[c33]
mulsd xmm6,xmm5
addsd xmm6,[c34]
mulsd xmm5,xmm6
addsd xmm4,xmm5
movsd xmm0,[c20]
mulsd xmm0,xmm2
mulsd xmm0,xmm2
addsd xmm4,xmm0
mulsd xmm3,xmm4
movsd xmm4,[c22]
mulsd xmm4,xmm1
addsd xmm3,xmm4
subsd xmm0,xmm3
subsd xmm0,xmm2
mulsd xmm1,[c21]
subsd xmm1,xmm0
movsd xmm0,xmm1
ret
.l0:
movq xmm0,r8
ret
.l1:
mov rax,0x7ff8000000000001
movq xmm0,rax
ret
.l2:
mov rax,0xfff0000000000000
movq xmm0,rax
ret
; (xmm0 double, xmm1 double) -> (xmm0 double)
max:
mov rax,0x7ff0000000000000
movq r8,xmm0
cmp rax,r8
je .l1
movq r9,xmm1
cmp rax,r9
je .l1
mov rdx,0x7fffffffffffffff
mov rax,0x7ff0000000000000
mov r10,r8
and r10,rdx
cmp rax,r10
jl .l0
mov rcx,r9
and rcx,rdx
cmp rax,rcx
jl .l0
or r10,rcx
je .l2
movq xmm0,r8
movq xmm1,r9
maxsd xmm0,xmm1
ret
.l0:
mov rax,0x7ff8000000000001
.l1:
movq xmm0,rax
ret
.l2:
mov rax,0x8000000000000000
cmp rax,r8
je .l3
movq xmm0,r8
ret
.l3:
movq xmm0,r9
ret
; (xmm0 double, xmm1 double) -> (xmm0 double)
min:
mov rax,0xfff0000000000000
movq r8,xmm0
cmp rax,r8
je .l1
movq r9,xmm1
cmp rax,r9
je .l1
mov rdx,0x7fffffffffffffff
mov rax,0x7ff0000000000000
mov r10,r8
and r10,rdx
cmp rax,r10
jl .l0
mov rcx,r9
and rcx,rdx
cmp rax,rcx
jl .l0
or r10,rcx
je .l2
movq xmm0,r8
movq xmm1,r9
minsd xmm0,xmm1
ret
.l0:
mov rax,0x7ff8000000000001
.l1:
movq xmm0,rax
ret
.l2:
mov rax,0x8000000000000000
cmp rax,r8
je .l3
movq xmm0,r9
ret
.l3:
movq xmm0,r8
ret
; (xmm0 double, xmm1 double) -> (xmm0 double)
pow:
.l0:
movsd xmm7,xmm0
movsd xmm8,xmm1
xorps xmm0,xmm0
movsd xmm1,xmm8
ucomisd xmm0,xmm1
jne .l1
jnp .l52
.l1:
movsd xmm2,[c7]
movsd xmm3,xmm7
ucomisd xmm2,xmm3
jne .l2
jnp .l51
.l2:
ucomisd xmm2,xmm1
jne .l3
jnp .l50
.l3:
ucomisd xmm3,xmm3
jne .l49
jp .l49
ucomisd xmm1,xmm1
jne .l49
jp .l49
ucomisd xmm0,xmm3
jne .l35
jp .l35
ucomisd xmm0,xmm1
ja .l31
ucomisd xmm1,xmm0
ja .l27
.l4:
movq rax,xmm8
btr rax,0x3f
movq xmm0,rax
call modf
xorps xmm2,xmm2
ucomisd xmm1,xmm2
jne .l5
jnp .l26
.l5:
movsd xmm3,xmm7
ucomisd xmm2,xmm3
ja .l25
.l6:
movsd xmm4,[c40]
ucomisd xmm0,xmm4
jae .l21
ucomisd xmm1,xmm2
jne .l7
jnp .l20
.l7:
movsd xmm4,[c20]
ucomisd xmm1,xmm4
jbe .l19
movsd xmm5,[c7]
subsd xmm1,xmm5
addsd xmm5,xmm0
.l8:
movsd xmm9,xmm1
movsd xmm10,xmm5
movsd xmm0,xmm3
call log
mulsd xmm0,xmm9
call exp
movsd xmm1,xmm10
.l9:
movsd xmm10,xmm1
movsd xmm11,xmm0
movsd xmm0,xmm7
call frexp
movsd xmm1,xmm10
cvttsd2si rcx,xmm1
movsd xmm1,xmm11
xor edx,edx
jmp .l11
.l10:
sar r8,1
mov rdx,rcx
mov rcx,r8
.l11:
test rcx,rcx
je .l18
cmp rax,0xfffffffffffff000
jl .l14
cmp rax,0x1000
jg .l14
mov r8,rcx
and rcx,0x1
cmp rcx,0x1
jne .l13
mulsd xmm1,xmm0
lea rcx,[rax+rdx*1]
.l12:
mulsd xmm0,xmm0
mov rdx,rax
shl rax,1
movsd xmm2,[c20]
ucomisd xmm2,xmm0
jbe .l10
addsd xmm0,xmm0
lea rax,[rdx+rdx*1]
lea rax,[rax-0x1]
jmp .l10
.l13:
mov rcx,rdx
jmp .l12
.l14:
add rax,rdx
xorps xmm0,xmm0
movsd xmm2,xmm8
ucomisd xmm0,xmm2
.l15:
jbe .l17
movsd xmm0,[c7]
divsd xmm0,xmm1
neg rax
.l16:
mov rdi,rax
call ldexp
ret
.l17:
movups xmm0,xmm1
jmp .l16
.l18:
xorps xmm0,xmm0
movsd xmm2,xmm8
ucomisd xmm0,xmm2
mov rax,rdx
jmp .l15
.l19:
movups xmm5,xmm0
jmp .l8
.l20:
movups xmm1,xmm0
movsd xmm0,[c7]
jmp .l9
.l21:
movsd xmm0,[c24]
ucomisd xmm0,xmm3
jne .l22
jnp .l24
.l22:
movq rax,xmm7
btr rax,0x3f
movq xmm0,rax
movsd xmm1,[c7]
ucomisd xmm1,xmm0
seta al
movsd xmm0,xmm8
ucomisd xmm0,xmm2
seta cl
cmp al,cl
jne .l23
movsd xmm0,xmm2
ret
.l23:
movsd xmm0,[c5]
ret
.l24:
movsd xmm0,[c7]
ret
.l25:
movsd xmm0,[c37]
ret
.l26:
movsd xmm3,xmm7
jmp .l6
.l27:
movsd xmm0,xmm1
call modf
xorps xmm2,xmm2
ucomisd xmm1,xmm2
jne .l30
jp .l30
cvttsd2si rax,xmm0
bt eax,0x0
setb al
.l28:
test al,al
je .l29
movsd xmm0,xmm7
ret
.l29:
movsd xmm0,xmm2
ret
.l30:
xor eax,eax
jmp .l28
.l31:
movsd xmm0,xmm1
call modf
xorps xmm2,xmm2
ucomisd xmm1,xmm2
jne .l34
jp .l34
cvttsd2si rax,xmm0
bt eax,0x0
setb al
.l32:
test al,al
je .l33
movq rax,xmm7
shr rax,0x3f
shl rax,0x3f
mov rcx,0x7ff0000000000000
or rcx,rax
movq xmm0,rcx
ret
.l33:
movsd xmm0,[c5]
ret
.l34:
xor eax,eax
jmp .l32
.l35:
movsd xmm4,[c1]
ucomisd xmm1,xmm4
jbe .l48
mov eax,0x1
.l36:
test al,al
jne .l44
ucomisd xmm3,xmm4
jbe .l43
mov eax,0x1
.l37:
test al,al
je .l40
movsd xmm4,[c8]
ucomisd xmm4,xmm3
ja .l39
ucomisd xmm0,xmm1
ja .l38
ucomisd xmm1,xmm0
jbe .l4
movsd xmm0,[c5]
ret
.l38:
ret
.l39:
divsd xmm2,xmm3
movsd xmm0,[c26]
pxor xmm1,xmm0
movsd xmm0,xmm2
call pow
ret
.l40:
movsd xmm4,[c20]
ucomisd xmm4,xmm1
jne .l41
jnp .l42
.l41:
movsd xmm5,[c35]
ucomisd xmm5,xmm1
jne .l4
jp .l4
sqrtsd xmm0,xmm3
divsd xmm2,xmm0
movsd xmm0,xmm2
ret
.l42:
sqrtsd xmm0,xmm3
ret
.l43:
movsd xmm4,[c8]
ucomisd xmm4,xmm3
seta al
jmp .l37
.l44:
movsd xmm5,[c24]
ucomisd xmm5,xmm3
jne .l45
jnp .l47
.l45:
movq rax,xmm7
btr rax,0x3f
movq xmm3,rax
ucomisd xmm2,xmm3
seta al
ucomisd xmm1,xmm4
seta cl
cmp cl,al
jne .l46
ret
.l46:
movsd xmm0,[c5]
ret
.l47:
movsd xmm0,xmm2
ret
.l48:
movsd xmm5,[c8]
ucomisd xmm5,xmm1
seta al
jmp .l36
.l49:
movsd xmm0,[c37]
ret
.l50:
movsd xmm0,xmm3
ret
.l51:
movsd xmm0,xmm2
ret
.l52:
movsd xmm2,[c7]
jmp .l51
; (xmm0 double) -> (xmm0 double)
sigmoid:
movsd xmm1,[c26]
pxor xmm0,xmm1
call exp
movsd xmm1,[c7]
addsd xmm0,xmm1
divsd xmm1,xmm0
movsd xmm0,xmm1
ret
; (xmm0 double) -> (xmm0 double)
; before bitRAKE mentioning my mistake
;relu:
; xorps xmm1,xmm1
; comisd xmm0,xmm1
; ja .l1
; xorps xmm0,xmm0
; .l1:
; ret
relu:
xorps xmm1,xmm1
maxsd xmm0,xmm1
ret
mPi4 dq 0x0000000000000001, 0x45f306dc9c882a53, 0xf84eafa3ea69bb81, 0xb6c52b3278872083,\
0xfca2c757bd778ac3, 0x6e48dc74849ba5c0, 0x0c925dd413a32439, 0xfc3bd63962534e7d,\
0xd1046bea5d768909, 0xd338e04d68befc82, 0x7323ac7306a673e9, 0x3908bf177bf25076,\
0x3ff12fffbc0b301f, 0xde5e2316b414da3e, 0xda6cfd9e4f96136e, 0x9e8c7ecd3cbfd45a,\
0xea4f758fd7cbe2f6, 0x7a0e73ef14a525d4, 0xd7f6bf623f1aba10, 0xac06608df8f6d757
tanhP dq -0.964399179425052238628, -99.2877231001918586564, -1614.68768441708447952
tanhQ dq 112.811678491632931402, 2235.48839060100448583, 4844.06305325125486048
sinc dq 0x3de5d8fd1fd19ccd, 0xbe5ae5e5a9291f5d, 0x3ec71de3567d48a1,\
0xbf2a01a019bfdf03, 0x3f8111111110f7d0, 0xbfc5555555555548
cosc dq 0xbda8fa49a0861a9b, 0x3e21ee9d7b4e3f05, 0xbe927e4f7eac4bc6,\
0x3efa01a019c844f5, 0xbf56c16c16c14f91, 0x3fa555555555554b
c1 dq 0x7fefffffffffffff
c2 dq 0x0010000000000000
c3 dq 0x4330000000000000
c4 dq 0xfff0000000000000
c5 dq 0x7ff0000000000000
c6 dq 0x3ca0000000000000
c7 dq 0x3ff0000000000000
c8 dq 0xffefffffffffffff
c9 dq 0x3e66376972bea4d0
c10 dq 0xbebbbd41c5d26bf1
c11 dq 0x3f11566aaf25de2c
c12 dq 0xbf66c16c16bebd93
c13 dq 0x3fc5555555555555
c14 dq 0x4000000000000000
c15 dq 0x40862e42fefa39ef
c16 dq 0xc0874910d52d3051
c17 dq 0xbe30000000000000
c18 dq 0x3e30000000000000
c19 dq 0x3ff71547652b82fe
c20 dq 0x3fe0000000000000
c21 dq 0x3fe62e42fee00000
c22 dq 0x3dea39ef35793c76
c23 dq 0x404601e678fc457b
c24 dq 0xbff0000000000000
c25 dq 0x3fe4000000000000
c26 dq 0x8000000000000000
c27 dq 0x3fe6a09e667f3bcd
c28 dq 0x3fc2f112df3e5244
c29 dq 0x3fc7466496cb03de
c30 dq 0x3fd2492494229359
c31 dq 0x3fe5555555555593
c32 dq 0x3fc39a09d078c69f
c33 dq 0x3fcc71c51d8e78af
c34 dq 0x3fd999999997fa04
c35 dq 0xbfe0000000000000
c36 dq 0x3fe921fb54442d18
c37 dq 0x7ff8000000000001
c38 dq 0x432921fb54442d18
c39 dq 0x3ff45f306dc9c883
c40 dq 0x43e0000000000000
c41 dq 0x3fe921fb40000000
c42 dq 0x3e64442d00000000
c43 dq 0x3ce8469898cc5170
Last edited by Jok3r on 10 Jul 2021, 14:24; edited 3 times in total
|