flat assembler
Message board for the users of flat assembler.
![]() |
Author |
|
macgub 10 Jul 2025, 06:47
Hi,
This is my implementation of sinus, cosinus and arcus tangent. Arc tg is calculated for two input values. Code: sin_cos: ; Calculating trigonometric functions ; Taylor series implementation ; in: ; xmm0 - 2 angles floats (radians); lowest half x, y ; out: ; xmm0 - sinx cosx atanx atany 4 dwords float ; sin x = x - x^3/3! + x^5/5! - ... ; cos x = 1 - x^2/2! + x^4/4! - ... ; atan x = x - x^3/3 + x^5/5 - ... pushad movaps xmm7,xmm0 pcmpeqd xmm5,xmm5 movhps xmm7,[the_one] shufps xmm7,xmm7,01001000b ; xm7 = lo -> hi x, 1.0, x, y xorps xmm3,xmm3 mov ecx,3 mulps xmm0,xmm0 ; x^2 pslld xmm5,31 movaps xmm1,xmm7 shufps xmm0,xmm0,0 ; xm7 lo - hi x, 1.0, x, y ; xm1 lo - hi x, 1.0, x, y ; xm0 lo - hi x^2, x^2, x^2, x^2 mov edx,1 @@: mulps xmm1,xmm0 ; lo - hi x^3 x^2 mov ebx,ecx mov eax,ecx dec ebx sub eax,2 push edx eax ebx ecx movups xmm2,[esp] cvtdq2ps xmm2,xmm2 ; 3, 2, 1 | 5, 4, 3 movaps xmm4,xmm2 add esp,16 shufps xmm4,xmm4,00001001b shufps xmm2,xmm2,11110100b mulps xmm4,xmm2 ; lo -> hi = 3*2, 1*2 rcpps xmm4,xmm4 mulps xmm1,xmm4 ; x^2/2! | x^4/4! movaps xmm6,xmm1 xorps xmm6,xmm3 ; -x^3/3! | x^5/5! subps xmm7,xmm6 xorps xmm3,xmm5 ;[sign_mask] add ecx,2 cmp ecx,100 jb @b movaps xmm0,xmm7 ; xmm0 - sinx cosx atanx atany 4 dwords float popad ret data section: the_one: dd 1.0, 1.0 |
|||
![]() |
|
Roman 10 Jul 2025, 07:41
macgub loop in 50 iterations. Its very slow and bad solution.
Your code speed circa almost like fpatan. https://wikimedia.org/api/rest_v1/media/math/render/svg/e4ecfb141505d3e4819878a3db1b78bd1d2a9518 if х >=0 than arctg x = acos(1/sqrt(1+x*x)) if х <0 than arctg x = -acos(1/sqrt(1+x*x)) -acos(1/sqrt(1+x*x))=3.141592-acos(1/sqrt(1+x*x)) replace 1/sqrt(1+x*x) to rsqrt(1+x*x). rsqrtss faster than sqrtss. I found avx fast acos Code: tabACos dd -0.0,1.0,0.141592654,\ ; pi-3 3.0,1.17123365,0.745630503,0.329797238,0.294789612,1.30136144,\ 0.930340230,0.0114133256,0.0189007167 @ACos: mov edx,tabACos ;xmm0=x vucomiss xmm0,[edx] ; Set cf if x<0 vorps xmm0,xmm0,[edx] ; xmm0 = -|x| vmovups xmm3,[edx+16] ; xmm3 = b2: b0 : a2 : a0 vshufps xmm1,xmm0,xmm0,0 ; xmm1 = -|x| : -|x| : -|x| : -|x| vaddss xmm0,xmm0,[edx+4] ; xmm0 = 1-|x| vfnmadd231ps xmm3,xmm1,[edx+32] ; xmm3 = b2+b3*|x|=q1 : a2+a3*|x|=p1 : b0+b1*|x|=q0 : a0+a1*|x|=p0 vmulps xmm2,xmm1,xmm1 ; xmm2 = x^2 : x^2 : x^2 : x^2 vmovhlps xmm1,xmm3,xmm3 ; xmm1 = b2+b3*|x|=q1 : a2+a3*|x|=p1 vfmadd132ps xmm1,xmm3,xmm2 ; xmm1 = q0+q1*x^2=q : p0+p1*x^2=p vsqrtss xmm0,xmm0,xmm0 ; xmm0 = (1-|x|)^0.5 vmovshdup xmm2,xmm1 ; xmm2 = q vdivss xmm1,xmm1,xmm2 ; xmm1 = p/q = f(t) jnc arccos_plus ; Goto if x>=0 vfnmadd213ss xmm0,xmm1,[edx+8] ; xmm0 = pi-3-(1+x)^0.5*p/q vaddss xmm0,xmm0,[edx+12] ; xmm0 = pi-(1+x)^0.5*p/q ret arccos_plus: ; Case of x>=0 vmulss xmm0,xmm0,xmm1 ; xmm0 = (1-x)^0.5*p/q ret ;in code I tested mov eax,0.8 movd xmm0,eax call @ACos ;I get in xmm0=0.64350. ;online internet calculator acos( 0.8 )=0.64350 |
|||
![]() |
|
Roman 10 Jul 2025, 08:32
good information.
https://mazzo.li/posts/vectorized-atan2.html https://stackoverflow.com/questions/42537957/fast-accurate-atan-arctan-approximation-algorithm Code: @ct2 dd 6.28740248E-17,\ ; a0*(a1+b0)=a0*c1 4.86816205E-17,\ ; a0*a1 2.24874633E-18,\ ; a0*(a3+b2)=a0*c3 4.02179944E-19,\ ; a0*a3 4.25772129E-17,\ ; a0 4.25772129E-17,\ ; a0 2.50182216E-17,\ ; a0*(a2+b1)=a0*c2 1.25756219E-17,\ ; a0*a2 0.0707963258,\ ;(pi-3)/2 1.5,\ ; 3/2 1.0,\ ; 1 -0.0707963258,\ ; -(pi-3)/2 -1.5,\ ; -3/2 3.95889818E15 ; Threshold of x^2 when arctg(x)=pi/2*sgn(x) arctg: vshufps xmm1,xmm0,xmm0,0 ; xmm1 = x # x : x # x mov edx, @ct2 ; edx contains the address of constants table vmulps xmm2,xmm1,xmm1 ; xmm2 = x^2 # x^2 : x^2 # x^2 vmovups xmm3,[edx+16] ; xmm3 = a0*a2 # a0*c2 : a0 # a0 vmulps xmm4,xmm2,xmm2 ; xmm4 = y^2 # y^2 : y^2 # y^2 vucomiss xmm2,[edx+40] ; Compare y=x^2 to 1 ja arctg_big2 ; Jump if |x|>1 vfmadd231ps xmm3,xmm2,[edx] ; xmm3 ~ a3*y+a2 # c3*y+c2 : a1*y+1 # c1*y+1 vmovhlps xmm1,xmm1,xmm3 ; xmm1 ~ a3*y+a2 # c3*y+c2 vfmadd231ps xmm3,xmm4,xmm1 ; xmm3 ~ a3*y^3+a2*y^2+a1*y+1 # c3*y^3+c2*y^2+c1*y+1 vmovshdup xmm2,xmm3 ; xmm2 = P; xmm3 = Q vdivss xmm2,xmm2,xmm3 ; xmm2 = P/Q vmulss xmm0,xmm0,xmm2 ; xmm0 = x*P/Q = arctg(x) ret ; Return arctg_big2: ; When |x|>1 use formula pi/2*sgn(x)-arctg(1/x) vfmadd213ps xmm3,xmm2,[edx] ; xmm3 ~ a2*y+a3 # c2*y+c3 : y+a1 # y+c1 vmovmskpd eax,xmm1 ; eax=3 if x<0, otherwise eax=0 vmovhlps xmm0,xmm0,xmm3 ; xmm0 ~ a2*y+a3 # c2*y+c3 vfmadd213ps xmm3,xmm4,xmm0 ; xmm3 ~ y^3+a1*y^2+a2*y+a3 # y^3+c1*y^2+c2*y+c3 vmovss xmm0,[edx+4*eax+32] ; xmm0 = (pi-3)/2*sgn(x) vucomiss xmm2,[edx+52] ; Compare y=x^2 to threshold value jnb arctg_end3 ; The data is already in xmm0, if |x|>=62919776 vmovshdup xmm4,xmm3 ; xmm4 = P; xmm3 = Q vmulss xmm1,xmm1,xmm3 ; xmm1 = x*Q vfmsub132ss xmm0,xmm4,xmm1 ; xmm0 = (pi-3)/2*|x|*Q-P vdivss xmm0,xmm0,xmm1 ; xmm0 = (pi-3)/2*sgn(x)-P/(x*Q) arctg_end3: ; Add to result 3/2*sgn(x) vaddss xmm0,xmm0,[edx+4*eax+36] ; xmm0 = pi/2*sgn(x)-P/(x*Q) ret ;in code I tested mov eax,11.8 movd xmm0,eax call arctg ;in xmm0 = 1.4862525 ;online calculator arctang(11.8 ) = 1.4862525 Last edited by Roman on 10 Jul 2025, 10:29; edited 6 times in total |
|||
![]() |
|
Roman 10 Jul 2025, 08:43
instead FPATAN FPU
Code: ;signbit return true if value <0. if value =>0 return false float fast_atan2_alt(float y, float x) { float ax = fabsf(x); float ay = fabsf(y); float a; if (ax > ay) { a = ay / ax; } else if (ay > ax) { a = ax / ay; } else { // ax and ay have the same value. a = ax > 0; // or just 1.0f if OK to have this function act like atan2(0,0) } float s = a * a; float r = ((-0.0447585656f * s + 0.157265460f) * s - 0.327108731f) * s * a + a; if (ay > ax) r = (float) M_PI_2 - r; if (signbit(x)) r = (float) M_PI - r; if (signbit(y)) r = -r; return r; } |
|||
![]() |
|
Roman 10 Jul 2025, 10:26
Code: align 64 avxpatan: vmovmskps ecx,xmm0 ; ecx=1 if y<=0; ecx=0 if y>=0 vmovmskps eax,xmm1 ; eax=1 if x<=0, eax=0 if x>=0 vdivss xmm0,xmm0,xmm1 ; xmm0 = y/x - prepare the argument of arctangent and ecx,1 ; Only bit 0 of ecx is needed and eax,1 ; Only bit 0 of eax is needed bts ecx,eax ; ecx=2 or 3 if x<=0, ecx=1 if x>=0 call arctgDegrees ; Calculate arctangent in degrees from y/x vaddss xmm0,xmm0,[@ct3-4+4*ecx] ; If neccessary, to add or subtract 180 ret align 16 @ct3 dd -0.0,180.0,-180.0 @ct20 dd 1.92582580E-14,\ ; a0*(a1+b0)=a0*c1 8.54345240E-13,\ ; a0*a1 (in degrees) 6.88789034E-16,\ ; a0*(a3+b2)=a0*c3 7.05811633E-15,\ ; a0*a3 (in degrees) 1.30413631E-14,\ ; a0 7.47215061E-13,\ ; a0 (in degrees) 7.66305964E-15,\ ; a0*(a2+b1)=a0*c2 2.20697728E-13,\ ; a0*a2 (in degrees) 90.0,\ 1.0,\ 2.25592738E14,\ ; Threshold of x^2 when arctgD(x)=90*sgn(x) -90.0 align 64 arctgDegrees: vshufps xmm1,xmm0,xmm0,0 ; xmm1 = x # x : x # x mov edx,@ct20 ; edx contains the address of constants table vmulps xmm2,xmm1,xmm1 ; xmm2 = x^2 # x^2 : x^2 # x^2 vmovups xmm3,[edx+16] ; xmm3 = a0*a2 # a0*c2 : a0 # a0 vmulps xmm4,xmm2,xmm2 ; xmm4 = y^2 # y^2 : y^2 # y^2 vucomiss xmm2,[edx+36] ; Compare y=x^2 to 1 ja arctg_big ; Goto if |x|>1 vfmadd231ps xmm3,xmm2,[edx] ; xmm3 ~ a3*y+a2 # c3*y+c2 : a1*y+1 # c1*y+1 vmovhlps xmm1,xmm1,xmm3 ; xmm1 ~ a3*y+a2 # c3*y+c2 vfmadd231ps xmm3,xmm4,xmm1 ; xmm3 ~ a3*y^3+a2*y^2+a1*y+1 # c3*y^3+c2*y^2+c1*y+1 vmovshdup xmm2,xmm3 ; xmm2 = P; xmm3 = Q vdivss xmm2,xmm2,xmm3 ; xmm2 = P/Q vmulss xmm0,xmm0,xmm2 ; xmm0 = x*P/Q = arctgD(x) ret arctg_big: ; When |x|>1 use formula 90*sgn(x)-arctgD(1/x) vfmadd213ps xmm3,xmm2,[edx] ; xmm3 ~ a2*y+a3 # c2*y+c3 : y+a1 # y+c1 vmovmskpd eax,xmm1 ; eax=3 if x<0, otherwise eax=0 vmovhlps xmm0,xmm0,xmm3 ; xmm0 ~ a2*y+a3 # c2*y+c3 vfmadd213ps xmm3,xmm4,xmm0 ; xmm3 ~ y^3+a1*y^2+a2*y+a3 # y^3+c1*y^2+c2*y+c3 vmovss xmm0,[edx+4*eax+32] ; xmm0 = 90*sgn(x) vcomiss xmm2,[edx+40] ; Compare y=x^2 to threshold value jnb arctg_end ; If |x|>=15019745 result already done vmovshdup xmm4,xmm3 ; xmm4 = P; xmm3 = Q vmulss xmm1,xmm1,xmm3 ; xmm1 = x*Q vdivss xmm4,xmm4,xmm1 ; xmm4 = P/(x*Q) vsubss xmm0,xmm0,xmm4 ; xmm0 = 90*sgn(x)-P/(x*Q) arctg_end: ; xmm0 = arctgD(x) ret ;in code do mov eax,-11.8 ;y movd xmm0,eax mov eax,2.8 ;x movd xmm1,eax call avxpatan ;in xmm0 angle in degrees. I get -76.6512 ;check on fpu fpatan tmp_atan2 dq -11.8,2.8 finit fld qword [tmp_atan2] fld qword [tmp_atan2+8] fpatan ;in st0 I get -1.337817 in radians ;-1.337817*(180/3.141592)=-76.6512 in degrees ;-76.6512*0.017453288889= radians -1.337815 |
|||
![]() |
|
Roman 10 Jul 2025, 11:54
avxpatan faster 15 times than fpu fpatan.
On Ryzen 3500. for 400000000 {avxfpatan(x,y);}; do 1 sec 80 ms. for 400000000 {fpu fpatan(x,y);}; do 16 seconds |
|||
![]() |
|
macgub 10 Jul 2025, 15:38
Roman wrote: macgub loop in 50 iterations. Its very slow and bad solution. I make 50 iterations to achive larger radius of convergence. When arguments are in standard -pi to pi values, 50 iters are in fact, more than much. greets |
|||
![]() |
|
macgub 10 Jul 2025, 15:54
... and thanks for examples with no iterations.
|
|||
![]() |
|
< Last Thread | Next Thread > |
Forum Rules:
|
Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.
Website powered by rwasa.