sse or avx implementation tangent & arctangent.

Index > Main > sse or avx implementation tangent & arctangent.

Author

Thread

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 09 Jul 2025, 12:50

fasmw 1.73
FPATAN is much slower than a good software implementation, even including function-call overhead. 150-300 cycles.

How write on sse or avx implementation tangent & arctangent ?

arctan(1) = tan-1(1) = 45°

Last edited by Roman on 10 Jul 2025, 08:16; edited 1 time in total

09 Jul 2025, 12:50

macgub

Joined: 11 Jan 2006
Posts: 356
Location: Poland

macgub 10 Jul 2025, 06:47

Hi,
This is my implementation of sinus, cosinus and arcus tangent.
Arc tg is calculated for two input values.

Code:

sin_cos:
; Calculating trigonometric functions
; Taylor series implementation
;  in:
;    xmm0 - 2 angles floats (radians); lowest half  x, y
;  out:
;    xmm0 - sinx cosx atanx atany 4 dwords float
      ; sin  x = x - x^3/3! + x^5/5! -  ...
      ; cos  x = 1 - x^2/2! + x^4/4! -  ...
      ; atan x = x - x^3/3  + x^5/5  -  ...
      pushad
      movaps    xmm7,xmm0
      pcmpeqd   xmm5,xmm5
      movhps    xmm7,[the_one]
      shufps    xmm7,xmm7,01001000b ; xm7 = lo -> hi  x, 1.0, x, y
      xorps     xmm3,xmm3
      mov       ecx,3
      mulps     xmm0,xmm0           ; x^2
      pslld     xmm5,31
      movaps    xmm1,xmm7
      shufps    xmm0,xmm0,0
      ; xm7 lo - hi x,   1.0, x,   y
      ; xm1 lo - hi x,   1.0, x,   y
      ; xm0 lo - hi x^2, x^2, x^2, x^2
      mov       edx,1
     @@:
      mulps     xmm1,xmm0           ; lo - hi x^3  x^2
      mov       ebx,ecx
      mov       eax,ecx
      dec       ebx
      sub       eax,2
      push      edx eax ebx ecx
      movups    xmm2,[esp]
      cvtdq2ps  xmm2,xmm2           ; 3, 2, 1      |   5, 4, 3
      movaps    xmm4,xmm2
      add       esp,16
      shufps    xmm4,xmm4,00001001b
      shufps    xmm2,xmm2,11110100b
      mulps     xmm4,xmm2           ; lo -> hi = 3*2, 1*2
      rcpps     xmm4,xmm4
      mulps     xmm1,xmm4           ; x^2/2!      |   x^4/4!
      movaps    xmm6,xmm1
      xorps     xmm6,xmm3           ; -x^3/3!     |   x^5/5!
      subps     xmm7,xmm6
      xorps     xmm3,xmm5 ;[sign_mask]
      add       ecx,2
      cmp       ecx,100
      jb        @b
      movaps    xmm0,xmm7
 ;    xmm0 - sinx cosx atanx atany  4 dwords float
      popad
ret
data section:
the_one: 
 dd 1.0, 1.0

10 Jul 2025, 06:47

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 10 Jul 2025, 07:41

macgub loop in 50 iterations. Its very slow and bad solution.
Your code speed circa almost like fpatan.

https://wikimedia.org/api/rest_v1/media/math/render/svg/e4ecfb141505d3e4819878a3db1b78bd1d2a9518
if х >=0 than arctg x = acos(1/sqrt(1+x*x))
if х <0 than arctg x = -acos(1/sqrt(1+x*x))

-acos(1/sqrt(1+x*x))=3.141592-acos(1/sqrt(1+x*x))

replace 1/sqrt(1+x*x) to rsqrt(1+x*x).
rsqrtss faster than sqrtss.

I found avx fast acos

Code:

tabACos dd -0.0,1.0,0.141592654,\     ; pi-3
    3.0,1.17123365,0.745630503,0.329797238,0.294789612,1.30136144,\
    0.930340230,0.0114133256,0.0189007167
@ACos:
    mov edx,tabACos                     ;xmm0=x
    vucomiss xmm0,[edx]             ; Set cf if x<0
    vorps xmm0,xmm0,[edx]           ; xmm0 = -|x|
    vmovups xmm3,[edx+16]           ; xmm3 = b2: b0 : a2 : a0
    vshufps xmm1,xmm0,xmm0,0        ; xmm1 = -|x| : -|x| : -|x| : -|x|
    vaddss xmm0,xmm0,[edx+4]        ; xmm0 = 1-|x|
    vfnmadd231ps xmm3,xmm1,[edx+32] ; xmm3 = b2+b3*|x|=q1 : a2+a3*|x|=p1 : b0+b1*|x|=q0 : a0+a1*|x|=p0
    vmulps xmm2,xmm1,xmm1           ; xmm2 = x^2 : x^2 : x^2 : x^2
    vmovhlps xmm1,xmm3,xmm3         ; xmm1 = b2+b3*|x|=q1 : a2+a3*|x|=p1
    vfmadd132ps xmm1,xmm3,xmm2      ; xmm1 = q0+q1*x^2=q : p0+p1*x^2=p
    vsqrtss xmm0,xmm0,xmm0          ; xmm0 = (1-|x|)^0.5
    vmovshdup xmm2,xmm1             ; xmm2 = q
    vdivss xmm1,xmm1,xmm2           ; xmm1 = p/q = f(t)
    jnc arccos_plus                 ; Goto if x>=0
    vfnmadd213ss xmm0,xmm1,[edx+8]  ; xmm0 = pi-3-(1+x)^0.5*p/q
    vaddss xmm0,xmm0,[edx+12]       ; xmm0 = pi-(1+x)^0.5*p/q
    ret
arccos_plus:                  ; Case of x>=0
    vmulss xmm0,xmm0,xmm1           ; xmm0 = (1-x)^0.5*p/q
    ret

;in code I tested
mov eax,0.8
movd xmm0,eax
call @ACos
;I get in xmm0=0.64350. 
;online internet calculator acos( 0.8 )=0.64350

10 Jul 2025, 07:41

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 10 Jul 2025, 08:32

good information.
https://mazzo.li/posts/vectorized-atan2.html
https://stackoverflow.com/questions/42537957/fast-accurate-atan-arctan-approximation-algorithm

Code:

@ct2 dd 6.28740248E-17,\          ; a0*(a1+b0)=a0*c1
    4.86816205E-17,\             ; a0*a1
    2.24874633E-18,\            ; a0*(a3+b2)=a0*c3
    4.02179944E-19,\            ; a0*a3
    4.25772129E-17,\            ; a0
    4.25772129E-17,\            ; a0
    2.50182216E-17,\            ; a0*(a2+b1)=a0*c2
    1.25756219E-17,\            ; a0*a2
    0.0707963258,\              ;(pi-3)/2
    1.5,\                       ; 3/2
    1.0,\                       ; 1
    -0.0707963258,\             ; -(pi-3)/2
    -1.5,\                      ; -3/2
    3.95889818E15              ; Threshold of x^2 when arctg(x)=pi/2*sgn(x)

arctg:
    vshufps xmm1,xmm0,xmm0,0    ; xmm1 = x # x : x # x
    mov edx,       @ct2         ; edx contains the address of constants table
    vmulps xmm2,xmm1,xmm1       ; xmm2 = x^2 # x^2 : x^2 # x^2
    vmovups xmm3,[edx+16]       ; xmm3 = a0*a2 # a0*c2 : a0 # a0
    vmulps xmm4,xmm2,xmm2       ; xmm4 = y^2 # y^2 : y^2 # y^2
    vucomiss xmm2,[edx+40]      ; Compare y=x^2 to 1
    ja arctg_big2                ; Jump if |x|>1
    vfmadd231ps xmm3,xmm2,[edx] ; xmm3 ~ a3*y+a2 # c3*y+c2 : a1*y+1 # c1*y+1
    vmovhlps xmm1,xmm1,xmm3     ; xmm1 ~ a3*y+a2 # c3*y+c2
    vfmadd231ps xmm3,xmm4,xmm1  ; xmm3 ~ a3*y^3+a2*y^2+a1*y+1 # c3*y^3+c2*y^2+c1*y+1
    vmovshdup xmm2,xmm3         ; xmm2 = P; xmm3 = Q
    vdivss xmm2,xmm2,xmm3       ; xmm2 = P/Q
    vmulss xmm0,xmm0,xmm2       ; xmm0 = x*P/Q = arctg(x)
    ret                         ; Return
      arctg_big2:                ; When |x|>1 use formula pi/2*sgn(x)-arctg(1/x)
    vfmadd213ps xmm3,xmm2,[edx] ; xmm3 ~ a2*y+a3 # c2*y+c3 : y+a1 # y+c1
    vmovmskpd eax,xmm1          ; eax=3 if x<0, otherwise eax=0
    vmovhlps xmm0,xmm0,xmm3     ; xmm0 ~ a2*y+a3 # c2*y+c3
    vfmadd213ps xmm3,xmm4,xmm0  ; xmm3 ~ y^3+a1*y^2+a2*y+a3 # y^3+c1*y^2+c2*y+c3
    vmovss xmm0,[edx+4*eax+32]  ; xmm0 = (pi-3)/2*sgn(x)
    vucomiss xmm2,[edx+52]      ; Compare y=x^2 to threshold value
    jnb arctg_end3               ; The data is already in xmm0, if |x|>=62919776
    vmovshdup xmm4,xmm3         ; xmm4 = P; xmm3 = Q
    vmulss xmm1,xmm1,xmm3       ; xmm1 = x*Q
    vfmsub132ss xmm0,xmm4,xmm1  ; xmm0 = (pi-3)/2*|x|*Q-P
    vdivss xmm0,xmm0,xmm1       ; xmm0 = (pi-3)/2*sgn(x)-P/(x*Q)
      arctg_end3:                ; Add to result 3/2*sgn(x)
    vaddss xmm0,xmm0,[edx+4*eax+36] ; xmm0 = pi/2*sgn(x)-P/(x*Q)
    ret         

;in code I tested
mov eax,11.8
movd xmm0,eax
call arctg
;in xmm0 = 1.4862525
;online calculator arctang(11.8 ) = 1.4862525

Last edited by Roman on 10 Jul 2025, 10:29; edited 6 times in total

10 Jul 2025, 08:32

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 10 Jul 2025, 08:43

instead FPATAN FPU

Code:

;signbit return true if value <0. if value =>0 return false
float fast_atan2_alt(float y, float x) {
  float ax = fabsf(x);
  float ay = fabsf(y);
  float a;
  if (ax > ay) {
    a = ay / ax;
  } else if (ay > ax) {
    a = ax / ay;
  } else {
    // ax and ay have the same value.
    a = ax > 0; // or just 1.0f if OK to have this function act like atan2(0,0)
  }

  float s = a * a;
  float r = ((-0.0447585656f * s + 0.157265460f) * s - 0.327108731f) * s * a + a;

  if (ay > ax)
    r = (float) M_PI_2 - r;
  if (signbit(x))
    r = (float) M_PI - r;
  if (signbit(y))
    r = -r;
  return r;
}

10 Jul 2025, 08:43

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 10 Jul 2025, 10:26

Code:

align 64
avxpatan: 
        vmovmskps ecx,xmm0              ; ecx=1 if y<=0; ecx=0 if y>=0
        vmovmskps eax,xmm1              ; eax=1 if x<=0, eax=0 if x>=0
        vdivss xmm0,xmm0,xmm1           ; xmm0 = y/x - prepare the argument of arctangent
        and ecx,1                       ; Only bit 0 of ecx is needed
        and eax,1                       ; Only bit 0 of eax is needed
        bts ecx,eax                     ; ecx=2 or 3 if x<=0, ecx=1 if x>=0
        call arctgDegrees               ; Calculate arctangent in degrees from y/x
        vaddss xmm0,xmm0,[@ct3-4+4*ecx] ; If neccessary, to add or subtract 180
        ret
align 16
@ct3 dd -0.0,180.0,-180.0
@ct20 dd   1.92582580E-14,\          ; a0*(a1+b0)=a0*c1
    8.54345240E-13,\                ; a0*a1 (in degrees)
    6.88789034E-16,\                ; a0*(a3+b2)=a0*c3
    7.05811633E-15,\                ; a0*a3 (in degrees)
    1.30413631E-14,\                ; a0
    7.47215061E-13,\                ; a0 (in degrees)
    7.66305964E-15,\                ; a0*(a2+b1)=a0*c2
    2.20697728E-13,\                ; a0*a2 (in degrees)
    90.0,\                          
    1.0,\                           
    2.25592738E14,\                 ; Threshold of x^2 when arctgD(x)=90*sgn(x)
    -90.0            
align 64
arctgDegrees:    
    vshufps xmm1,xmm0,xmm0,0        ; xmm1 = x # x : x # x
    mov edx,@ct20                      ; edx contains the address of constants table
    vmulps xmm2,xmm1,xmm1           ; xmm2 = x^2 # x^2 : x^2 # x^2
    vmovups xmm3,[edx+16]           ; xmm3 = a0*a2 # a0*c2 : a0 # a0
    vmulps xmm4,xmm2,xmm2           ; xmm4 = y^2 # y^2 : y^2 # y^2
    vucomiss xmm2,[edx+36]          ; Compare y=x^2 to 1
    ja arctg_big                    ; Goto if |x|>1
    vfmadd231ps xmm3,xmm2,[edx]     ; xmm3 ~ a3*y+a2 # c3*y+c2 : a1*y+1 # c1*y+1
    vmovhlps xmm1,xmm1,xmm3         ; xmm1 ~ a3*y+a2 # c3*y+c2
    vfmadd231ps xmm3,xmm4,xmm1      ; xmm3 ~ a3*y^3+a2*y^2+a1*y+1 # c3*y^3+c2*y^2+c1*y+1
    vmovshdup xmm2,xmm3             ; xmm2 = P; xmm3 = Q
    vdivss xmm2,xmm2,xmm3           ; xmm2 = P/Q
    vmulss xmm0,xmm0,xmm2           ; xmm0 = x*P/Q = arctgD(x)
    ret                             
arctg_big:                          ; When |x|>1 use formula 90*sgn(x)-arctgD(1/x)
    vfmadd213ps xmm3,xmm2,[edx]     ; xmm3 ~ a2*y+a3 # c2*y+c3 : y+a1 # y+c1
    vmovmskpd eax,xmm1              ; eax=3 if x<0, otherwise eax=0
    vmovhlps xmm0,xmm0,xmm3         ; xmm0 ~ a2*y+a3 # c2*y+c3
    vfmadd213ps xmm3,xmm4,xmm0      ; xmm3 ~ y^3+a1*y^2+a2*y+a3 # y^3+c1*y^2+c2*y+c3
    vmovss xmm0,[edx+4*eax+32]      ; xmm0 = 90*sgn(x)
    vcomiss xmm2,[edx+40]           ; Compare y=x^2 to threshold value
    jnb arctg_end                   ; If |x|>=15019745 result already done
    vmovshdup xmm4,xmm3             ; xmm4 = P; xmm3 = Q
    vmulss xmm1,xmm1,xmm3           ; xmm1 = x*Q
    vdivss xmm4,xmm4,xmm1           ; xmm4 = P/(x*Q)
    vsubss xmm0,xmm0,xmm4           ; xmm0 = 90*sgn(x)-P/(x*Q)
arctg_end:                          ; xmm0 = arctgD(x)
    ret                            

;in code do
                mov eax,-11.8 ;y
                movd xmm0,eax
                mov eax,2.8 ;x
                movd xmm1,eax
                call avxpatan   ;in xmm0 angle in degrees. I get -76.6512
;check on fpu fpatan
tmp_atan2 dq -11.8,2.8

                finit
                fld    qword [tmp_atan2]
                fld    qword [tmp_atan2+8]
                fpatan ;in st0 I get -1.337817 in radians 
;-1.337817*(180/3.141592)=-76.6512 in degrees
;-76.6512*0.017453288889= radians -1.337815

10 Jul 2025, 10:26

Roman

Joined: 21 Apr 2012
Posts: 1978

Roman 10 Jul 2025, 11:54

avxpatan faster 15 times than fpu fpatan.
On Ryzen 3500.
for 400000000 {avxfpatan(x,y);}; do 1 sec 80 ms.
for 400000000 {fpu fpatan(x,y);}; do 16 seconds

10 Jul 2025, 11:54

macgub

Joined: 11 Jan 2006
Posts: 356
Location: Poland

macgub 10 Jul 2025, 15:38

Roman wrote:

macgub loop in 50 iterations. Its very slow and bad solution.
Your code speed circa almost like fpatan.

I make 50 iterations to achive larger radius of convergence. When arguments are in standard -pi to pi values, 50 iters are in fact, more than much.
greets

10 Jul 2025, 15:38

macgub

Joined: 11 Jan 2006
Posts: 356
Location: Poland

macgub 10 Jul 2025, 15:54

... and thanks for examples with no iterations.

10 Jul 2025, 15:54

< Last Thread | Next Thread >

Forum Rules:

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum