juanchen
Joined: 21 Jun 2008
Posts: 3

The following code optimize ddot operation in BLAS library.
Without any optimization, the DDOT function is:
#define DDOT(A,n,x,y,rd) { int _n_=n; double *_x_=x,*_y_=y; rd=0; while(_n_) rd+=*(_y_++) * *(_x_++); }
With SSE2 instruction, DDOT function is written as follows.
when n=10, I follow the instructions step by step.
I don't know
(1) why n is divided by 2 at first ("shrl $1,%%edx").
(2) why andl operation is executed ("andl $0x07,%%edx").
(3) why n is compared with 3 ("cmp $3,%%edx").
(4) When the program runs at jj0"#X",
what does "shll $4,%%edx" mean?
Thank you very much!
The full code is:
#define DDOT_SSE2(X,nv,xv,yv,res) asm volatile (
" xorps %%xmm7, %%xmm7 \n\t"
" mov %%ecx,%%edx /* %%ecx = %%edx = n */ \n\t"
" movapd %%xmm7,%%xmm0 \n\t"
" movapd %%xmm7,%%xmm1 \n\t"
" movapd %%xmm7,%%xmm2 \n\t"
" movapd %%xmm7,%%xmm3 \n\t"
" movapd %%xmm7,%%xmm4 \n\t"
" movapd %%xmm7,%%xmm5 \n\t"
" movapd %%xmm7,%%xmm6 \n\t"
" shrl $1,%%edx \n\t"
" andl $0x07,%%edx \n\t"
" je jj1"#X" \n\t"
" movapd (%%eax),%%xmm6 \n\t"
" mulpd (%%ebx),%%xmm6 \n\t"
" cmp $3,%%edx \n\t"
" jg jjg3"#X" \n\t"
" jne jjg9"#X" \n\t"
" movapd 0x10(%%eax),%%xmm5 \n\t"
" mulpd 0x10(%%ebx),%%xmm5 \n\t"
" movapd 0x20(%%eax),%%xmm4 \n\t"
" mulpd 0x20(%%ebx),%%xmm4 \n\t"
" jmp jj0"#X" \n\t"
" jjg9"#X": cmp $2,%%edx \n\t"
" jl jj0"#X" \n\t"
" movapd 0x10(%%eax),%%xmm5 \n\t"
" mulpd 0x10(%%ebx),%%xmm5 \n\t"
" jmp jj0"#X" \n\t"
" jjg3"#X": movapd 0x30(%%eax),%%xmm3 \n\t"
" movapd 0x20(%%eax),%%xmm4 \n\t"
" mulpd 0x30(%%ebx),%%xmm3 \n\t"
" movapd 0x10(%%eax),%%xmm5 \n\t"
" mulpd 0x20(%%ebx),%%xmm4 \n\t"
" mulpd 0x10(%%ebx),%%xmm5 \n\t"
" cmp $5,%%edx \n\t"
" jg jjg5"#X" \n\t"
" jne jj0"#X" \n\t"
" movapd 0x40(%%eax),%%xmm2 \n\t"
" mulpd 0x40(%%ebx),%%xmm2 \n\t"
" jmp jj0"#X" \n\t"
" jjg5"#X": movapd 0x50(%%eax),%%xmm1 \n\t"
" mulpd 0x50(%%ebx),%%xmm1 \n\t"
" movapd 0x40(%%eax),%%xmm2 \n\t"
" mulpd 0x40(%%ebx),%%xmm2 \n\t"
" addpd %%xmm1,%%xmm3 \n\t"
" cmp $6,%%edx \n\t"
" je jj0"#X" \n\t"
" movapd 0x60(%%eax),%%xmm0 \n\t"
" mulpd 0x60(%%ebx),%%xmm0 \n\t"
" addpd %%xmm0,%%xmm2 \n\t"
" jj0"#X": shll $4,%%edx \n\t"
" addl %%edx,%%ebx \n\t"
" addl %%edx,%%eax \n\t"
" jj1"#X": mov %%ecx,%%edx \n\t"
" shrl $4,%%ecx \n\t"
" je jip3"#X" \n\t"
" movapd (%%eax),%%xmm0 \n\t"
" movapd 0x10(%%eax),%%xmm1 \n\t"
" .p2align 4 /* each loop does 16 add+multiply */ \n\t"
" jip2"#X": mulpd (%%ebx),%%xmm0 \n\t"
" addpd %%xmm2,%%xmm6 \n\t"
" movapd 0x20(%%eax),%%xmm2 \n\t"
" mulpd 0x10(%%ebx),%%xmm1 \n\t"
" addpd %%xmm3,%%xmm7 \n\t"
" movapd 0x30(%%eax),%%xmm3 \n\t"
" mulpd 0x20(%%ebx),%%xmm2 \n\t"
" addpd %%xmm0,%%xmm4 \n\t"
" movapd 0x40(%%eax),%%xmm0 \n\t"
" mulpd 0x30(%%ebx),%%xmm3 \n\t"
" addpd %%xmm1,%%xmm5 \n\t"
" movapd 0x50(%%eax),%%xmm1 \n\t"
" mulpd 0x40(%%ebx),%%xmm0 \n\t"
" addpd %%xmm2,%%xmm6 \n\t"
" movapd 0x60(%%eax),%%xmm2 \n\t"
" mulpd 0x50(%%ebx),%%xmm1 \n\t"
" addpd %%xmm3,%%xmm7 \n\t"
" movapd 0x70(%%eax),%%xmm3 \n\t"
" mulpd 0x60(%%ebx),%%xmm2 \n\t"
" addpd %%xmm0,%%xmm4 \n\t"
" movapd 0x80(%%eax),%%xmm0 \n\t"
" mulpd 0x70(%%ebx),%%xmm3 \n\t"
" addpd %%xmm1,%%xmm4 \n\t"
" movapd 0x90(%%eax),%%xmm1 \n\t"
" add $0x80,%%ebx \n\t"
" add $0x80,%%eax \n\t"
" dec %%ecx \n\t"
" jne jip2"#X" \n\t"
" jip3"#X": addpd %%xmm2,%%xmm6 \n\t"
" addpd %%xmm3,%%xmm7 \n\t"
" addpd %%xmm5,%%xmm4 \n\t"
" addpd %%xmm6,%%xmm7 \n\t"
" addpd %%xmm7,%%xmm4 \n\t"
" movapd %%xmm4,%%xmm0 \n\t"
" shufpd $1,%%xmm4,%%xmm4 \n\t"
" addsd %%xmm4,%%xmm0 \n\t"
" andl $1,%%edx \n\t"
" je jip4"#X" \n\t"
" movsd (%%eax),%%xmm1 \n\t"
" mulsd (%%ebx),%%xmm1 \n\t"
" addsd %%xmm1,%%xmm0 \n\t"
" jip4"#X": movsd %%xmm0,(%0) \n\t"
: : "g" (&res), "a" (xv), "b" (yv), "c" (nv)
: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","edx","cc" )
