Hello! Help me to rewrite code, please!
Here, code part, that I rewrite NOW:
    
{$L Root16Viz.obj}
procedure Root16Viz(IterDat: array of Single; min, max, step: Single; size:integer; pal:array of TCol; out buffer:array of byte); register; external;
 
begin
        index := 0;
        x := 0;
        stepSSE.a:=1/((sqrt(sqrt(sqrt(sqrt(max))))-sqrt(sqrt(sqrt(sqrt(min)))))/4095.0);   Fill1SSE(stepSSE);
        minSSE.a:=sqrt(sqrt(sqrt(sqrt(min))));                       Fill1SSE(minSSE);
        While x<maxx*maxy do
         begin
            {tmpSSE.a := (IterDat[x]);
            tmpSSE.b := (IterDat[x-1]);
            tmpSSE.c := (IterDat[x-2]);
            tmpSSE.d := (IterDat[x-3]); }
            asm
              mov    eax, IterDat
              mov    edx,  [x]
              shl    edx,  2
              add    eax,  edx
              movups xmm0, [eax]
              sqrtps xmm0, xmm0
              sqrtps xmm0, xmm0
              sqrtps xmm0, xmm0
              sqrtps xmm0, xmm0
              movups xmm1, [minSSE]
              subps  xmm0, xmm1
              movups xmm1, [stepSSE]
              mulps  xmm0, xmm1
              movups [tmpSSE], xmm0
            end;
            ColorIndexSSE.a:=round(tmpSSE.a);
            ColorIndexSSE.b:=round(tmpSSE.b);
            ColorIndexSSE.c:=round(tmpSSE.c);
            ColorIndexSSE.d:=round(tmpSSE.d);
            if (ColorIndexSSE.a>4095) then ColorIndexSSE.a:=0;
            if (ColorIndexSSE.b>4095) then ColorIndexSSE.b:=0;
            if (ColorIndexSSE.c>4095) then ColorIndexSSE.c:=0;
            if (ColorIndexSSE.d>4095) then ColorIndexSSE.d:=0;
            if (IterDat[x]<=1)   then ColorIndexSSE.a:=0;
            if (IterDat[x+1]<=1) then ColorIndexSSE.b:=0;
            if (IterDat[x+2]<=1) then ColorIndexSSE.c:=0;
            if (IterDat[x+3]<=1) then ColorIndexSSE.d:=0;
            buffer[index]     :=  pal[ColorIndexSSE.a].b;
            buffer[index+1]   :=  pal[ColorIndexSSE.a].g;
            buffer[index+2]   :=  pal[ColorIndexSSE.a].r;
            buffer[index+4]   :=  pal[ColorIndexSSE.b].b;
            buffer[index+5]   :=  pal[ColorIndexSSE.b].g;
            buffer[index+6]   :=  pal[ColorIndexSSE.b].r;
            buffer[index+8]   :=  pal[ColorIndexSSE.c].b;
            buffer[index+9]   :=  pal[ColorIndexSSE.c].g;
            buffer[index+10]  :=  pal[ColorIndexSSE.c].r;
            buffer[index+12]  :=  pal[ColorIndexSSE.d].b;
            buffer[index+13]  :=  pal[ColorIndexSSE.d].g;
            buffer[index+14]  :=  pal[ColorIndexSSE.d].r;
            inc(index, 16);
            inc(x,4);
          end;
      end;
     
Here MS COFF file:
    
format MS COFF
public Root16Viz
extrn IterDat
extrn min
extrn max
extrn step
extrn size
extrn palette
extrn buffer
sz rd 1
ones dd  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
zeroes dd  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
Root16Viz:
  mov edi,  [size]
  sub edi,  4
  mov [sz], edi
  mov ecx,  0
  mov edi,  0
  mov eax,  [IterDat]
  mov edx,  [buffer]
  mov ebx,  [palette]
  vmovups ymm1, [min]
  vmovups ymm2, [step]
  vshufps ymm1, ymm1, ymm1, 0   ;ðàçìíîæàåì min
  vshufps ymm2, ymm2, ymm1, 0   ;è step ïî âñåìó ðåãèñòðó
  fx:                      ;ëóï
    vmovups     ymm0, [eax]
    vsqrtps     ymm0, ymm0
    vsqrtps     ymm0, ymm0
    vsqrtps     ymm0, ymm0
    vsqrtps     ymm0, ymm0
    vsubps      ymm0, ymm1, ymm0  ;ymm0 = ymm0 - ymm1
    vmulps      ymm0, ymm2, ymm0  ;ymm0 = ymm0 * ymm2
    vcmpeqps    ymm0, ymm0, yword[ones]
                                        ; ymm1 has 1.0 mask 
    vpandn      ymm0, ymm1, ymm0                   ; zero 1.0 cells
    vpand       ymm1, ymm1, yword[zeroes]             ; create 0.0 cells
    vpor        ymm0, ymm0, ymm1
    vmovupb     ymm1, [ebx]
    cmp         ecx, sz           ;Åñëè ecx<=size-4
  jne fx                          ;òî ïèëèì öèêë!
ret 8    
 
Help me to rewrite last part with buffer!!! Please... How to load and compare params in fastest way?