converting Delphi bilinear resize function to SSE2

Hi,

I'm using Delphi XE8 update 1 on Windows 8.1 x64.

I'm trying to convert a bilinear resize function to sse2.

The function:

Code:

   procedure Bilinear32;
   var
      ix, iy: integer;
      x, y, xdif, ydif: integer;
      xp1, xp2, yp: integer;
      wy, wyi, wx: integer;
      w11, w21, w12, w22: integer;
      sbBits, sbLine1, sbLine2: PByteArray;
      smBits, smLine1, smLine2: PByteArray;
      dbLine: PByteArray;
      dmLine: ^byte;
      sbLineDif, dbLineDif: integer;
      smLineDif, dmLineDif: integer;
      w: integer;
   begin
      y := 0;
      xdif := (bmpInput.Width shl 16) div bmpOutput.Width;
      ydif := (bmpInput.Height shl 16) div bmpOutput.Height;
      sbBits := bmpInput.ScanLine[0];
      if bmpInput.Height > 1 then
         sbLineDif := NativeInt(bmpInput.ScanLine[1]) - NativeInt(sbBits)
      else
         sbLineDif := 0;
      dbLine := bmpOutput.ScanLine[0];
      if bmpOutput.Height > 1 then
         dbLineDif := NativeInt(bmpOutput.ScanLine[1]) - NativeInt(dbLine) - 4 * bmpOutput.Width
      else
         dbLineDif := 0;
      smBits := nil;
      smLineDif := 0;
      dmLine := nil;
      dmLineDif := 0;
      w := bmpInput.Width - 1;
      for iy := 0 to bmpOutput.Height - 1 do
      begin
         yp := y shr 16;
         NativeInt(sbLine1) := NativeInt(sbBits) + sbLineDif * yp;
         NativeInt(smLine1) := NativeInt(smBits) + smLineDif * yp;
         if yp < bmpInput.Height - 1 then
         begin
            NativeInt(sbLine2) := NativeInt(sbLine1) + sbLineDif;
            NativeInt(smLine2) := NativeInt(smLine1) + smLineDif;
         end
         else
            sbLine2 := sbLine1;
         x := 0;
         wy := y and $FFFF;
         wyi := (not y) and $FFFF;
         for ix := 0 to bmpOutput.Width - 1 do
         begin
            xp1 := x shr 16;
            if xp1 < w then
               xp2 := xp1 + 1
            else
               xp2 := xp1;
            wx := x and $FFFF;
            w21 := (wyi * wx) shr 16;
            w11 := wyi - w21;
            w22 := (wy * wx) shr 16;
            w12 := wy - w22;

            xp1 := xp1 * 4;
            xp2 := xp2 * 4;

            dbLine^[0] := (sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp1] * w12 + sbLine2[xp2] * w22) shr 16;
            dbLine^[1] := (sbLine1[xp1 + 1] * w11 + sbLine1[xp2 + 1] * w21 + sbLine2[xp1 + 1] * w12 + sbLine2[xp2 + 1] * w22) shr 16;
            dbLine^[2] := (sbLine1[xp1 + 2] * w11 + sbLine1[xp2 + 2] * w21 + sbLine2[xp1 + 2] * w12 + sbLine2[xp2 + 2] * w22) shr 16;
            dbLine^[3] := (sbLine1[xp1 + 3] * w11 + sbLine1[xp2 + 3] * w21 + sbLine2[xp1 + 3] * w12 + sbLine2[xp2 + 3] * w22) shr 16;

            inc(NativeInt(dbLine), 4);
            inc(dmLine);
            inc(x, xdif);
         end;
         inc(NativeInt(dbLine), dbLineDif);
         inc(NativeInt(dmLine), dmLineDif);
         inc(y, ydif);
      end;
   end;

The code who takes most CPU time is this:

Code:

            dbLine^[0] := (sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp1] * w12 + sbLine2[xp2] * w22) shr 16;
            dbLine^[1] := (sbLine1[xp1 + 1] * w11 + sbLine1[xp2 + 1] * w21 + sbLine2[xp1 + 1] * w12 + sbLine2[xp2 + 1] * w22) shr 16;
            dbLine^[2] := (sbLine1[xp1 + 2] * w11 + sbLine1[xp2 + 2] * w21 + sbLine2[xp1 + 2] * w12 + sbLine2[xp2 + 2] * w22) shr 16;
            dbLine^[3] := (sbLine1[xp1 + 3] * w11 + sbLine1[xp2 + 3] * w21 + sbLine2[xp1 + 3] * w12 + sbLine2[xp2 + 3] * w22) shr 16;

For now I tried to convert just the first line:

Code:

            asm
               mov       eax,[sbline1]
               mov       edx,[xp1]
               movzx     ecx,[eax+edx]
               movd      xmm0, ecx                //sbLine1[xp1]

               mov       edx,[xp2]
               movzx     ecx,[eax+edx]
               movd      xmm4, ecx                //sbLine1[xp2]

               movd      xmm2, [w11]
               movd      xmm6, [w21]

               pmuludq   xmm0, xmm2               //sbLine1[xp1] * w11
               pmuludq   xmm4, xmm6               //sbLine1[xp2] * w21

               addpd     xmm0, xmm4              //sbLine1[xp1] * w11 + sbLine1[xp2] * w21

               movd      eax, xmm0
               push      eax                     //send  sbLine1[xp1] * w11 + sbLine1[xp2] * w21  to stack

               mov       eax,[sbline2]
               movzx     ecx,[eax+edx]
               movd      xmm0, ecx                //sbLine2[xp2]

               mov       edx,[xp1]
               movzx     ecx,[eax+edx]
               movd      xmm4, ecx                //sbLine2[xp1]

               movd      xmm2, [w22]
               movd      xmm6, [w12]

               pmuludq   xmm0, xmm2               //sbLine2[xp2] * w22
               pmuludq   xmm4, xmm6               //sbLine2[xp1] * w12

               addpd     xmm0, xmm4              //sbLine2[xp2] * w22 + sbLine2[xp1] * w12

               movd      eax, xmm0

               pop       edx                     //get sbLine1[xp1] * w11 + sbLine1[xp2] * w21 from stack

               add       eax, edx                //sbLine1[xp1] * w11 + sbLine1[xp2] * w21   +   sbLine2[xp2] * w22 + sbLine2[xp1] * w12

               shr       eax,$10                 //(sbLine1[xp1] * w11 + sbLine1[xp2] * w21   +   sbLine2[xp2] * w22 + sbLine2[xp1] * w12)   shr   16
               mov       edx,[dbLine]
               mov       [edx],al
            end;

It gives the expected result but it's almost twice as slow.

What am I doing wrong?
Please help me.

Thank you in advance.