Hi,
I'm using Delphi XE8 update 1 on Windows 8.1 x64.
I'm trying to convert a bilinear resize function to sse2.
The function:
procedure Bilinear32;
var
ix, iy: integer;
x, y, xdif, ydif: integer;
xp1, xp2, yp: integer;
wy, wyi, wx: integer;
w11, w21, w12, w22: integer;
sbBits, sbLine1, sbLine2: PByteArray;
smBits, smLine1, smLine2: PByteArray;
dbLine: PByteArray;
dmLine: ^byte;
sbLineDif, dbLineDif: integer;
smLineDif, dmLineDif: integer;
w: integer;
begin
y := 0;
xdif := (bmpInput.Width shl 16) div bmpOutput.Width;
ydif := (bmpInput.Height shl 16) div bmpOutput.Height;
sbBits := bmpInput.ScanLine[0];
if bmpInput.Height > 1 then
sbLineDif := NativeInt(bmpInput.ScanLine[1]) - NativeInt(sbBits)
else
sbLineDif := 0;
dbLine := bmpOutput.ScanLine[0];
if bmpOutput.Height > 1 then
dbLineDif := NativeInt(bmpOutput.ScanLine[1]) - NativeInt(dbLine) - 4 * bmpOutput.Width
else
dbLineDif := 0;
smBits := nil;
smLineDif := 0;
dmLine := nil;
dmLineDif := 0;
w := bmpInput.Width - 1;
for iy := 0 to bmpOutput.Height - 1 do
begin
yp := y shr 16;
NativeInt(sbLine1) := NativeInt(sbBits) + sbLineDif * yp;
NativeInt(smLine1) := NativeInt(smBits) + smLineDif * yp;
if yp < bmpInput.Height - 1 then
begin
NativeInt(sbLine2) := NativeInt(sbLine1) + sbLineDif;
NativeInt(smLine2) := NativeInt(smLine1) + smLineDif;
end
else
sbLine2 := sbLine1;
x := 0;
wy := y and $FFFF;
wyi := (not y) and $FFFF;
for ix := 0 to bmpOutput.Width - 1 do
begin
xp1 := x shr 16;
if xp1 < w then
xp2 := xp1 + 1
else
xp2 := xp1;
wx := x and $FFFF;
w21 := (wyi * wx) shr 16;
w11 := wyi - w21;
w22 := (wy * wx) shr 16;
w12 := wy - w22;
xp1 := xp1 * 4;
xp2 := xp2 * 4;
dbLine^[0] := (sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp1] * w12 + sbLine2[xp2] * w22) shr 16;
dbLine^[1] := (sbLine1[xp1 + 1] * w11 + sbLine1[xp2 + 1] * w21 + sbLine2[xp1 + 1] * w12 + sbLine2[xp2 + 1] * w22) shr 16;
dbLine^[2] := (sbLine1[xp1 + 2] * w11 + sbLine1[xp2 + 2] * w21 + sbLine2[xp1 + 2] * w12 + sbLine2[xp2 + 2] * w22) shr 16;
dbLine^[3] := (sbLine1[xp1 + 3] * w11 + sbLine1[xp2 + 3] * w21 + sbLine2[xp1 + 3] * w12 + sbLine2[xp2 + 3] * w22) shr 16;
inc(NativeInt(dbLine), 4);
inc(dmLine);
inc(x, xdif);
end;
inc(NativeInt(dbLine), dbLineDif);
inc(NativeInt(dmLine), dmLineDif);
inc(y, ydif);
end;
end;
The code who takes most CPU time is this:
dbLine^[0] := (sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp1] * w12 + sbLine2[xp2] * w22) shr 16;
dbLine^[1] := (sbLine1[xp1 + 1] * w11 + sbLine1[xp2 + 1] * w21 + sbLine2[xp1 + 1] * w12 + sbLine2[xp2 + 1] * w22) shr 16;
dbLine^[2] := (sbLine1[xp1 + 2] * w11 + sbLine1[xp2 + 2] * w21 + sbLine2[xp1 + 2] * w12 + sbLine2[xp2 + 2] * w22) shr 16;
dbLine^[3] := (sbLine1[xp1 + 3] * w11 + sbLine1[xp2 + 3] * w21 + sbLine2[xp1 + 3] * w12 + sbLine2[xp2 + 3] * w22) shr 16;
For now I tried to convert just the first line:
asm
mov eax,[sbline1]
mov edx,[xp1]
movzx ecx,[eax+edx]
movd xmm0, ecx //sbLine1[xp1]
mov edx,[xp2]
movzx ecx,[eax+edx]
movd xmm4, ecx //sbLine1[xp2]
movd xmm2, [w11]
movd xmm6, [w21]
pmuludq xmm0, xmm2 //sbLine1[xp1] * w11
pmuludq xmm4, xmm6 //sbLine1[xp2] * w21
addpd xmm0, xmm4 //sbLine1[xp1] * w11 + sbLine1[xp2] * w21
movd eax, xmm0
push eax //send sbLine1[xp1] * w11 + sbLine1[xp2] * w21 to stack
mov eax,[sbline2]
movzx ecx,[eax+edx]
movd xmm0, ecx //sbLine2[xp2]
mov edx,[xp1]
movzx ecx,[eax+edx]
movd xmm4, ecx //sbLine2[xp1]
movd xmm2, [w22]
movd xmm6, [w12]
pmuludq xmm0, xmm2 //sbLine2[xp2] * w22
pmuludq xmm4, xmm6 //sbLine2[xp1] * w12
addpd xmm0, xmm4 //sbLine2[xp2] * w22 + sbLine2[xp1] * w12
movd eax, xmm0
pop edx //get sbLine1[xp1] * w11 + sbLine1[xp2] * w21 from stack
add eax, edx //sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp2] * w22 + sbLine2[xp1] * w12
shr eax,$10 //(sbLine1[xp1] * w11 + sbLine1[xp2] * w21 + sbLine2[xp2] * w22 + sbLine2[xp1] * w12) shr 16
mov edx,[dbLine]
mov [edx],al
end;
It gives the expected result but it's almost twice as slow.
What am I doing wrong?
Please help me.
Thank you in advance.