flat assembler
Message board for the users of flat assembler.

 Index > Main > Sorting Algorithm, RadixSort Goto page 1, 2  Next
Author
r22

Joined: 27 Dec 2004
Posts: 805
r22
I'm looking for the fastest unsigned integer array sorter.
If anyone has an implementation of QuickSort to test against this RadixSort I'd be very interested in the results.

It's not fully optimized yet, but the implementation is solid and still very fast.

For those that don't know what radix sort is here's a high level representation of the algorithm. (in Java)
Code:
```///////////////////////////////
///////////////////////////////
private int[] sourceArray;
private int[] targetArray;
private int[] counts;
private int keyLength=4;
{
counts = new int[256];
sourceArray = arr;
targetArray = new int[arr.length];
int soFar,keyByte,to,temp,bitShift;
int[] tempA;
//outer loop 4 times because 4words in a 32bit int
for ( int col=keyLength-1; col>=0; col-- )
{
soFar = 0;
bitShift = radix2LUT[col];//either 0,8,16 or 24
//zeroing out the count[0to255]=0
for (int x = 0; x<256; x=x+8)
{//unrolled for loop speedup
counts[x] = counts[x+1] = counts[x+2] = counts[x+3] =
counts[x+4] = counts[x+5] = counts[x+6] = counts[x+7] = 0;
}
//counting how many of each key
//this is what avoids using queues
for ( int i=0; i<sourceArray.length; i++ )
{
counts[((sourceArray[i] >> bitShift) & 0xFF)]++;
}
//manipulate the count's into indexes into a shadow array
for ( int i=0; i<counts.length; i=i+8 )
{//unrolled for speed
temp = counts[i];counts[i] = soFar;soFar += temp;
temp = counts[i+1];counts[i+1] = soFar;soFar += temp;
temp = counts[i+2];counts[i+2] = soFar;soFar += temp;
temp = counts[i+3];counts[i+3] = soFar;soFar += temp;
temp = counts[i+4];counts[i+4] = soFar;soFar += temp;
temp = counts[i+5];counts[i+5] = soFar;soFar += temp;
temp = counts[i+6];counts[i+6] = soFar;soFar += temp;
temp = counts[i+7];counts[i+7] = soFar;soFar += temp;
}
//copy and rearrange the array
for ( int from=0; from<sourceArray.length; from++ )
{
keyByte = ((sourceArray[from] >> bitShift) & 0xFF);
to = counts[keyByte]++;
targetArray[to] = sourceArray[from];
}
//flip the array pointers around
tempA = sourceArray;
sourceArray = targetArray;
targetArray = tempA;
}
//copy the sorted array into the array passed to the method
//System.arraycopy(sourceArray,0, arr,0, sourceArray.length);
}
```

Here's the current fasm project I'm using,
the DATA section is a little messy but the code is straight forward.

I create a 10,000,000 DWORD long array
Fill it with Random 32bit integers
And have benchmarking loops setup to test the speed of RadixSort vs WHATEVER you guys come up with to help me out.

If your going to test your quick sort implementation make sure you call it first in the benchmarking loops (WHY?) because optimized QuickSort algorithms have early stop checks, so if you let the radix sort sort the array first QuickSort will see that it's sorted already and stop early. Or you could just call the FillArray function before the loop starts for both of them.

Without further babbling the code.
Code:
```format PE console 4.0
entry start
include '%fasminc%\win32a.inc'

result1 db 'Result 1: %d:',0
result2 db 'Result 2: %d:',0
buffer rb 255

align 16
RandomSeed dd 1318699, 1015727, 1235239, 412943

paramA dd 0
paramB dd 0
align 8
dVal dq 4294967295.0

fmtr db 'test: %d',0
tmp dq 0
align 16
fixer dw 0fc01h,0,0,0,0,0,0,0

fmtf db '%d . %d',0
dbgt dq 0

fmth db '%x %x %x',0
str1 dd 0,0,0,0
fhex db '%x  > %x  ',0

ttest db '%x : %x : %x',10,13,0

align 16
sseseed dq 1111111111111111h
dq 12134567778abbf0h
dq 0ababcabababdabah
dq 7654321546174353h
_fmth db ' %X ',0

RandomArray dd 0
TestArray dd 1000BBBBh, 99AABBCCh, 1000BBB1h, 100h, 250h
start:

call MakeSeed
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
push    40000000 ;;40million bytes 10million DWORDS
call    MakeArray
mov     [RandomArray], eax
push    10000000 ;;DWORD size
push    eax ;;ARRAY MEM
call    FillArray

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
call    [GetCurrentProcess] ;returns -1
push    100h
push    eax
call    [SetPriorityClass]
push    15
push    eax

push 0
push 0
push 0
push 0
call [MessageBox]
;---------------------------------------
mov esi,01h
call [GetTickCount]
mov edi,eax
jmp tst1
align 16
tst1:
push    10000000
push    dword[RandomArray]
dec esi
jnz tst1
push eax
push _fmth
call [printf]
call [GetTickCount]
sub eax,edi
push eax
push result1
call [printf]

mov esi,01h
call [GetTickCount]
mov edi,eax
jmp tst2
align 16
tst2:
push    10000000
push    dword[RandomArray]
dec esi
jnz tst2
push eax
push _fmth
call [printf]
call [GetTickCount]
sub eax,edi
push eax
push result2
call [printf]
;+++++++++++++++++++++++++++++++++++++++++++++
push 0
push buffer
push buffer
push 0
call [MessageBox]
push 0
call [ExitProcess]

;;IN size
mov ecx,[esp+8];;size DWORDs
push ebp
push ebx
push esi
push edi
mov esi,[esp+20];;array
mov ebp,ecx ;;copy sizeDWORDs
shl ecx,2;;*4 sizeBYTEs
push ecx ;;size
call MakeArray
mov edx,ebp ;;copy back dword size
sub esp,256 * 4 ;;count array
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 1
;;;;;;;;;;;;;;;;;;;;zero count array
xor eax,eax
mov ecx,255
.lpc1:
mov dword[esp+ecx*4],eax
mov dword[esp+ecx*4-4],eax
mov dword[esp+ecx*4-8],eax
mov dword[esp+ecx*4-12],eax
mov dword[esp+ecx*4-16],eax
sub ecx,5
jnz .lpc1
mov dword[esp],eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
mov ecx,edx ;;dword size -1
dec ecx
.lpg1:
movzx eax,byte[esi+ecx*4] ;;lowest byte first
inc dword[esp+eax*4]
dec ecx
jns .lpg1

;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
xor ecx,ecx
xor ebx,ebx
.lpm1:
mov eax, dword[esp+ecx*4]
mov dword[esp+ecx*4], ebx
cmp ecx,256
jne .lpm1
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; first copy
xor ecx,ecx
.lpr1:
movzx eax, byte[esi+ecx*4] ;; first byte
mov ebx, dword[esp+eax*4]
inc dword[esp+eax*4] ;++
mov eax, dword[esi+ecx*4]
mov dword[edi+ebx*4], eax
cmp ecx,edx
jne .lpr1
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
push esi
push edi
pop esi
pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 2
;;;;;;;;;;;;;;;;;;;;zero count array
xor eax,eax
mov ecx,255
.lpc2:
mov dword[esp+ecx*4],eax
mov dword[esp+ecx*4-4],eax
mov dword[esp+ecx*4-8],eax
mov dword[esp+ecx*4-12],eax
mov dword[esp+ecx*4-16],eax
sub ecx,5
jnz .lpc2
mov dword[esp],eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
mov ecx,edx ;;dword size -1
dec ecx
.lpg2:
movzx eax,byte[esi+ecx*4+1] ;;2nd lowest byte
inc dword[esp+eax*4]
sub ecx,1
jns .lpg2
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
xor ecx,ecx
xor ebx,ebx
.lpm2:
mov eax, dword[esp+ecx*4]
mov dword[esp+ecx*4], ebx
cmp ecx,256
jne .lpm2
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; second copy
xor ecx,ecx
.lpr2:
movzx eax, byte[esi+ecx*4+1] ;; second byte
mov ebx, dword[esp+eax*4]
inc dword[esp+eax*4] ;++
mov eax, dword[esi+ecx*4]
mov dword[edi+ebx*4], eax
cmp ecx,edx
jne .lpr2
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
push esi
push edi
pop esi
pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 3
;;;;;;;;;;;;;;;;;;;;zero count array
xor eax,eax
mov ecx,255
.lpc3:
mov dword[esp+ecx*4],eax
mov dword[esp+ecx*4-4],eax
mov dword[esp+ecx*4-8],eax
mov dword[esp+ecx*4-12],eax
mov dword[esp+ecx*4-16],eax
sub ecx,5
jnz .lpc3
mov dword[esp],eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
mov ecx,edx ;;dword size -1
dec ecx
.lpg3:
movzx eax,byte[esi+ecx*4+2] ;;third lowest byte
inc dword[esp+eax*4]
sub ecx,1
jns .lpg3
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
xor ecx,ecx
xor ebx,ebx
.lpm3:
mov eax, dword[esp+ecx*4]
mov dword[esp+ecx*4], ebx
cmp ecx,256
jne .lpm3
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; third copy
xor ecx,ecx
.lpr3:
movzx eax, byte[esi+ecx*4+2] ;; third byte
mov ebx, dword[esp+eax*4]
inc dword[esp+eax*4] ;++
mov eax, dword[esi+ecx*4]
mov dword[edi+ebx*4], eax
cmp ecx,edx
jne .lpr3
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
push esi
push edi
pop esi
pop edi
;++++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ITERATION 4
;;;;;;;;;;;;;;;;;;;;zero count array
xor eax,eax
mov ecx,255
.lpc4:
mov dword[esp+ecx*4],eax
mov dword[esp+ecx*4-4],eax
mov dword[esp+ecx*4-8],eax
mov dword[esp+ecx*4-12],eax
mov dword[esp+ecx*4-16],eax
sub ecx,5
jnz .lpc4
mov dword[esp],eax
;++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;get counts
mov ecx,edx ;;dword size -1
dec ecx
.lpg4:
movzx eax,byte[esi+ecx*4+3] ;;last byte
inc dword[esp+eax*4]
sub ecx,1
jns .lpg4
;++++++++++++++++++++++++++++
;;;;;change counts to indexes into shadow array
xor ecx,ecx
xor ebx,ebx
.lpm4:
mov eax, dword[esp+ecx*4]
mov dword[esp+ecx*4], ebx
cmp ecx,256
jne .lpm4
;++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;;;;;;;; fourth copy
xor ecx,ecx
.lpr4:
movzx eax, byte[esi+ecx*4+3] ;; fourth byte
mov ebx, dword[esp+eax*4]
inc dword[esp+eax*4] ;++
mov eax, dword[esi+ecx*4]
mov dword[edi+ebx*4], eax
cmp ecx,edx
jne .lpr4
;+++++++++++++++++++++++++++++++++++
;;;;;;;;;;;;;;;; flip shadow and array ptrs
push esi
push edi
pop esi
pop edi
;++++++++++++++++++++++++++++++++++++
push edi
call DeleteArray
pop edi
pop esi
pop ebx
pop ebp
ret 8

FillArray:
push ebx
push esi
mov ebx,dword[esp+16] ;; count
.lp:
dec ebx
js .end
call Random32
mov dword[esi+ebx],eax
jmp .lp
.end:
pop esi
pop ebx
ret 8

MakeArray:
mov eax,[esp+4]
push 1000h Or 2000h ;;commit | reserve
push eax ;;size
push -1 ;;current process
call [VirtualAllocEx]
test eax,eax
jnz .good
push 0
call [ExitProcess]
.good:
ret 4

DeleteArray:
push 8000h ;;release
push 0 ;;size
push -1 ;;this process
call [VirtualFreeEx]
ret 4

Random32:
push ebx
mov eax,[RandomSeed]
mov ebx,[RandomSeed+4]
mov ecx,[RandomSeed+8]
mov edx,[RandomSeed+12]
shld ebx,eax,1
ror eax,3
bswap eax
shld edx,ecx,1
bswap ecx
ror ecx,7
mov [RandomSeed],eax
mov [RandomSeed+4],ebx
mov [RandomSeed+8],ecx
mov [RandomSeed+12],edx
xor eax,ecx
pop ebx
ret 0

SetSeed:
.seed equ esp+4 ;,+8,+12,+16
movdqu xmm0,[.seed]
movntdq dqword[RandomSeed],xmm0
ret 16

MakeSeed:
rdtsc
mov edx,eax
call [GetTickCount]
mov ecx,eax
mul edx
mov [RandomSeed],eax
xor edx,ecx
mov [RandomSeed+4],edx
bswap ecx
xor eax,ecx
mov [RandomSeed+8],eax
not edx
bswap edx
mul edx
mov [RandomSeed+12],eax
ret 0

section '.idata' import data readable writeable

library kernel32,'KERNEL32.DLL',\
user32,'USER32.DLL',\
wsock32,'WS2_32.DLL',\
ntdll,'NTDLL.DLL',\
msvcrt,'msvcrt.dll'
include  "%fasminc%\apia\kernel32.inc"
include  "%fasminc%\apia\user32.inc"
include  "%fasminc%\apia\wsock32.inc"
import ntdll,\
NtAllocateVirtualMemory,'NtAllocateVirtualMemory',\
NtFreeVirtualMemory,'NtFreeVirtualMemory',\
NtWriteVirtualMemory,'NtWriteVirtualMemory',\
NtProtectVirtualMemory,'NtProtectVirtualMemory',\
NtClose,'NtClose',\
InitializeCS,'RtlInitializeCriticalSection',\
EnterCS,'RtlEnterCriticalSection',\
LeaveCS,'RtlLeaveCriticalSection',\
DeleteCS,'RtlDeleteCriticalSection'
import msvcrt,\
printf,'printf'

```
09 Apr 2006, 23:59

Joined: 07 Oct 2003
Posts: 1045
Location: Michigan, USA
I have a quicksort in my TurboAsmDx Library, VERY FAST. I can sort signed and unsigned integers (ascending and descending), also single and double floats (ascending and descending). It uses recursion, so you can see how to do that in fasm assembly too. Here is the code for unsigned integer, descending.

Code:
```proc QuickSortDUD uses ebx, array, left, right
locals
pivot dd ?
lhold dd ?
rhold dd ?
endl

mov     eax, [left]
mov     [lhold], eax

mov     eax, [right]
mov     [rhold], eax

mov     ecx, [array]
mov     edx, [left]
mov     eax, [ecx + edx*4]
mov     [pivot],eax

mov     eax, [left]
.while  eax, l, [right]

mov     ecx, [array]
mov     edx, [right]
mov     eax, [pivot]

QuickSortDUD_contloop0:
.if     [ecx + edx*4], a, eax ;ge
mov     eax, [left]
.if     eax, l, [right]
sub     [right], 1
.else
jmp     QuickSortDUD_endloop0
.endif
.else
jmp     QuickSortDUD_endloop0
.endif
mov     eax, [pivot]
mov     edx, [right]
jmp     QuickSortDUD_contloop0

QuickSortDUD_endloop0:
mov     eax, [left]
.if     eax, ne, [right]
mov     edx, [right]
mov     eax, [ecx + edx*4]
mov     edx, [left]
mov     [ecx + edx*4], eax
.endif

mov     ecx, [array]
mov     edx, [left]
mov     eax, [pivot]

QuickSortDUD_contloop1:
.if     [ecx + edx*4], b, eax ;le
mov     eax, [left]
.if     eax, l, [right]
.else
jmp     QuickSortDUD_endloop1
.endif
.else
jmp     QuickSortDUD_endloop1
.endif
mov     eax, [pivot]
mov     edx, [left]
jmp     QuickSortDUD_contloop1

QuickSortDUD_endloop1:
mov     eax, [left]
.if     eax, ne, [right]
mov     edx, [left]
mov     eax, [ecx + edx*4]
mov     edx, [right]
mov     [ecx + edx*4], eax
sub     [right], 1
.endif
mov     eax, [left]
.endw

mov     ecx, [array]
mov     edx, [left]
mov     eax, [pivot]
mov     [ecx + edx*4], eax
mov     [pivot], edx

mov     eax, [lhold]
mov     [left], eax

mov     eax, [rhold]
mov     [right], eax

mov     eax, [pivot]
.if     [left], l, eax
sub     eax, 1
stdcall QuickSortDUD, [array], [left], eax
.endif
.if     [right], g, eax
stdcall QuickSortDUD, [array], eax, [right]
.endif
return
endp    ```
10 Apr 2006, 09:39
Octavio

Joined: 21 Jun 2003
Posts: 366
Location: Spain
Octavio
i did a quick test,
my qsort takes 9.7 seconds to sort the array on a P3 930Mhz
i did the test on my OS, with windows you donÂ´t know wich task is runing.
10 Apr 2006, 13:27

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
I found a Q+I-sort (quick & insert) on the net and converted it. If I manage to make it work in your benchmark, I'll update my post.

EDIT: OK, I did it, but I see you want esi and edi preserved and you need some value in eax?! Well, at least it works somehow.
Maybe mine is 4x faster because I wrote 40M instead of 10M, but my proc wants to know the length not the number of elements...
Code:
``` 2DC59F4 Result 1: 3636:
2DC5A00 Result 2: 2964:
1 Result 3: 2884:
1 Result 4: 2924:
```

And some code in the attachment:

_________________
My updated idol http://www.agner.org/optimize/
10 Apr 2006, 16:07
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
Ok so why doesnt someone put all the algos here into 1 program and test the times.
10 Apr 2006, 22:16

Joined: 07 Oct 2003
Posts: 1045
Location: Michigan, USA
Using my function above (QuickSortDUD [still as yet un-optimized, got it working and moved on to other things ]) I can sort 10,000,000 Random Integers in a little less than 1.5 seconds. A bit tough to get an accurate count with the timers that I have setup now. I did this test on a Celeron 2.7 ghz, with windowsxp sp2.

Last edited by madmatt on 10 Apr 2006, 22:18; edited 1 time in total
10 Apr 2006, 22:17
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
Code:
```mov     esi, VectorToSort
mov     edi, LastElement + 4
call    MainRSort
push    0
call    [ExitProcess]

proc MainRSort
mov     ebp, 3    ; Sort byte 3
call    PreRSort
mov     ebp, 2    ; Sort byte 2
call    PreRSort
mov     ebp, 1    ; Sort byte 1
call    PreRSort
xor     ebp, ebp  ; Sort byte 0
call    PreRSort
ret
endp

proc PreRSort
push    esi
cmp     ebp, 3
jz    @@Sort3
cmp     ebp, 2
jz    @@Sort2
cmp     ebp, 1           ; Select masks and rotation values
jz    @@Sort1
@@Sort0:   mov     eax, 0FFFFFF00h
mov     ebx, 0000000FFh
mov     ecx, 2
jmp   @@Next00
@@Sort3:   xor     eax, eax
mov     ebx, 0FF000000h
mov     ecx, 0Ah
jmp   @@Next00
@@Sort2:   mov     eax, 0FF000000h
mov     ebx, 000FF0000h
mov     ecx, 12h
jmp   @@Next00
@@Sort1:   mov     eax, 0FFFF0000h
mov     ebx, 00000FF00h
mov     ecx, 1Ah

@@Loop0:    push    edi
mov     ebx, esi
mov     edx, [ebx]
xor     ecx, ecx
push    esi
and     edx, esi
xor     ecx, ecx
@@LoopCount:
mov     eax, [ebx]
and     eax, esi
cmp     eax, edx
jnz   @@Sort
cmp     ebx, edi
jnz   @@LoopCount
@@Sort:     pop     esi
cmp     ecx, 4
mov     edi, ebx
push    ebx
call    RSort
pop     ebx
mov     esi, ebx
pop     edi
cmp     esi, edi
jb    @@Loop0
pop     esi
ret
endp

proc RSort
mov     ebx, 100h*4 - 8
mov     ecx,  VectorTemp1
fldz                    ; Big optimization! We store a 0
Loop0:  fst     qword  [ecx] ; in the FPU and overwrite with QWORDs
sub     ebx, 8
jnz   Loop0
fstp    qword  [ecx]

push    esi
@@Next1:  mov     ebx, [esi]
and     ebx, edx
rol     ebx, cl
mov     eax, [VectorTemp1+ebx]
mov     [VectorTemp1+ebx], eax
cmp     esi, edi
jnz   @@Next1
pop     esi

mov     ebx,  VectorTemp1
mov     ecx, 100h
xor     edx, edx
@@Loop1:  mov     eax, [ebx]
mov     [ebx], edx
dec     ecx
jnz   @@Loop1

push    esi
@@Next2:  mov     ebx, [esi]
mov     eax, ebx
and     ebx, edx
rol     ebx, cl

mov     ecx, [VectorTemp1+ebx]
mov     [VectorTemp2+ecx], eax
mov     [VectorTemp1+ebx], ecx
cmp     esi, edi
jnz   @@Next2
pop     esi

push    esi
mov     ebx,  VectorTemp2
@@Loop2:  mov     eax, [ebx]
mov     [esi], eax
cmp     esi, edi      ; Copy the data. This part overwrites
jnz   @@Loop2         ; the unsorted vector with the sorted
pop     esi           ; one at VectorTemp2. If you don't need
ret                   ; it, you can eliminate this.
endp

VectorTemp1     dd      256 dup (0)
VectorTemp2     dd      256 dup (0)

VectorToSort    dd     100h     ; Sample array of elements to sort.
dd      99h
dd      98h
dd      87h
dd      76h
dd      65h
dd      54h
dd      43h
dd      32h
dd      21h
LastElement     dd      55676134h    ```
10 Apr 2006, 22:17

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
All algos are hard to test in all conditions. The quicksort can be as bad as some bubblesort, if you make the conditions hard for it. Although, sometimes it can quit earlier. There are very different algoritms for different memory, move, exchange, etc, costs. There's also a flash-sort that does the sorting in about O(n) time
depenging on the "ONE", the size of the memory used is about 110%-200% of the original data.

EDIT: Yeah sorry, O(n), because O(1) is a look-up table

Last edited by Madis731 on 11 Apr 2006, 21:21; edited 1 time in total
10 Apr 2006, 22:23
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
flash sort????
10 Apr 2006, 22:47
r22

Joined: 27 Dec 2004
Posts: 805
r22
FlashSort and RadixSort both have O(n) time.
http://www.neubert.net/Flacodes/FLACodes.html

When I benchmarked the JAVA RadixSort vs the JAVA FlashSort there timings were identical. JAVA version are on average 4x slower than the ASM versions.

It would make testing the ASM versions easier if they followed a calling convention.
11 Apr 2006, 19:34

Joined: 25 Sep 2003
Posts: 2139
Location: Estonia
Hi, I just had a vision

I don't remember what was the stride length in modern computers and is it even constant, but you should be able to sort one stride at a time and then insert sort all other strides. Would it be faster?

i.e. stride is 32-bytes (this is just an example):
You take 8 DWORDs and sort them for free any way you want. Because they are in one cache stride, you will only get a level 1 cache penalty which is 3 clocks.

Okay, lets scale horizontally now - we have many cache strides (P4s have upto 4MB of cache nowdays) and we can read an array of strides in by accessing the first memory location of each of these. This can be done by dummy cmp [mem],0 or something. Maybe use some SSE instruction.

We can even move chunks of 32-bytes between strides, because CPU architecture is optimized this way so REP MOVSD is less than 8 clocks when ECX=8. I don't know if this can be done faster with XMM.

-----

Okay, another thought here:
What if you read in four of the XMM registers (that is 128 bits or 8 bytes). Then you apply an algorithm on xmm0..xmm3 to sort the eight counterparts. I know that SSE is meant to deal with floats, but who cares We're still using XOR EAX,EAX to "move" 0 into eax.

-----

Would any of these work or the code to make it work will be bloat?
12 Apr 2006, 07:49
r22

Joined: 27 Dec 2004
Posts: 805
r22
I don't see how that could work any faster. Because you'd probably have to break the strides up with you were insertion sorting them

Say we sort 2 at a time
100 5 60 20 6 80
(5, 100) (20, 60) (6, 55)
To get to 5, 6, 20, 55, 60 would cost a lot of time

Unless your thinking of something different.
12 Apr 2006, 18:06
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
How about a greator or less sort... By subtracitng lets say 9 from 6.. you get 3 but if you subtract 6-9 you get -3 . So

If you subtract a number from that number and the outcome is positive. the First number is greater than the second. If the outcome is negative the second number is greater.

mov ecx,9
mov eax,7
sub ecx,eax
js ecxless
13 Apr 2006, 01:28
r22

Joined: 27 Dec 2004
Posts: 805
r22
CMP reg,reg does the same thing, only better, because it doesn't change teh contents of the register.

Some sort of CMOVxx might be an interesting way to optimize a sorting algorithm that uses a lot of CMP. As the branchless technique is proven to be a lot faster.
13 Apr 2006, 05:25
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
Is there anyway to add many memory destinations at the same time example

1m dd 4
2m dd 5
3m dd 2

13 Apr 2006, 14:23
f0dder

Joined: 19 Feb 2004
Posts: 3170
Location: Denmark
f0dder
shism2, your code GPFs on me (somewhere in RSort I think).

Attached is my correctness + benchmark test, enjoy

Quote:

D:\src\test\sort>driver
sorting test - sorting 10485760 signed numbers, using seed 0xBEEFF00D

*** testing for correctness
testing routine libc qsort()
testing routine hutch asm qsort() failed at index 7 (953 > 479)
testing routine C trimedian qsort()
testing routine STL std::sort()
testing routine r22 FASM RADIX sort()
testing routine David Garcia C++ radix()
testing routine Madis731 Quick&Insert() failed at index 633 (2 > 1)

*** testing for SPEED
libc qsort() took 1782 ticks
hutch asm qsort() took 1016 ticks
C trimedian qsort() took 859 ticks
STL std::sort() took 1016 ticks
r22 FASM RADIX sort() took 390 ticks
David Garcia C++ radix() took 641 ticks

EDIT: forgot to state my machine specs. AMD64x2 4400+, 4x512meg ram, ASUS A8N SLI Premium motherboard (for stability and passively cooled chipset, I don't run SLI).

EDIT2: I also just realized the program says "signed", but test tests are unsigned

_________________
- carpe noctem

Last edited by f0dder on 23 Apr 2006, 11:33; edited 2 times in total
23 Apr 2006, 11:07
okasvi

Joined: 18 Aug 2005
Posts: 382
Location: Finland
okasvi
results:
Quote:
sorting test - sorting 10485760 signed numbers, using seed 0xBEEFF00D

*** testing for correctness
testing routine libc qsort()
testing routine hutch asm qsort() failed at index 7 (953 > 479)
testing routine C trimedian qsort()
testing routine STL std::sort()
testing routine r22 FASM RADIX sort()
testing routine David Garcia C++ radix()
testing routine Madis731 Quick&Insert() failed at index 633 (2 > 1)

*** testing for SPEED
libc qsort() took 2687 ticks
hutch asm qsort() took 1797 ticks
C trimedian qsort() took 1532 ticks
STL std::sort() took 1719 ticks
r22 FASM RADIX sort() took 1250 ticks
David Garcia C++ radix() took 1579 ticks

amd sempron 2800+, sucky i know...
23 Apr 2006, 11:24
UCM

Joined: 25 Feb 2005
Posts: 285
UCM
sort_test wrote:

sorting test - sorting 10485760 signed numbers, using seed
0xBEEFF00D

*** testing for correctness
testing routine libc qsort()
testing routine hutch asm qsort() failed at index 7 (953 > 479)
testing routine C trimedian qsort()
testing routine STL std::sort()
testing routine r22 FASM RADIX sort()
testing routine David Garcia C++ radix()
testing routine Madis731 Quick&Insert() failed at index 633 (2 > 1)

*** testing for SPEED
libc qsort() took 2156 ticks
hutch asm qsort() took 1157 ticks
C trimedian qsort() took 969 ticks
STL std::sort() took 1109 ticks
r22 FASM RADIX sort() took 532 ticks
David Garcia C++ radix() took 750 ticks

AMD Athlon X2 4200+
p.s. if you think this is a tad slow for my processor consider that i am running firefox, 7zip, regedt32.exe, 8 instances of explorer, 1 instance of Prime95 per core (2), PvPGN, OpenVPN, gaim, Skype, Apache, OpenOffice.org, Avast! Antivirus and 6 instances of Notepad.

_________________
This calls for... Ultra CRUNCHY Man!
Ta da!! *crunch*
23 Apr 2006, 15:39
shism2

Joined: 14 Sep 2005
Posts: 248
shism2
That's funny fodder because it doesn't produce a GPF for me
23 Apr 2006, 16:22
Vasilev Vjacheslav

Joined: 11 Aug 2004
Posts: 392
Vasilev Vjacheslav
maybe someone will be interested in this program (include source code)