flat assembler
Message board for the users of flat assembler.

 flat assembler > Examples and Tutorials > [FASM G][Project Euler] Super Pandigital Numbers
Author
bitRAKE

Joined: 21 Jul 2003
Posts: 2653
Location: dank orb

Super Pandigital Numbers
Problem 571
https://projecteuler.net/problem=571
This example demonstrates:
• in-place lexicographic generator
• base conversion

The only real optimization is to realize all 12-digit numbers cannot be tested for super pandigital quality. Therefore, only 12-pandigital numbers are presented for further testing. I was concerned that 12 digits would not be enough, but I moved forward anyhow - to prove myself over cautious.

Note the use of bit arrays within a register to ease multiple exit conditions. Also, array testing has known bound limits or blocking data in place. This prevents the need for some bounds checking (branching). Permutation algorithm is address based and easily adapted to other type arrays.

I build the 64-bit console PE manually because I can (Yes, 64-bit Windows doesn't use much of the headers). Main code can easily be moved to standard FASM template, etc. I'll post more solved problems in time...
Code:
```;@REM use windows CMD processor to build
;@SET INCLUDE=..\..\fasmg\examples\x86\include
;@..\..\fasmg\fasmg.exe %~nx0 %~n0.exe
;@GOTO:EOF

macro align? boundary
rb (boundary-1)-((\$ scale 0) + boundary - 1) mod boundary
end macro

; Because protections are page granular section alignment must be page size or larger.
;
; Sections are needed for the following configurations:
;   - Executable
;   - Readwrite data (also includes uninitialized data on the end)
; (extra padding bytes <= 2*(512 - 1))

IMAGE_BASE = 10000h
RVA? equ -IMAGE_BASE+ ; relative virtual address

SECTION_ALIGNMENT = 4096
FILE_ALIGNMENT = 512

org IMAGE_BASE

dd 'MZ'
rb 64-8
dd 64 ; e_lfanew

dd 'PE'                               ; Signature
dw 8664h                              ; PROCESSOR_AMD_X8664
dw 2                                  ; NumberOfSections
dd ?                                  ; #### ; TimeDateStamp
dd ?                                  ; #### ; PointerToSymbolTable
dd ?                                  ; #### ; NumberOfSymbols
dw Sections - Optional                ; RVA SizeOfOptionalHeader
dw 0002h                              ; Characteristics ; IMAGE_FILE_32BIT_MACHINE or IMAGE_FILE_EXECUTABLE_IMAGE

; Standard Fields:
dw 020Bh ; PE+ ; IMAGE_NT_OPTIONAL_HDR64_MAGIC
dd ?                                  ; #### ; SizeOfCode
dd ?                                  ; #### ; SizeOfInitializedData
dd ?                                  ; #### ; SizeOfUninitializedData
dd RVA Main                           ; RVA AddressOfEntryPoint
dd ?                                  ; #### ; BaseOfCode

; Windows Specific Fields:
dq IMAGE_BASE                         ; ImageBase
dd SECTION_ALIGNMENT                  ; memory align of sections
dd FILE_ALIGNMENT                     ; file align of sections
dw ?,?                                ; OperatingSystemVersion
dw ?,?                                ; ImageVersion
dw 5,2                                ; SubsystemVersion
dd ?                                  ; Win32VersionValue
dd SizeOfImage ;*SizeOfImage, must be multiple of section alignment
dd ?                                  ; #### checksum
dw 0003h                              ; IMAGE_SUBSYSTEM_WINDOWS_CUI
dw 8000h                              ; IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE
dq 0,0                                ; stack: reserve, commit
dq 0,0                                ; heap: reserve, commit
dd ?                                  ; loader flags, obsolete
dd 2                                 ; number of directories

; Data Directories:
dd 0,0 ; Export Table                 ; RVA, Size
dd RVA import_, 0 ;end_import-import_
; Resource
; Exception
; Certificate
; Relocation
; Debug
; Globalptr
; TLS Table
; BoundImport
; IAT
; DelayImport
; COM+
; Reserved

IMAGE_SCN_CNT_CODE               = 0x00000020
IMAGE_SCN_CNT_INITIALIZED_DATA   = 0x00000040
IMAGE_SCN_CNT_UNINITIALIZED_DATA = 0x00000080
IMAGE_SCN_LNK_OTHER              = 0x00000100
IMAGE_SCN_LNK_INFO               = 0x00000200
IMAGE_SCN_LNK_REMOVE             = 0x00000800
IMAGE_SCN_LNK_COMDAT             = 0x00001000
IMAGE_SCN_GPREL                  = 0x00008000
IMAGE_SCN_MEM_PURGEABLE          = 0x00020000
IMAGE_SCN_MEM_16BIT              = 0x00020000
IMAGE_SCN_MEM_LOCKED             = 0x00040000
IMAGE_SCN_ALIGN_1BYTES           = 0x00100000
IMAGE_SCN_ALIGN_2BYTES           = 0x00200000
IMAGE_SCN_ALIGN_4BYTES           = 0x00300000
IMAGE_SCN_ALIGN_8BYTES           = 0x00400000
IMAGE_SCN_ALIGN_16BYTES          = 0x00500000
IMAGE_SCN_ALIGN_32BYTES          = 0x00600000
IMAGE_SCN_ALIGN_64BYTES          = 0x00700000
IMAGE_SCN_ALIGN_128BYTES         = 0x00800000
IMAGE_SCN_ALIGN_256BYTES         = 0x00900000
IMAGE_SCN_ALIGN_512BYTES         = 0x00A00000
IMAGE_SCN_ALIGN_1024BYTES        = 0x00B00000
IMAGE_SCN_ALIGN_2048BYTES        = 0x00C00000
IMAGE_SCN_ALIGN_4096BYTES        = 0x00D00000
IMAGE_SCN_ALIGN_8192BYTES        = 0x00E00000
IMAGE_SCN_LNK_NRELOC_OVFL        = 0x01000000
IMAGE_SCN_MEM_NOT_CACHED         = 0x04000000
IMAGE_SCN_MEM_NOT_PAGED          = 0x08000000
IMAGE_SCN_MEM_SHARED             = 0x10000000
IMAGE_SCN_MEM_EXECUTE            = 0x20000000
IMAGE_SCN_MEM_WRITE              = 0x80000000

Sections:

Section_Code:
dq '.bitRAKE'
dd Section_Code.Length                ; VirtualSize
dd RVA Section_Code.Base              ; RVA VirtualAddress
dd Section_Code.Bytes                 ; SizeOfRawData
dd Section_Code.FileOffset            ; PointerToRawData
dd 0                                  ; PointerToRelocations
dd 0                                  ; PointerToLinenumbers
dw 0                                  ; NumberOfRelocations
dw 0                                  ; NumberOfLinenumbers
dd IMAGE_SCN_MEM_EXECUTE or IMAGE_SCN_MEM_SHARED or IMAGE_SCN_ALIGN_4096BYTES

Section_Data:
dq '.DATA'
dd Section_Data.Length                ; VirtualSize
dd RVA Section_Data.Base              ; RVA VirtualAddress
dd Section_Data.Bytes                 ; SizeOfRawData
dd Section_Data.FileOffset            ; PointerToRawData
dd 0                                  ; PointerToRelocations
dd 0                                  ; PointerToLinenumbers
dw 0                                  ; NumberOfRelocations
dw 0                                  ; NumberOfLinenumbers
dd IMAGE_SCN_MEM_WRITE or IMAGE_SCN_ALIGN_4096BYTES

;###############################################################################
; Constant data goes here...

Output db "Found %I64u, Sum %I64u",13,10,0

Error_Messages dd \
.Not_Enough_Digits

.Not_Enough_Digits db "Insufficient digits for 10, 12-Super-Pandigital numbers!",13,10,0

;###############################################################################
; DLL Strings

_kernel32 db "kernel32",0
_msvcrt db "msvcrt",0

; Hint/Name Table

rb (\$ and 1) ; align 2
_ExitProcess db 0,0,"ExitProcess",0

rb (\$ and 1) ; align 2
_printf db 0,0,"printf",0

;###############################################################################
VIRTUAL at \$
ALIGN SECTION_ALIGNMENT,0
TEMP = \$
END VIRTUAL
ORG \$%
ALIGN FILE_ALIGNMENT,0
Section_Code.FileOffset:
ORG TEMP
Section_Code.Base:

include 'x64.inc'
use64

Main: ;=RIP=RDX=R9

xor r8,r8               ; clear running sum
push 10                 ; problem wants sum of smallest 10
.next:
call Permute_Digits     ; 12-pandigital by design
mov cl,0
jc .FAIL

; what is base-12 number in binary?
push 12
xor eax,eax
pop rcx
.b12: movzx edx,[Digits+rcx-1]
imul rax,rax,12
loop .b12

call Super_PanDigital_11_RAX
jnc .next

; output number in base 10, and sum thus far
push r8
enter 32,0
xchg rdx,rax
lea rcx,[Output]
call [printf]
leave
pop r8

sub dword [rsp],1
jnz .next
pop rcx ; zero
call [ExitProcess]
int3

; POSSIBLE ERRORS:
;   1) 12 digits are not sufficient to generate 10,
;      12-Super-Pandigital numbers!
.FAIL:  movzx ecx,cl
push rcx rcx
enter 32,0
mov ecx,[Error_Messages+ecx*4]
call [printf]
leave
pop rcx rcx
call [ExitProcess]
int3

align 64
; Compute the next lexicographical order of Digits:
; RAX RCX RDX are trashed
Permute_Digits:

; find suffix of increasing values

lea rcx,[Digits]
mov al,[rcx]
cmp al,[rcx-1]
jnc .sufx
cmp rcx,Digits_end

; swap pivot into suffix order

lea rdx,[Digits-1]
cmp al,[rdx]
jnc .pvt
.swap: mov ah,[rdx]
mov [rdx],al
mov [rcx],ah

; reverse suffix to create lexical minimal

sub rcx,1
lea rdx,[Digits]
.rev:  mov ah,[rcx]
mov al,[rdx]
mov [rdx],ah
mov [rcx],al
sub rcx,1
cmp rdx,rcx             ; insure one less itteration
jc .rev                 ; in the odd length suffix case
;       clc
retn
stc                     ; already at last permutation
retn

align 64
; pass 12-pandigital numbers in RAX to check
Super_PanDigital_11_RAX:
push rbx rcx rdx
push 12
pop rbx
.check:
sub ebx,1
push rax
xor ecx,ecx
bts ecx,ebx
sub ecx,1               ; bit set for each base digit
.more_digits:
xor edx,edx
div rbx                 ; extract digit
btr ecx,edx             ; tag digit
jrcxz .pandigital       ; all digits present
; NOTE: most significant zero digits do not count
test rax,rax
jnz .more_digits
;       clc                     ; failed
pop rax
jmp .end
.pandigital:
pop rax
cmp ebx,2
jnz .check
stc                     ; RAX is 12-super-pandigital
.end: pop rdx rcx rbx
retn

Section_Code.Bytes = \$ - Section_Code.Base
Section_Code.Length = \$ - Section_Code.Base

;###############################################################################
VIRTUAL at \$
ALIGN SECTION_ALIGNMENT
TEMP = \$
END VIRTUAL
ORG \$%
ALIGN FILE_ALIGNMENT,0
Section_Data.FileOffset:
ORG TEMP
Section_Data.Base:

align 64
Digits db 1,2,3,4,5,6,7,8,9,10,11,0
Digits_end:
db 0 ; force termination of suffix search
align 64

import_:
dd 0, 0, 0, RVA _kernel32, RVA kernel32_iat
dd 0, 0, 0, RVA _msvcrt, RVA msvcrt_iat
dd 0,0,0,0,0

kernel32_iat:
ExitProcess dq RVA _ExitProcess
dq 0

msvcrt_iat:
printf dq RVA _printf
dq 0
end_import:

Section_Data.Bytes = \$ - Section_Data.Base
Section_Data.Length = \$ - Section_Data.Base

;###############################################################################
VIRTUAL at \$
ALIGN SECTION_ALIGNMENT
SizeOfImage = RVA \$
END VIRTUAL

SizeOfHeaders = Section_Code.FileOffset ; same as first segment file offset
```
References:
https://www.nayuki.io/page/next-lexicographical-permutation-algorithm - my version differs in a number of ways (i.e. is superior), but this is a great write-up of the algorithm.

_________________
utf8everywhere.org
27 Oct 2016, 21:48
Tomasz Grysztar
Assembly Artist

Joined: 16 Jun 2003
Posts: 6969
Location: Kraków, Poland
Nice! Just a couple of months ago I've been talking about "nextperm" implementations with Ender, it's great to now see your take on it. We've been also discussing the "next partition of number" algorithms - another interesting and somewhat related problem.

Out of curiosity: is there any specific reason why have you used fasmg for this? Or was it just "because you can"?

I also think that you used a different "align" macro to assemble this, because you have a two-argument "align" in few places.
28 Oct 2016, 11:40
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
bitRAKE wrote:
References:
https://www.nayuki.io/page/next-lexicographical-permutation-algorithm - my version differs in a number of ways (i.e. is superior), but this is a great write-up of the algorithm.
Can you please explain what are your improvements? I followed your source code and I can't see where you differ from the algorithm shown in the reference.
28 Oct 2016, 13:52
bitRAKE

Joined: 21 Jul 2003
Posts: 2653
Location: dank orb
Tomasz Grysztar, Many of the Euler problems benefit from partitions:
Code:
```Partition_Next:
mov edx,ecx
.one:  cmp byte [rax+rcx-1],1
loopz .one
jnz .N
;clc; no more partitions
retn

.N:    sub edx,ecx
sub byte [rax+rcx],1
mov dh,[rax+rcx]

sub dl,[rax+rcx-1]
mov [rax+rcx],dh
ja .R
stc; partition valid
retn    ```
I'm not sure one would want to use larger than byte values for this. It would be interesting to hear how this is related for permutations.

revolution, Nayuki's algorithms use indices to access the array, and sort the array in the other direction. It the address is static then they both require the same number of registers, but for a dynamic array address my method is one less register. The ordering of the array simplifies the other code. Improvements are mostly problem specific.
28 Oct 2016, 23:23
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
Code:
```Base: 3
Found 11, Sum 11
Found 19, Sum 30
Found 21, Sum 51
Insufficient digits for 10, 3-Super-Pandigital numbers!

Base: 4
Found 75, Sum 75
Found 99, Sum 174
Found 114, Sum 288
Found 135, Sum 423
Found 141, Sum 564
Found 147, Sum 711
Found 156, Sum 867
Found 177, Sum 1044
Found 198, Sum 1242
Found 201, Sum 1443

Base: 5
Found 978, Sum 978
Found 1070, Sum 2048
Found 1138, Sum 3186
Found 1142, Sum 4328
Found 1294, Sum 5622
Found 1334, Sum 6956
Found 1358, Sum 8314
Found 1394, Sum 9708
Found 1490, Sum 11198
Found 1730, Sum 12928

Base: 6
Found 8350, Sum 8350
Found 8630, Sum 16980
Found 8950, Sum 25930
Found 8990, Sum 34920
Found 10510, Sum 45430
Found 10535, Sum 55965
Found 11045, Sum 67010
Found 12130, Sum 79140
Found 12710, Sum 91850
Found 13415, Sum 105265

Base: 7
Found 160773, Sum 160773
Found 203373, Sum 364146
Found 207549, Sum 571695
Found 208695, Sum 780390
Found 209019, Sum 989409
Found 209127, Sum 1198536
Found 225825, Sum 1424361
Found 246711, Sum 1671072
Found 248415, Sum 1919487
Found 251235, Sum 2170722

Base: 8
Found 2217404, Sum 2217404
Found 2340261, Sum 4557665
Found 2787302, Sum 7344967
Found 2803640, Sum 10148607
Found 2810087, Sum 12958694
Found 2873220, Sum 15831914
Found 3082618, Sum 18914532
Found 3130694, Sum 22045226
Found 3214239, Sum 25259465
Found 3265976, Sum 28525441

Base: 9
Found 45623244, Sum 45623244
Found 45642476, Sum 91265720
Found 54953972, Sum 146219692
Found 56368428, Sum 202588120
Found 62399428, Sum 264987548
Found 73653388, Sum 338640936
Found 86978244, Sum 425619180
Found 87799108, Sum 513418288
Found 89633332, Sum 603051620
Found 90665140, Sum 693716760

Base: 10
Found 1093265784, Sum 1093265784
Found 1367508924, Sum 2460774708
Found 1432598706, Sum 3893373414
Found 1624573890, Sum 5517947304
Found 1802964753, Sum 7320912057
Found 2381059764, Sum 9701971821
Found 2409758631, Sum 12111730452
Found 2578693140, Sum 14690423592
Found 2814609357, Sum 17505032949
Found 2814759360, Sum 20319792309

Base: 11
Found 37206483195, Sum 37206483195
Found 40913867295, Sum 78120350490
Found 41367089205, Sum 119487439695
Found 43285601975, Sum 162773041670
Found 43609726185, Sum 206382767855
Found 59063147285, Sum 265445915140
Found 80276318495, Sum 345722233635
Found 87469019325, Sum 433191252960
Found 91044263875, Sum 524235516835
Found 91347062385, Sum 615582579220

Base: 12
Found 1587937206284, Sum 1587937206284
Found 1674839888205, Sum 3262777094489
Found 2638904175622, Sum 5901681270111
Found 2806980157234, Sum 8708661427345
Found 2816957039424, Sum 11525618466769
Found 3325871906940, Sum 14851490373709
Found 3863090145827, Sum 18714580519536
Found 3909765781284, Sum 22624346300820
Found 3925260871994, Sum 26549607172814
Found 3960783529164, Sum 30510390701978

Base: 13
Found 109746121381518, Sum 109746121381518
Found 125340958430766, Sum 235087079812284
Found 128249807645730, Sum 363336887458014
Found 128497348608546, Sum 491834236066560
Found 131405926979178, Sum 623240163045738
Found 132714092273658, Sum 755954255319396
Found 137194286579202, Sum 893148541898598
Found 140369758399362, Sum 1033518300297960
Found 140370815262798, Sum 1173889115560758
Found 142856969308770, Sum 1316746084869528    ```
The case for 3-Super-Pandigital-numbers would make an interesting puzzle to efficiently generate the required longer test string (with duplicated values in base-3). One could naively just increment a counter and check all the bases but I'm sure there is a cleverer way to do it.
29 Oct 2016, 07:24
Tomasz Grysztar
Assembly Artist

Joined: 16 Jun 2003
Posts: 6969
Location: Kraków, Poland
bitRAKE wrote:
Tomasz Grysztar, Many of the Euler problems benefit from partitions:
Code:
```Partition_Next:
mov edx,ecx
.one:  cmp byte [rax+rcx-1],1
loopz .one
jnz .N
;clc; no more partitions
retn

.N:    sub edx,ecx
sub byte [rax+rcx],1
mov dh,[rax+rcx]

sub dl,[rax+rcx-1]
mov [rax+rcx],dh
ja .R
stc; partition valid
retn    ```
I'm not sure one would want to use larger than byte values for this. It would be interesting to hear how this is related for permutations.

When Ender casually introduced this problem (generating all partitions of a number) to me, we came up with completely different algorithms. Mine started with an idea analogous to the "next permutation" algorithm and looked like this (I implemented it now with the same interface as your sample, buffer is as RAX, RCX is the length of partition):
Code:
```Partition_First:
; this algorithm begins with a partition consisting of all 1s
xor     edx,edx
.initialize:
mov     byte [rax+rdx],1
inc     rdx
cmp     rdx,rcx
jb      .initialize
retn

Partition_Next:
sub     rcx,2
jc      .last
mov     dx,[rax+rcx]
.collect:
sub     rcx,1
jc      .increment
cmp     dl,[rax+rcx]
jne     .increment
jmp     .collect
.increment:
inc     rcx
inc     byte [rax+rcx]
dec     dh
jz      .done
inc     rcx
mov     byte [rax+rcx],1
.done:
inc     rcx
stc
retn
.last:
clc
retn    ```
You could call it a "mathematician's algorithm", because it has very simple formulation, with as little special cases as possible. But for this reason its performance is far from perfect. There is one simple optimization that can make it a lot faster:
Code:
```Partition_Next:
sub     rcx,2
jc      .last
mov     dx,[rax+rcx]
.collect:
sub     rcx,1
jc      .increment
cmp     dl,[rax+rcx]
jne     .increment
jmp     .collect
.increment:
inc     rcx
inc     byte [rax+rcx]
cmp     dl,1
je      .reuse
dec     dh
jz      .done
inc     rcx
mov     byte [rax+rcx],1
.reuse:
stc
retn
.done:
inc     rcx
stc
retn
.last:
clc
retn    ```
29 Oct 2016, 17:25
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
Code:
```Base: 14
Found 2697545063614180, Sum 2697545063614180
Found 3543987606226119, Sum 6241532669840299
Found 5208398137374861, Sum 11449930807215160
Found 5284604920711883, Sum 16734535727927043
Found 6318016520427391, Sum 23052552248354434
Found 6420868753099710, Sum 29473421001454144
Found 6953196730874420, Sum 36426617732328564
Found 9120013592816724, Sum 45546631325145288
Found 10192514371265148, Sum 55739145696410436
Found 10448271079357605, Sum 66187416775768041    ```
30 Oct 2016, 01:52
Tomasz Grysztar
Assembly Artist

Joined: 16 Jun 2003
Posts: 6969
Location: Kraków, Poland
Turns out the Ender's algorithm had some flaw, so he's not going to write anything about it here (at least not until he finds a way to fix it).

Another interesting thing: I called the performance of my simple implementation "far from perfect", but it actually generates all the partitions faster that the one from your sample (but in the opposite direction - mine goes in lexicographic order, while your does reverse lexicographic). This is probably because of a fewer memory accesses it makes, since their complexity appears to be the same. The version with "reuse" optimization cycles through all the partitions in half the time of your variant on my machine.

I have not exactly explained how is it related to permutations. If you are still interested, I may write a proof of correctness of this algorithm showing the parallels between it and the "next permutation" generation.
31 Oct 2016, 06:48
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
I am disappoint:
Code:
```Base: 15
Found 64810523515743579, Sum 64810523515743579
Found 231294725708209863, Sum 296105249223953442
Found 242888099925134167, Sum 538993349149087609
Found 318695032806437931, Sum 857688381955525540
Insufficient digits for 10, 15-Super-Pandigital numbers!    ```
The 14-SPN set only got to 10 values after testing 90+% of the base-set permutations.

Perhaps there are no base-set 16-SPNs? I'm not sure I have the patience for what might take a month or more to run it for 16-SPNs.
02 Nov 2016, 10:38
bitRAKE

Joined: 21 Jul 2003
Posts: 2653
Location: dank orb
Tomasz Grysztar wrote:
I called the performance of my simple implementation "far from perfect", but it actually generates all the partitions faster that the one from your sample (but in the opposite direction - mine goes in lexicographic order, while your does reverse lexicographic). This is probably because of a fewer memory accesses it makes, since their complexity appears to be the same. The version with "reuse" optimization cycles through all the partitions in half the time of your variant on my machine.
Yes, the reuse is almost ideal - as writes to memory are the slowest - even if the data doesn't change.

The permutations can be generated directly. This is interesting because a job can be partitioned across many cores. How to partition partitions across cores though?

_________________
utf8everywhere.org
06 Nov 2016, 01:43
bitRAKE

Joined: 21 Jul 2003
Posts: 2653
Location: dank orb
Code:
```First 10, 3-SuperPandigital numbers:
11,19,21,29,33,38,42,46,47,48    ```

_________________
utf8everywhere.org
06 Nov 2016, 05:43
Tomasz Grysztar
Assembly Artist

Joined: 16 Jun 2003
Posts: 6969
Location: Kraków, Poland
bitRAKE wrote:
The permutations can be generated directly. This is interesting because a job can be partitioned across many cores. How to partition partitions across cores though?
That's an interesting problem... I'll let you know if I come up with anything.
06 Nov 2016, 18:41
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
I used a factorial counter to generate the permutations directly but it was much slower than the swap algorithm presented in the first post. It would be easy to start at any arbitrary point if you use the factorial counter to generate the initial permutation and then use the reverse/swap method to continue from there.

I also modified the original algorithm to compute (rather than search) for the reversal point and that gave a small improvement in speed.
07 Nov 2016, 04:35
bitRAKE

Joined: 21 Jul 2003
Posts: 2653
Location: dank orb
The number of base-N, N-digit, N-Super Pandigital numbers:
Code:
`{1,1,3,13,22,75,130,255,223,168,159,101,11,4,..}    `
My search so far on base 16 hasn't found any. It is now clear the permutation does not need to be lexicographical - sorting can be done afterward. This opens us up to optimize where the code spends most it's time. I was also able to gain a few percent by optimizing the tail code of digit extraction.

Initially, the series is very deceptive - graph just the first eight values, lol.

_________________
utf8everywhere.org
12 Nov 2016, 17:11
revolution
When all else fails, read the source

Joined: 24 Aug 2004
Posts: 16057
Location: 112 Ocean Avenue, Amityville
bitRAKE wrote:
I was also able to gain a few percent by optimizing the tail code of digit extraction.
Oh, I see I didn't post my code I thought I had posted it above. So let's fix that. It uses inverse multiply for the digit extraction, gives about 4x speed up over div.
Code:
```START_BASE                      = 3             ;minimum valid value is 3
MAX_BASE                        = 16            ;maximum valid value is 16
SUM_OF                          = 10            ;any value you feel like > 0
UPDATE_INTERVAL                 = 4000000       ;smaller values update the progress faster
DIRECTLY_GENERATE_PERMUTATIONS  = 0             ;this appears to be slower than scanning, so disabled for now

;if values > 2^63 are expected then enable the extra work for bases 7 and 14
USE_65TH_MULTIPLIER_BIT = MAX_BASE shr 4

format pe64 console
include 'win64ax.inc'
fcall fix fastcall

.data

ticks                   db 'Ticks: %I64u',13,10,0
base                    db 'Base: %u',13,10,0
progress                db '%016I64x (%u.%04u%%)',13,0
found                   db 'Found %I64u, Sum %I64u',13,10,0
insufficient            db 'Insufficient digits for %u, %u-Super-Pandigital numbers!'
crlf                    db 13,10,0
align 8
inverse_multiply:       rq MAX_BASE

.code

Begin:
;initialise the inverse_multiply table
mov     ebx,MAX_BASE - 1
.loop_inverse_multiply:
lea     eax,[ebx-1]
bsr     ecx,eax
xor     edx,edx
bts     edx,ecx
div     rbx
if USE_65TH_MULTIPLIER_BIT
;check for (base - remainder - 1) > (2^shift)
not     edx
jz      .65th_bit_okay
bsr     edx,edx
lea     edx,[edx*2+1]
cmp     cl,dl
rcr     cl,1                    ;set the 65th multiplier bit
.65th_bit_okay:
end if
mov     [inverse_multiply+rbx*8],rax
mov     [inverse_multiply+rbx],cl
dec     ebx
cmp     ebx,2
jae     .loop_inverse_multiply
invoke  GetTickCount
mov     rsi,rax
mov     edi,START_BASE
.loop_base:
fcall   SuperPanDigital,rdi,SUM_OF
inc     edi
cmp     edi,MAX_BASE
jbe     .loop_base
invoke  GetTickCount
sub     rax,rsi
fcall   printf,ticks,rax
invoke  ExitProcess,0

proc SuperPanDigital uses rbx rsi rdi r12 r13 r14 r15,the_base,sumOf
locals
digits  rb MAX_BASE+1
endl
mov     esi,ecx                         ;rsi = base
mov     r13,rdx                         ;r13 = countdown for found numbers
mov     ebx,edx                         ;rbx = desired count of found numbers
fcall   printf,base,rsi
;initialise the digits
mov     word[digits+rsi-1],0
lea     ecx,[esi-1]
.init:
mov     [digits+rcx-1],cl
dec     ecx
jnz     .init
mov     rdi,0x0123456789abcdef
mov     rax,-16
lea     ecx,[(esi-1)*4]
rol     rax,cl
or      rdi,rax                         ;rdi = suffix counter
xor     r12,r12                         ;r12 = running sum
xor     r14,r14                         ;r14 = progress update counter
.next_permutation:
if DIRECTLY_GENERATE_PERMUTATIONS
;increment the factorial counter
inc     rdi
jz      .fail
bsf     rcx,rdi
mov     rdx,0x0123456789abcdef
xor     rax,rax
and     ecx,not 3
bts     rax,rcx
dec     rax
and     rax,rdx
or      rdi,rax
;get the next set of digits
mov     rax,rdi
sub     rax,rdx
lea     ecx,[esi*4]
ror     rax,cl
mov     r9,0xfedcba9876543210
mov     r10,rsi
.next_factorial_digit:
dec     r10
rol     rax,4
lea     ecx,[eax*4]
and     ecx,0xf*4
mov     rdx,r9
shr     rdx,cl
mov     r8,rdx
and     edx,0xf
mov     [digits+r10],dl
and     r8,-0x10
shl     r8,cl
shl     rdx,cl
xor     r9,r8
xor     r9,rdx
shr     r8,4
or      r9,r8
test    r10,r10
jnz     .next_factorial_digit
else
;find the next set of digits in lexicographical order
;get next suffix
inc     rdi
jz      .fail
bsf     rcx,rdi
mov     rdx,0x0123456789abcdef
xor     rax,rax
and     ecx,not 3
bts     rax,rcx
dec     rax
and     rax,rdx
or      rdi,rax
shr     ecx,2
mov     al,[digits+rcx]
;find the pivot
or      edx,-1
.pivot:
inc     edx
cmp     al,[digits+rdx]
jnc     .pivot
;swap pivot within suffix
.swap:
mov     ah,[digits+rdx]
mov     [digits+rdx],al
mov     [digits+rcx],ah
;reverse suffix to create lexical minimum
dec     ecx
xor     edx,edx
.reverse:
mov     ah,[digits+rcx]
mov     al,[digits+rdx]
mov     [digits+rdx],ah
mov     [digits+rcx],al
inc     edx
dec     ecx
cmp     edx,ecx
jl      .reverse
end if
inc     r14
cmp     r14,UPDATE_INTERVAL
jb      .update_done
;display the current state to assure the user the program is still running
mov     rax,rdi
mov     r8d,1
xor     r11,r11
xor     ecx,ecx
not     rax
.compute_update_position:
mov     edx,eax
and     edx,0xf                         ;rdx = next digit from factorial counter
sub     edx,ecx
neg     edx                             ;convert rdx to up-counter
inc     ecx                             ;rcx = digit position
mov     r9,r8                           ;r9 = the count of the skipped permutations at the start (used below after the loop has finished)
imul    rdx,r8                          ;rdx = this digits contribution to the percentage
imul    r8,rcx                          ;r8 = next digit weighting
add     r11,rdx                         ;r11 = converted factorial counter into binary
shr     rax,4
cmp     ecx,esi
jb      .compute_update_position
sub     r8,r9                           ;r8 = total permutations to check
sub     r11,r9                          ;r11 = number of permutation checked
mov     eax,1000000                     ;convert to 6 digits of precision
mul     r11
dec     r8
div     r8
mov     ecx,10000                       ;convert to percentage
xor     edx,edx
div     ecx
mov     r9,rdx
mov     rdx,rdi
not     rdx
fcall   printf,progress,rdx,rax,r9
xor     r14,r14                         ;restart progress count
.update_done:
;convert to binary
xor     r15,r15                         ;r15 = the current number
mov     ecx,esi
.to_binary:
movzx   edx,[digits+rcx-1]
imul    r15,rsi
dec     ecx
jnz     .to_binary
;check if the number is pandigital for all bases less than rsi
mov     r8,rsi                          ;r8 = base countdown
.check_pandigital:
dec     r8
mov     rax,r15
movsx   rcx,byte[inverse_multiply+r8]   ;cl = shift count and 65th multiplier bit
mov     r9,[inverse_multiply+r8*8]      ;r9 = inverse multiplier
if USE_65TH_MULTIPLIER_BIT
test    rcx,r15
js      .65_bit_multiply
end if
xor     r10,r10
bts     r10,r8
dec     r10                             ;r10 = bit set for each base digit
.next_digit:
mov     r11,rax
mul     r9
shr     rdx,cl
mov     rax,rdx
imul    dx,r8w
sub     r11,rdx                         ;r11b = output digit
btr     r10,r11
test    rax,rax
jnz     .next_digit
test    r10,r10
jnz     .next_permutation
cmp     r8,2
jnz     .check_pandigital
;show this number and the current sum so far in base 10
fcall   printf,found,r15,r12
xor     r14,r14                         ;restart progress count
dec     r13
jnz     .next_permutation
.done:
fcall   printf,crlf
ret
.fail:
fcall   printf,insufficient,rbx,rsi
jmp     .done
if used .65_bit_multiply
.65_bit_multiply:
mov     r11,rax
mul     r9
mov     r10,r11
shr     r10,1
sub     rax,r10
sbb     rdx,0
shr     rdx,cl
mov     rax,rdx
imul    dx,r8w
xor     r10,r10
bts     r10,r8
dec     r10                     ;r10 = bit set for each base digit
sub     r11,rdx                 ;r11b = output digit
btr     r10,r11
jmp     .next_digit
end if
endp

proc printf string,arg1,arg2,arg3
locals
buff    rb 1028
endl
mov     [arg1],rdx
mov     [arg2],r8
mov     [arg3],r9
mov     rdx,rcx                         ;rdx = string
ret
endp

proc write_stdoutput source,bytes
local   bytes_written:QWORD
mov     [source],rcx
mov     [bytes],rdx
invoke  GetStdHandle,STD_OUTPUT_HANDLE
test    rax,rax
jz      .done
mov     rax,[bytes_written]
.done:
ret
endp

.end Begin    ```
12 Nov 2016, 22:01
 Display posts from previous: All Posts1 Day7 Days2 Weeks1 Month3 Months6 Months1 Year Oldest FirstNewest First

 Jump to: Select a forum Official----------------Blog General----------------MainDOSWindowsLinuxUnixMenuetOS Specific----------------MacroinstructionsCompiler InternalsIDE DevelopmentOS ConstructionNon-x86 architecturesHigh Level LanguagesProgramming Language DesignProjects and IdeasExamples and Tutorials Other----------------FeedbackHeapTest Area

Forum Rules:
 You cannot post new topics in this forumYou cannot reply to topics in this forumYou cannot edit your posts in this forumYou cannot delete your posts in this forumYou cannot vote in polls in this forumYou cannot attach files in this forumYou can download files in this forum