flat assembler
Message board for the users of flat assembler.

Index > Tutorials and Examples > ARM GameBoy Advance on Android

Author
Thread Post new topic Reply to topic
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 18 May 2013, 17:32
Minimal version of Z77 for ARM. Includes portable graphics (draw pixel, line, rectangle, scanline, gradient), text operations (copy, compare, etc) and more.

Assemble with FASMARM (outputs .GBA). Runs in popular GBA emulators: VisualBoy Advance, No$GBA and GBA Emu for Android.

Image

Download: http://sungod777.zxq.net/z77gba.zip

Thanks to Tomasz for FASM and to revolution for ARM addition. FASMARM is nice. Source is clear, written professionally and it includes documentation and examples. As a programming environment, I think FASM+ARM+Z77 together is way better than GNU/GCC ASM/C/C++ package. And I will never use Microsoft's junk bloated compilers or DevARMKit (600MB+ download!) or ARM's DS-5 package (bloatware) or Eclipse (slow, illogical, cluttered IDE, time consuming setup, downloads take forever).

You guys think drawing a 32BPP gradient in X86/Windoze is hard? Smile Try drawing a 15BPP (1.5.5.5) gradient directly to VRAM in ARM bare-metal! (GBA, RPI, etc). Spent hours struggling with "draw.fade" and finally got it. I guarantee you won't find another 15BPP draw gradient anywhere online, not to mention written in ARM assembler.

GBA programming references:

* GBAtek: http://nocash.emubase.de/gbatek.htm
* CowBite: http://www.cs.rit.edu/~tjh8300/CowBite/CowBiteSpec.htm

Source Code Preview:
Code:
; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$
; *************** STAR^2 SOFTWARE ****************
; ???????????????????? Z.INC ?????????????????????
;    ___     __
;   / _/_ __/ /___ _________             
;  / _/ // / __/ // / __/ -_)            
; /_/ \_,_/\__/\_,_/_/  \__/_   __       
;  ___ ____ ___ ___ __ _  / /  / /__ ____
; / _ `(_-<(_-</ -_)  ' \/ _ \/ / -_) __/
; \_,_/___/___/\__/_/_/_/_.__/_/\__/_/

format binary as 'GBA'
use32

macro use [f] { forward include 'use\'#`f#'.inc' }

use cpu, language, math, system, memory, text, draw

;;;;;;;;;;;;;; EVOLUTION BEGINS NOW ;;;;;;;;;;;;;;

; fast unsigned division by 10. r0/10

macro div10 {
movri r1, 1999999Ah    ; r1=((2^32)/10)+1
sub r0, r0, r0, lsr 30 ; r0=r0-(r0>>>30)
umull r2, r0, r1, r0   ; r0=r1*r0
}

; with remainder in r1

macro divr10 {
mov r3, r0        ; dividend
udiv10
mov r1, r0, lsl 1 ; multiply by 10:
add r1, r1, lsl 2 ; r1=(r0<<1)+(r1<<2)
sub r1, r3, r1    ; r1=r3-r1
}

; faster than div10

macro __div10 {
sub r0, r0, r0, lsr 14 ; r0=r0-(r0>>>14)
add r1, r0, r0, lsl 1  ; r1=r0+(r0<<1)
add r0, r0, r1, lsl 2  ; r0=r0+(r1<<2)
add r0, r0, r1, lsl 6  ; r0=r0+(r1<<6)
add r0, r0, r1, lsl 10 ; r0=r0+(r1<<10)
mov r0, r0, lsr 15     ; r0=(r0>>>15)
}

; divide by 255 (256-1)

macro div255 {
mov r1, r0, lsr 8 ; n=((n>>Cool+n+1)>>8
add r0, r0, 1
add r0, r1, lsr 8
}         

; memory.copy(destiny, source, count)

function memory.copy, a, b, n
cmp a3, 4         ; if n<4
blt .8            ; copy 8BIT
mov v2, a3, lsr 2 ; v2=n/4
.32:              ; copy 32BIT
ldr v1, [a2], 4   ; v1=*a2++
str v1, [a1], 4   ; *a1++=v1
subs v2, v2, 1    ; v2 # times
bne .32
ands a3, a3, 3    ; modulo 4
beq .!            ; remainder?
.8:               ; copy 8BIT
ldrb v1, [a2], 1  ; v1=*a2++
strb v1, [a1], 1  ; *a1++=v1
subs a3, a3, 1    ; v2 # times
bne .8
endf

; memory.set(destiny, value, count)

function memory.set, p, v, n
cmp a3, 4         ; if n<4
blt .8            ; copy 8BIT
mov v2, a3, lsr 2 ; v2=n/4
.32:              ; copy 32BIT
str a2, [a1], 4   ; *a1++=a2
subs v2, v2, 1    ; v2 # times
bne .32
ands a3, a3, 3    ; modulo 4
beq .!            ; remainder?
.8:               ; copy 8BIT
strb a2, [a1], 1  ; *a1++=a2
subs a3, a3, 1    ; a3 # times
bne .8
endf

function memory.set.b, p, v, n
and a2, a2, 0FFh
orr a2, a2, lsl 8
orr a2, a2, lsl 16
memory.set
endf

function memory.set.w, p, v, n
movri v1, 0FFFFh
and a2, a2, v1
orr a2, a2, lsl 16
memory.set
endf

macro memory.zero p, n
 { memory.set p, 0, n }

; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$
; *************** STAR^2 SOFTWARE ****************
; ?????????????????? DRAW.INC ????????????????????

; fast ARM graphics rendering for any system:
; embedded, Raspberry PI, Windows Mobile, etc

; create BGR 15BPP (1.5.5.5), 0-31 each...

function rgb, r, g, b
and a1, a1, 11111b    ; r=(r&1Fh)
and v2, a2, 11111b    ; g=(g&1Fh)
and v3, a3, 11111b    ; b=(b&1Fh)
orr a1, v2, lsl 5     ; c|(g<<5)
orr a1, v3, lsl 10    ; c=(b<<10)
endf    

; alpha combination. a1/a2 = a/b. a3/n=0-31

function mix, a, b, n
mov a3, a3, lsl 3  ; convert n to 0-255
mov v1, a1, lsr 10 ; db=(c1>>10)&11111b
mov v2, a2, lsr 10 ; sb=(c2>>10)&11111b
and v1, v1, 1Fh
and v2, v2, 1Fh
sub v2, v2, v1     ; (sb-db)
mul v2, v2, a3     ; (sb-db)*n
lsr v2, v2, 8      ; ((sb-db)*n)>>Cool+db
add v3, v2, v1
mov v1, a1, lsr 5  ; dg=(c1>>5)&11111b
mov v2, a2, lsr 5  ; sg=(c2>>5)&11111b
and v1, v1, 1Fh
and v2, v2, 1Fh
sub v2, v2, v1     ; (sg-dg)
mul v2, v2, a3     ; (sg-dg)*n
lsr v2, v2, 8      ; ((sg-dg)*n)>>Cool+dg
add v4, v2, v1
and v1, a1, 1Fh    ; dr=c1&11111b
and v2, a2, 1Fh    ; sr=c2&11111b
sub v2, v2, v1     ; (sr-dr)
mul v2, v2, a3     ; (sr-dr)*n
lsr v2, v2, 8      ; ((sr-dr)*n)>>Cool+dr
add a1, v2, v1     ; c=r|(g<<5)|(b<<10)
orr a1, v4, lsl 5
orr a1, v3, lsl 10
endf

; shift, mask, scale, subtract then divide.
; return: a1=delta

; (((((b>>s)&m)<<Cool-(((a>>s)&m)<<Cool)/w)

function delta8, a, b, s, n
alias a=a1, b=a2, s=a3, n=a4, m=v1
mov m, 11111b
mov b, b, lsr s ; ((b>>s)&m)<<Cool
and b, b, m
lsl b, b, 8
mov a, a, lsr s ; (((a>>s)&m)<<Cool
and a, a, m
lsl a, a, 8
sub a, b, a     ; (b-a)/w
mov b, n
idiv
endf

;;;;;;;;;;;;;;;;;;;; DRAWING ;;;;;;;;;;;;;;;;;;;;;

macro locate x, y, w, h {
movri a1, x
movri a2, y
if ~w eq
 movri a3, w
 movri a4, h
end if
}

function clear.screen
movri v1, VRAM          ; video RAM
movrmw v2, screen.color ; color
orr v2, v2, lsl 16      ; duplicate
movri v3, (SCREEN.N/2)  ; # of pixels
@@:
 str v2, [v1], 4        ; (u32) *vga++=c
 subs v3, v3, 1         ; w*h/2 # times
bne @b
endf

; &vga[(y*(screen.w*2))+(x*2)]

function vga.xy, x, y
movri v1, SCREEN.PITCH
mul v2, a2, v1         ; v2=y*pitch+(x*2)
add v2, v2, a1, lsl 1
movri v1, VRAM         ; v1=vga
add a1, v1, v2         ; return v1+v2
endf

function color.at, x, y
vga.xy
ldrh a1, [a1] ; return vga[x, y]
endf

function draw.pixel, x, y
movrmw v1, g.color
vga.xy
strh v1, [a1] ; *vga=color
endf

function draw.line.h, x, y, w
movrmw v1, g.color ; value=color
mov v2, a3
vga.xy
@@:
 strh v1, [a1], 2  ; *vga++=c
 subs v2, v2, 1    ; w # times
bne @b
endf

function draw.line.v, x, y, h
movrmw v1, g.color
movri v2, SCREEN.PITCH
mov v3, a3
vga.xy
@@:
 strh v1, [a1]
 add a1, a1, v2 ; vga+pitch
 subs v3, v3, 1 ; h # times
bne @b
endf

function draw.scanline, x, y, w
mov v1, a1
mov v2, a2
mov v3, a3
vga.xy
@@:
 ldrh v4, [v7], 2 ; v7 = pixels
 tst v4, 8000h    ; transparent?
 streqh v4, [a1]  ; store if opaque
 add a1, a1, 2
 subs v3, v3, 1
bne @b
endf

function draw.box, x, y, w, h
mov v2, a3
movri v3, SCREEN.PITCH
mov v4, a4
movrmw v5, g.color
vga.xy
.y:
 mov a3, v2
 mov v1, a1
 .x:
  strh v5, [v1], 2 ; *vga++=c
  subs a3, a3, 1   ; w # times
 bne .x
 add a1, a1, v3    ; vga+pitch
 subs v4, v4, 1    ; h # times
bne .y
endf

function draw.outline, x, y, w, h
mov v1, a1
mov v2, a2
mov v3, a3
mov v4, a4
vga.xy
mov a1, v1     ; top: x, y, w
mov a2, v2
mov a3, v3
draw.line.h
mov a1, v1     ; left: x, y, h
mov a2, v2
mov a3, v4
draw.line.v
add a1, v1, v3 ; right: x+w-1, y+1, h-1
sub a1, a1, 1
add a2, v2, 1
sub a3, v4, 1
draw.line.v
mov a1, v1     ; bottom: x, y+h-1, w
add a2, v2, v4
sub a2, a2, 1
mov a3, v3
draw.line.h
endf

; draw gradual vertical fade. parameters:
; a1-a4 = x, y, w, h. v5-v6 = c1, c2

function draw.fade, x, y, w, h, c1, c2
alias r=v1, g=v2, b=v3, rn=v4, gn=v5, bn=v6,\
 c1=v7, c2=v8, h=r12

push a1-a4
mov c1, v5 ; save colors
mov c2, v6
mov h, a4  ; height

delta8 c1, c2, 0, r12  ; get deltas...
mov rn, a1
delta8 c1, c2, 5, r12
mov gn, a1
delta8 c1, c2, 10, r12
mov bn, a1

mov a1, c1
get.rgb
mov r, a1, lsl 8 ; extract and scale
mov g, a2, lsl 8 ; r/g/b components
mov b, a3, lsl 8
pop a1-a4        ; restore x/y/w/h

push a3-a4       ; save w/h
vga.xy
pop a3-a4        ; x/y no longer needed

.y:
 push a1-a4
 mov a1, r, lsr 8 ; r=(r>>Cool
 mov a2, g, lsr 8 ; g=(g>>Cool
 mov a3, b, lsr 8 ; b=(b>>Cool
 rgb
 mov v7, a1
 pop a1-a4

 mov v8, a3        ; draw lines...
 .x:
  strh v7, [a1], 2 ; *vga++=c
  subs v8, v8, 1   ; w # times
 bne .x
 sub a1, a3, lsl 1 ; vga-(w*2)
 movri v7,\
  SCREEN.PITCH
 add a1, a1, v7    ; vga+pitch
 add r, r, rn      ; r/g/b+deltas
 add g, g, gn      ; 8.8 fixed points
 add b, b, bn
 subs h, h, 1      ; h # times
bne .y
endf

; draw vertical center fade: c1 to c2
; then c2 to c1

function draw.shade, x, y, w, h, c1, c2
alias y=a2, h=a4, c1=v5, c2=v6, tmp=r12
mov h, h, lsr 1 ; h/2
pusha
draw.fade
popa
add y, y, h     ; move down
mov tmp, c1     ; exchange colors
mov c1, c2
mov c2, tmp
draw.fade
endf      

; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$
; *************** STAR^2 SOFTWARE ****************
; ?????????????????? TEXT.INC ????????????????????

; fast portable ARM text operations. re-written
; from scratch. more effecient than X86 version

; "fast-call" convention. parameters: a1-a4/r0-r3

; text.n t          ; get # characters (size-1)
; text.copy a, b    ; standard copy with 0 after
; text.copy.n ...   ; copy with maximum size
; text.attach a, b  ; attach b to a; "concencate"
; text.attach.c...  ; attach character
; text.compare a, b ; compare. return <0>
; text.find t, c    ; search for c. return &/0
; text.find.last... ; search for c reverse
; text.begins a, b  ; begins with b?
; text.ends a, b    ; ends with b?
; text.upper t      ; convert to uppercase
; text.lower t      ; convert to lowercase
; text.reverse t    ; reverse

; i2t n, t          ; number/text conversions
; u2t n, t
; h2t n, t
; b2t n, t

strlen  fix text.n
strcpy  fix text.copy
strncpy fix text.copy.n
strcat  fix text.attach
strcmp  fix text.compare
strchr  fix text.find
strrchr fix text.find.last
strrev  fix text.reverse

align 4

_hex: db '0123456789ABCDEF'

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; text.n(t) - get text length, # characters

function text.n, t
mov v2, a1
@@:
 ldrb v1, [a1], 1 ; v1=*a1++
 tst v1, v1       ; until 0
bne @b
sub a1, a1, v2    ; return n (a1=a1-v2-1)
sub a1, a1, 1
endf

; text.copy(a, b) - standard copy with 0 after.
; return advanced address

function text.copy, a, b
@@:
 ldrb v1, [a2], 1 ; v1=*a2++
 strb v1, [a1], 1 ; *a1++=v1
 tst v1, v1       ; until 0
bne @b
endf

; text.copy.n(a, b, n) - copy with maximum
; size specified

function text.copy.n, a, b, n
tst a3, a3        ; n=0?
beq .!
@@:
 ldrb v1, [a2], 1 ; v1=*a2++
 strb v1, [a1], 1 ; *a1++=v1
 tst v1, v1       ; until 0
 beq @f
 subs a3, a3, 1   ; or # times
bne @b
@@:
mov a3, 0
strb a3, [a1]
endf

; text.attach(a, b) - attach b to a

function text.attach, a, b
@@:
 ldrb v1, [a1], 1 ; advance a1 to end
 tst v1, v1
bne @b
sub a1, a1, 1     ; -1
@@:
 ldrb v1, [a2], 1 ; v1=*a2++
 strb v1, [a1], 1 ; *a1++=v1
 tst v1, v1       ; until 0
bne @b
endf

; text.attach.c(t, c) - attach c to t

function text.attach.c, t, c
@@:
 ldrb v1, [a1], 1 ; advance a1 to end
 tst v1, v1
bne @b
sub a1, a1, 1     ; -1
strb a2, [a1], 1  ; *a1++=a2
mov a3, 0
strb a3, [a1]     ; *a1=0
endf

; text.compare(a, b) - lexical comparison

function text.compare, a, b
@@:
 ldrb v1, [a1], 1 ; v1=*a1++
 ldrb v2, [a2], 1 ; v2=*a2++
 cmp v1, v2       ; while equal
 bne @f
 orrs v1, v1, v2  ; and both nonzero
bne @b
@@:
sub a1, v1, v2    ; return v1-v2
endf

; text.find(t, c) - search for character

function text.find, t, c
@@:
 ldrb v1, [a1] ; v1=*a1
 cmp v1, a2    ; if *t=c, found
 beq .!        ; return address
 add a1, a1, 1 ; a1++
 tst v1, v1    ; until 0
bne @b
mov a1, 0      ; not found
endf

; text.find.last(t, c) - search for c reverse

function text.find.last, t, c
mov v2, a1        ; save start
@@:
 ldrb v1, [a1], 1 ; advance to end-1
 tst v1, v1
bne @b
sub a1, a1, 2
@@:
 ldrb v1, [a1]    ; v1=*a1
 cmp v1, a2       ; found
 beq .!           ; return address
 sub a1, a1, 1    ; a1--
 cmp a1, v2       ; beginning?
bgt @b
mov a1, 0         ; not found
endf

; text.begins(a, b) - a begins with b?

function text.begins, a, b
@@:
 ldrb v1, [a1] ; load *a/*b
 ldrb v2, [a2]
 tst v1, v2    ; if either=0
 beq @f
 add a1, a1, 1 ; while *a++=*b++
 add a2, a2, 1
 cmp v1, v2
beq @b
@@:            ; end
tst v2, v2     ; *b must=0
beq .!
mov a1, 0      ; no
endf

; text.ends(a, b) - a ends with b?

function text.ends, a, b
mov v1, a1     ; save a1/a2
mov v2, a2
text.n         ; n1=text.n(a1)
mov v3, a1
mov a1, a2
text.n         ; n2=text.n(a2)
mov v4, a1
mov a1, v1
add a1, a1, v3 ; a=a+n1-n2
sub a1, a1, v4
mov a2, v2
text.compare   ; a+n1-n2, b
cmp a1, 0
moveq a1, 1    ; if 0, return 1
movne a1, 0    ; if !0, return 0
endf

; text.upper(t) - convert to uppercase

function text.upper, t
@@:
 ldrb v1, [a1]  ; c=*t
 cmp v1, 'a'    ; is lowercase?
 blt .a
 cmp v1, 'z'    ; c>='a' and c<='z'?
 bgt .a
 sub v1, v1, 32 ; c-32
 strb v1, [a1]  ; *t=c
 .a:
 add a1, a1, 1  ; t++
 tst v1, v1
bne @b
endf

; text.lower(t) - convert to lowercase

function text.lower, t
@@:
 ldrb v1, [a1]  ; c=*t
 cmp v1, 'A'    ; is uppercase?
 blt .a
 cmp v1, 'Z'    ; c>='A' and c<='Z'?
 bgt .a
 add v1, v1, 32 ; c+32
 strb v1, [a1]  ; *t=c
 .a:
 add a1, a1, 1  ; t++
 tst v1, v1
bne @b
endf

; text.reverse(t) - reverse text

function text.reverse, t
mov v3, a1         ; save start
mov v4, a1
@@:
 ldrb v1, [v4], 1  ; advance v4 to end
 tst v1, v1
bne @b
sub a2, v4, 2      ; a2=end-2
@@:                ; exchange *a1++/*a2--
 ldrb v1, [a1]
 ldrb v2, [a2]
 strb v2, [a1], 1
 strb v1, [a2], -1
 cmp a1, a2        ; until end
blt @b
endf

;;;;;;;;;;;;;;;;;; CONVERSIONS ;;;;;;;;;;;;;;;;;;;

macro convert.digits {
@@:                     ; c=(n%10)+'0', n/10
 mov a3, a1             ; dividend
 movri a2, 1999999Ah    ; ((2^32)/10)+1
 sub a1, a1, a1, lsr 30 ; a1=a1-(a1>>>30)
 umull a4, a1, a1, a2   ; a1=a1*a2
 mov a2, a1, lsl 1      ; multiply by 10:
 add a2, a2, lsl 2      ; a2=a2+(a2<<2)
 sub a2, a3, a2         ; remainder
 add v1, a2, '0'        ; digit
 strb v1, [v2], 1       ; *t++=c
 tst a1, a1             ; while not 0
bne @b
}

macro reverse.digits {
mov v1, 0
strb v1, [v2] ; *t=0
mov a1, v3
text.reverse
mov a1, v2
}

; convert unsigned 32BIT integer to text

function u2t, n, t
mov v1, a1
mov v2, a2
mov v3, a2
convert.digits
reverse.digits
endf

; convert signed 32BIT integer to text

function i2t, n, t
tst a1, 80000000h
 beq @f
 mov v1, '-'
 strb v1, [a2], 1
 neg a1, a1
@@:
u2t
endf

; convert 32BIT hexadecimal number to text

function h2t, n, t
mov v1, a1
mov v2, a2
mov v3, a2
movri v5, _hex
@@:                ; *t++=*(_hex+(n&(16-1)))
 and v4, v1, 15
 ldrb v4, [v5, v4]
 strb v4, [v2], 1  ; *t++=n
 lsrs v1, v1, 4    ; n/16
bne @b
reverse.digits
endf

; convert 32BIT binary number to text

function b2t, n, t
mov v1, a1
mov v2, a2
mov v3, a2
@@:               ; n=(n&1)+'0'
 and v4, v1, 1
 add v4, v4, '0'
 strb v4, [v2], 1 ; *t++=n
 lsrs v1, v1, 1
bne @b
reverse.digits
endf    
Post 18 May 2013, 17:32
View user's profile Send private message Reply with quote
HaHaAnonymous



Joined: 02 Dec 2012
Posts: 1178
Location: Unknown
HaHaAnonymous 18 May 2013, 17:42
[ Post removed by author. ]


Last edited by HaHaAnonymous on 28 Feb 2015, 20:30; edited 1 time in total
Post 18 May 2013, 17:42
View user's profile Send private message Reply with quote
MHajduk



Joined: 30 Mar 2006
Posts: 6115
Location: Poland
MHajduk 18 May 2013, 17:43
Thank you uart777 for this example (somebody has always to make the first step). Wink I've never made anything for ARM but I think that I would need your example in the future when I will "translate" some of my projects to it. Smile
Post 18 May 2013, 17:43
View user's profile Send private message Visit poster's website Reply with quote
edfed



Joined: 20 Feb 2006
Posts: 4353
Location: Now
edfed 18 May 2013, 18:24
nice Smile

can this work (and how to run) on a HTC explorer?
Post 18 May 2013, 18:24
View user's profile Send private message Visit poster's website Reply with quote
TmX



Joined: 02 Mar 2006
Posts: 843
Location: Jakarta, Indonesia
TmX 19 May 2013, 02:36
Nice. Hopefully the next release of Z77 can run on Android natively

Very Happy
Post 19 May 2013, 02:36
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20430
Location: In your JS exploiting you and your system
revolution 19 May 2013, 02:50
uart777: I would suggest you use the processor and coprocessor directives to make sure that fasmarm doesn't generate opcodes not supported by the CPU.
Post 19 May 2013, 02:50
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 19 May 2013, 09:33
revolution: Which directive/s are needed for GBA? It has a ARM7TDMI CPU. What about Raspberry PI? ARM1176JZF-S.

MHajduk: ARM assembly is harder than Intel, but in many ways, ARM is a more powerful CPU and it can do more in one instruction.

edfed: You can try running in any GBA emulator. Thinking about ordering a HTC Tilt 2 for Windows Mobile programming if I can find one on EBay for <=$20.
Post 19 May 2013, 09:33
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20430
Location: In your JS exploiting you and your system
revolution 19 May 2013, 10:05
uart777 wrote:
revolution: Which directive/s are needed for GBA? It has a ARM7TDMI CPU.
Well strangely enough this one is mentioned in the ReadMe.txt file.
ReadMe.txt wrote:
For ARM7TDMI CPUs:
processor 0xfe
coprocessor 0x0
uart777 wrote:
What about Raspberry PI? ARM1176JZF-S.
I haven't looked into the the ARM11 chip but I think it is V6 for the CPU instruction set. I don't think it supports T2, but could be wrong there so best to properly see the manual to confirm.
Post 19 May 2013, 10:05
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 19 May 2013, 11:34
Thanks revolution. Sorry, I overlooked that, although I did view your documentation and I was specifically looking for how to call Windows Mobile/CE functions from coredll.

Let me get this straight... To call a Windows Mobile/CE function, you send the first 4 parameters in r0-r3/a1-a4 then the remaining on the stack, right? In what order? For example, how exactly would I call CreateFileW from coredll? Example?

GBA does not support the movw/movt method (>=ARMv6T2) of constructing immediate values so I wrote my own "movri" (in CPU.INC) and it does not produce the pointless "orr 0"'s as seen in typical examples).


Last edited by uart777 on 10 Aug 2013, 09:22; edited 2 times in total
Post 19 May 2013, 11:34
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20430
Location: In your JS exploiting you and your system
revolution 19 May 2013, 11:44
For WinCE it follows the APS calling convention. This is explained in the file PROCAPS.INC
PROCAPS.INC wrote:
;High level procedure macros for APS (ARM Procedure Standard) calling
;
;The APS call standard requires the first four parameters to be passed in registers r0-r3, any remaining
;parameters are placed on the stack and pushed in reverse order. The stack is always full descending.
;Caller always restores the stack. Any return values are in r0-r3.
;
;r4-r11 must always be preserved by procedures.
;r12 is always a scratch register and considered corruptable.
;r13 is always the stack register and is always used in full descending mode.
;r14 is always the return address and considered corruptable.
Post 19 May 2013, 11:44
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 19 May 2013, 12:12
Thanks, again. That's what I thought. I have followed the ARM APS standard in my code.
Code:
; standard register names. lowest-level macros
; - CPU+LANGUAGE - shall use the names r0-r12
; while high-level functions - in the library -
; use a1-a4/v1-v8 to make a clear distinction
; between parameters and "scratch registers"    
Ok, let me shove some hardcore ARM ASM in the face of these newbies Smile Grow up, pussies Wink
Code:
; create BGR 15BPP (1.5.5.5), 0-31 each... 

function rgb, r, g, b 
and a1, a1, 11111b    ; r=(r&1Fh) 
and v2, a2, 11111b    ; g=(g&1Fh) 
and v3, a3, 11111b    ; b=(b&1Fh) 
orr a1, v2, lsl 5     ; c|(g<<5) 
orr a1, v3, lsl 10    ; c=(b<<10) 
endf     

; alpha combination. a1/a2 = a/b. a3/n=0-31 

function mix, a, b, n 
mov a3, a3, lsl 3  ; convert n to 0-255 
mov v1, a1, lsr 10 ; db=(c1>>10)&11111b 
mov v2, a2, lsr 10 ; sb=(c2>>10)&11111b 
and v1, v1, 1Fh 
and v2, v2, 1Fh 
sub v2, v2, v1     ; (sb-db) 
mul v2, v2, a3     ; (sb-db)*n 
lsr v2, v2, 8      ; ((sb-db)*n)>>+db 
add v3, v2, v1 
mov v1, a1, lsr 5  ; dg=(c1>>5)&11111b 
mov v2, a2, lsr 5  ; sg=(c2>>5)&11111b 
and v1, v1, 1Fh 
and v2, v2, 1Fh 
sub v2, v2, v1     ; (sg-dg) 
mul v2, v2, a3     ; (sg-dg)*n 
lsr v2, v2, 8      ; ((sg-dg)*n)>>+dg 
add v4, v2, v1 
and v1, a1, 1Fh    ; dr=c1&11111b 
and v2, a2, 1Fh    ; sr=c2&11111b 
sub v2, v2, v1     ; (sr-dr) 
mul v2, v2, a3     ; (sr-dr)*n 
lsr v2, v2, 8      ; ((sr-dr)*n)>>+dr 
add a1, v2, v1     ; c=r|(g<<5)|(b<<10) 
orr a1, v4, lsl 5 
orr a1, v3, lsl 10 
endf 

; shift, mask, scale, subtract then divide. 
; return: a1=delta 

; (((((b>>s)&m)<<-(((a>>s)&m)<<)/w) 

function delta8, a, b, s, n 
alias a=a1, b=a2, s=a3, n=a4, m=v1 
mov m, 11111b 
mov b, b, lsr s ; ((b>>s)&m)<< 
and b, b, m 
lsl b, b, 8 
mov a, a, lsr s ; (((a>>s)&m)<< 
and a, a, m 
lsl a, a, 8 
sub a, b, a     ; (b-a)/w 
mov b, n 
idiv 
endf    
Post 19 May 2013, 12:12
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20430
Location: In your JS exploiting you and your system
revolution 19 May 2013, 12:19
uart777 wrote:
I have followed the ARM APS standard in my code.
It is not clear to me where you are saving the registers r4-r11 (v1-v8) in your code. It appears as though your functions will corrupt these registers. Or does the function macro automatically always save and restore r4-r11?
Post 19 May 2013, 12:19
View user's profile Send private message Visit poster's website Reply with quote
MHajduk



Joined: 30 Mar 2006
Posts: 6115
Location: Poland
MHajduk 19 May 2013, 12:20
uart777 wrote:
Ok, let me shove some hardcore ARM ASM in the face of these newbies Smile Grow up, pussies Wink
The day without a curse is a lost day, oh yeah. Smile
Post 19 May 2013, 12:20
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 19 May 2013, 20:16
revolution: Yes, "function" saves registers r4-r12+lr: stmfd sp!, { r4-r12, lr }.


Last edited by uart777 on 10 Aug 2013, 09:23; edited 1 time in total
Post 19 May 2013, 20:16
View user's profile Send private message Reply with quote
revolution
When all else fails, read the source


Joined: 24 Aug 2004
Posts: 20430
Location: In your JS exploiting you and your system
revolution 20 May 2013, 00:36
uart777 wrote:
revolution: Yes, "function" saves registers r4-r12+lr: stmfd sp!, { r4-r12, lr }.
With these battery powered devices it might be prudent to only save the registers that are altered within each function. Battery life can suffer quite significantly if redundant operations are being done continuously. It might not be important how quickly it runs because it runs too fast for the user to notice, but the power usage will affect how often the user must charge/change the battery.
Post 20 May 2013, 00:36
View user's profile Send private message Visit poster's website Reply with quote
uart777



Joined: 17 Jan 2012
Posts: 369
uart777 20 May 2013, 06:52
revolution: I thought about this, too, but the default "function" saves all 'v' registers just to be safe. Safety is more important than optimization. Who cares how fast a program is if it doesn't work? I use ASM primarily for knowledge. Drawing requires 1,000s of times the speed compared to the call overhead, so reducing it won't make a difference. 97%+ of the time is spent drawing.

Another thing to consider is that "let" should perform ARM specific optimizations. For example, "let" should detect/match sequences involving the barrel shifter like this: "let r1<<r2, r0+r1" and replace it with: "add r0, r1, lsl r2".
Post 20 May 2013, 06:52
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.