flat assembler
Message board for the users of flat assembler.
Index
> Tutorials and Examples > ARM GameBoy Advance on Android |
Author |
|
uart777 18 May 2013, 17:32
Minimal version of Z77 for ARM. Includes portable graphics (draw pixel, line, rectangle, scanline, gradient), text operations (copy, compare, etc) and more.
Assemble with FASMARM (outputs .GBA). Runs in popular GBA emulators: VisualBoy Advance, No$GBA and GBA Emu for Android. Download: http://sungod777.zxq.net/z77gba.zip Thanks to Tomasz for FASM and to revolution for ARM addition. FASMARM is nice. Source is clear, written professionally and it includes documentation and examples. As a programming environment, I think FASM+ARM+Z77 together is way better than GNU/GCC ASM/C/C++ package. And I will never use Microsoft's junk bloated compilers or DevARMKit (600MB+ download!) or ARM's DS-5 package (bloatware) or Eclipse (slow, illogical, cluttered IDE, time consuming setup, downloads take forever). You guys think drawing a 32BPP gradient in X86/Windoze is hard? Try drawing a 15BPP (1.5.5.5) gradient directly to VRAM in ARM bare-metal! (GBA, RPI, etc). Spent hours struggling with "draw.fade" and finally got it. I guarantee you won't find another 15BPP draw gradient anywhere online, not to mention written in ARM assembler. GBA programming references: * GBAtek: http://nocash.emubase.de/gbatek.htm * CowBite: http://www.cs.rit.edu/~tjh8300/CowBite/CowBiteSpec.htm Source Code Preview: Code: ; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$ ; *************** STAR^2 SOFTWARE **************** ; ???????????????????? Z.INC ????????????????????? ; ___ __ ; / _/_ __/ /___ _________ ; / _/ // / __/ // / __/ -_) ; /_/ \_,_/\__/\_,_/_/ \__/_ __ ; ___ ____ ___ ___ __ _ / / / /__ ____ ; / _ `(_-<(_-</ -_) ' \/ _ \/ / -_) __/ ; \_,_/___/___/\__/_/_/_/_.__/_/\__/_/ format binary as 'GBA' use32 macro use [f] { forward include 'use\'#`f#'.inc' } use cpu, language, math, system, memory, text, draw ;;;;;;;;;;;;;; EVOLUTION BEGINS NOW ;;;;;;;;;;;;;; ; fast unsigned division by 10. r0/10 macro div10 { movri r1, 1999999Ah ; r1=((2^32)/10)+1 sub r0, r0, r0, lsr 30 ; r0=r0-(r0>>>30) umull r2, r0, r1, r0 ; r0=r1*r0 } ; with remainder in r1 macro divr10 { mov r3, r0 ; dividend udiv10 mov r1, r0, lsl 1 ; multiply by 10: add r1, r1, lsl 2 ; r1=(r0<<1)+(r1<<2) sub r1, r3, r1 ; r1=r3-r1 } ; faster than div10 macro __div10 { sub r0, r0, r0, lsr 14 ; r0=r0-(r0>>>14) add r1, r0, r0, lsl 1 ; r1=r0+(r0<<1) add r0, r0, r1, lsl 2 ; r0=r0+(r1<<2) add r0, r0, r1, lsl 6 ; r0=r0+(r1<<6) add r0, r0, r1, lsl 10 ; r0=r0+(r1<<10) mov r0, r0, lsr 15 ; r0=(r0>>>15) } ; divide by 255 (256-1) macro div255 { mov r1, r0, lsr 8 ; n=((n>>+n+1)>>8 add r0, r0, 1 add r0, r1, lsr 8 } ; memory.copy(destiny, source, count) function memory.copy, a, b, n cmp a3, 4 ; if n<4 blt .8 ; copy 8BIT mov v2, a3, lsr 2 ; v2=n/4 .32: ; copy 32BIT ldr v1, [a2], 4 ; v1=*a2++ str v1, [a1], 4 ; *a1++=v1 subs v2, v2, 1 ; v2 # times bne .32 ands a3, a3, 3 ; modulo 4 beq .! ; remainder? .8: ; copy 8BIT ldrb v1, [a2], 1 ; v1=*a2++ strb v1, [a1], 1 ; *a1++=v1 subs a3, a3, 1 ; v2 # times bne .8 endf ; memory.set(destiny, value, count) function memory.set, p, v, n cmp a3, 4 ; if n<4 blt .8 ; copy 8BIT mov v2, a3, lsr 2 ; v2=n/4 .32: ; copy 32BIT str a2, [a1], 4 ; *a1++=a2 subs v2, v2, 1 ; v2 # times bne .32 ands a3, a3, 3 ; modulo 4 beq .! ; remainder? .8: ; copy 8BIT strb a2, [a1], 1 ; *a1++=a2 subs a3, a3, 1 ; a3 # times bne .8 endf function memory.set.b, p, v, n and a2, a2, 0FFh orr a2, a2, lsl 8 orr a2, a2, lsl 16 memory.set endf function memory.set.w, p, v, n movri v1, 0FFFFh and a2, a2, v1 orr a2, a2, lsl 16 memory.set endf macro memory.zero p, n { memory.set p, 0, n } ; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$ ; *************** STAR^2 SOFTWARE **************** ; ?????????????????? DRAW.INC ???????????????????? ; fast ARM graphics rendering for any system: ; embedded, Raspberry PI, Windows Mobile, etc ; create BGR 15BPP (1.5.5.5), 0-31 each... function rgb, r, g, b and a1, a1, 11111b ; r=(r&1Fh) and v2, a2, 11111b ; g=(g&1Fh) and v3, a3, 11111b ; b=(b&1Fh) orr a1, v2, lsl 5 ; c|(g<<5) orr a1, v3, lsl 10 ; c=(b<<10) endf ; alpha combination. a1/a2 = a/b. a3/n=0-31 function mix, a, b, n mov a3, a3, lsl 3 ; convert n to 0-255 mov v1, a1, lsr 10 ; db=(c1>>10)&11111b mov v2, a2, lsr 10 ; sb=(c2>>10)&11111b and v1, v1, 1Fh and v2, v2, 1Fh sub v2, v2, v1 ; (sb-db) mul v2, v2, a3 ; (sb-db)*n lsr v2, v2, 8 ; ((sb-db)*n)>>+db add v3, v2, v1 mov v1, a1, lsr 5 ; dg=(c1>>5)&11111b mov v2, a2, lsr 5 ; sg=(c2>>5)&11111b and v1, v1, 1Fh and v2, v2, 1Fh sub v2, v2, v1 ; (sg-dg) mul v2, v2, a3 ; (sg-dg)*n lsr v2, v2, 8 ; ((sg-dg)*n)>>+dg add v4, v2, v1 and v1, a1, 1Fh ; dr=c1&11111b and v2, a2, 1Fh ; sr=c2&11111b sub v2, v2, v1 ; (sr-dr) mul v2, v2, a3 ; (sr-dr)*n lsr v2, v2, 8 ; ((sr-dr)*n)>>+dr add a1, v2, v1 ; c=r|(g<<5)|(b<<10) orr a1, v4, lsl 5 orr a1, v3, lsl 10 endf ; shift, mask, scale, subtract then divide. ; return: a1=delta ; (((((b>>s)&m)<<-(((a>>s)&m)<<)/w) function delta8, a, b, s, n alias a=a1, b=a2, s=a3, n=a4, m=v1 mov m, 11111b mov b, b, lsr s ; ((b>>s)&m)<< and b, b, m lsl b, b, 8 mov a, a, lsr s ; (((a>>s)&m)<< and a, a, m lsl a, a, 8 sub a, b, a ; (b-a)/w mov b, n idiv endf ;;;;;;;;;;;;;;;;;;;; DRAWING ;;;;;;;;;;;;;;;;;;;;; macro locate x, y, w, h { movri a1, x movri a2, y if ~w eq movri a3, w movri a4, h end if } function clear.screen movri v1, VRAM ; video RAM movrmw v2, screen.color ; color orr v2, v2, lsl 16 ; duplicate movri v3, (SCREEN.N/2) ; # of pixels @@: str v2, [v1], 4 ; (u32) *vga++=c subs v3, v3, 1 ; w*h/2 # times bne @b endf ; &vga[(y*(screen.w*2))+(x*2)] function vga.xy, x, y movri v1, SCREEN.PITCH mul v2, a2, v1 ; v2=y*pitch+(x*2) add v2, v2, a1, lsl 1 movri v1, VRAM ; v1=vga add a1, v1, v2 ; return v1+v2 endf function color.at, x, y vga.xy ldrh a1, [a1] ; return vga[x, y] endf function draw.pixel, x, y movrmw v1, g.color vga.xy strh v1, [a1] ; *vga=color endf function draw.line.h, x, y, w movrmw v1, g.color ; value=color mov v2, a3 vga.xy @@: strh v1, [a1], 2 ; *vga++=c subs v2, v2, 1 ; w # times bne @b endf function draw.line.v, x, y, h movrmw v1, g.color movri v2, SCREEN.PITCH mov v3, a3 vga.xy @@: strh v1, [a1] add a1, a1, v2 ; vga+pitch subs v3, v3, 1 ; h # times bne @b endf function draw.scanline, x, y, w mov v1, a1 mov v2, a2 mov v3, a3 vga.xy @@: ldrh v4, [v7], 2 ; v7 = pixels tst v4, 8000h ; transparent? streqh v4, [a1] ; store if opaque add a1, a1, 2 subs v3, v3, 1 bne @b endf function draw.box, x, y, w, h mov v2, a3 movri v3, SCREEN.PITCH mov v4, a4 movrmw v5, g.color vga.xy .y: mov a3, v2 mov v1, a1 .x: strh v5, [v1], 2 ; *vga++=c subs a3, a3, 1 ; w # times bne .x add a1, a1, v3 ; vga+pitch subs v4, v4, 1 ; h # times bne .y endf function draw.outline, x, y, w, h mov v1, a1 mov v2, a2 mov v3, a3 mov v4, a4 vga.xy mov a1, v1 ; top: x, y, w mov a2, v2 mov a3, v3 draw.line.h mov a1, v1 ; left: x, y, h mov a2, v2 mov a3, v4 draw.line.v add a1, v1, v3 ; right: x+w-1, y+1, h-1 sub a1, a1, 1 add a2, v2, 1 sub a3, v4, 1 draw.line.v mov a1, v1 ; bottom: x, y+h-1, w add a2, v2, v4 sub a2, a2, 1 mov a3, v3 draw.line.h endf ; draw gradual vertical fade. parameters: ; a1-a4 = x, y, w, h. v5-v6 = c1, c2 function draw.fade, x, y, w, h, c1, c2 alias r=v1, g=v2, b=v3, rn=v4, gn=v5, bn=v6,\ c1=v7, c2=v8, h=r12 push a1-a4 mov c1, v5 ; save colors mov c2, v6 mov h, a4 ; height delta8 c1, c2, 0, r12 ; get deltas... mov rn, a1 delta8 c1, c2, 5, r12 mov gn, a1 delta8 c1, c2, 10, r12 mov bn, a1 mov a1, c1 get.rgb mov r, a1, lsl 8 ; extract and scale mov g, a2, lsl 8 ; r/g/b components mov b, a3, lsl 8 pop a1-a4 ; restore x/y/w/h push a3-a4 ; save w/h vga.xy pop a3-a4 ; x/y no longer needed .y: push a1-a4 mov a1, r, lsr 8 ; r=(r>> mov a2, g, lsr 8 ; g=(g>> mov a3, b, lsr 8 ; b=(b>> rgb mov v7, a1 pop a1-a4 mov v8, a3 ; draw lines... .x: strh v7, [a1], 2 ; *vga++=c subs v8, v8, 1 ; w # times bne .x sub a1, a3, lsl 1 ; vga-(w*2) movri v7,\ SCREEN.PITCH add a1, a1, v7 ; vga+pitch add r, r, rn ; r/g/b+deltas add g, g, gn ; 8.8 fixed points add b, b, bn subs h, h, 1 ; h # times bne .y endf ; draw vertical center fade: c1 to c2 ; then c2 to c1 function draw.shade, x, y, w, h, c1, c2 alias y=a2, h=a4, c1=v5, c2=v6, tmp=r12 mov h, h, lsr 1 ; h/2 pusha draw.fade popa add y, y, h ; move down mov tmp, c1 ; exchange colors mov c1, c2 mov c2, tmp draw.fade endf ; $$$$$$$$$$$$$$$$$$ Z77 4 ARM $$$$$$$$$$$$$$$$$$$ ; *************** STAR^2 SOFTWARE **************** ; ?????????????????? TEXT.INC ???????????????????? ; fast portable ARM text operations. re-written ; from scratch. more effecient than X86 version ; "fast-call" convention. parameters: a1-a4/r0-r3 ; text.n t ; get # characters (size-1) ; text.copy a, b ; standard copy with 0 after ; text.copy.n ... ; copy with maximum size ; text.attach a, b ; attach b to a; "concencate" ; text.attach.c... ; attach character ; text.compare a, b ; compare. return <0> ; text.find t, c ; search for c. return &/0 ; text.find.last... ; search for c reverse ; text.begins a, b ; begins with b? ; text.ends a, b ; ends with b? ; text.upper t ; convert to uppercase ; text.lower t ; convert to lowercase ; text.reverse t ; reverse ; i2t n, t ; number/text conversions ; u2t n, t ; h2t n, t ; b2t n, t strlen fix text.n strcpy fix text.copy strncpy fix text.copy.n strcat fix text.attach strcmp fix text.compare strchr fix text.find strrchr fix text.find.last strrev fix text.reverse align 4 _hex: db '0123456789ABCDEF' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; text.n(t) - get text length, # characters function text.n, t mov v2, a1 @@: ldrb v1, [a1], 1 ; v1=*a1++ tst v1, v1 ; until 0 bne @b sub a1, a1, v2 ; return n (a1=a1-v2-1) sub a1, a1, 1 endf ; text.copy(a, b) - standard copy with 0 after. ; return advanced address function text.copy, a, b @@: ldrb v1, [a2], 1 ; v1=*a2++ strb v1, [a1], 1 ; *a1++=v1 tst v1, v1 ; until 0 bne @b endf ; text.copy.n(a, b, n) - copy with maximum ; size specified function text.copy.n, a, b, n tst a3, a3 ; n=0? beq .! @@: ldrb v1, [a2], 1 ; v1=*a2++ strb v1, [a1], 1 ; *a1++=v1 tst v1, v1 ; until 0 beq @f subs a3, a3, 1 ; or # times bne @b @@: mov a3, 0 strb a3, [a1] endf ; text.attach(a, b) - attach b to a function text.attach, a, b @@: ldrb v1, [a1], 1 ; advance a1 to end tst v1, v1 bne @b sub a1, a1, 1 ; -1 @@: ldrb v1, [a2], 1 ; v1=*a2++ strb v1, [a1], 1 ; *a1++=v1 tst v1, v1 ; until 0 bne @b endf ; text.attach.c(t, c) - attach c to t function text.attach.c, t, c @@: ldrb v1, [a1], 1 ; advance a1 to end tst v1, v1 bne @b sub a1, a1, 1 ; -1 strb a2, [a1], 1 ; *a1++=a2 mov a3, 0 strb a3, [a1] ; *a1=0 endf ; text.compare(a, b) - lexical comparison function text.compare, a, b @@: ldrb v1, [a1], 1 ; v1=*a1++ ldrb v2, [a2], 1 ; v2=*a2++ cmp v1, v2 ; while equal bne @f orrs v1, v1, v2 ; and both nonzero bne @b @@: sub a1, v1, v2 ; return v1-v2 endf ; text.find(t, c) - search for character function text.find, t, c @@: ldrb v1, [a1] ; v1=*a1 cmp v1, a2 ; if *t=c, found beq .! ; return address add a1, a1, 1 ; a1++ tst v1, v1 ; until 0 bne @b mov a1, 0 ; not found endf ; text.find.last(t, c) - search for c reverse function text.find.last, t, c mov v2, a1 ; save start @@: ldrb v1, [a1], 1 ; advance to end-1 tst v1, v1 bne @b sub a1, a1, 2 @@: ldrb v1, [a1] ; v1=*a1 cmp v1, a2 ; found beq .! ; return address sub a1, a1, 1 ; a1-- cmp a1, v2 ; beginning? bgt @b mov a1, 0 ; not found endf ; text.begins(a, b) - a begins with b? function text.begins, a, b @@: ldrb v1, [a1] ; load *a/*b ldrb v2, [a2] tst v1, v2 ; if either=0 beq @f add a1, a1, 1 ; while *a++=*b++ add a2, a2, 1 cmp v1, v2 beq @b @@: ; end tst v2, v2 ; *b must=0 beq .! mov a1, 0 ; no endf ; text.ends(a, b) - a ends with b? function text.ends, a, b mov v1, a1 ; save a1/a2 mov v2, a2 text.n ; n1=text.n(a1) mov v3, a1 mov a1, a2 text.n ; n2=text.n(a2) mov v4, a1 mov a1, v1 add a1, a1, v3 ; a=a+n1-n2 sub a1, a1, v4 mov a2, v2 text.compare ; a+n1-n2, b cmp a1, 0 moveq a1, 1 ; if 0, return 1 movne a1, 0 ; if !0, return 0 endf ; text.upper(t) - convert to uppercase function text.upper, t @@: ldrb v1, [a1] ; c=*t cmp v1, 'a' ; is lowercase? blt .a cmp v1, 'z' ; c>='a' and c<='z'? bgt .a sub v1, v1, 32 ; c-32 strb v1, [a1] ; *t=c .a: add a1, a1, 1 ; t++ tst v1, v1 bne @b endf ; text.lower(t) - convert to lowercase function text.lower, t @@: ldrb v1, [a1] ; c=*t cmp v1, 'A' ; is uppercase? blt .a cmp v1, 'Z' ; c>='A' and c<='Z'? bgt .a add v1, v1, 32 ; c+32 strb v1, [a1] ; *t=c .a: add a1, a1, 1 ; t++ tst v1, v1 bne @b endf ; text.reverse(t) - reverse text function text.reverse, t mov v3, a1 ; save start mov v4, a1 @@: ldrb v1, [v4], 1 ; advance v4 to end tst v1, v1 bne @b sub a2, v4, 2 ; a2=end-2 @@: ; exchange *a1++/*a2-- ldrb v1, [a1] ldrb v2, [a2] strb v2, [a1], 1 strb v1, [a2], -1 cmp a1, a2 ; until end blt @b endf ;;;;;;;;;;;;;;;;;; CONVERSIONS ;;;;;;;;;;;;;;;;;;; macro convert.digits { @@: ; c=(n%10)+'0', n/10 mov a3, a1 ; dividend movri a2, 1999999Ah ; ((2^32)/10)+1 sub a1, a1, a1, lsr 30 ; a1=a1-(a1>>>30) umull a4, a1, a1, a2 ; a1=a1*a2 mov a2, a1, lsl 1 ; multiply by 10: add a2, a2, lsl 2 ; a2=a2+(a2<<2) sub a2, a3, a2 ; remainder add v1, a2, '0' ; digit strb v1, [v2], 1 ; *t++=c tst a1, a1 ; while not 0 bne @b } macro reverse.digits { mov v1, 0 strb v1, [v2] ; *t=0 mov a1, v3 text.reverse mov a1, v2 } ; convert unsigned 32BIT integer to text function u2t, n, t mov v1, a1 mov v2, a2 mov v3, a2 convert.digits reverse.digits endf ; convert signed 32BIT integer to text function i2t, n, t tst a1, 80000000h beq @f mov v1, '-' strb v1, [a2], 1 neg a1, a1 @@: u2t endf ; convert 32BIT hexadecimal number to text function h2t, n, t mov v1, a1 mov v2, a2 mov v3, a2 movri v5, _hex @@: ; *t++=*(_hex+(n&(16-1))) and v4, v1, 15 ldrb v4, [v5, v4] strb v4, [v2], 1 ; *t++=n lsrs v1, v1, 4 ; n/16 bne @b reverse.digits endf ; convert 32BIT binary number to text function b2t, n, t mov v1, a1 mov v2, a2 mov v3, a2 @@: ; n=(n&1)+'0' and v4, v1, 1 add v4, v4, '0' strb v4, [v2], 1 ; *t++=n lsrs v1, v1, 1 bne @b reverse.digits endf |
|||
18 May 2013, 17:32 |
|
HaHaAnonymous 18 May 2013, 17:42
[ Post removed by author. ]
Last edited by HaHaAnonymous on 28 Feb 2015, 20:30; edited 1 time in total |
|||
18 May 2013, 17:42 |
|
edfed 18 May 2013, 18:24
nice
can this work (and how to run) on a HTC explorer? |
|||
18 May 2013, 18:24 |
|
TmX 19 May 2013, 02:36
Nice. Hopefully the next release of Z77 can run on Android natively
|
|||
19 May 2013, 02:36 |
|
revolution 19 May 2013, 02:50
uart777: I would suggest you use the processor and coprocessor directives to make sure that fasmarm doesn't generate opcodes not supported by the CPU.
|
|||
19 May 2013, 02:50 |
|
uart777 19 May 2013, 09:33
revolution: Which directive/s are needed for GBA? It has a ARM7TDMI CPU. What about Raspberry PI? ARM1176JZF-S.
MHajduk: ARM assembly is harder than Intel, but in many ways, ARM is a more powerful CPU and it can do more in one instruction. edfed: You can try running in any GBA emulator. Thinking about ordering a HTC Tilt 2 for Windows Mobile programming if I can find one on EBay for <=$20. |
|||
19 May 2013, 09:33 |
|
revolution 19 May 2013, 10:05
uart777 wrote: revolution: Which directive/s are needed for GBA? It has a ARM7TDMI CPU. ReadMe.txt wrote: For ARM7TDMI CPUs: uart777 wrote: What about Raspberry PI? ARM1176JZF-S. |
|||
19 May 2013, 10:05 |
|
uart777 19 May 2013, 11:34
Thanks revolution. Sorry, I overlooked that, although I did view your documentation and I was specifically looking for how to call Windows Mobile/CE functions from coredll.
Let me get this straight... To call a Windows Mobile/CE function, you send the first 4 parameters in r0-r3/a1-a4 then the remaining on the stack, right? In what order? For example, how exactly would I call CreateFileW from coredll? Example? GBA does not support the movw/movt method (>=ARMv6T2) of constructing immediate values so I wrote my own "movri" (in CPU.INC) and it does not produce the pointless "orr 0"'s as seen in typical examples). Last edited by uart777 on 10 Aug 2013, 09:22; edited 2 times in total |
|||
19 May 2013, 11:34 |
|
revolution 19 May 2013, 11:44
For WinCE it follows the APS calling convention. This is explained in the file PROCAPS.INC
PROCAPS.INC wrote: ;High level procedure macros for APS (ARM Procedure Standard) calling |
|||
19 May 2013, 11:44 |
|
uart777 19 May 2013, 12:12
Thanks, again. That's what I thought. I have followed the ARM APS standard in my code.
Code: ; standard register names. lowest-level macros ; - CPU+LANGUAGE - shall use the names r0-r12 ; while high-level functions - in the library - ; use a1-a4/v1-v8 to make a clear distinction ; between parameters and "scratch registers" Code: ; create BGR 15BPP (1.5.5.5), 0-31 each... function rgb, r, g, b and a1, a1, 11111b ; r=(r&1Fh) and v2, a2, 11111b ; g=(g&1Fh) and v3, a3, 11111b ; b=(b&1Fh) orr a1, v2, lsl 5 ; c|(g<<5) orr a1, v3, lsl 10 ; c=(b<<10) endf ; alpha combination. a1/a2 = a/b. a3/n=0-31 function mix, a, b, n mov a3, a3, lsl 3 ; convert n to 0-255 mov v1, a1, lsr 10 ; db=(c1>>10)&11111b mov v2, a2, lsr 10 ; sb=(c2>>10)&11111b and v1, v1, 1Fh and v2, v2, 1Fh sub v2, v2, v1 ; (sb-db) mul v2, v2, a3 ; (sb-db)*n lsr v2, v2, 8 ; ((sb-db)*n)>>+db add v3, v2, v1 mov v1, a1, lsr 5 ; dg=(c1>>5)&11111b mov v2, a2, lsr 5 ; sg=(c2>>5)&11111b and v1, v1, 1Fh and v2, v2, 1Fh sub v2, v2, v1 ; (sg-dg) mul v2, v2, a3 ; (sg-dg)*n lsr v2, v2, 8 ; ((sg-dg)*n)>>+dg add v4, v2, v1 and v1, a1, 1Fh ; dr=c1&11111b and v2, a2, 1Fh ; sr=c2&11111b sub v2, v2, v1 ; (sr-dr) mul v2, v2, a3 ; (sr-dr)*n lsr v2, v2, 8 ; ((sr-dr)*n)>>+dr add a1, v2, v1 ; c=r|(g<<5)|(b<<10) orr a1, v4, lsl 5 orr a1, v3, lsl 10 endf ; shift, mask, scale, subtract then divide. ; return: a1=delta ; (((((b>>s)&m)<<-(((a>>s)&m)<<)/w) function delta8, a, b, s, n alias a=a1, b=a2, s=a3, n=a4, m=v1 mov m, 11111b mov b, b, lsr s ; ((b>>s)&m)<< and b, b, m lsl b, b, 8 mov a, a, lsr s ; (((a>>s)&m)<< and a, a, m lsl a, a, 8 sub a, b, a ; (b-a)/w mov b, n idiv endf |
|||
19 May 2013, 12:12 |
|
revolution 19 May 2013, 12:19
uart777 wrote: I have followed the ARM APS standard in my code. |
|||
19 May 2013, 12:19 |
|
MHajduk 19 May 2013, 12:20
uart777 wrote: Ok, let me shove some hardcore ARM ASM in the face of these newbies Grow up, pussies |
|||
19 May 2013, 12:20 |
|
uart777 19 May 2013, 20:16
revolution: Yes, "function" saves registers r4-r12+lr: stmfd sp!, { r4-r12, lr }.
Last edited by uart777 on 10 Aug 2013, 09:23; edited 1 time in total |
|||
19 May 2013, 20:16 |
|
revolution 20 May 2013, 00:36
uart777 wrote: revolution: Yes, "function" saves registers r4-r12+lr: stmfd sp!, { r4-r12, lr }. |
|||
20 May 2013, 00:36 |
|
uart777 20 May 2013, 06:52
revolution: I thought about this, too, but the default "function" saves all 'v' registers just to be safe. Safety is more important than optimization. Who cares how fast a program is if it doesn't work? I use ASM primarily for knowledge. Drawing requires 1,000s of times the speed compared to the call overhead, so reducing it won't make a difference. 97%+ of the time is spent drawing.
Another thing to consider is that "let" should perform ARM specific optimizations. For example, "let" should detect/match sequences involving the barrel shifter like this: "let r1<<r2, r0+r1" and replace it with: "add r0, r1, lsl r2". |
|||
20 May 2013, 06:52 |
|
< Last Thread | Next Thread > |
Forum Rules:
|
Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.
Website powered by rwasa.