I built this code to transform an image memory into input for an AI model.
The code generates the DLL. If I look at the DLL with DLLEXP, everything is fine. But
when I try to load it with LoadLibrary, it returns 0, which means it's failing.
I load a lot of libraries this way, and I don't see why it's not working.
There's something I'm missing...this is the first time I've made a DLL with fasm.
This is the code
format PE64 DLL
entry DllMain
include 'include/win64a.inc'
;--------------------------------------------------------------------
section '.code' code readable writeable
proc DllMain hinstDLL,fdwReason,lpvReserved
mov rax, TRUE
ret
endp
; Convención de llamada Microsoft x64 Fastcall
; rcx: const int32_t* input
; rdx: float* output
Int32ToFloat32RGB:
push rbp
mov rbp, rsp
sub rsp, 48 ; Reservar espacio para la "home" del registro y la alineación de 16 bytes
mov r8, rdx
add r8, 409600 * 4 ; r8 = puntero para g_channel
mov r9, rdx
add r9, 2 * 409600 * 4 ; r9 = puntero para b_channel
vmovups ymm2, [mask_ff]
vmovups ymm3, [inv_255_f32]
; Bucle principal, procesa 8 valores a la vez (409600 / 8 = 51200 iteraciones)
mov rcx, 51200
.loop:
vmovups ymm0, [rcx]
; --- Canal R ---
vpand ymm1, ymm0, ymm2
vcvtudq2ps ymm1, ymm1
vmulps ymm1, ymm1, ymm3
vmovups [rdx], ymm1
; --- Canal G ---
vpsrlw ymm4, ymm0, 8
vpand ymm1, ymm4, ymm2
vcvtudq2ps ymm1, ymm1
vmulps ymm1, ymm1, ymm3
vmovups [r8], ymm1
; --- Canal B ---
vpsrlw ymm4, ymm0, 16
vpand ymm1, ymm4, ymm2
vcvtudq2ps ymm1, ymm1
vmulps ymm1, ymm1, ymm3
vmovups [r9], ymm1
add rdx, 32
add r8, 32
add r9, 32
sub rcx, 32 ; 8 * 4 bytes
jnz .loop
add rsp, 48
pop rbp
ret
;--------------------------------------------------------------------
section '.data' data readable writeable
align 32
mask_ff: dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
inv_255_f32: dd 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77, 0x3d7fbe77
section '.edata' export data readable
export 'memavx.dll',\
Int32ToFloat32RGB, 'Int32ToFloat32RGB'
section '.reloc' fixups data readable discardable
if $=$$
dd 0,8
end if