Last active
January 1, 2022 16:59
-
-
Save iamahuman/ecd1ec772f8e5ec01ed8adabec6bd794 to your computer and use it in GitHub Desktop.
openat() tracer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#define UNW_LOCAL_ONLY | |
#include <stddef.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <limits.h> | |
#include <unistd.h> | |
#include <errno.h> | |
#include <signal.h> | |
#include <linux/audit.h> | |
#include <linux/filter.h> | |
#include <linux/seccomp.h> | |
#include <linux/futex.h> | |
#include <sys/time.h> | |
#include <sys/prctl.h> | |
#include <sys/uio.h> | |
#include <sys/syscall.h> | |
#include <sys/mman.h> | |
#include <ucontext.h> | |
#include <libunwind.h> | |
#include <pthread.h> | |
#include <assert.h> | |
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(*(arr))) | |
#define container_of(ptr, type, member) ((type *)((unsigned char *)(ptr) - offsetof(type, member))) | |
_Static_assert(__builtin_types_compatible_p(unw_context_t, ucontext_t), | |
"unw_context_t is not equivalent to ucontext_t"); | |
static void abort_with_error(const char *prefix, size_t prefix_len, const char *message) | |
{ | |
struct iovec vecs[] = { | |
{ .iov_base = (void *)prefix, .iov_len = prefix_len }, | |
{ .iov_base = (void *)message, .iov_len = strlen(message) }, | |
{ .iov_base = (void *)"\n", .iov_len = 1 }, | |
}; | |
if (getenv("dbg")) __asm__ __volatile__("int $3" ::: "memory"); | |
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs)); | |
abort(); | |
} | |
#define FAIL(prefix, message) abort_with_error((prefix), strlen(prefix), (message)) | |
static char hexdigit(unsigned int value) | |
{ | |
return value + (value > 9 ? 'a' - 10 : '0'); | |
} | |
static void print_backtrace_entry(const char *name, unsigned long offset) | |
{ | |
char buffer[32], *ptr, *endptr; | |
unsigned long num; | |
struct iovec vecs[3] = { | |
{ .iov_base = (void *)"\t", .iov_len = 1 }, | |
}; | |
ptr = endptr = buffer + ARRAY_SIZE(buffer); | |
*--ptr = '\n'; | |
num = offset; | |
do *--ptr = hexdigit(num & 0xf); | |
while ((num >>= 4) != 0); | |
ptr -= 2; | |
memcpy(ptr, "0x", 2); | |
if (name) { | |
*--ptr = '+'; | |
vecs[1].iov_base = (void *)name; | |
vecs[1].iov_len = strlen(name); | |
} | |
vecs[2].iov_base = (void *)ptr; | |
vecs[2].iov_len = endptr - ptr; | |
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs)); | |
} | |
static pthread_key_t nested_signal_key; | |
#define MCTX_REG_R8 40 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R8 ]) == MCTX_REG_R8); | |
#define MCTX_REG_R9 48 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R9 ]) == MCTX_REG_R9); | |
#define MCTX_REG_R10 56 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R10]) == MCTX_REG_R10); | |
#define MCTX_REG_R11 64 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R11]) == MCTX_REG_R11); | |
#define MCTX_REG_R12 72 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R12]) == MCTX_REG_R12); | |
#define MCTX_REG_R13 80 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R13]) == MCTX_REG_R13); | |
#define MCTX_REG_R14 88 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R14]) == MCTX_REG_R14); | |
#define MCTX_REG_R15 96 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_R15]) == MCTX_REG_R15); | |
#define MCTX_REG_RDI 104 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RDI]) == MCTX_REG_RDI); | |
#define MCTX_REG_RSI 112 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RSI]) == MCTX_REG_RSI); | |
#define MCTX_REG_RBP 120 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RBP]) == MCTX_REG_RBP); | |
#define MCTX_REG_RBX 128 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RBX]) == MCTX_REG_RBX); | |
#define MCTX_REG_RDX 136 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RDX]) == MCTX_REG_RDX); | |
#define MCTX_REG_RAX 144 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RAX]) == MCTX_REG_RAX); | |
#define MCTX_REG_RCX 152 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RCX]) == MCTX_REG_RCX); | |
#define MCTX_REG_RSP 160 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RSP]) == MCTX_REG_RSP); | |
#define MCTX_REG_RIP 168 | |
_Static_assert(offsetof(ucontext_t, uc_mcontext.gregs[REG_RIP]) == MCTX_REG_RIP); | |
#define STRINGIFY(x) #x | |
#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || defined(__clang__) | |
#define ASM_CFI(str) str | |
#else | |
#define ASM_CFI(str) | |
#endif | |
#define FOREACH6_FWD(fn, oi, r0, o0, r1, o1, r2, o2, r3, o3, r4, o4, r5, o5) fn(r0, o0) fn(r1, o1) fn(r2, o2) fn(r3, o3) fn(r4, o4) fn(r5, o5) | |
#define FOREACH6_REV(fn, oi, r0, o0, r1, o1, r2, o2, r3, o3, r4, o4, r5, o5) fn(r5, o4) fn(r4, o3) fn(r3, o2) fn(r2, o1) fn(r1, o0) fn(r0, oi) | |
#define APPLY_CALLEE_SAVE(foreach, fn) foreach(fn, "8", "r15", "16", "r14", "24", "r13", "32", "r12", "40", "rbp", "48", "rbx", "56") | |
#define FOR_EACH_REG_PART1(f) \ | |
f(MCTX_REG_R8 , "%r8" ) f(MCTX_REG_R9 , "%r9" ) f(MCTX_REG_R10, "%r10") \ | |
f(MCTX_REG_R12, "%r12") f(MCTX_REG_R13, "%r13") f(MCTX_REG_R14, "%r14") \ | |
f(MCTX_REG_R15, "%r15") | |
#define FOR_EACH_REG_PART2(f) f(MCTX_REG_RDI, "%rdi") | |
#define FOR_EACH_REG_PART3(f) \ | |
f(MCTX_REG_RSI, "%rsi") f(MCTX_REG_RBP, "%rbp") \ | |
f(MCTX_REG_RBX, "%rbx") f(MCTX_REG_RDX, "%rdx") | |
#define UCTX_LOAD_REG(x, y) "movq " STRINGIFY(x) "(%rdi), " y "\n\t" | |
#define UCTX_STORE_REG(x, y) "movq " y ", " STRINGIFY(x) "(%rcx)\n\t" | |
#define PUSH_REG(reg, offset) "pushq %" reg "\n\t" ASM_CFI(".cfi_def_cfa_offset " offset "\n\t.cfi_offset " reg ", -" offset "\n\t") | |
#define PUSH_REG_EH_ONLY(reg, offset) ASM_CFI(".cfi_offset " reg ", -" offset "\n\t") | |
#define POP_REG(reg, offset) "popq %" reg "\n\t" ASM_CFI(".cfi_restore " reg "\n\t.cfi_def_cfa_offset " offset "\n\t") | |
#if defined(__CET__) && ((__CET__) & 1) | |
#define BRANCH_TARGET_MARKER() "endbr64\n\t" | |
#else | |
#define BRANCH_TARGET_MARKER() | |
#endif | |
#define SYSCALL_STUB_PROLOGUE() \ | |
ASM_CFI(".cfi_startproc\n\t") \ | |
BRANCH_TARGET_MARKER() \ | |
APPLY_CALLEE_SAVE(FOREACH6_FWD, PUSH_REG) \ | |
PUSH_REG("rdi", "64") \ | |
"movq %rsi, %rax" \ | |
ASM_CFI("\n\t.cfi_register rsi, rax") | |
#define SYSCALL_STUB_BODY() \ | |
FOR_EACH_REG_PART1(UCTX_LOAD_REG) \ | |
FOR_EACH_REG_PART3(UCTX_LOAD_REG) \ | |
FOR_EACH_REG_PART2(UCTX_LOAD_REG) \ | |
"syscall" | |
#define SYSCALL_STUB_EPILOGUE() \ | |
"popq %rcx\n\t" \ | |
ASM_CFI(".cfi_register rdi, rcx\n\t") \ | |
ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ | |
FOR_EACH_REG_PART1(UCTX_STORE_REG) \ | |
FOR_EACH_REG_PART2(UCTX_STORE_REG) \ | |
FOR_EACH_REG_PART3(UCTX_STORE_REG) \ | |
UCTX_STORE_REG(MCTX_REG_RAX, "%rax") \ | |
APPLY_CALLEE_SAVE(FOREACH6_REV, POP_REG) \ | |
"ret" \ | |
ASM_CFI("\n\t.cfi_endproc") | |
__attribute__((visibility("hidden"))) | |
extern unsigned long execute_syscall_clone(ucontext_t *uc, unsigned long orig_syscall); | |
__attribute__((visibility("hidden"))) | |
extern void seccomp_exempt_address_clone(void); | |
__attribute__((visibility("hidden"))) | |
extern void syscall_return_clone(void); | |
__asm__( | |
".pushsection \".text\", \"ax\"\n\t" | |
".type execute_syscall_clone, @function\n" | |
"execute_syscall_clone:\n\t" | |
SYSCALL_STUB_PROLOGUE() "\n\t" | |
UCTX_LOAD_REG(MCTX_REG_RIP, "%xmm0") | |
"movq %rsp, %xmm1\n\t" | |
SYSCALL_STUB_BODY() "\n\t" | |
ASM_CFI(".cfi_endproc\n") /* first FDE ends here: RSP may change if using clone() */ | |
"seccomp_exempt_address_clone:\n\t" | |
"movq %xmm1, %rcx\n\t" | |
"cmpq %rcx, %rsp\n\t" | |
"je syscall_return_normal\n\t" | |
"movq %xmm0, %rcx\n" | |
"syscall_return_clone:\n\t" | |
ASM_CFI(".cfi_startproc simple\n\t") /* disable default frame instructions */ | |
ASM_CFI(".cfi_def_cfa rsp, 0\n\t") /* nothing pushed on stack */ | |
ASM_CFI(".cfi_return_column rcx\n\t") /* pop-less return */ | |
"jmp *%rcx\n" | |
ASM_CFI("\t.cfi_endproc\n") | |
"syscall_return_normal:\n\t" | |
ASM_CFI(".cfi_startproc simple\n\t") /* second FDE starts here: RSP has been verified */ | |
ASM_CFI(".cfi_def_cfa rsp, 64\n\t") | |
ASM_CFI(".cfi_offset rip, -8\n\t") | |
APPLY_CALLEE_SAVE(FOREACH6_FWD, PUSH_REG_EH_ONLY) | |
PUSH_REG_EH_ONLY("rdi", "64") | |
SYSCALL_STUB_EPILOGUE() "\n\t" | |
".size execute_syscall_clone, .-execute_syscall_clone\n\t" | |
".popsection" | |
); | |
__attribute__((visibility("hidden"))) | |
extern unsigned long execute_syscall_noclone(ucontext_t *uc, unsigned long orig_syscall); | |
__attribute__((visibility("hidden"))) | |
extern void seccomp_exempt_address_noclone(void); | |
__asm__( | |
".pushsection \".text\", \"ax\"\n\t" | |
".type execute_syscall_noclone, @function\n" | |
"execute_syscall_noclone:\n\t" | |
SYSCALL_STUB_PROLOGUE() "\n\t" | |
SYSCALL_STUB_BODY() "\n" | |
"seccomp_exempt_address_noclone:\n\t" | |
SYSCALL_STUB_EPILOGUE() "\n\t" | |
".size execute_syscall_noclone, .-execute_syscall_noclone\n\t" | |
".popsection" | |
); | |
__attribute__((visibility("hidden"), noreturn)) | |
extern void restore_context_trampoline(void *ucontext); | |
__asm__( | |
".pushsection \".text\", \"ax\"\n\t" | |
".type restore_context_trampoline, @function\n" | |
"restore_context_trampoline:\n\t" | |
ASM_CFI(".cfi_startproc\n\t") | |
BRANCH_TARGET_MARKER() | |
"mov %rdi, %rsp\n\t" /* exploit red zone guarantee of x86-64 ABI */ | |
ASM_CFI(".cfi_endproc\n\t") /* switching stacks: split FDE */ | |
ASM_CFI(".cfi_startproc simple\n\t") /* disable default frame insns */ | |
ASM_CFI(".cfi_def_cfa rsp, 0\n\t") /* retaddr not above rsp */ | |
ASM_CFI(".cfi_offset rip, -8\n\t") /* retaddr = &restore_rt (glibc) */ | |
"jmpq *-8(%rsp)\n\t" /* don't use "ret" (bypass shadow stack) */ | |
ASM_CFI(".cfi_endproc\n\t") | |
".size restore_context_trampoline, .-restore_context_trampoline\n\t" | |
".popsection" | |
); | |
struct queue_item | |
{ | |
struct queue_item *next; | |
unsigned int next_set; | |
unsigned int done; | |
}; | |
struct lockfree_queue | |
{ | |
struct queue_item *tail; | |
struct queue_item head; | |
}; | |
struct syscall_request | |
{ | |
struct queue_item queue_item; | |
unsigned long nr; | |
ucontext_t *uc; | |
}; | |
#define DEFINE_LOCKFREE_QUEUE(name) struct lockfree_queue name = { &name.head } | |
static DEFINE_LOCKFREE_QUEUE(noseccomp_syscall_queue); | |
static pthread_t queue_thread; | |
static void enqueue_and_wait(struct lockfree_queue *queue, struct queue_item *item) | |
{ | |
struct queue_item *prev; | |
prev = __atomic_exchange_n(&queue->tail, item, __ATOMIC_ACQ_REL); | |
prev->next = item; | |
__atomic_store_n(&prev->next_set, 1, __ATOMIC_RELEASE); | |
syscall(__NR_futex, &prev->next_set, FUTEX_WAKE, INT_MAX, 0, 0, 0); | |
while (!__atomic_load_n(&item->done, __ATOMIC_ACQUIRE)) | |
syscall(__NR_futex, &item->done, FUTEX_WAIT, 0, 0, 0, 0); | |
} | |
static void issue_syscall_request(struct lockfree_queue *queue, int nr, ucontext_t *uc) | |
{ | |
struct syscall_request req = { { NULL }, nr, uc }; | |
enqueue_and_wait(queue, &req.queue_item); | |
} | |
static int next_queue_item_internal(struct lockfree_queue *queue, struct queue_item **cursor, struct queue_item *curr, int flag, const struct timespec *timeout) | |
{ | |
while (!__atomic_load_n(&curr->next_set, __ATOMIC_ACQUIRE)) | |
syscall(__NR_futex, &curr->next_set, FUTEX_WAIT | flag, (unsigned long)timeout, 0, 0, 0); | |
*cursor = curr->next; | |
return 0; | |
} | |
static int next_queue_item(struct lockfree_queue *queue, struct queue_item **cursor, int flag, const struct timespec *timeout) | |
{ | |
if (*cursor != NULL) | |
return 0; | |
return next_queue_item_internal(queue, cursor, &queue->head, flag, timeout); | |
} | |
static int ack_queue_item(struct lockfree_queue *queue, struct queue_item **cursor, int flag, const struct timespec *timeout) | |
{ | |
struct queue_item *curr = *cursor, *expect; | |
int result = 0; | |
if (curr == NULL) /* no item */ | |
return 0; | |
queue->head.next = NULL; | |
__atomic_store_n(&queue->head.next_set, 0, __ATOMIC_RELEASE); | |
expect = curr; | |
if (!__atomic_compare_exchange_n(&queue->tail, &expect, &queue->head, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) { | |
assert(expect != &queue->head); | |
/* Queue is nonempty; get next item before releasing the current one */ | |
result = next_queue_item_internal(queue, cursor, curr, flag, timeout); | |
} else { | |
/* Queue is empty */ | |
*cursor = NULL; | |
} | |
__atomic_store_n(&curr->done, 1, __ATOMIC_RELEASE); | |
syscall(__NR_futex, &curr->done, FUTEX_WAKE, INT_MAX, 0, 0, 0); | |
return result; | |
} | |
static void *membitor(void *dst, const void *src1, const void *src2, size_t len) | |
{ | |
unsigned char *a = dst; | |
const unsigned char *b = src1; | |
const unsigned char *c = src2; | |
size_t i; | |
for (i = 0; i < len; i++) | |
a[i] = b[i] | c[i]; | |
return dst; | |
} | |
static void *membitandn(void *dst, const void *src1, const void *src2, size_t len) | |
{ | |
unsigned char *a = dst; | |
const unsigned char *b = src1; | |
const unsigned char *c = src2; | |
size_t i; | |
for (i = 0; i < len; i++) | |
a[i] = b[i] & ~c[i]; | |
return dst; | |
} | |
#define MESSAGE(x) write(STDERR_FILENO, (x), sizeof(x) - 1); | |
static int mem_overlaps(const void *a, size_t an, const void *b, size_t bn) | |
{ | |
unsigned long ap = (unsigned long)a; | |
unsigned long bp = (unsigned long)b; | |
return ap + an > bp && bp + bn > an; | |
} | |
static void handle_sigprocmask(unsigned int orig_syscall, ucontext_t *uc) | |
{ | |
unsigned long ret; | |
unsigned long how = uc->uc_mcontext.gregs[REG_RDI]; | |
void *set = (void *)uc->uc_mcontext.gregs[REG_RSI]; | |
void *oldset = (void *)uc->uc_mcontext.gregs[REG_RDX]; | |
size_t sigsetsize = (size_t)uc->uc_mcontext.gregs[REG_R10]; | |
sigset_t newset; | |
int err; | |
if (orig_syscall != __NR_rt_sigprocmask) { | |
sigsetsize = sizeof(unsigned long); /* old_sigset_t */ | |
} else if (sigsetsize > sizeof(newset)) { | |
uc->uc_mcontext.gregs[REG_RAX] = -EINVAL; | |
return; | |
} | |
/* do a test run to catch -EFAULT etc. first try (NULL oldset) */ | |
if (set && oldset && mem_overlaps(set, sigsetsize, oldset, sigsetsize)) { | |
uc->uc_mcontext.gregs[REG_RDI] = SIG_BLOCK; | |
uc->uc_mcontext.gregs[REG_RDX] = 0; | |
ret = execute_syscall_noclone(uc, orig_syscall); | |
if ((long)ret < 0) | |
return; | |
uc->uc_mcontext.gregs[REG_RDI] = how; | |
uc->uc_mcontext.gregs[REG_RDX] = (unsigned long)oldset; | |
} | |
err = 0; | |
if (set) { | |
switch ((int)how) { | |
case SIG_BLOCK: | |
membitor(&newset, &uc->uc_sigmask, set, sigsetsize); | |
break; | |
case SIG_UNBLOCK: | |
membitandn(&newset, &uc->uc_sigmask, set, sigsetsize); | |
break; | |
case SIG_SETMASK: | |
memcpy(&newset, set, sigsetsize); | |
break; | |
default: | |
err = -EINVAL; | |
break; | |
} | |
/* Always unblock SIGSYS */ | |
sigdelset(&newset, SIGSYS); | |
} | |
uc->uc_mcontext.gregs[REG_RAX] = err; | |
/* do a test run to catch -EFAULT etc. second try (full) */ | |
ret = execute_syscall_noclone(uc, orig_syscall); | |
if ((long)ret < 0) | |
return; | |
if (oldset) | |
memcpy(oldset, &uc->uc_sigmask, sigsetsize); | |
if (set) | |
memcpy(&uc->uc_sigmask, &newset, sigsetsize); | |
} | |
static unsigned long execute_syscall(ucontext_t *uc, unsigned long orig_syscall) | |
{ | |
if ((orig_syscall == __NR_clone && uc->uc_mcontext.gregs[REG_RSI]) || | |
orig_syscall == __NR_clone3) | |
return execute_syscall_clone(uc, orig_syscall); | |
return execute_syscall_noclone(uc, orig_syscall); | |
} | |
void handle_sigsys(int sig, siginfo_t *siginfo, void *ucontext) | |
{ | |
unw_context_t *ctx; | |
unw_cursor_t cursor; | |
unw_word_t offset; | |
ucontext_t *uc = ucontext; | |
int result; | |
char funcname[1024]; | |
ctx = ucontext; | |
if (pthread_getspecific(nested_signal_key)) | |
goto syscall_passthru; | |
switch (siginfo->si_syscall) | |
{ | |
case __NR_clone: | |
if (uc->uc_mcontext.gregs[REG_RDI] & CLONE_VFORK) { | |
uc->uc_mcontext.gregs[REG_RDI] &= ~CLONE_VM; /* vfork+vm not supported */ | |
} | |
if (uc->uc_mcontext.gregs[REG_RDI] & CLONE_VM) { | |
goto syscall_passthru; | |
} | |
/* passthrough */ | |
case __NR_execve: | |
case __NR_execveat: | |
case __NR_fork: | |
issue_syscall_request(&noseccomp_syscall_queue, siginfo->si_syscall, uc); | |
return; | |
case __NR_rt_sigprocmask: | |
handle_sigprocmask(siginfo->si_syscall, uc); | |
return; | |
case __NR_openat: | |
default: | |
break; | |
} | |
if (pthread_setspecific(nested_signal_key, (void *)1UL)) | |
abort(); | |
result = unw_init_local2(&cursor, ctx, UNW_INIT_SIGNAL_FRAME); | |
if (result < 0) { | |
FAIL("*** handle_sigsys: unw_init_local2 failed: ", unw_strerror(result)); | |
return; | |
} | |
{ | |
const char *fname = (const char *)uc->uc_mcontext.gregs[REG_RSI]; | |
struct iovec vecs[] = { | |
#define VECSTR(x) { (void *)(x), sizeof(x) - 1 } | |
VECSTR("openat(..., \""), | |
{ (void *)fname, fname ? strlen(fname) : 0 }, | |
VECSTR("\", ...) called, backtrace:\n"), | |
#undef VECSTR | |
}; | |
writev(STDERR_FILENO, vecs, ARRAY_SIZE(vecs)); | |
} | |
do { | |
result = unw_get_proc_name(&cursor, funcname, sizeof(funcname), &offset); | |
if (result >= 0) { | |
print_backtrace_entry(funcname, offset); | |
} else { | |
unw_get_reg(&cursor, UNW_REG_IP, &offset); | |
print_backtrace_entry(NULL, offset); | |
} | |
} while ((result = unw_step(&cursor)) > 0); | |
if (result < 0) { | |
FAIL("*** handle_sigsys: unw_init_local2 failed: ", unw_strerror(result > 0 ? -result : result)); | |
return; | |
} | |
MESSAGE("end backtrace.\n\n"); | |
pthread_setspecific(nested_signal_key, NULL); | |
syscall_passthru: | |
execute_syscall(uc, (unsigned int)siginfo->si_syscall); | |
} | |
enum { | |
LoadArch, | |
CheckArch, | |
LoadNr, | |
CheckX32, | |
CheckSigaction, | |
LoadSigcallArg0, | |
CheckSigsys, | |
CheckRtsigprocmask, | |
CheckFork, | |
CheckClone, | |
CheckOpenat, | |
CheckExecve, | |
CheckExecveat, | |
LoadIPLo_1, | |
CheckIPLo_1, | |
LoadIPHi_1, | |
CheckIPHi_1, | |
LoadIPLo_2, | |
CheckIPLo_2, | |
LoadIPHi_2, | |
CheckIPHi_2, | |
VerdictTrap, | |
VerdictAllow, | |
VerdictKillProcess, | |
VerdictSucceed, | |
FilterLength, | |
}; | |
static void init_filter(void) | |
{ | |
static struct sock_filter filter[FilterLength] = { | |
[LoadArch ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))), | |
[CheckArch ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, VerdictAllow - CheckArch - 1), | |
[LoadNr ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))), | |
[CheckX32 ] = BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, (1 << 30) - 1, VerdictAllow - CheckX32 - 1, 0), | |
[CheckSigaction ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigaction, LoadSigcallArg0 - CheckSigaction - 1, CheckRtsigprocmask - CheckSigaction - 1), | |
[LoadSigcallArg0 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, args[0]))), | |
/* sigaction(SIGSYS, ...) -> no-op */ | |
[CheckSigsys ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SIGSYS, VerdictSucceed - CheckSigsys - 1, CheckRtsigprocmask - CheckSigsys - 1), | |
[CheckRtsigprocmask] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigprocmask, LoadIPLo_1 - CheckRtsigprocmask - 1, 0), | |
[CheckFork ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_fork, LoadIPLo_1 - CheckFork - 1, 0), | |
[CheckClone ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clone, LoadIPLo_1 - CheckClone - 1, 0), | |
[CheckOpenat ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_openat, LoadIPLo_1 - CheckOpenat - 1, 0), | |
[CheckExecve ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_execve, LoadIPLo_1 - CheckExecve - 1, 0), | |
[CheckExecveat ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_execveat, LoadIPLo_1 - CheckExecveat - 1, VerdictAllow - CheckExecveat - 1), | |
/* PC == seccomp_exempt_address -> allow */ | |
[LoadIPLo_1 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer))), | |
[CheckIPLo_1 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, CheckIPLo_2 - CheckIPLo_1 - 1), | |
[LoadIPHi_1 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer) + 4)), | |
[CheckIPHi_1 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, VerdictAllow - CheckIPHi_1 - 1, LoadIPLo_2 - CheckIPHi_1 - 1), | |
[LoadIPLo_2 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer))), | |
[CheckIPLo_2 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, VerdictTrap - CheckIPLo_2 - 1), | |
[LoadIPHi_2 ] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, instruction_pointer) + 4)), | |
[CheckIPHi_2 ] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0, VerdictAllow - CheckIPHi_2 - 1, VerdictTrap - CheckIPHi_2 - 1), | |
[VerdictTrap ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), | |
[VerdictAllow ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), | |
[VerdictSucceed ] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO), | |
}; | |
static const struct sock_fprog prog = { | |
.len = ARRAY_SIZE(filter), | |
.filter = (struct sock_filter *)filter, | |
}; | |
static const struct sigaction sigact = { .sa_flags = SA_SIGINFO | SA_NODEFER, .sa_sigaction = handle_sigsys }; | |
unsigned long ip_exempts[2] = { | |
(unsigned long)&seccomp_exempt_address_clone, | |
(unsigned long)&seccomp_exempt_address_noclone, | |
}; | |
memcpy(&filter[CheckIPLo_1].k, (unsigned char *)&ip_exempts[0] + 0, sizeof(unsigned int)); | |
memcpy(&filter[CheckIPHi_1].k, (unsigned char *)&ip_exempts[0] + 4, sizeof(unsigned int)); | |
memcpy(&filter[CheckIPLo_2].k, (unsigned char *)&ip_exempts[1] + 0, sizeof(unsigned int)); | |
memcpy(&filter[CheckIPHi_2].k, (unsigned char *)&ip_exempts[1] + 4, sizeof(unsigned int)); | |
pthread_key_create(&nested_signal_key, NULL); | |
if (sigaction(SIGSYS, &sigact, NULL)) { | |
perror("sigaction"); | |
exit(EXIT_FAILURE); | |
} | |
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0, (struct sock_fprog *)&prog)) { | |
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); | |
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0, (struct sock_fprog *)&prog)) { | |
perror("seccomp"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
} | |
void *noseccomp_syscall_thread(void *arg) | |
{ | |
int result; | |
struct queue_item *item = NULL; | |
pid_t orig_tid = gettid(); | |
while (!(result = next_queue_item(&noseccomp_syscall_queue, &item, 0, NULL))) { | |
struct syscall_request *req = container_of(item, struct syscall_request, queue_item); | |
execute_syscall(req->uc, req->nr); | |
if (orig_tid != gettid()) { /* fork() */ | |
restore_context_trampoline(req->uc); | |
} | |
ack_queue_item(&noseccomp_syscall_queue, &item, 0, NULL); | |
} | |
return 0; | |
} | |
__attribute__((constructor)) | |
static void main_ctor(void) | |
{ | |
/* Ensure access to unwind-unsafe region does not fault */ | |
mlock((void *)seccomp_exempt_address_clone, | |
(unsigned char *)syscall_return_clone - | |
(unsigned char *)seccomp_exempt_address_clone); | |
/* fork()/execve() requests use this thread to bypass seccomp inheritance */ | |
pthread_create(&queue_thread, NULL, &noseccomp_syscall_thread, NULL); | |
init_filter(); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC = gcc | |
CFLAGS = -O2 -g -shared -fPIC -Wall $(EXTRACFLAGS) | |
LDFLAGS = -Wl,-z,now,-z,relro $(EXTRALDFLAGS) | |
LIBS = -lunwind-x86_64 -lpthread | |
all: libmain.so | |
libmain.so: main.c | |
$(CC) $(CFLAGS) -o libmain.so main.c $(LDFLAGS) $(LIBS) | |
clean: | |
@rm -f libmain.so | |
test-thread: libmain.so | |
@LD_PRELOAD="./libmain.so" python thread-test.py | |
.PHONY: all clean test-thread |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import threading | |
thr = threading.Timer(1, lambda: print('Timeout')) | |
thr.start() | |
print('Thread started') | |
thr.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment