// Copyright Epic Games, Inc. All Rights Reserved. #include "UbaThread.h" #include "UbaPlatform.h" #if PLATFORM_WINDOWS #include #elif PLATFORM_LINUX #include #include #include #include #define REG_SP regs.rsp #else #include #include #endif #define UBA_TRACK_THREADS 0 namespace uba { bool AlternateThreadGroupAffinity(void* nativeThreadHandle) { #if PLATFORM_WINDOWS int processorGroupCount = GetProcessorGroupCount(); if (processorGroupCount <= 1) return true; static Atomic processorGroupCounter; u16 processorGroup = u16((processorGroupCounter++) % processorGroupCount); u32 groupProcessorCount = ::GetActiveProcessorCount(processorGroup); GROUP_AFFINITY groupAffinity = {}; groupAffinity.Mask = ~0ull >> (int)(64 - groupProcessorCount); groupAffinity.Group = processorGroup; return ::SetThreadGroupAffinity(nativeThreadHandle, &groupAffinity, NULL); #else return true; #endif } bool SetThreadGroupAffinity(void* nativeThreadHandle, const GroupAffinity& affinity) { #if PLATFORM_WINDOWS if (GetProcessorGroupCount() <= 1) return true; GROUP_AFFINITY groupAffinity = {}; groupAffinity.Mask = affinity.mask; groupAffinity.Group = affinity.group; return ::SetThreadGroupAffinity(nativeThreadHandle, &groupAffinity, NULL); #else return false; #endif } Futex g_allThreadsLock; Thread* g_firstThread; Thread::Thread() { } Thread::Thread(Function&& func, const tchar* description) { Start(std::move(func), description); } Thread::~Thread() { Wait(); } void Thread::Start(Function&& f, const tchar* description) { m_func = std::move(f); #if PLATFORM_WINDOWS m_handle = CreateThread(NULL, 0, [](LPVOID p) -> DWORD { return ((Thread*)p)->m_func(); }, this, 0, NULL); UBA_ASSERT(m_handle); if (!m_handle) return; if (description) SetThreadDescription(m_handle, description); AlternateThreadGroupAffinity(m_handle); #else int err = 0; m_finished.Create(true); static_assert(sizeof(pthread_t) <= sizeof(m_handle), ""); auto& pth = *(pthread_t*)&m_handle; pthread_attr_t tattr; // initialized with default attributes err = pthread_attr_init(&tattr); // TODO: Need to figure out a better value, or decrease stack usage // without this though we get a bus error on Intel Macs #if !defined(__arm__) && !defined(__arm64__) size_t size = PTHREAD_STACK_MIN * 500; err = pthread_attr_setstacksize(&tattr, size); #endif err = pthread_create(&pth, &tattr, [](void* p) -> void* { // Ignore sigint. Uba is designed for someone else to cancel it #if !PLATFORM_WINDOWS sigset_t set; sigemptyset(&set); sigaddset(&set, SIGINT); pthread_sigmask(SIG_BLOCK, &set, NULL); #endif auto& t = *(Thread*)p; int res = t.m_func(); t.m_finished.Set(); return (void*)(uintptr_t)res; }, this); UBA_ASSERT(err == 0); (void)err; if (!description) description = "UbaUnknown"; #if PLATFORM_MACOS pthread_setname_np(description); #else pthread_setname_np(pth, description); #endif err = pthread_attr_destroy(&tattr); UBA_ASSERT(err == 0); (void)err; #endif #if UBA_TRACK_THREADS SCOPED_FUTEX(g_allThreadsLock, lock); printf("Adding THREAD %llx\n", (u64)*(pthread_t*)&m_handle); m_next = g_firstThread; if (m_next) m_next->m_prev = this; g_firstThread = this; #endif } bool Thread::Wait(u32 milliseconds, Event* wakeupEvent) { SCOPED_READ_LOCK(m_funcLock, readLock); if (!m_handle) return true; auto removeThread = [this]() { #if UBA_TRACK_THREADS SCOPED_FUTEX(g_allThreadsLock, lock); printf("REMOVING THREAD %llx\n", (u64)*(pthread_t*)&m_handle); if (m_next) m_next->m_prev = m_prev; if (m_prev) m_prev->m_next = m_next; else if (g_firstThread == this) g_firstThread = m_next; m_next = nullptr; m_prev = nullptr; #endif }; #if PLATFORM_WINDOWS // Optimization, not needed in initial implementation if (wakeupEvent) { HANDLE h[] = { m_handle, wakeupEvent->GetHandle() }; DWORD res = WaitForMultipleObjects(2, h, false, milliseconds); if (res == WAIT_OBJECT_0 + 1 || res == WAIT_TIMEOUT) return false; } else { if (WaitForSingleObject(m_handle, milliseconds) == WAIT_TIMEOUT) return false; } removeThread(); #else if (!m_finished.IsSet(milliseconds)) return false; removeThread(); int* ptr = 0; int res = pthread_join(*(pthread_t*)&m_handle, (void**)&ptr); UBA_ASSERT(res == 0); #endif readLock.Leave(); SCOPED_WRITE_LOCK(m_funcLock, lock); if (!m_handle) return true; #if PLATFORM_WINDOWS CloseHandle(m_handle); #endif m_func = {}; m_handle = nullptr; return true; } bool Thread::GetGroupAffinity(GroupAffinity& out) { #if PLATFORM_WINDOWS if (GetProcessorGroupCount() <= 1) return true; GROUP_AFFINITY aff; if (!::GetThreadGroupAffinity(m_handle, &aff)) return false; out.mask = aff.Mask; out.group = aff.Group; return true; #else return false; #endif } bool TraverseAllThreads(const TraverseThreadFunc& func, const TraverseThreadErrorFunc& errorFunc) { #if PLATFORM_WINDOWS StringBuffer<256> error; auto reportError = [&](HANDLE hThread, const tchar* call) { errorFunc(error.Clear().Appendf(TC("%s failed for thread %llu (%s)"), call, u64(hThread), LastErrorToText().data)); }; #if UBA_TRACK_THREADS DWORD currentThreadId = GetCurrentThreadId(); SCOPED_FUTEX(g_allThreadsLock, lock); for (Thread* t=g_firstThread; t; t=t->m_next) { HANDLE hThread = t->m_handle; if (currentThreadId == GetThreadId(hThread)) continue; if (SuspendThread(hThread) == -1) { reportError(hThread, TC("SuspendThread")); continue; } auto rtg = MakeGuard([&]() { ResumeThread(hThread); }); CONTEXT ctx; memset(&ctx, 0, sizeof(CONTEXT)); ctx.ContextFlags = CONTEXT_FULL; if (!GetThreadContext(hThread, &ctx)) { reportError(hThread, TC("GetThreadContext")); continue; } func(0, &ctx); } #else DWORD pid = GetCurrentProcessId(); DWORD tid = GetCurrentThreadId(); HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); if (hSnapshot == INVALID_HANDLE_VALUE) return false; auto sg = MakeGuard([&]() { CloseHandle(hSnapshot); }); UnorderedSet handledCallstacks; THREADENTRY32 te32 = { sizeof(THREADENTRY32) }; if (!Thread32First(hSnapshot, &te32)) return false; do { if (te32.th32OwnerProcessID != pid || te32.th32ThreadID == tid || te32.th32ThreadID == 0) continue; HANDLE hThread = OpenThread(THREAD_GET_CONTEXT | THREAD_SET_CONTEXT | THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, FALSE, te32.th32ThreadID); if (!hThread) { reportError(hThread, TC("OpenThread")); continue; } auto tg = MakeGuard([&]() { CloseHandle(hThread); }); if (SuspendThread(hThread) == -1) { reportError(hThread, TC("SuspendThread")); continue; } auto rtg = MakeGuard([&]() { ResumeThread(hThread); }); PWSTR threadDesc = nullptr; GetThreadDescription(hThread, &threadDesc); auto tdg = MakeGuard([&]() { LocalFree(threadDesc); }); CONTEXT ctx = {}; ctx.ContextFlags = CONTEXT_ALL; if (!GetThreadContext(hThread, &ctx)) { reportError(hThread, TC("GetThreadContext")); continue; } void* callstack[100]; u32 callstackCount = GetCallstack(callstack, 100, 1, &ctx); func(te32.th32ThreadID, callstack, callstackCount, threadDesc); } while (Thread32Next(hSnapshot, &te32)); #endif return true; #elif PLATFORM_LINUX // TODO: None of these approaches work. // signal path will break if thread is in certain system calls.. and ptrace does not work either :-/ static Event s_ev(false); static const TraverseThreadFunc* s_func; s_func = &func; struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int sig, siginfo_t* info, void* context) { void* callstack[100]; u32 callstackCount = GetCallstack(callstack, 100, 1, context); (*s_func)(syscall(SYS_gettid), callstack, callstackCount, nullptr); s_ev.Set(); }; sigaction(SIGUSR1, &sa, NULL); DIR* dir = opendir("/proc/self/task"); if (!dir) return false; pid_t currentTid = syscall(SYS_gettid); struct dirent *entry; while ((entry = readdir(dir)) != NULL) { pid_t tid = (pid_t)atoi(entry->d_name); if (tid <= 0 || tid == currentTid) continue; kill(tid, SIGUSR1); s_ev.IsSet(); } #if 0 pid_t parentPid = getpid(); char message[100] = "Original message in parent."; // We need to fork to be able to use ptrace on threads pid_t child = fork(); if (child < 0) { perror("fork"); return false; } if (child == 0) // The child { StringBuffer<128>().Appendf("/proc/%u/task", parentPid); auto g = MakeGuard([] { exit(EXIT_SUCCESS); }); DIR* dir = opendir(StringBuffer<128>().Appendf("/proc/%u/task", parentPid).data); if (!dir) return false; struct dirent *entry; while ((entry = readdir(dir)) != NULL) { pid_t tid = (pid_t)atoi(entry->d_name); if (tid <= 0) continue; if (ptrace(PTRACE_ATTACH, tid, NULL, NULL) == -1) { perror("ptrace attach"); continue; } int status; waitpid(tid, &status, 0); // Wait until the thread stops. struct user_regs_struct regs; if (ptrace(PTRACE_GETREGS, tid, NULL, ®s) == -1) { perror("ptrace getregs"); return false; } printf("Callstack for thread %d:\n", tid); printf("RIP: %llx\n", regs.rip); // Start from the current base pointer. unsigned long long fp = regs.rbp; int frame = 0; #define MAX_FRAMES 64 while (fp && frame < MAX_FRAMES) { // Read the saved frame pointer (first word) and return address (second word) unsigned long long next_fp, ret_addr; errno = 0; next_fp = ptrace(PTRACE_PEEKDATA, tid, (void *)fp, NULL); if (errno != 0) break; ret_addr = ptrace(PTRACE_PEEKDATA, tid, (void *)(fp + sizeof(unsigned long long)), NULL); if (errno != 0) break; printf(" Frame %d: ret_addr = %llx (fp = %llx)\n", frame, ret_addr, fp); fp = next_fp; frame++; } printf("\n"); if (ptrace(PTRACE_DETACH, tid, NULL, NULL) == -1) { perror("ptrace detach"); return false; } const char* new_msg = "Hello from child!"; struct iovec local[1]; local[0].iov_base = (void *)new_msg; local[0].iov_len = strlen(new_msg) + 1; // Include the null terminator // Set up the remote iovec. // The parent's view of the address is the same as the child's copy after fork. struct iovec remote[1]; remote[0].iov_base = message; remote[0].iov_len = strlen(new_msg) + 1; // Use process_vm_writev to write the new message into the parent's memory. ssize_t nwritten = process_vm_writev(getppid(), local, 1, remote, 1, 0); } closedir(dir); } else // The parent { while (message[0] == 'O') { Sleep(500); } } #endif #else task_t task; kern_return_t kr = task_for_pid(mach_task_self(), getpid(), &task); if (kr != KERN_SUCCESS) return false; thread_act_array_t threads; mach_msg_type_number_t thread_count; kr = task_threads(task, &threads, &thread_count); if (kr != KERN_SUCCESS) return false; auto tsg = MakeGuard([&] { vm_deallocate(mach_task_self(), (vm_address_t)threads, thread_count * sizeof(thread_t)); }); for (int i = 0; i < thread_count; i++) { if (threads[i] == mach_thread_self()) continue; auto thread = threads[i]; thread_suspend(thread); auto g = MakeGuard([&] { thread_resume(thread); }); kern_return_t kr; uint64_t pc = 0, fp = 0; #if defined(__x86_64__) x86_thread_state64_t state; mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; kr = thread_get_state(thread, x86_THREAD_STATE64, (thread_state_t)&state, &count); if (kr != KERN_SUCCESS) return false; pc = state.__rip; fp = state.__rbp; #elif defined(__arm64__) arm_thread_state64_t state; mach_msg_type_number_t count = ARM_THREAD_STATE64_COUNT; kr = thread_get_state(thread, ARM_THREAD_STATE64, (thread_state_t)&state, &count); if (kr != KERN_SUCCESS) return false; pc = state.__pc; fp = state.__fp; #else #error "Unsupported architecture" #endif void* callstack[100]; u32 callstackCount = 0; for (int i = 0; i < 32 && fp; i++) { uint64_t *stack = (uint64_t *)fp; uint64_t return_addr = stack[1]; fp = stack[0]; if (!fp) break; if (i > 0) callstack[callstackCount++] = (void*)return_addr; } g.Execute(); func(thread, callstack, callstackCount, nullptr); } #endif return true; } }