#include "ggml-threading.h" #include #define GGML_UNUSED(x) (void)(x) // // NUMA support // #define GGML_NUMA_MAX_NODES 8 #define GGML_NUMA_MAX_CPUS 512 struct ggml_numa_node { uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node uint32_t n_cpus; }; struct ggml_numa_nodes { enum ggml_numa_strategy numa_strategy; struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system uint32_t current_node; // node on which main process is execting #if defined(__gnu_linux__) cpu_set_t cpuset; // cpuset from numactl #else uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype #endif }; struct ggml_numa_nodes g_state_numa; #if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; pthread_t thread; thread = pthread_self(); CPU_ZERO(&cpuset); pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); return cpuset; } #else static uint32_t ggml_get_numa_affinity(void) { return 0; // no NUMA support } #endif void ggml_numa_zero() { g_state_numa.n_nodes = 0; g_state_numa.total_cpus = 0; } void ggml_numa_init(enum ggml_numa_strategy numa_flag) { if (g_state_numa.n_nodes > 0) { fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); return; } #if defined(__gnu_linux__) struct stat st; char path[256]; int rv; // set numa scheme g_state_numa.numa_strategy = numa_flag; GGML_PRINT_DEBUG("numa strategy %u\n", g_state_numa.numa_strategy); g_state_numa.cpuset = ggml_get_numa_affinity(); // enumerate nodes while (g_state_numa.n_nodes < GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state_numa.n_nodes); GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) != 0) { break; } ++g_state_numa.n_nodes; } // enumerate CPUs while (g_state_numa.total_cpus < GGML_NUMA_MAX_CPUS) { rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state_numa.total_cpus); GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) != 0) { break; } ++g_state_numa.total_cpus; } GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state_numa.n_nodes, g_state_numa.total_cpus); // figure out which node we're on uint current_cpu; int getcpu_ret = 0; #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) getcpu_ret = getcpu(¤t_cpu, &g_state_numa.current_node); #else // old glibc doesn't have a wrapper for this call. Fall back on direct syscall # if !defined(SYS_getcpu) && defined(SYS_get_cpu) # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name # endif getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state_numa.current_node); #endif if (g_state_numa.n_nodes < 1 || g_state_numa.total_cpus < 1 || getcpu_ret != 0) { g_state_numa.n_nodes = 0; return; } GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state_numa.current_node, current_cpu); for (uint32_t n = 0; n < g_state_numa.n_nodes; ++n) { struct ggml_numa_node* node = &g_state_numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); node->n_cpus = 0; for (uint32_t c = 0; c < g_state_numa.total_cpus; ++c) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) == 0) { node->cpus[node->n_cpus++] = c; GGML_PRINT_DEBUG(" %u", c); } } GGML_PRINT_DEBUG("\n"); } if (ggml_is_numa()) { FILE* fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); if (fptr != NULL) { char buf[42]; if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); } fclose(fptr); } } #else GGML_UNUSED(numa_flag); // TODO #endif } bool ggml_is_numa(void) { return g_state_numa.n_nodes > 1; } //////////////////////////////////////////////////////////////////////////////// #if defined(_WIN32) void atomic_store(atomic_int* ptr, LONG val) { InterlockedExchange(ptr, val); } LONG atomic_load(atomic_int* ptr) { return InterlockedCompareExchange(ptr, 0, 0); } LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); } LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { (void)unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL); if (handle == NULL) { return EAGAIN; } *out = handle; return 0; } int pthread_join(pthread_t thread, void* unused) { (void)unused; int ret = (int)WaitForSingleObject(thread, INFINITE); CloseHandle(thread); return ret; } int sched_yield(void) { Sleep(0); return 0; } #endif static atomic_int g_state_barrier = 0; // barrier via spin lock for g_state void ggml_critical_section_start(void) { int processing = atomic_fetch_add(&g_state_barrier, 1); while (processing > 0) { // wait for other threads to finish atomic_fetch_sub(&g_state_barrier, 1); sched_yield(); // TODO: reconsider this processing = atomic_fetch_add(&g_state_barrier, 1); } } // TODO: make this somehow automatically executed // some sort of "sentry" mechanism void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; } int node_num; int rv; size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus); switch (g_state_numa.numa_strategy) { case GGML_NUMA_STRATEGY_DISTRIBUTE: // run thread on node_num thread_n / (threads per node) node_num = thread_n % g_state_numa.n_nodes; break; case GGML_NUMA_STRATEGY_ISOLATE: // run thread on current_node node_num = g_state_numa.current_node; break; case GGML_NUMA_STRATEGY_NUMACTL: // use the cpuset that numactl gave us rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state_numa.cpuset); if (rv) { fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } return; default: return; } struct ggml_numa_node* node = &g_state_numa.nodes[node_num]; cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus); CPU_ZERO_S(setsize, cpus); for (size_t i = 0; i < node->n_cpus; ++i) { CPU_SET_S(node->cpus[i], setsize, cpus); } rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); } void clear_numa_thread_affinity(void) { if (!ggml_is_numa()) { return; } size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus); cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus); CPU_ZERO_S(setsize, cpus); for (unsigned i = 0; i < g_state_numa.total_cpus; ++i) { CPU_SET_S(i, setsize, cpus); } int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); } #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) void set_numa_thread_affinity(int thread_n) { GGML_UNUSED(thread_n); } void clear_numa_thread_affinity(void) {} #endif