diff --git a/src/cpu.cpp b/src/cpu.cpp index 287b95b..c544baa 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -41,11 +41,12 @@ void CpuThread::run() { Cubiomes *cubiomes = cubiomes_create(large_biomes); - while (!should_stop()) { + while (true) { GpuOutput input; { std::unique_lock lock(inputs.mutex); if (inputs.queue.empty()) { + if (should_stop()) break; lock.unlock(); std::this_thread::sleep_for(std::chrono::seconds(1)); continue; diff --git a/src/gpu.cu b/src/gpu.cu index 7d00c6b..84e201a 100644 --- a/src/gpu.cu +++ b/src/gpu.cu @@ -1861,31 +1861,39 @@ void GpuThread::run() { std::printf("\n"); std::printf("start_seed = %" PRIi64 "\n", start_seed); + uint64_t total_inputs = stage_filter_seeds.total_inputs * stage_filter_seeds.inputs_multiplier; + uint64_t total_outputs = filter_2.back().stage.total_outputs; + + // Find max input value for dynamic column width + uint64_t max_inputs = total_inputs; + uint64_t gradvecs_1_inputs = stage_filter_gradvecs_1.total_inputs * stage_filter_gradvecs_1.inputs_multiplier; + if (gradvecs_1_inputs > max_inputs) max_inputs = gradvecs_1_inputs; + int inputs_width = std::max(12, (int)std::snprintf(nullptr, 0, "%" PRIu64, max_inputs)); + int outputs_width = 12; + double kernel_total_time = 0; for (auto &stage : stage_stats) { uint64_t scaled_total_inputs = stage.total_inputs * stage.inputs_multiplier; auto [scaled_input_speed, input_speed_unit] = scale_si(scaled_total_inputs / stage.total_time); auto [scaled_output_speed, output_speed_unit] = scale_si(stage.total_outputs / stage.total_time); - std::printf("%-20s - %9.3f ms | %7.3f %% | %12" PRIu64 " -> %12" PRIu64 + std::printf("%-20s - %9.3f ms | %7.3f %% | %*" PRIu64 " -> %*" PRIu64 " | 1 in %11.3f | %7.3f %cips | %7.3f %cops\n", stage.name.c_str(), stage.total_time * 1e3, stage.total_time / host_total_time * 100.0, - scaled_total_inputs, stage.total_outputs, + inputs_width, scaled_total_inputs, outputs_width, stage.total_outputs, (double)scaled_total_inputs / stage.total_outputs, scaled_input_speed, input_speed_unit, scaled_output_speed, output_speed_unit); kernel_total_time += stage.total_time; } - uint64_t total_inputs = stage_filter_seeds.total_inputs * stage_filter_seeds.inputs_multiplier; - uint64_t total_outputs = filter_2.back().stage.total_outputs; auto [scaled_input_speed, input_speed_unit] = scale_si(total_inputs / host_total_time); auto [scaled_output_speed, output_speed_unit] = scale_si(total_outputs / host_total_time); std::printf( - "total - %9.3f ms | %7.3f %% | %12" PRIu64 - " -> %12" PRIu64 " | | %7.3f %cips | %7.3f %cops\n", - host_total_time * 1e3, kernel_total_time / host_total_time * 100.0, - total_inputs, total_outputs, scaled_input_speed, input_speed_unit, + "%-20s - %9.3f ms | %7.3f %% | %*" PRIu64 + " -> %*" PRIu64 " | | %7.3f %cips | %7.3f %cops\n", + "total", host_total_time * 1e3, kernel_total_time / host_total_time * 100.0, + inputs_width, total_inputs, outputs_width, total_outputs, scaled_input_speed, input_speed_unit, scaled_output_speed, output_speed_unit); size_t gpu_outputs_size; diff --git a/src/main.cpp b/src/main.cpp index bf6603c..87a4888 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,11 +14,24 @@ #include #include #include +#include #include #include #include #include #include +#include + +static std::atomic_bool running{true}; +static std::atomic_uint32_t signal_count{0}; + +static void signal_handler(int) { + uint32_t count = signal_count.fetch_add(1, std::memory_order_relaxed) + 1; + running.store(false, std::memory_order_relaxed); + if (count >= 2) { + _exit(130); + } +} #ifdef NO_GPU constexpr bool no_gpu = true; @@ -244,7 +257,10 @@ int main_inner(int argc, char **argv) { } #endif - for (size_t i = 0;; i++) { + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + + for (size_t i = 0; running.load(std::memory_order_relaxed); i++) { if (threads != 0) { std::lock_guard lock(cpu_outputs.mutex); while (!cpu_outputs.queue.empty()) { @@ -264,6 +280,23 @@ int main_inner(int argc, char **argv) { std::this_thread::sleep_for(std::chrono::seconds(1)); } + size_t pending_cpu_inputs = 0; + { + std::lock_guard lock(gpu_outputs.mutex); + pending_cpu_inputs = gpu_outputs.queue.size(); + } + size_t pending_cpu_outputs = 0; + { + std::lock_guard lock(cpu_outputs.mutex); + pending_cpu_outputs = cpu_outputs.queue.size(); + } + + std::printf("\nShutting down...\n"); + if (pending_cpu_inputs > 0 || pending_cpu_outputs > 0) { + std::printf("Pending CPU input queue: %zu, pending CPU output queue: %zu\n", pending_cpu_inputs, pending_cpu_outputs); + } + std::printf("Press Ctrl+C again to force exit immediately.\n"); + #ifndef NO_GPU for (auto &thread : gpu_threads) { (*thread).stop(); @@ -279,19 +312,34 @@ int main_inner(int argc, char **argv) { (*client_thread).stop(); } if (server_thread) { - (*server_thread).stop(); + (*server_thread).shutdown(); } #endif #ifndef NO_GPU + std::printf("Waiting for GPU batches to finish...\n"); for (auto &thread : gpu_threads) { (*thread).join(); } + { + uint64_t total_seeds_checked = seed_range.pos.load(std::memory_order_relaxed) - start_seed; + std::printf("Start seed: %" PRIi64 ", Total seeds checked: %" PRIu64 "\n", start_seed, total_seeds_checked); + } #endif #ifndef NO_CPU for (auto &thread : cpu_threads) { (*thread).join(); } + { + std::lock_guard lock(cpu_outputs.mutex); + while (!cpu_outputs.queue.empty()) { + auto output = cpu_outputs.queue.front(); + cpu_outputs.queue.pop(); + std::printf("%" PRIi64 " at %" PRIi32 " %" PRIi32 " with %" PRIi32 "\n", output.seed, output.x, output.z, output.score); + std::fprintf(output_file, "%" PRIi64 " %" PRIi32 " %" PRIi32 " %" PRIi32 "\n", output.seed, output.x, output.z, output.score); + std::fflush(output_file); + } + } #endif #ifndef NO_NET if (client_thread) { diff --git a/src/server.cpp b/src/server.cpp index d32f0ca..ea8ef9f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -128,6 +128,7 @@ void ServerThread::run() { try { asio::io_context io_context; + io_ctx.store(&io_context, std::memory_order_release); asio::ip::tcp::resolver resolver(io_context); auto endpoints = resolver.resolve(listen_address.host, listen_address.service); @@ -140,8 +141,16 @@ void ServerThread::run() { tcp_server server(io_context, endpoint, outputs); io_context.run(); + + io_ctx.store(nullptr, std::memory_order_release); } catch (std::exception &e) { std::fprintf(stderr, "Uncaught exception on server thread: %s\n", e.what()); std::abort(); } +} + +void ServerThread::shutdown() { + stop(); + auto *ctx = io_ctx.load(std::memory_order_acquire); + if (ctx) ctx->stop(); } \ No newline at end of file diff --git a/src/server.h b/src/server.h index 21b8c4a..2a0fe5e 100644 --- a/src/server.h +++ b/src/server.h @@ -2,11 +2,15 @@ #include "common.h" +namespace asio { class io_context; } + struct ServerThread: Thread { HostService &listen_address; GpuOutputs &outputs; + std::atomic io_ctx{nullptr}; ServerThread(HostService &listen_address, GpuOutputs &outputs); void run(); + void shutdown(); }; \ No newline at end of file