signal_handling.cpp
Go to the documentation of this file.
97 errmsg<<"Tried to broadcast code "<<name<<" to all processes, but the send buffer for process "<<i<<" was already in use! This is a bug in the message send logic of the code using this object, please file a bug report."<<std::endl; 106 // Ensure all processes have received this message (completes the send; must follow ISendToAll at some point) 234 myout <<"#"<<i+1<<": Signal "<<received_signals[i]<<" ("<<signal_name(received_signals[i])<<")"<<std::endl; 239 myout << "Another " << (N_signals - MAX_SIGNALS) <<" signals were caught but their values were not recorded (buffer exceeded)"<<std::endl; 286 const int max_attempts=-1; // Number of extra likelihood evaluations allowed for sync attempts before we declare failure. -1 means "unlimited" 287 const int attempts_before_ff=10; // Number of times to attempt synchronisation before entering a "fast forward" period 291 static std::chrono::time_point<std::chrono::system_clock> start(std::chrono::system_clock::now()); 297 logger() << "Beginning GAMBIT soft shutdown procedure. Control will be returned to the scanner plugin so " 298 << "that it can get its affairs in order in preparation for shutdown (it may cease iterating if " 299 << "it has that capability), and next iteration we will attempt to synchronise all processes and " 300 << "shut them down. If sync fails, we will loop up to "<<max_attempts<<" times (-1 means infinite), attempting to " 306 logger() << "Fast-forward active (loop "<<ff_loop_count<<"); no synchronisation attempted." << EOM; 322 msg << "rank "<<myrank()<<": Tried to synchronise for shutdown (attempt "<<shutdown_attempts<<") but failed. Will now fast-forward through "<<ff_loops<<" iterations in an attempt to 'unlock' possible MPI deadlocks with the scanner."; 333 logger() << "Scanner did not shut down when given the chance; we will therefore assume responsibility for terminating the scan." << EOM; 335 logger() << "Attempting to synchronise for soft shutdown (attempt "<<shutdown_attempts<<")" << EOM; 351 logger() << "Failed to synchronise for soft shutdown! Attempting cleanup anyway, but cannot guarantee safety of the scan output." << EOM; 357 msg << "Soft shutdown failed, emergency shutdown performed instead! (could not synchronise all processes after "<<shutdown_attempts 358 <<" attempts, and after waiting "<<std::chrono::duration_cast<std::chrono::seconds>(time_waited).count() 360 <<" times). Data handled by external scanner codes may have been left in an inconsistent state." << std::endl; 365 logger() << "Attempt to sync for soft shutdown failed (this was attempt "<<shutdown_attempts<<" of "<<max_attempts<<" (-1 means infinite)); " 366 <<std::chrono::duration_cast<std::chrono::seconds>(time_waited).count() <<" seconds have elapsed since " 367 <<"shutdown attempts began). Will allow evaluation to continue and attempt to sync again next iteration." << EOM; 383 // If shutdown is not known to be in progress, check for MPI messages telling us to initiate shutdown 387 logger() << LogTags::core << LogTags::info << "Doing Iprobe to check for shutdown messages from other processes (with MPI tag "<<signalComm->mytag<<")" << EOM; 402 logger() << LogTags::core << LogTags::info << "Received SOFT shutdown message from process with rank " << msg_status.MPI_SOURCE << EOM; 407 logger() << LogTags::core << LogTags::info << "Received EMERGENCY shutdown message from process with rank " << msg_status.MPI_SOURCE << EOM; 412 ss << "Received UNRECOGNISED shutdown message from process with rank " << msg_status.MPI_SOURCE<<". Performing emergency shutdown, but please note that this indicates a ***BUG*** somewhere in the signal handling code!!!"; 423 logger() << LogTags::core << LogTags::info << "No shutdown message detected; continuing as normal" << EOM; 448 if(not shutdown_due_to_MPI_message) // Don't broadcast another shutdown message if we are shutting down due to an MPI message we received. Assume that all processes will get the first message (otherwise for 1000 process job we will end up with 1000*1000 shutdown messages clogging up the network) 488 logger() << LogTags::core << LogTags::info << "Doing Iprobe to check for shutdown signals from other processes (with MPI tag " 489 <<signalComm->mytag<<"). These will be discarded (since we are inside the 'discard_excess_shutdown_messages' routine)" << EOM; 491 int max_loops = 2*signalComm->Get_size(); // At most should be one message from every process (minus one), so we will check twice as many times as this before deciding that something has gone horribly wrong. 512 //std::cerr<<"Rank "<<myRank<<" attempting to cleanup shutdown messages from rank "<<rank<<std::endl; 518 //std::cerr<<"Rank "<<myRank<<": Messages waiting from rank "<<rank<<"? "<<signalComm->Iprobe(rank, signalComm->mytag)<<std::endl; 520 //std::cerr<<"Rank "<<myRank<<": received code "<<shutdown_name(code)<<" from rank "<<rank<<std::endl; 528 errmsg<<"ensure_no_more_shutdown_messages function has been looping on rank "<<myRank<<" for "<<loop<<" iterations (receiving messages from rank "<<rank<<" process), but there are only "<<mpiSize<<" processes in this job. There should not be anywhere near this many shutdown messages to receive, so something has gone horribly wrong. Please report this as a bug."; 605 logger() << LogTags::core << LogTags::info << "Broadcasting shutcode code " <<shutdown_name(shutdown_code)<< " with MPI tag "<<signalComm->mytag<< EOM; 608 logger() << LogTags::core << LogTags::info << shutdown_name(shutdown_code) <<" code broadcast to all processes" << EOM; 614 errmsg << "Tried to broadcast_shutdown_signal ("<<shutdown_name(shutdown_code)<<"), but MPI communicator is not ready! (either MPI is uninitialised or a communicator has not been set). This is a bug, please report it."; 618 } // Don't need to broadcast twice (NOTE: might need to trigger change from soft to emergency shutdown?) 622 logger() << LogTags::core << LogTags::info << "Received instruction to broadcast code " <<shutdown_name(shutdown_code) 637 // If we are using MPI, it is required that the signaldata object be initialised with a communicator object 647 errmsg << "Error retrieving global SignalData object! An MPI communicator has not been provided to this object! Please provide one via the 'set_MPI_comm' the first time that 'signaldata()' is called."; 666 // We will avoid touching streams in this "clean" shutdown mode since technically it is undefined behaviour, so no messages here. 668 signaldata().add_signal(sig); // I think this should be ok... but can delete it if there are any problems 675 void set_signal_handler(const YAML::Node& keyvalnode, const int sig, const std::string& def_mode) 691 // else if (shutdown_mode=="emergency_shutdown_longjmp"){ signal(sig, sighandler_emergency_longjmp); } 697 msg << "Invalid shutdown mode requested for signal "<<signal_name(sig)<<" ("<<sig<<")"<<" (via YAML file option '"<<signal_name(sig)<<"' in KeyValue section under 'signal_handling'). Value received was '"<<shutdown_mode<<"'. Valid shutdown modes are:" <<std::endl; 699 msg << " 'emergency_shutdown' -- Attempt to save printer/resume data and then immediately exit." <<std::endl; 700 msg << " 'emergency_shutdown_longjmp' -- Longjmp to outside of likelihood loop, then attempt to save printer/resume data and exit"<<std::endl; 701 msg << " 'soft_shutdown' -- Safest: attempt to synchronise processes at safe location, then save printer/resume data and exit." <<std::endl; 703 msg << "The default shutdown mode on signal "<<signal_name(sig)<<" is '"<<def_mode<<"'." <<std::endl;
std::string display_received_signals() Print to string a list of the signals received so far by this process. Definition: signal_handling.cpp:224 void set_shutdown_begun(const sig_atomic_t emergnc=0) Register that shutdown has begun. Definition: signal_handling.cpp:246 void entering_multithreaded_region() Check if shutdown is in progress and raise appropriate termination exception if so. Definition: signal_handling.cpp:550 bool all_processes_ready() Attempt to synchronise all processes, but abort if it takes too long. Definition: signal_handling.cpp:265 bool shutdown_due_to_MPI_message Definition: signal_handling.hpp:188 Logging access header for GAMBIT. Special exception used during controlled early shutdown. Definition: exceptions.hpp:346 void attempt_soft_shutdown() Perform soft shutdown if processes can be synchronised. Definition: signal_handling.cpp:284 void leaving_multithreaded_region() Exit threadsafe signal handling mode. Definition: signal_handling.cpp:559 void add_signal(int sig) Check if emergency shutdown is in progress. Definition: signal_handling.cpp:209 GAMBIT signal handling functions. EXPORT_SYMBOLS bool check_if_shutdown_begun() Check for signals that early shutdown is required If an MPI message telling us to perform an emergenc... Definition: signal_handling.cpp:379 Definition: log_tags.hpp:35 std::string signal_name(int sig) Translate signal codes to strings. Definition: signal_handling.cpp:35 Special exception raised when emergency shutdown triggered via MPI. Definition: exceptions.hpp:364 volatile sig_atomic_t shutdownBegun Flag to warn if early shutdown is already in process. Definition: signal_handling.hpp:180 Special exception used during emergency early shutdown. Definition: exceptions.hpp:355 EXPORT_SYMBOLS SignalData & signaldata() Retrieve global instance of signal handler options struct. Definition: signal_handling.cpp:633 Definition: log_tags.hpp:36 const Logging::endofmessage EOM Explicit const instance of the end of message struct in Gambit namespace. Definition: logger.hpp:100 EXPORT_SYMBOLS Logging::LogMaster & logger() Function to retrieve a reference to the Gambit global log object. Definition: logger.cpp:95 int received_signals[MAX_SIGNALS] Definition: signal_handling.hpp:203 void(* void_func)() Set cleanup function to run during emergency shutdown. Definition: signal_handling.hpp:91 void set_signal_handler(const YAML::Node &keyvalnode, const int sig, const std::string &def_mode) Choose signal handler for a given signal via yaml file option. Definition: signal_handling.cpp:675 A simple C++ wrapper for the MPI C bindings. bool POSIX_signal_noticed Flag to indicate if POSIX shutdown signal has been noticed. Definition: signal_handling.hpp:185 Definition: log_tags.hpp:47 bool shutdown_begun() Check if (any kind of) shutdown is in progress. Definition: signal_handling.cpp:261 void update_looptime(double newtime) Extra functions needed in MPI mode. Definition: signal_handling.cpp:474 std::string myrank() Retrieve MPI rank as a string (for log messages etc.) Definition: signal_handling.cpp:175 static const int MAX_SIGNALS Array to record received signals (up to max_signals) Definition: signal_handling.hpp:202 bool inside_multithreaded_region() Report 'true' if inside a multithreaded region (according to our own flag) Definition: signal_handling.cpp:568 bool inside_omp_block Flag to switch signal handling behavior to multithreaded mode (i.e. Definition: signal_handling.hpp:199 volatile sig_atomic_t emergency Flag to warn if the shutdown that is in progress is an emergency shutdown (use to decided whether to ... Definition: signal_handling.hpp:183 int shutdown_attempts Number of times synchronisation for soft shutdown has been attempted;. Definition: signal_handling.hpp:191 |