00001 /*!@file Beowulf/Beowulf.H Simple interfacing to a Beowulf cluster */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the // 00005 // University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Laurent Itti <itti@usc.edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Beowulf/Beowulf.H $ 00035 // $Id: Beowulf.H 8160 2007-03-21 21:34:16Z rjpeters $ 00036 // 00037 00038 #ifndef BEOWULF_H_DEFINED 00039 #define BEOWULF_H_DEFINED 00040 00041 #include "Beowulf/TCPcommunicator.H" 00042 #include "Beowulf/TCPdefs.H" 00043 #include "Beowulf/TCPmessage.H" 00044 #include "Component/ModelComponent.H" 00045 #include "Component/ModelParam.H" 00046 #include "Util/Timer.H" 00047 00048 #include <deque> 00049 #include <pthread.h> 00050 #include <vector> 00051 00052 // ##### 32-bit action codes: 00053 // the following are reserved for init of the beowulf communications: 00054 #define BEO_NONE 0 00055 #define BEO_INIT -1 00056 #define BEO_INIT2 -2 00057 #define BEO_INIT3 -3 00058 00059 //! Simple interfacing to a Beowulf cluster 00060 /*! The idea of this class is to hide all of the low-level 00061 communication setup and transfer details from the user, and to 00062 provide a simple interface for passing messages between nodes on a 00063 Beowulf cluster. Each slave node should instantiate a Beowulf object 00064 and initialize it with slaveInit(). This will block the slave until 00065 it is contacted by the Beowulf master node. The master node 00066 instantiates a Beowulf object, and initializes it using 00067 masterInit(), passing along the hostnames of the slave nodes. During 00068 initialization, the Beowulf master contacts all the slaves and 00069 instructs them to fully interconnect with each other. Once 00070 initialization is complete, any node can send() and receive() 00071 TCPmessages to and from any other node. Both send() and receive() 00072 are non-blocking methods. Actual queueing and transfer of messages 00073 is done in a thread that runs in parallel with the main program 00074 thread. */ 00075 00076 class Beowulf : public ModelComponent { 00077 public: 00078 00079 // ###################################################################### 00080 /*! @name Constructors and destructors */ 00081 //@{ 00082 00083 //! Constructor 00084 /*! @param isMaster true if we are the master of the Beowulf. The 00085 master is the one that gets the list of slaves and then 00086 initializes all the slaves at start() time. */ 00087 Beowulf(OptionManager& mgr, 00088 const std::string& descrName = "Beowulf", 00089 const std::string& tagName = "Beowulf", 00090 const bool ismaster = false); 00091 00092 //! Reset and kill all connections except possibly one (keepfd) 00093 /*! Resets the Beowulf to uninitialized state. Kills all connections, 00094 except possibly one (typically towards the master) that may be 00095 specified as argument. 00096 @param keepfd the fd to keep (or -1 to keep no fd) */ 00097 void resetConnections(const int keepfd = -1); 00098 00099 //! Destructor 00100 /*! Will properly terminate all connections. */ 00101 virtual ~Beowulf(); 00102 //@} 00103 00104 // ###################################################################### 00105 /*! @name Access functions */ 00106 //@{ 00107 00108 //! get number of slave nodes 00109 int getNbSlaves() const; 00110 00111 //! get our node number (-1 is the master) 00112 int getNodeNumber() const; 00113 00114 //! Get hostname:port of node with given node number 00115 /*! This is whatever the user gave at configuration, so it could be 00116 just a short hostname, a fully-qualified hostname, or a 00117 hostname:port. If nb is -1, we return "BeoMaster" */ 00118 const char* nodeName(const int nb) const; 00119 00120 //! Request a so-far unallocated node 00121 /*! This will return the next node number that has not yet been 00122 requested. This only works if we are Beowulf master. It will 00123 generate an error message and return -2 if we have no more 00124 unallocated nodes. We need to be start()'ed for this to work. */ 00125 int requestNode(); 00126 00127 //! De-allocate a currently allocated node 00128 void releaseNode(int nodenum); 00129 00130 //@} 00131 00132 // ###################################################################### 00133 /*! @name Message passing functions */ 00134 //@{ 00135 00136 //! Send message to another node 00137 /*! This method is non-blocking (returns immediately). A copy of msg 00138 is taken, so you can destroy it immediately after send. 00139 @param node_nb is the destination node number. A value of -1 on a 00140 slave Beowulf means that msg should be sent to the Beowulf master. */ 00141 void send(const int node_nb, TCPmessage& msg); 00142 00143 //! Send message to the least-loaded of our slave nodes 00144 /*! This method is non-blocking (returns immediately). A copy of msg 00145 is taken, so you can destroy it immediately after send. Only 00146 works if we are the Beowulf master, fatal error otherwise. This 00147 implements load balancing. The ETI (estimated time to idle) fields 00148 in TCPmessage are used to determine which of our slave nodes has 00149 the shortest pending work queue (i.e., shortest ETI) and the 00150 message will be sent to that node. Thus, this functionality 00151 assumes that every slave node can process every message that you 00152 might send them (as opposed to more constrained architectures 00153 where a given node is only capable of doing a given type of 00154 processing corresponding to a given type of received message). 00155 For this load balancing to work, the slaves should try to put 00156 good-faith estimates of their time to idle (in seconds) each time 00157 they send us (the master) a message back. The master relies on 00158 those good-faith estimates to decide which node is the least 00159 loaded. This approach has severe limitations if your overall 00160 message traffic is low, as your ETI estimates at the master will 00161 not be refreshed regularly and may become grossly 00162 inaccurate. Thus, this approach is mostly intended for streaming 00163 applications, where every node will usually send several messages 00164 back to the master every 30ms or so, so that the ETI estimates 00165 collected at the master will be reasonably fresh and accurate. If 00166 several slave nodes have the lowest ETI, one will be picked at 00167 random. */ 00168 void send(TCPmessage& msg); 00169 00170 //! Receive message from a given node (or from any node) 00171 /*! Check whether a message has been received; returns false 00172 otherwise. If a message was received, the node it came from will 00173 be in node_nb, and its frame and action fields will be pre-decoded 00174 for convenience (they still are in the message itself too). This 00175 method is always non-blocking, i.e., it returns immediately and 00176 does not wait for messages to come in. 00177 @param node_nb node number to receive from, or -1 to receive from 00178 any node. If a message is received, node_nb will be updated to 00179 the node number from which the message was received, or -1 if it 00180 was received from the master. 00181 @param msg received message 00182 @param frame frame number from received message 00183 @param action action field from received message 00184 @param timeout if non-zero, max time (in ms) this call may block 00185 @param err if non-null, then (*err) will be set to non-zero if an error occurs 00186 */ 00187 bool receive(int& node_nb, TCPmessage& msg, int32& frame, 00188 int32& action, const int timeout = 0, 00189 int* err = 0); 00190 00191 //! Do we have any received messages? 00192 /*! Returns the total number of messages in the incoming queues of 00193 our various connected nodes. If node_nb == -1, only consider 00194 messages from the Beowulf master. If node_nb == -2, consider any 00195 node, otherwise only consider the specified node. */ 00196 int nbReceived(const int node_nb = -2); 00197 00198 //@} 00199 00200 protected: 00201 //! names of our slaves as a space-separated list of hostname:port 00202 /*! port is optional and we will use the default SockServ port if 00203 unspecified. This parameter is only used if we are Beowulf master 00204 (see constructor) */ 00205 OModelParam<std::string> itsSlaveNames; 00206 00207 OModelParam<bool> isMaster; //!< true if we are the master 00208 OModelParam<int> selfqlen; //!< self-message queue length 00209 OModelParam<bool> selfdroplast; //!< self-message queue drop policy 00210 OModelParam<double> initTimeout; //!< max time to wait for initialization 00211 00212 //! Intercept people changing our ModelParam 00213 /*! See ModelComponent.H; as parsing the command-line or reading a 00214 config file sets our name, we'll also here instantiate a 00215 controller of the proper type (and export its options) */ 00216 virtual void paramChanged(ModelParamBase* const param, 00217 const bool valueChanged, 00218 ParamClient::ChangeStatus* status); 00219 00220 private: 00221 nub::soft_ref<TCPcommunicator> com; // Handles all communications 00222 00223 struct NodeInfo 00224 { 00225 NodeInfo() : fd(-1), name(), ETI(-1.0f), ETIreceived(-1.0f), 00226 isAvailable(true) {} 00227 00228 int fd; // Translate node number into fd (socket) 00229 std::string name; // Hostname of the slave node 00230 float ETI; // ETIs (in seconds) as sent to us by our slaves 00231 float ETIreceived; // time at which an ETI was last received 00232 bool isAvailable; // true if this node is not currently in use 00233 }; 00234 00235 bool initialized; // True if all communications ok 00236 std::vector<NodeInfo> itsNodes; // Table of per-node info 00237 int *fd2node; // Table to translate fd into node number 00238 int master; // fd of my master if I am a slave 00239 int me; // My node number if I am a slave 00240 00241 Timer tim; // to record message arrival times 00242 std::deque<TCPmessage> selfmsg; // messages to myself 00243 pthread_mutex_t mutselfmsg; // Mutex for access to self message queue 00244 00245 // get started (after our TCPcommunicator has started) 00246 void start2(); 00247 00248 // get stopped (before our TCPcommunicator has stopped) 00249 void stop1(); 00250 00251 //! Initialize as master node, using array of slave node hostnames 00252 /*! Master node initialization will contact all nodes specified and 00253 initialize them. Once this is done, everything will be ready to 00254 send() and receive() TCPmessages. 00255 @param nb_nodes number of slave nodes 00256 @param node_names hostnames of the slave nodes (format name:port) */ 00257 void masterInit(const int nb_nodes, char **node_names); 00258 00259 //! Initialize as master node using string of slave node hostnames 00260 /*! Master node initialization will contact all nodes specified and 00261 initialize them. Once this is done, everything will be ready to 00262 send() and receive() TCPmessages. 00263 @param node_names space-separated or comma-separated names of slave 00264 nodes as name:port, or a single absolute path (starting with '/') of 00265 a text file that contains the node names, one name per line. */ 00266 void masterInit(const char *node_names); 00267 00268 //! Initialize as a slave node 00269 /*! This method will block until a Beowulf master contacts us and 00270 initializes us. Once this is done, everything will be ready to 00271 send() and receive() TCPmessages. */ 00272 void slaveInit(); 00273 00274 // In case we receive a message of type BEO_INIT while processing 00275 // stuff, receive() will call this function with the message; this 00276 // will start reinitializing us completely. Then receive will also 00277 // call slaveInit() to finish the re-initialization: 00278 void slaveReInit(TCPmessage& rmsg); 00279 }; 00280 00281 #endif 00282 00283 // ###################################################################### 00284 /* So things look consistent in everyone's emacs... */ 00285 /* Local Variables: */ 00286 /* indent-tabs-mode: nil */ 00287 /* End: */