Beowulf.H

Go to the documentation of this file.
00001 /*!@file Beowulf/Beowulf.H Simple interfacing to a Beowulf cluster */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the //
00005 // University of Southern California (USC) and the iLab at USC.         //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Laurent Itti <itti@usc.edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Beowulf/Beowulf.H $
00035 // $Id: Beowulf.H 8160 2007-03-21 21:34:16Z rjpeters $
00036 //
00037 
00038 #ifndef BEOWULF_H_DEFINED
00039 #define BEOWULF_H_DEFINED
00040 
00041 #include "Beowulf/TCPcommunicator.H"
00042 #include "Beowulf/TCPdefs.H"
00043 #include "Beowulf/TCPmessage.H"
00044 #include "Component/ModelComponent.H"
00045 #include "Component/ModelParam.H"
00046 #include "Util/Timer.H"
00047 
00048 #include <deque>
00049 #include <pthread.h>
00050 #include <vector>
00051 
00052 // ##### 32-bit action codes:
00053 // the following are reserved for init of the beowulf communications:
00054 #define BEO_NONE       0
00055 #define BEO_INIT      -1
00056 #define BEO_INIT2     -2
00057 #define BEO_INIT3     -3
00058 
00059 //! Simple interfacing to a Beowulf cluster
00060 /*! The idea of this class is to hide all of the low-level
00061   communication setup and transfer details from the user, and to
00062   provide a simple interface for passing messages between nodes on a
00063   Beowulf cluster. Each slave node should instantiate a Beowulf object
00064   and initialize it with slaveInit(). This will block the slave until
00065   it is contacted by the Beowulf master node. The master node
00066   instantiates a Beowulf object, and initializes it using
00067   masterInit(), passing along the hostnames of the slave nodes. During
00068   initialization, the Beowulf master contacts all the slaves and
00069   instructs them to fully interconnect with each other.  Once
00070   initialization is complete, any node can send() and receive()
00071   TCPmessages to and from any other node. Both send() and receive()
00072   are non-blocking methods. Actual queueing and transfer of messages
00073   is done in a thread that runs in parallel with the main program
00074   thread. */
00075 
00076 class Beowulf : public ModelComponent {
00077 public:
00078 
00079   // ######################################################################
00080   /*! @name Constructors and destructors */
00081   //@{
00082 
00083   //! Constructor
00084   /*! @param isMaster true if we are the master of the Beowulf. The
00085     master is the one that gets the list of slaves and then
00086     initializes all the slaves at start() time. */
00087   Beowulf(OptionManager& mgr,
00088           const std::string& descrName = "Beowulf",
00089           const std::string& tagName = "Beowulf",
00090           const bool ismaster = false);
00091 
00092   //! Reset and kill all connections except possibly one (keepfd)
00093   /*! Resets the Beowulf to uninitialized state. Kills all connections,
00094      except possibly one (typically towards the master) that may be
00095      specified as argument.
00096      @param keepfd the fd to keep (or -1 to keep no fd) */
00097   void resetConnections(const int keepfd = -1);
00098 
00099   //! Destructor
00100   /*! Will properly terminate all connections. */
00101   virtual ~Beowulf();
00102   //@}
00103 
00104   // ######################################################################
00105   /*! @name Access functions */
00106   //@{
00107 
00108   //! get number of slave nodes
00109   int getNbSlaves() const;
00110 
00111   //! get our node number (-1 is the master)
00112   int getNodeNumber() const;
00113 
00114   //! Get hostname:port of node with given node number
00115   /*! This is whatever the user gave at configuration, so it could be
00116     just a short hostname, a fully-qualified hostname, or a
00117     hostname:port. If nb is -1, we return "BeoMaster" */
00118   const char* nodeName(const int nb) const;
00119 
00120   //! Request a so-far unallocated node
00121   /*! This will return the next node number that has not yet been
00122     requested. This only works if we are Beowulf master. It will
00123     generate an error message and return -2 if we have no more
00124     unallocated nodes. We need to be start()'ed for this to work. */
00125   int requestNode();
00126 
00127   //! De-allocate a currently allocated node
00128   void releaseNode(int nodenum);
00129 
00130   //@}
00131 
00132   // ######################################################################
00133   /*! @name Message passing functions */
00134   //@{
00135 
00136   //! Send message to another node
00137   /*! This method is non-blocking (returns immediately). A copy of msg
00138     is taken, so you can destroy it immediately after send.
00139     @param node_nb is the destination node number. A value of -1 on a
00140     slave Beowulf means that msg should be sent to the Beowulf master.  */
00141   void send(const int node_nb, TCPmessage& msg);
00142 
00143   //! Send message to the least-loaded of our slave nodes
00144   /*! This method is non-blocking (returns immediately). A copy of msg
00145     is taken, so you can destroy it immediately after send.  Only
00146     works if we are the Beowulf master, fatal error otherwise. This
00147     implements load balancing. The ETI (estimated time to idle) fields
00148     in TCPmessage are used to determine which of our slave nodes has
00149     the shortest pending work queue (i.e., shortest ETI) and the
00150     message will be sent to that node. Thus, this functionality
00151     assumes that every slave node can process every message that you
00152     might send them (as opposed to more constrained architectures
00153     where a given node is only capable of doing a given type of
00154     processing corresponding to a given type of received message).
00155     For this load balancing to work, the slaves should try to put
00156     good-faith estimates of their time to idle (in seconds) each time
00157     they send us (the master) a message back. The master relies on
00158     those good-faith estimates to decide which node is the least
00159     loaded. This approach has severe limitations if your overall
00160     message traffic is low, as your ETI estimates at the master will
00161     not be refreshed regularly and may become grossly
00162     inaccurate. Thus, this approach is mostly intended for streaming
00163     applications, where every node will usually send several messages
00164     back to the master every 30ms or so, so that the ETI estimates
00165     collected at the master will be reasonably fresh and accurate. If
00166     several slave nodes have the lowest ETI, one will be picked at
00167     random. */
00168   void send(TCPmessage& msg);
00169 
00170   //! Receive message from a given node (or from any node)
00171   /*! Check whether a message has been received; returns false
00172     otherwise.  If a message was received, the node it came from will
00173     be in node_nb, and its frame and action fields will be pre-decoded
00174     for convenience (they still are in the message itself too). This
00175     method is always non-blocking, i.e., it returns immediately and
00176     does not wait for messages to come in.
00177     @param node_nb node number to receive from, or -1 to receive from
00178     any node.  If a message is received, node_nb will be updated to
00179     the node number from which the message was received, or -1 if it
00180     was received from the master.
00181     @param msg received message
00182     @param frame frame number from received message
00183     @param action action field from received message
00184     @param timeout if non-zero, max time (in ms) this call may block
00185     @param err if non-null, then (*err) will be set to non-zero if an error occurs
00186   */
00187   bool receive(int& node_nb, TCPmessage& msg, int32& frame,
00188                int32& action, const int timeout = 0,
00189                int* err = 0);
00190 
00191   //! Do we have any received messages?
00192   /*! Returns the total number of messages in the incoming queues of
00193     our various connected nodes. If node_nb == -1, only consider
00194     messages from the Beowulf master. If node_nb == -2, consider any
00195     node, otherwise only consider the specified node. */
00196   int nbReceived(const int node_nb = -2);
00197 
00198   //@}
00199 
00200 protected:
00201   //! names of our slaves as a space-separated list of hostname:port
00202   /*! port is optional and we will use the default SockServ port if
00203     unspecified. This parameter is only used if we are Beowulf master
00204     (see constructor) */
00205   OModelParam<std::string> itsSlaveNames;
00206 
00207   OModelParam<bool> isMaster;       //!< true if we are the master
00208   OModelParam<int> selfqlen;        //!< self-message queue length
00209   OModelParam<bool> selfdroplast;   //!< self-message queue drop policy
00210   OModelParam<double> initTimeout;  //!< max time to wait for initialization
00211 
00212   //! Intercept people changing our ModelParam
00213   /*! See ModelComponent.H; as parsing the command-line or reading a
00214     config file sets our name, we'll also here instantiate a
00215     controller of the proper type (and export its options) */
00216   virtual void paramChanged(ModelParamBase* const param,
00217                             const bool valueChanged,
00218                             ParamClient::ChangeStatus* status);
00219 
00220 private:
00221   nub::soft_ref<TCPcommunicator> com; // Handles all communications
00222 
00223   struct NodeInfo
00224   {
00225     NodeInfo() : fd(-1), name(), ETI(-1.0f), ETIreceived(-1.0f),
00226                  isAvailable(true) {}
00227 
00228     int fd; // Translate node number into fd (socket)
00229     std::string name; // Hostname of the slave node
00230     float ETI; // ETIs (in seconds) as sent to us by our slaves
00231     float ETIreceived; // time at which an ETI was last received
00232     bool isAvailable; // true if this node is not currently in use
00233   };
00234 
00235   bool initialized;    // True if all communications ok
00236   std::vector<NodeInfo> itsNodes; // Table of per-node info
00237   int *fd2node;        // Table to translate fd into node number
00238   int master;          // fd of my master if I am a slave
00239   int me;              // My node number if I am a slave
00240 
00241   Timer tim;           // to record message arrival times
00242   std::deque<TCPmessage> selfmsg; // messages to myself
00243   pthread_mutex_t mutselfmsg;     // Mutex for access to self message queue
00244 
00245   // get started (after our TCPcommunicator has started)
00246   void start2();
00247 
00248   // get stopped (before our TCPcommunicator has stopped)
00249   void stop1();
00250 
00251   //! Initialize as master node, using array of slave node hostnames
00252   /*! Master node initialization will contact all nodes specified and
00253      initialize them. Once this is done, everything will be ready to
00254      send() and receive() TCPmessages.
00255      @param nb_nodes number of slave nodes
00256      @param node_names hostnames of the slave nodes (format name:port) */
00257   void masterInit(const int nb_nodes, char **node_names);
00258 
00259   //! Initialize as master node using string of slave node hostnames
00260   /*! Master node initialization will contact all nodes specified and
00261      initialize them. Once this is done, everything will be ready to
00262      send() and receive() TCPmessages.
00263      @param node_names space-separated or comma-separated names of slave
00264      nodes as name:port, or a single absolute path (starting with '/') of
00265      a text file that contains the node names, one name per line. */
00266   void masterInit(const char *node_names);
00267 
00268   //! Initialize as a slave node
00269   /*! This method will block until a Beowulf master contacts us and
00270     initializes us. Once this is done, everything will be ready to
00271     send() and receive() TCPmessages. */
00272   void slaveInit();
00273 
00274   // In case we receive a message of type BEO_INIT while processing
00275   // stuff, receive() will call this function with the message; this
00276   // will start reinitializing us completely. Then receive will also
00277   // call slaveInit() to finish the re-initialization:
00278   void slaveReInit(TCPmessage& rmsg);
00279 };
00280 
00281 #endif
00282 
00283 // ######################################################################
00284 /* So things look consistent in everyone's emacs... */
00285 /* Local Variables: */
00286 /* indent-tabs-mode: nil */
00287 /* End: */
Generated on Sun May 8 08:40:20 2011 for iLab Neuromorphic Vision Toolkit by  doxygen 1.6.3