SpeechSynth.C

Go to the documentation of this file.
00001 /*!@file Devices/SpeechSynth.C Interfaces to festival speech synth */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the //
00005 // University of Southern California (USC) and the iLab at USC.         //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Lior Elazary <elazary@usc.edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Devices/SpeechSynth.C $
00035 // $Id: SpeechSynth.C 9760 2008-05-11 22:40:13Z rjpeters $
00036 //
00037 
00038 #include "Devices/SpeechSynth.H"
00039 
00040 #include "Component/ModelOptionDef.H"
00041 #include "Devices/DeviceOpts.H"
00042 #include "Util/Assert.H"
00043 #include "Util/JobWithSemaphore.H"
00044 #include "Util/StringUtil.H"
00045 #include "Util/WorkThreadServer.H"
00046 #include "Util/sformat.H"
00047 #include "rutz/compat_snprintf.h"
00048 #include "rutz/stdiobuf.h"
00049 
00050 #include <fcntl.h>
00051 
00052 static const ModelOptionDef OPT_FestivalServerHost =
00053   { MODOPT_ARG_STRING, "FestivalServerHost", &MOC_AUDIO, OPTEXP_CORE,
00054     "IP address of festival server to use for speech synthesis",
00055     "festival-server-host", '\0', "<ipaddr>", "127.0.0.1" };
00056 
00057 static const ModelOptionDef OPT_FestivalServerPort =
00058   { MODOPT_ARG(uint), "FestivalServerPort", &MOC_AUDIO, OPTEXP_CORE,
00059     "Port number of festival server to use for speech synthesis",
00060     "festival-server-port", '\0', "<portnum>", "1314" };
00061 
00062 static const ModelOptionDef OPT_SpeechQueueSize =
00063   { MODOPT_ARG(size_t), "SpeechQueueSize", &MOC_AUDIO, OPTEXP_CORE,
00064     "Max queue size for speech utterances; low-priority utterances "
00065     "will be dropped to avoid exceeding this size",
00066     "speech-queue-size", '\0', "<size_t>", "1" };
00067 
00068 namespace
00069 {
00070   class SpeechUtteranceJob : public JobWithSemaphore
00071   {
00072   public:
00073     SpeechUtteranceJob(std::iostream& server,
00074                        const std::string& msg,
00075                        const int priority,
00076                        const int id,
00077                        time_t* timestamp)
00078       :
00079       itsServer(server),
00080       itsMsg(msg),
00081       itsPriority(priority),
00082       itsJobType(sformat("utterance[priority=%d]", priority)),
00083       itsId(id),
00084       itsTimestamp(timestamp)
00085     {}
00086 
00087     virtual ~SpeechUtteranceJob() {}
00088 
00089     virtual void run()
00090     {
00091       LINFO("          running #%d @ priority %d: %s",
00092             itsId, itsPriority, itsMsg.c_str());
00093 
00094       const std::string msg_with_newline = itsMsg + '\n';
00095 
00096       itsServer << msg_with_newline << std::flush;
00097 
00098       std::string code;
00099       std::getline(itsServer, code);
00100 
00101       LDEBUG("festival ack = '%s'", code.c_str());
00102 
00103       if (code == "LP" || code == "WV")
00104         {
00105           std::string data, end;
00106           std::getline(itsServer, data);
00107           std::getline(itsServer, end);
00108           LDEBUG("festival data = '%s'", data.c_str());
00109           LDEBUG("festival end = '%s'", end.c_str());
00110           if (end.size() < 2 || end.substr(end.size() - 2).compare("OK") != 0)
00111             LERROR("festival return message didn't end with 'OK'");
00112         }
00113       else if (code == "ER")
00114         {
00115           LERROR("festival returned error code 'ER'");
00116         }
00117 
00118       if (itsTimestamp)
00119         time(itsTimestamp);
00120 
00121       this->markFinished();
00122     }
00123 
00124     virtual const char* jobType() const
00125     { return itsJobType.c_str(); }
00126 
00127     virtual int priority() const
00128     { return itsPriority; }
00129 
00130     std::iostream& itsServer;
00131     const std::string itsMsg;
00132     const int itsPriority;
00133     const std::string itsJobType;
00134     const int itsId;
00135     time_t* itsTimestamp;
00136   };
00137 }
00138 
00139 // ######################################################################
00140 SpeechSynth::SpeechSynth(OptionManager& mgr, const std::string& descrName,
00141                          const std::string& tagName)
00142   :
00143   ModelComponent(mgr, descrName, tagName),
00144   itsServerHost(&OPT_FestivalServerHost, this),
00145   itsServerPort(&OPT_FestivalServerPort, this),
00146   itsQueueSize(&OPT_SpeechQueueSize, this, ALLOW_ONLINE_CHANGES),
00147   itsThreadServer(),
00148   itsServerFD(-1),
00149   itsServerStream(0),
00150   itsJobCounter(0)
00151 {}
00152 
00153 // ######################################################################
00154 void SpeechSynth::start2()
00155 {
00156    festivalConnect();
00157 }
00158 
00159 // ######################################################################
00160 SpeechSynth::~SpeechSynth()
00161 {
00162    festivalClose();
00163 }
00164 
00165 // ######################################################################
00166 void SpeechSynth::festivalConnect()
00167 {
00168   itsThreadServer.reset(new WorkThreadServer("SpeechSynth", 1));
00169   itsThreadServer->setDropPolicy(WorkThreadServer::DROP_OLDEST_LOWEST_PRIORITY);
00170   itsThreadServer->setMaxQueueSize(itsQueueSize.getVal());
00171   itsThreadServer->setFlushBeforeStopping(false);
00172 
00173   /* Return an FD to a remote server */
00174   struct sockaddr_in serv_addr;
00175   struct hostent *serverhost;
00176   int fd;
00177 
00178   fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
00179 
00180   if (fd < 0)
00181   {
00182     LFATAL("can't get socket\n");
00183   }
00184   memset(&serv_addr, 0, sizeof(serv_addr));
00185   if ((int)(serv_addr.sin_addr.s_addr = inet_addr(itsServerHost.getVal().c_str())) == -1)
00186   {
00187     /* its a name rather than an ipnum */
00188     serverhost = gethostbyname(itsServerHost.getVal().c_str());
00189     if (serverhost == (struct hostent *)0)
00190     {
00191       LFATAL("festival_client: gethostbyname failed\n");
00192     }
00193     memmove(&serv_addr.sin_addr,serverhost->h_addr, serverhost->h_length);
00194   }
00195   serv_addr.sin_family = AF_INET;
00196   serv_addr.sin_port = htons(itsServerPort.getVal());
00197 
00198   if (connect(fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) != 0)
00199   {
00200     LINFO("Connect to server failed: Insure that festival is running in server mode\n");
00201     LINFO("Starting in no sound mode\n");
00202     itsServerFD = -1;
00203     delete itsServerStream;
00204     itsServerStream = 0;
00205   } else {
00206     itsServerFD = fd;
00207     itsServerStream = new rutz::stdiostream(itsServerFD,
00208                                             std::ios::in | std::ios::out,
00209                                             true);
00210   }
00211 }
00212 
00213 // ######################################################################
00214 void SpeechSynth::festivalClose()
00215 {
00216   itsThreadServer.reset(0);
00217 
00218   if (itsServerFD != -1)
00219     {
00220       close(itsServerFD);
00221       itsServerFD = -1;
00222     }
00223 
00224   if (itsServerStream != 0)
00225     {
00226       delete itsServerStream;
00227       itsServerStream = 0;
00228     }
00229 }
00230 
00231 // ######################################################################
00232 bool SpeechSynth::sayText(const std::string& text, int priority,
00233                           bool block)
00234 {
00235   std::string cmd = sformat("(SayText \"%s\")", text.c_str());
00236   return sendCommand(cmd, priority, block);
00237 }
00238 
00239 // ######################################################################
00240 bool SpeechSynth::sendCommand(const std::string& text, int priority,
00241                               bool block, time_t* timestamp)
00242 {
00243   const int id = ++itsJobCounter;
00244   LINFO("enqueuing #%d @ priority %d: %s", id, priority, text.c_str());
00245 
00246   if (itsServerStream != 0 && itsThreadServer.get() != 0)
00247     {
00248       rutz::shared_ptr<SpeechUtteranceJob> j
00249         (new SpeechUtteranceJob(*itsServerStream, text, priority, id,
00250                                 timestamp));
00251 
00252       itsThreadServer->enqueueJob(j);
00253 
00254       if (j->wasDropped())
00255         return false;
00256 
00257       if (block)
00258         j->wait();
00259 
00260       return true;
00261     }
00262 
00263   return false;
00264 }
00265 
00266 // ######################################################################
00267 void SpeechSynth::flushQueue()
00268 {
00269   if (itsThreadServer.get() != 0)
00270     itsThreadServer->flushQueue();
00271 }
00272 
00273 // ######################################################################
00274 bool SpeechSynth::playWavFile(const std::string& fname, int priority, bool block,
00275                               int mindelay)
00276 {
00277   static int nextid = 0;
00278 
00279   std::map<std::string, WavFileInfo>::iterator itr = itsWavFiles.find(fname);
00280 
00281   if (itr == itsWavFiles.end())
00282     {
00283       WavFileInfo info;
00284 
00285       info.fname = fname;
00286       info.token = sformat("SpeechSynthWavToken%d", nextid++);
00287       if (!this->sendCommand(sformat("(if (probe_file \"%s\") (set! %s (wave.load \"%s\")) (set! %s nil))",
00288                                      fname.c_str(),
00289                                      info.token.c_str(),
00290                                      fname.c_str(),
00291                                      info.token.c_str()),
00292                              -10, false))
00293         return false;
00294 
00295       itr = itsWavFiles.insert(std::make_pair(fname, info)).first;
00296     }
00297 
00298   const time_t now = time(NULL);
00299 
00300   WavFileInfo& info = (*itr).second;
00301 
00302   if (now - info.lasttime >= mindelay)
00303     {
00304       info.lasttime = now;
00305 
00306       return this->sendCommand(sformat("(if %s (wave.play %s))",
00307                                        info.token.c_str(),
00308                                        info.token.c_str()),
00309                                priority, block, &info.lasttime);
00310     }
00311   else
00312     LINFO("not speaking '%s' because the delay is %ld secs, less than the minimum %d secs",
00313           info.fname.c_str(), now - info.lasttime, mindelay);
00314 
00315   // else...
00316   return false;
00317 }
00318 
00319 // ######################################################################
00320 void SpeechSynth::paramChanged(ModelParamBase* const param,
00321                                const bool valueChanged,
00322                                ParamClient::ChangeStatus* status)
00323 {
00324   if (param == &itsQueueSize && valueChanged)
00325     {
00326       if (itsQueueSize.getVal() == 0)
00327         *status = ParamClient::CHANGE_REJECTED;
00328       else if (itsThreadServer.get() != 0)
00329         itsThreadServer->setMaxQueueSize(itsQueueSize.getVal());
00330     }
00331 }
00332 
00333 // ######################################################################
00334 /* So things look consistent in everyone's emacs... */
00335 /* Local Variables: */
00336 /* indent-tabs-mode: nil */
00337 /* End: */