00001 /*!@file Devices/SpeechSynth.C Interfaces to festival speech synth */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2001 by the // 00005 // University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Lior Elazary <elazary@usc.edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Devices/SpeechSynth.C $ 00035 // $Id: SpeechSynth.C 9760 2008-05-11 22:40:13Z rjpeters $ 00036 // 00037 00038 #include "Devices/SpeechSynth.H" 00039 00040 #include "Component/ModelOptionDef.H" 00041 #include "Devices/DeviceOpts.H" 00042 #include "Util/Assert.H" 00043 #include "Util/JobWithSemaphore.H" 00044 #include "Util/StringUtil.H" 00045 #include "Util/WorkThreadServer.H" 00046 #include "Util/sformat.H" 00047 #include "rutz/compat_snprintf.h" 00048 #include "rutz/stdiobuf.h" 00049 00050 #include <fcntl.h> 00051 00052 static const ModelOptionDef OPT_FestivalServerHost = 00053 { MODOPT_ARG_STRING, "FestivalServerHost", &MOC_AUDIO, OPTEXP_CORE, 00054 "IP address of festival server to use for speech synthesis", 00055 "festival-server-host", '\0', "<ipaddr>", "127.0.0.1" }; 00056 00057 static const ModelOptionDef OPT_FestivalServerPort = 00058 { MODOPT_ARG(uint), "FestivalServerPort", &MOC_AUDIO, OPTEXP_CORE, 00059 "Port number of festival server to use for speech synthesis", 00060 "festival-server-port", '\0', "<portnum>", "1314" }; 00061 00062 static const ModelOptionDef OPT_SpeechQueueSize = 00063 { MODOPT_ARG(size_t), "SpeechQueueSize", &MOC_AUDIO, OPTEXP_CORE, 00064 "Max queue size for speech utterances; low-priority utterances " 00065 "will be dropped to avoid exceeding this size", 00066 "speech-queue-size", '\0', "<size_t>", "1" }; 00067 00068 namespace 00069 { 00070 class SpeechUtteranceJob : public JobWithSemaphore 00071 { 00072 public: 00073 SpeechUtteranceJob(std::iostream& server, 00074 const std::string& msg, 00075 const int priority, 00076 const int id, 00077 time_t* timestamp) 00078 : 00079 itsServer(server), 00080 itsMsg(msg), 00081 itsPriority(priority), 00082 itsJobType(sformat("utterance[priority=%d]", priority)), 00083 itsId(id), 00084 itsTimestamp(timestamp) 00085 {} 00086 00087 virtual ~SpeechUtteranceJob() {} 00088 00089 virtual void run() 00090 { 00091 LINFO(" running #%d @ priority %d: %s", 00092 itsId, itsPriority, itsMsg.c_str()); 00093 00094 const std::string msg_with_newline = itsMsg + '\n'; 00095 00096 itsServer << msg_with_newline << std::flush; 00097 00098 std::string code; 00099 std::getline(itsServer, code); 00100 00101 LDEBUG("festival ack = '%s'", code.c_str()); 00102 00103 if (code == "LP" || code == "WV") 00104 { 00105 std::string data, end; 00106 std::getline(itsServer, data); 00107 std::getline(itsServer, end); 00108 LDEBUG("festival data = '%s'", data.c_str()); 00109 LDEBUG("festival end = '%s'", end.c_str()); 00110 if (end.size() < 2 || end.substr(end.size() - 2).compare("OK") != 0) 00111 LERROR("festival return message didn't end with 'OK'"); 00112 } 00113 else if (code == "ER") 00114 { 00115 LERROR("festival returned error code 'ER'"); 00116 } 00117 00118 if (itsTimestamp) 00119 time(itsTimestamp); 00120 00121 this->markFinished(); 00122 } 00123 00124 virtual const char* jobType() const 00125 { return itsJobType.c_str(); } 00126 00127 virtual int priority() const 00128 { return itsPriority; } 00129 00130 std::iostream& itsServer; 00131 const std::string itsMsg; 00132 const int itsPriority; 00133 const std::string itsJobType; 00134 const int itsId; 00135 time_t* itsTimestamp; 00136 }; 00137 } 00138 00139 // ###################################################################### 00140 SpeechSynth::SpeechSynth(OptionManager& mgr, const std::string& descrName, 00141 const std::string& tagName) 00142 : 00143 ModelComponent(mgr, descrName, tagName), 00144 itsServerHost(&OPT_FestivalServerHost, this), 00145 itsServerPort(&OPT_FestivalServerPort, this), 00146 itsQueueSize(&OPT_SpeechQueueSize, this, ALLOW_ONLINE_CHANGES), 00147 itsThreadServer(), 00148 itsServerFD(-1), 00149 itsServerStream(0), 00150 itsJobCounter(0) 00151 {} 00152 00153 // ###################################################################### 00154 void SpeechSynth::start2() 00155 { 00156 festivalConnect(); 00157 } 00158 00159 // ###################################################################### 00160 SpeechSynth::~SpeechSynth() 00161 { 00162 festivalClose(); 00163 } 00164 00165 // ###################################################################### 00166 void SpeechSynth::festivalConnect() 00167 { 00168 itsThreadServer.reset(new WorkThreadServer("SpeechSynth", 1)); 00169 itsThreadServer->setDropPolicy(WorkThreadServer::DROP_OLDEST_LOWEST_PRIORITY); 00170 itsThreadServer->setMaxQueueSize(itsQueueSize.getVal()); 00171 itsThreadServer->setFlushBeforeStopping(false); 00172 00173 /* Return an FD to a remote server */ 00174 struct sockaddr_in serv_addr; 00175 struct hostent *serverhost; 00176 int fd; 00177 00178 fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); 00179 00180 if (fd < 0) 00181 { 00182 LFATAL("can't get socket\n"); 00183 } 00184 memset(&serv_addr, 0, sizeof(serv_addr)); 00185 if ((int)(serv_addr.sin_addr.s_addr = inet_addr(itsServerHost.getVal().c_str())) == -1) 00186 { 00187 /* its a name rather than an ipnum */ 00188 serverhost = gethostbyname(itsServerHost.getVal().c_str()); 00189 if (serverhost == (struct hostent *)0) 00190 { 00191 LFATAL("festival_client: gethostbyname failed\n"); 00192 } 00193 memmove(&serv_addr.sin_addr,serverhost->h_addr, serverhost->h_length); 00194 } 00195 serv_addr.sin_family = AF_INET; 00196 serv_addr.sin_port = htons(itsServerPort.getVal()); 00197 00198 if (connect(fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) != 0) 00199 { 00200 LINFO("Connect to server failed: Insure that festival is running in server mode\n"); 00201 LINFO("Starting in no sound mode\n"); 00202 itsServerFD = -1; 00203 delete itsServerStream; 00204 itsServerStream = 0; 00205 } else { 00206 itsServerFD = fd; 00207 itsServerStream = new rutz::stdiostream(itsServerFD, 00208 std::ios::in | std::ios::out, 00209 true); 00210 } 00211 } 00212 00213 // ###################################################################### 00214 void SpeechSynth::festivalClose() 00215 { 00216 itsThreadServer.reset(0); 00217 00218 if (itsServerFD != -1) 00219 { 00220 close(itsServerFD); 00221 itsServerFD = -1; 00222 } 00223 00224 if (itsServerStream != 0) 00225 { 00226 delete itsServerStream; 00227 itsServerStream = 0; 00228 } 00229 } 00230 00231 // ###################################################################### 00232 bool SpeechSynth::sayText(const std::string& text, int priority, 00233 bool block) 00234 { 00235 std::string cmd = sformat("(SayText \"%s\")", text.c_str()); 00236 return sendCommand(cmd, priority, block); 00237 } 00238 00239 // ###################################################################### 00240 bool SpeechSynth::sendCommand(const std::string& text, int priority, 00241 bool block, time_t* timestamp) 00242 { 00243 const int id = ++itsJobCounter; 00244 LINFO("enqueuing #%d @ priority %d: %s", id, priority, text.c_str()); 00245 00246 if (itsServerStream != 0 && itsThreadServer.get() != 0) 00247 { 00248 rutz::shared_ptr<SpeechUtteranceJob> j 00249 (new SpeechUtteranceJob(*itsServerStream, text, priority, id, 00250 timestamp)); 00251 00252 itsThreadServer->enqueueJob(j); 00253 00254 if (j->wasDropped()) 00255 return false; 00256 00257 if (block) 00258 j->wait(); 00259 00260 return true; 00261 } 00262 00263 return false; 00264 } 00265 00266 // ###################################################################### 00267 void SpeechSynth::flushQueue() 00268 { 00269 if (itsThreadServer.get() != 0) 00270 itsThreadServer->flushQueue(); 00271 } 00272 00273 // ###################################################################### 00274 bool SpeechSynth::playWavFile(const std::string& fname, int priority, bool block, 00275 int mindelay) 00276 { 00277 static int nextid = 0; 00278 00279 std::map<std::string, WavFileInfo>::iterator itr = itsWavFiles.find(fname); 00280 00281 if (itr == itsWavFiles.end()) 00282 { 00283 WavFileInfo info; 00284 00285 info.fname = fname; 00286 info.token = sformat("SpeechSynthWavToken%d", nextid++); 00287 if (!this->sendCommand(sformat("(if (probe_file \"%s\") (set! %s (wave.load \"%s\")) (set! %s nil))", 00288 fname.c_str(), 00289 info.token.c_str(), 00290 fname.c_str(), 00291 info.token.c_str()), 00292 -10, false)) 00293 return false; 00294 00295 itr = itsWavFiles.insert(std::make_pair(fname, info)).first; 00296 } 00297 00298 const time_t now = time(NULL); 00299 00300 WavFileInfo& info = (*itr).second; 00301 00302 if (now - info.lasttime >= mindelay) 00303 { 00304 info.lasttime = now; 00305 00306 return this->sendCommand(sformat("(if %s (wave.play %s))", 00307 info.token.c_str(), 00308 info.token.c_str()), 00309 priority, block, &info.lasttime); 00310 } 00311 else 00312 LINFO("not speaking '%s' because the delay is %ld secs, less than the minimum %d secs", 00313 info.fname.c_str(), now - info.lasttime, mindelay); 00314 00315 // else... 00316 return false; 00317 } 00318 00319 // ###################################################################### 00320 void SpeechSynth::paramChanged(ModelParamBase* const param, 00321 const bool valueChanged, 00322 ParamClient::ChangeStatus* status) 00323 { 00324 if (param == &itsQueueSize && valueChanged) 00325 { 00326 if (itsQueueSize.getVal() == 0) 00327 *status = ParamClient::CHANGE_REJECTED; 00328 else if (itsThreadServer.get() != 0) 00329 itsThreadServer->setMaxQueueSize(itsQueueSize.getVal()); 00330 } 00331 } 00332 00333 // ###################################################################### 00334 /* So things look consistent in everyone's emacs... */ 00335 /* Local Variables: */ 00336 /* indent-tabs-mode: nil */ 00337 /* End: */