StringUtil.C

Go to the documentation of this file.
00001 /*!@file Util/StringUtil.C */
00002 
00003 // //////////////////////////////////////////////////////////////////// //
00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005   //
00005 // by the University of Southern California (USC) and the iLab at USC.  //
00006 // See http://iLab.usc.edu for information about this project.          //
00007 // //////////////////////////////////////////////////////////////////// //
00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected //
00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency //
00010 // in Visual Environments, and Applications'' by Christof Koch and      //
00011 // Laurent Itti, California Institute of Technology, 2001 (patent       //
00012 // pending; application number 09/912,225 filed July 23, 2001; see      //
00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status).     //
00014 // //////////////////////////////////////////////////////////////////// //
00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit.       //
00016 //                                                                      //
00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can   //
00018 // redistribute it and/or modify it under the terms of the GNU General  //
00019 // Public License as published by the Free Software Foundation; either  //
00020 // version 2 of the License, or (at your option) any later version.     //
00021 //                                                                      //
00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope  //
00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the   //
00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      //
00025 // PURPOSE.  See the GNU General Public License for more details.       //
00026 //                                                                      //
00027 // You should have received a copy of the GNU General Public License    //
00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write   //
00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,   //
00030 // Boston, MA 02111-1307 USA.                                           //
00031 // //////////////////////////////////////////////////////////////////// //
00032 //
00033 // Primary maintainer for this file: Rob Peters <rjpeters at usc dot edu>
00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/StringUtil.C $
00035 // $Id: StringUtil.C 14610 2011-03-15 17:53:47Z rand $
00036 //
00037 
00038 #ifndef UTIL_STRINGUTIL_C_DEFINED
00039 #define UTIL_STRINGUTIL_C_DEFINED
00040 
00041 #include "Util/StringUtil.H"
00042 
00043 #include "Util/Assert.H"
00044 
00045 #include <algorithm>
00046 #include <iterator>
00047 #include <vector>
00048 
00049 // ######################################################################
00050 std::string stdLineWrap(const std::string& in, size_t linelength,
00051                         const std::string& pfx, const std::string& sfx)
00052 {
00053   std::vector<std::string> words;
00054 
00055   split(in, " \t\n", std::back_inserter(words));
00056 
00057   return lineWrap(words.begin(), words.end(), linelength, pfx, sfx);
00058 }
00059 
00060 // ######################################################################
00061 unsigned int levenshteinDistance(const std::string& s,
00062                                  const std::string& t)
00063 {
00064   const size_t m = s.length();
00065   const size_t n = t.length();
00066 
00067   std::vector<int> d((m+1) * (n+1)); // m+1 columns, n+1 rows
00068 
00069 #define D_INDEX(i, j) ((i) + ((j)*(m+1)))
00070 
00071   for (size_t i = 0; i <= m; ++i)
00072     d[D_INDEX(i,0)] = i;
00073 
00074   for (size_t j = 0; j <= n; ++j)
00075     d[D_INDEX(0,j)] = j;
00076 
00077   for (size_t i = 1; i <= m; ++i)
00078     for (size_t j = 1; j <= n; ++j)
00079       {
00080         const int cost = s[i-1] == t[j-1] ? 0 : 1;
00081 
00082         d[D_INDEX(i,j)] =
00083           std::min(std::min(d[D_INDEX(i-1,j)] + 1 /* deletion */,
00084                             d[D_INDEX(i,j-1)] + 1 /* insertion */),
00085                    d[D_INDEX(i-1,j-1)] + cost /* substitution */);
00086       }
00087 
00088   ASSERT(d[D_INDEX(m,n)] >= 0);
00089 
00090   return (unsigned int)(d[D_INDEX(m,n)]);
00091 
00092 #undef D_INDEX
00093 }
00094 
00095 // ######################################################################
00096 unsigned int damerauLevenshteinDistance(const std::string& s,
00097                                         const std::string& t)
00098 {
00099   const size_t m = s.length();
00100   const size_t n = t.length();
00101 
00102   std::vector<int> d((m+1) * (n+1)); // m+1 columns, n+1 rows
00103 
00104 #define D_INDEX(i, j) ((i) + ((j)*(m+1)))
00105 
00106   for (size_t i = 0; i <= m; ++i)
00107     d[D_INDEX(i,0)] = i;
00108 
00109   for (size_t j = 0; j <= n; ++j)
00110     d[D_INDEX(0,j)] = j;
00111 
00112   for (size_t i = 1; i <= m; ++i)
00113     for (size_t j = 1; j <= n; ++j)
00114       {
00115         const int cost = s[i-1] == t[j-1] ? 0 : 1;
00116 
00117         d[D_INDEX(i,j)] =
00118           std::min(std::min(d[D_INDEX(i-1,j)] + 1 /* deletion */,
00119                             d[D_INDEX(i,j-1)] + 1 /* insertion */),
00120                    d[D_INDEX(i-1,j-1)] + cost /* substitution */);
00121 
00122         if (i > 1 && j > 1 && s[i-1] == t[j-2] && s[i-2] == t[j-1])
00123           d[D_INDEX(i,j)] =
00124             std::min(d[D_INDEX(i,j)],
00125                      d[D_INDEX(i-2,j-2)] + cost /* transposition */);
00126       }
00127 
00128   ASSERT(d[D_INDEX(m,n)] >= 0);
00129 
00130   return (unsigned int)(d[D_INDEX(m,n)]);
00131 
00132 #undef D_INDEX
00133 }
00134 
00135 // ######################################################################
00136 std::string camelCaseToSpaces(const std::string& s,
00137                               const std::set<std::string>* acronyms)
00138 {
00139   std::string r;
00140 
00141   std::string upword;
00142 
00143   for (size_t i = 0; i < s.length(); ++i)
00144     {
00145       const bool is_last_char = (i + 1 == s.length());
00146 
00147       bool wasupper = false;
00148       if (isupper(s[i])
00149           ||
00150           // digits stick to preceding uppercase strings:
00151           (upword.length() > 0 && isdigit(s[i])))
00152         {
00153           upword += s[i];
00154           wasupper = true;
00155         }
00156 
00157       // do we need to inject our collected uppercase word into the
00158       // result?
00159       if (!wasupper || is_last_char)
00160         {
00161           if (upword.length() > 0)
00162             {
00163               // Is our full uppercase word a known acronym?
00164               if (acronyms != 0
00165                   && acronyms->find(upword) != acronyms->end())
00166                 {
00167                   if (r.length() > 0)
00168                     r += ' ';
00169 
00170                   for (size_t j = 0; j < upword.size(); ++j)
00171                     r += tolower(upword[j]);
00172 
00173                   if (!is_last_char)
00174                     r += ' ';
00175                 }
00176               // Otherwise are the first n-1 uppercase chars a known
00177               // acronym (in which case the last char would be the
00178               // first char of the next word)?
00179               else if (upword.length() >= 2
00180                        && acronyms != 0
00181                        &&
00182                        (acronyms->find(upword.substr(0, upword.length()-1))
00183                         != acronyms->end()))
00184                 {
00185                   if (r.length() > 0)
00186                     r += ' ';
00187 
00188                   for (size_t j = 0; j < upword.size() - 1; ++j)
00189                     r += tolower(upword[j]);
00190 
00191                   r += ' ';
00192                   r += tolower(upword[upword.size() - 1]);
00193                 }
00194               // Otherwise we treat each uppercase char as its own
00195               // word:
00196               else
00197                 {
00198                   for (size_t j = 0; j < upword.size(); ++j)
00199                     {
00200                       if (r.length() > 0)
00201                         r += ' ';
00202                       r += tolower(upword[j]);
00203                     }
00204                 }
00205 
00206               upword = "";
00207             }
00208         }
00209 
00210       if (!wasupper)
00211         {
00212           r += s[i];
00213         }
00214     }
00215 
00216   return r;
00217 }
00218 
00219 
00220 // ######################################################################
00221 std::string trim(std::string const& str)
00222 {
00223         size_t f = str.find_first_not_of(" ");
00224         size_t l = str.find_last_not_of(" ");
00225         return str.substr(f, l-f+1);
00226 }
00227 
00228 // ######################################################################
00229 /* So things look consistent in everyone's emacs... */
00230 /* Local Variables: */
00231 /* mode: c++ */
00232 /* indent-tabs-mode: nil */
00233 /* End: */
00234 
00235 #endif // UTIL_STRINGUTIL_C_DEFINED