00001 /*!@file Util/StringUtil.C */ 00002 00003 // //////////////////////////////////////////////////////////////////// // 00004 // The iLab Neuromorphic Vision C++ Toolkit - Copyright (C) 2000-2005 // 00005 // by the University of Southern California (USC) and the iLab at USC. // 00006 // See http://iLab.usc.edu for information about this project. // 00007 // //////////////////////////////////////////////////////////////////// // 00008 // Major portions of the iLab Neuromorphic Vision Toolkit are protected // 00009 // under the U.S. patent ``Computation of Intrinsic Perceptual Saliency // 00010 // in Visual Environments, and Applications'' by Christof Koch and // 00011 // Laurent Itti, California Institute of Technology, 2001 (patent // 00012 // pending; application number 09/912,225 filed July 23, 2001; see // 00013 // http://pair.uspto.gov/cgi-bin/final/home.pl for current status). // 00014 // //////////////////////////////////////////////////////////////////// // 00015 // This file is part of the iLab Neuromorphic Vision C++ Toolkit. // 00016 // // 00017 // The iLab Neuromorphic Vision C++ Toolkit is free software; you can // 00018 // redistribute it and/or modify it under the terms of the GNU General // 00019 // Public License as published by the Free Software Foundation; either // 00020 // version 2 of the License, or (at your option) any later version. // 00021 // // 00022 // The iLab Neuromorphic Vision C++ Toolkit is distributed in the hope // 00023 // that it will be useful, but WITHOUT ANY WARRANTY; without even the // 00024 // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // 00025 // PURPOSE. See the GNU General Public License for more details. // 00026 // // 00027 // You should have received a copy of the GNU General Public License // 00028 // along with the iLab Neuromorphic Vision C++ Toolkit; if not, write // 00029 // to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, // 00030 // Boston, MA 02111-1307 USA. // 00031 // //////////////////////////////////////////////////////////////////// // 00032 // 00033 // Primary maintainer for this file: Rob Peters <rjpeters at usc dot edu> 00034 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/StringUtil.C $ 00035 // $Id: StringUtil.C 14610 2011-03-15 17:53:47Z rand $ 00036 // 00037 00038 #ifndef UTIL_STRINGUTIL_C_DEFINED 00039 #define UTIL_STRINGUTIL_C_DEFINED 00040 00041 #include "Util/StringUtil.H" 00042 00043 #include "Util/Assert.H" 00044 00045 #include <algorithm> 00046 #include <iterator> 00047 #include <vector> 00048 00049 // ###################################################################### 00050 std::string stdLineWrap(const std::string& in, size_t linelength, 00051 const std::string& pfx, const std::string& sfx) 00052 { 00053 std::vector<std::string> words; 00054 00055 split(in, " \t\n", std::back_inserter(words)); 00056 00057 return lineWrap(words.begin(), words.end(), linelength, pfx, sfx); 00058 } 00059 00060 // ###################################################################### 00061 unsigned int levenshteinDistance(const std::string& s, 00062 const std::string& t) 00063 { 00064 const size_t m = s.length(); 00065 const size_t n = t.length(); 00066 00067 std::vector<int> d((m+1) * (n+1)); // m+1 columns, n+1 rows 00068 00069 #define D_INDEX(i, j) ((i) + ((j)*(m+1))) 00070 00071 for (size_t i = 0; i <= m; ++i) 00072 d[D_INDEX(i,0)] = i; 00073 00074 for (size_t j = 0; j <= n; ++j) 00075 d[D_INDEX(0,j)] = j; 00076 00077 for (size_t i = 1; i <= m; ++i) 00078 for (size_t j = 1; j <= n; ++j) 00079 { 00080 const int cost = s[i-1] == t[j-1] ? 0 : 1; 00081 00082 d[D_INDEX(i,j)] = 00083 std::min(std::min(d[D_INDEX(i-1,j)] + 1 /* deletion */, 00084 d[D_INDEX(i,j-1)] + 1 /* insertion */), 00085 d[D_INDEX(i-1,j-1)] + cost /* substitution */); 00086 } 00087 00088 ASSERT(d[D_INDEX(m,n)] >= 0); 00089 00090 return (unsigned int)(d[D_INDEX(m,n)]); 00091 00092 #undef D_INDEX 00093 } 00094 00095 // ###################################################################### 00096 unsigned int damerauLevenshteinDistance(const std::string& s, 00097 const std::string& t) 00098 { 00099 const size_t m = s.length(); 00100 const size_t n = t.length(); 00101 00102 std::vector<int> d((m+1) * (n+1)); // m+1 columns, n+1 rows 00103 00104 #define D_INDEX(i, j) ((i) + ((j)*(m+1))) 00105 00106 for (size_t i = 0; i <= m; ++i) 00107 d[D_INDEX(i,0)] = i; 00108 00109 for (size_t j = 0; j <= n; ++j) 00110 d[D_INDEX(0,j)] = j; 00111 00112 for (size_t i = 1; i <= m; ++i) 00113 for (size_t j = 1; j <= n; ++j) 00114 { 00115 const int cost = s[i-1] == t[j-1] ? 0 : 1; 00116 00117 d[D_INDEX(i,j)] = 00118 std::min(std::min(d[D_INDEX(i-1,j)] + 1 /* deletion */, 00119 d[D_INDEX(i,j-1)] + 1 /* insertion */), 00120 d[D_INDEX(i-1,j-1)] + cost /* substitution */); 00121 00122 if (i > 1 && j > 1 && s[i-1] == t[j-2] && s[i-2] == t[j-1]) 00123 d[D_INDEX(i,j)] = 00124 std::min(d[D_INDEX(i,j)], 00125 d[D_INDEX(i-2,j-2)] + cost /* transposition */); 00126 } 00127 00128 ASSERT(d[D_INDEX(m,n)] >= 0); 00129 00130 return (unsigned int)(d[D_INDEX(m,n)]); 00131 00132 #undef D_INDEX 00133 } 00134 00135 // ###################################################################### 00136 std::string camelCaseToSpaces(const std::string& s, 00137 const std::set<std::string>* acronyms) 00138 { 00139 std::string r; 00140 00141 std::string upword; 00142 00143 for (size_t i = 0; i < s.length(); ++i) 00144 { 00145 const bool is_last_char = (i + 1 == s.length()); 00146 00147 bool wasupper = false; 00148 if (isupper(s[i]) 00149 || 00150 // digits stick to preceding uppercase strings: 00151 (upword.length() > 0 && isdigit(s[i]))) 00152 { 00153 upword += s[i]; 00154 wasupper = true; 00155 } 00156 00157 // do we need to inject our collected uppercase word into the 00158 // result? 00159 if (!wasupper || is_last_char) 00160 { 00161 if (upword.length() > 0) 00162 { 00163 // Is our full uppercase word a known acronym? 00164 if (acronyms != 0 00165 && acronyms->find(upword) != acronyms->end()) 00166 { 00167 if (r.length() > 0) 00168 r += ' '; 00169 00170 for (size_t j = 0; j < upword.size(); ++j) 00171 r += tolower(upword[j]); 00172 00173 if (!is_last_char) 00174 r += ' '; 00175 } 00176 // Otherwise are the first n-1 uppercase chars a known 00177 // acronym (in which case the last char would be the 00178 // first char of the next word)? 00179 else if (upword.length() >= 2 00180 && acronyms != 0 00181 && 00182 (acronyms->find(upword.substr(0, upword.length()-1)) 00183 != acronyms->end())) 00184 { 00185 if (r.length() > 0) 00186 r += ' '; 00187 00188 for (size_t j = 0; j < upword.size() - 1; ++j) 00189 r += tolower(upword[j]); 00190 00191 r += ' '; 00192 r += tolower(upword[upword.size() - 1]); 00193 } 00194 // Otherwise we treat each uppercase char as its own 00195 // word: 00196 else 00197 { 00198 for (size_t j = 0; j < upword.size(); ++j) 00199 { 00200 if (r.length() > 0) 00201 r += ' '; 00202 r += tolower(upword[j]); 00203 } 00204 } 00205 00206 upword = ""; 00207 } 00208 } 00209 00210 if (!wasupper) 00211 { 00212 r += s[i]; 00213 } 00214 } 00215 00216 return r; 00217 } 00218 00219 00220 // ###################################################################### 00221 std::string trim(std::string const& str) 00222 { 00223 size_t f = str.find_first_not_of(" "); 00224 size_t l = str.find_last_not_of(" "); 00225 return str.substr(f, l-f+1); 00226 } 00227 00228 // ###################################################################### 00229 /* So things look consistent in everyone's emacs... */ 00230 /* Local Variables: */ 00231 /* mode: c++ */ 00232 /* indent-tabs-mode: nil */ 00233 /* End: */ 00234 00235 #endif // UTIL_STRINGUTIL_C_DEFINED