1 /* 2 * The contents of this file are subject to the Mozilla Public License 3 * Version 1.1 (the "License"); you may not use this file except in 4 * compliance with the License. You may obtain a copy of the License at 5 * http://www.mozilla.org/MPL/ 6 * 7 * Software distributed under the License is distributed on an "AS IS" 8 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 9 * License for the specific language governing rights and limitations 10 * under the License. 11 * 12 * The Original Code was developed for an EU.EDGE internal project and 13 * is made available according to the terms of this license. 14 * 15 * The Initial Developer of the Original Code is Istvan T. Hernadvolgyi, 16 * EU.EDGE LLC. 17 * 18 * Portions created by EU.EDGE LLC are Copyright (C) EU.EDGE LLC. 19 * All Rights Reserved. 20 * 21 * Alternatively, the contents of this file may be used under the terms 22 * of the GNU General Public License (the "GPL"), in which case the 23 * provisions of GPL are applicable instead of those above. If you wish 24 * to allow use of your version of this file only under the terms of the 25 * GPL and not to allow others to use your version of this file under the 26 * License, indicate your decision by deleting the provisions above and 27 * replace them with the notice and other provisions required by the GPL. 28 * If you do not delete the provisions above, a recipient may use your 29 * version of this file under either the License or the GPL. 30 */ 31 32 // FILE COMPARISONS BY MD5 HASH VALUE - HEADER 33 // 34 35 #if !defined(_FILEI_H_) 36 #define _FILEI_H_ 37 38 39 // decide whether to prefer hashed containers 40 // first you need gcc 3.x or higher 41 // second __NOHASH should be undefined 42 // 43 #if defined(__GNUC__) && !defined(__NOHASH) 44 #if __GNUC__ >= 3 45 #define __UA_USEHASH 46 #endif 47 #endif 48 49 // default work buffer size 50 // 51 #if !defined(__UABUFFSIZE) 52 #define __UABUFFSIZE 32768 53 #endif 54 55 #include <string> 56 57 #if defined(__UA_USEHASH) 58 #include <ext/hash_set> 59 #include <ext/hash_map> 60 #endif 61 62 #include <set> 63 #include <map> 64 65 #include <vector> 66 67 #include <iostream> 68 #include <iomanip> 69 70 /** File info. 71 * 72 * Contains the path name and the corresponding md5 hash. 73 * All calculations are performed during construction. Once 74 * constructed the object is "const"; there are only accessors. 75 * 76 * The calculation of MD5 requires a char buffer. By default 77 * an internal buffer is used (filei::_buffer which is private). 78 * For concurrent calculations, you can assign filei::_gbuff, filei::_buffc and 79 * filei::_relbuff to get a buffer, get its capacity and to release 80 * the buffer. Eg. you could set 81 * <pre> 82 * filei::_gbuff = &::malloc; 83 * filei::_relbuff = &::free; 84 * filei::_buffc = 0; 85 * </pre> 86 * If _buffc is 0, then it assumes that all requested memory is returned. 87 * The code checks for _gbuff returning 0 (and catches exceptions) so it is 88 * compatible with malloc and easy to wrap into an allocator as well. 89 */ 90 class filei { 91 92 private: 93 // a shared buffer 94 // used for internal calculations, but can be overridden 95 static char _buffer[__UABUFFSIZE]; 96 97 std::string _path; // path name 98 unsigned char _md5[16]; // md5 hash 99 size_t _h; // hash of hash :) 100 101 // calculate hash 102 void calc(bool ic, bool iw, size_t bs, size_t m) throw(const char*); 103 104 // return buffer gbuff(size_t)105 static void* gbuff(size_t) { return _buffer; } 106 107 // return buffer capacity buffc()108 static size_t buffc() { return __UABUFFSIZE; } 109 110 public: 111 112 /** Constructor. 113 * 114 * The constructor will open the file and calculate the hash. 115 * 116 * @param path file name 117 * @param ic ignore case 118 * @param iw ignore white space (in essence, remove it) 119 * @param m consider at most these many bytes for the hash (0: ALL) 120 * @param bs buffer size of internal work buffer (default 1024) 121 * @throws an error message if construction failed 122 */ 123 filei(const std::string& path, bool ic, bool iw, 124 size_t m = 0ul, size_t bs=1024ul) 125 throw(const char*); 126 127 /** Get an md5 hash char. 128 * @param i index 129 * @return md5 hash char at index 130 */ 131 unsigned char operator[](int i) const { return _md5[i]; } 132 133 /** Get the md5 hash characters. 134 * @return the md5 hash characters. 135 */ md5()136 const unsigned char* md5() const { return _md5; } 137 138 /** Get path name. 139 * @return path name 140 */ path()141 const std::string& path() const { return _path; } 142 143 /** Get hash code. 144 * This is a hash of the _md5 value and meant to be 145 * used in hashed data structures. 146 * @return hash code 147 */ h()148 const size_t h() const { return _h; } 149 150 151 /** Get file size from the file system. 152 * @param path absolute or relative path 153 * @return file size in bytes 154 * @throws an exception if status cannot be determined. 155 */ 156 static off_t fsize(const std::string& path) throw(const char*); 157 158 /** Determine whether the two files are identical. 159 * @param p1 path of one file 160 * @param p2 path of the other 161 * @param ic ignore letter case 162 * @param iw ignore white spaces 163 * @param m onlu consider these many bytes (0 all) 164 * @param bs set the internal buffer size 165 * @return whether the files corresponding to p1 and p2 are identical 166 * @throws an exception on any error 167 */ 168 static bool eq(const std::string& p1, const std::string& p2, 169 bool ic, bool iw, size_t m = 0ul, size_t bs = 1024ul) 170 throw(const char*); 171 172 /** Functor for hashed containers. 173 */ 174 struct md5hash { 175 176 /** Calculate a hash of the md5 hash. 177 * @param fi file info 178 * @return hash from md5 179 */ operatormd5hash180 size_t operator()(const filei& fi) const { return fi.h(); } 181 }; 182 183 /** Functor for sorted containers. 184 */ 185 struct md5cmp { 186 187 /** Compare the md5 hashes numerically. 188 * @param fi1 file info 1 189 * @param fi2 file info 2 190 * @return true if fi1._md5 < fi2._md5 191 */ 192 bool operator()(const filei& fi1, const filei& fi2) const; 193 }; 194 195 /** Functor for containers. 196 */ 197 struct md5eq { 198 199 /** Compare the md5 hashes numerically. 200 * @param fi1 file info 1 201 * @param fi2 file info 2 202 * @return true if fi1._md5 == fi2._md5 203 */ 204 bool operator()(const filei& fi1, const filei& fi2) const; 205 }; 206 207 // assign the three plugins below differently 208 // if you want re-entrant calculations, 209 // or different buffer allocation 210 // by default, all calculations share the same buffer and 211 // all calculations are performed at construction 212 213 /** Function that gets work buffer. 214 * The size_t argument is the requested capacity. 215 * By default, it is set to filei::gbuff, which returns 216 * a static (shared) buffer of size 32K (regardless of the 217 * requested capacity). 218 */ 219 static void* (*_gbuff)(size_t); 220 221 /** Function that tells work buffer capacity. 222 * The function should tell the capacity of the 223 * buffer returned by (*_gbuff)(size_t). 224 */ 225 static size_t (*_buffc)(); 226 227 /** Function that releases work buffer. 228 * This function will be called when a calculation 229 * has finished. The argument is the address of the work buffer 230 * returned by (*_gbuff)(size_t). 231 */ 232 static void (*_relbuff)(void*); 233 }; 234 235 236 /** Vector of file names. */ 237 typedef std::vector<std::string> fvec_t; 238 239 /** Set of file infos. 240 */ 241 typedef std::set<filei,filei::md5cmp> set_t; 242 243 /** Map from file name to file names. 244 * The key and the values together make up a set of 245 * identical files. 246 */ 247 typedef std::map<filei,fvec_t,filei::md5cmp> map_t; 248 249 250 #if defined(__UA_USEHASH) 251 252 /** Hashed set of file infos. 253 */ 254 typedef __gnu_cxx::hash_set<filei,filei::md5hash,filei::md5eq> hset_t; 255 256 /** Hashed map from file name to file names. 257 * The key and the values together make up a set of 258 * identical files. 259 */ 260 typedef __gnu_cxx::hash_map<filei,fvec_t,filei::md5hash,filei::md5eq> hmap_t; 261 #endif 262 263 /** Sets of identical files. 264 * 265 * S: must be some set<filei> 266 * M: must be some map<filei,std::vector<std::string> > 267 */ 268 template<typename S, typename M> 269 class fset { 270 private: 271 272 // "heads" are random representatives of sets of files 273 // with the same _md5 274 275 S _files; // set of file "heads": each has unique _md5 276 M _cmn; // the subsets that are identical ("head" -> path names) 277 278 bool _ic; // ignore case 279 bool _iw; // ignore whitespace 280 size_t _max; // max chars to consider 281 size_t _bs; // buffer size 282 283 typedef typename M::const_iterator it_t; // subset iterator 284 285 /** Add a file info. 286 * @param fi file info 287 */ add(const filei & fi)288 void add(const filei& fi) { 289 typename S::const_iterator i = _files.find(fi); 290 if (i != _files.end()) _cmn[*i].push_back(fi.path()); 291 else _files.insert(fi); 292 } 293 294 295 public: 296 297 /** Constructor. 298 * 299 * @param ic ignore case 300 * @param iw ignore white space 301 * @param m consider at most these many bytes for hash (0: ALL) 302 * @param bs internal buffer size (default 1024) 303 */ 304 fset(bool ic, bool iw, size_t m = 0, size_t bs = 1024): _ic(ic)305 _ic(ic), _iw(iw), _max(m), _bs(bs) { 306 } 307 308 309 /** Add a file. 310 * @param path file 311 * @throws a description if hash could not be constructed 312 */ add(const std::string & path)313 void add(const std::string& path) throw(const char*) { 314 add(filei(path,_ic,_iw,_max,_bs)); 315 } 316 317 /** Print the sets of identical files. 318 * 319 * Each set of identical files are printed on a single line. 320 * @param cmn common files 321 * @param os output stream 322 * @param s separator (default " ") 323 * @param ph print hash (if true, the first column is the hash) 324 */ 325 static void produce(const M& cmn, std::ostream& os, 326 const std::string& s = " ", bool ph = false) { 327 328 for(it_t it= cmn.begin(); it != cmn.end(); ++it) { 329 if (ph) { // print hash 330 for(int i=0; i< 16; ++i) { 331 int hi = it->first[i] >> 4 & 0x0f; 332 int lo = it->first[i] & 0x0f; 333 os << std::hex << hi << lo; 334 } 335 os << s; 336 } 337 os << it->first.path(); 338 for(int i=0; i<(int)it->second.size();++i) os << s << it->second[i]; 339 os << std::endl; 340 } 341 } 342 343 /** Identical sets of files. 344 * 345 * The result is a map from a file info to vectors of file paths. 346 * The key and its corresponding values together make up one 347 * identical set of files. The key is not repeated in the values and 348 * its designation as key is arbitrary. 349 * 350 * @return a map of the identical sets 351 */ common()352 const M& common() const { return _cmn; } 353 354 /** Calculate sets of identical files. 355 * 356 * This function can be used to implement a multi-stage 357 * calculation of identical file sets. The param cmn 358 * could be obtained by looking at only the prefix of the files 359 * and thus files in its identical sets may not be totally identical 360 * (only in prefix). However, files which are not in these sets can 361 * be safely ignored. 362 * 363 * @param res resulting common set of files (returned) 364 * @param cmn common set of files 365 * @param ic ignore case 366 * @param iw ignore white space 367 * @param m consider at most these many bytes for hash (0: ALL) 368 * @param bs internal buffer size (default 1024) 369 */ 370 static void common(M& res, const M& cmn, 371 bool ic, bool iw, size_t m=0, size_t bs=1204) { 372 for(it_t it=cmn.begin(); it != cmn.end(); ++it) { 373 fset files(ic,iw,m,bs); 374 files.add(it->first.path()); 375 for(int i=0; i<(int)it->second.size();++i) files.add(it->second[i]); 376 377 const M& locmn = files.common(); 378 for(it_t lit = locmn.begin(); lit!=locmn.end(); ++lit) { 379 res[lit->first] = lit->second; 380 } 381 } 382 } 383 }; 384 385 /** Choose preferred types for file set and result set. 386 * fsetc_t: preferred type for the map from file size to file names 387 * res_t: preferred type for the map of identical subsets 388 * fset_t: preferred type for the fset object 389 * 390 */ 391 #if defined(__UA_USEHASH) 392 typedef fset<hset_t,hmap_t> fset_t; 393 typedef hmap_t res_t; 394 typedef __gnu_cxx::hash_map<size_t,fvec_t> fsetc_t; 395 #else 396 typedef std::map<size_t,fvec_t> fsetc_t; 397 typedef map_t res_t; 398 typedef fset<set_t,map_t> fset_t; 399 #endif 400 401 #endif 402