1 /*
2  * The contents of this file are subject to the Mozilla Public License
3  * Version 1.1 (the "License"); you may not use this file except in
4  * compliance with the License. You may obtain a copy of the License at
5  * http://www.mozilla.org/MPL/
6  *
7  * Software distributed under the License is distributed on an "AS IS"
8  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9  * License for the specific language governing rights and limitations
10  * under the License.
11  *
12  * The Original Code was developed for an EU.EDGE internal project and
13  * is made available according to the terms of this license.
14  *
15  * The Initial Developer of the Original Code is Istvan T. Hernadvolgyi,
16  * EU.EDGE LLC.
17  *
18  * Portions created by EU.EDGE LLC are Copyright (C) EU.EDGE LLC.
19  * All Rights Reserved.
20  *
21  * Alternatively, the contents of this file may be used under the terms
22  * of the GNU General Public License (the "GPL"), in which case the
23  * provisions of GPL are applicable instead of those above.  If you wish
24  * to allow use of your version of this file only under the terms of the
25  * GPL and not to allow others to use your version of this file under the
26  * License, indicate your decision by deleting the provisions above and
27  * replace them with the notice and other provisions required by the GPL.
28  * If you do not delete the provisions above, a recipient may use your
29  * version of this file under either the License or the GPL.
30  */
31 
32 // FILE COMPARISONS BY MD5 HASH VALUE - HEADER
33 //
34 
35 #if !defined(_FILEI_H_)
36 #define _FILEI_H_
37 
38 
39 // decide whether to prefer hashed containers
40 // first you need gcc 3.x or higher
41 // second __NOHASH should be undefined
42 //
43 #if defined(__GNUC__) && !defined(__NOHASH)
44 #if __GNUC__ >= 3
45 #define __UA_USEHASH
46 #endif
47 #endif
48 
49 // default work buffer size
50 //
51 #if !defined(__UABUFFSIZE)
52 #define __UABUFFSIZE 32768
53 #endif
54 
55 #include <string>
56 
57 #if defined(__UA_USEHASH)
58 #include <ext/hash_set>
59 #include <ext/hash_map>
60 #endif
61 
62 #include <set>
63 #include <map>
64 
65 #include <vector>
66 
67 #include <iostream>
68 #include <iomanip>
69 
70 /** File info.
71  *
72  * Contains the path name and the corresponding md5 hash.
73  * All calculations are performed during construction. Once
74  * constructed the object is "const"; there are only accessors.
75  *
76  * The calculation of MD5 requires a char buffer. By default
77  * an internal buffer is used (filei::_buffer which is private).
78  * For concurrent calculations, you can assign filei::_gbuff, filei::_buffc and
79  * filei::_relbuff to get a buffer, get its capacity and to release
80  * the buffer. Eg. you could set
81  * <pre>
82  *    filei::_gbuff = &::malloc;
83  *    filei::_relbuff = &::free;
84  *    filei::_buffc = 0;
85  * </pre>
86  * If _buffc is 0, then it assumes that all requested memory is returned.
87  * The code checks for _gbuff returning 0 (and catches exceptions) so it is
88  * compatible with malloc and easy to wrap into an allocator as well.
89  */
90 class filei {
91 
92    private:
93       // a shared buffer
94       // used for internal calculations, but can be overridden
95       static char _buffer[__UABUFFSIZE];
96 
97       std::string _path; // path name
98       unsigned char _md5[16]; // md5 hash
99       size_t _h; // hash of hash :)
100 
101       // calculate hash
102       void calc(bool ic, bool iw, size_t bs, size_t m) throw(const char*);
103 
104       // return buffer
gbuff(size_t)105       static void* gbuff(size_t) { return _buffer; }
106 
107       // return buffer capacity
buffc()108       static size_t buffc() { return __UABUFFSIZE; }
109 
110    public:
111 
112       /** Constructor.
113        *
114        * The constructor will open the file and calculate the hash.
115        *
116        * @param path file name
117        * @param ic ignore case
118        * @param iw ignore white space (in essence, remove it)
119        * @param m consider at most these many bytes for the hash (0: ALL)
120        * @param bs buffer size of internal work buffer (default 1024)
121        * @throws an error message if construction failed
122        */
123       filei(const std::string& path, bool ic, bool iw,
124          size_t m = 0ul, size_t bs=1024ul)
125       throw(const char*);
126 
127       /** Get an md5 hash char.
128        * @param i index
129        * @return md5 hash char at index
130        */
131       unsigned char operator[](int i) const { return _md5[i]; }
132 
133       /** Get the md5 hash characters.
134        * @return the md5 hash characters.
135        */
md5()136       const unsigned char* md5() const { return _md5; }
137 
138       /** Get path name.
139        * @return path name
140        */
path()141       const std::string& path() const { return _path; }
142 
143       /** Get hash code.
144         * This is a hash of the _md5 value and meant to be
145         * used in hashed data structures.
146         * @return hash code
147         */
h()148       const size_t h() const { return _h; }
149 
150 
151       /** Get file size from the file system.
152         * @param path absolute or relative path
153         * @return file size in bytes
154         * @throws an exception if status cannot be determined.
155         */
156       static off_t fsize(const std::string& path) throw(const char*);
157 
158       /** Determine whether the two files are identical.
159         * @param p1 path of one file
160         * @param p2 path of the other
161         * @param ic ignore letter case
162         * @param iw ignore white spaces
163         * @param m onlu consider these many bytes (0 all)
164         * @param bs set the internal buffer size
165         * @return whether the files corresponding to p1 and p2 are identical
166         * @throws an exception on any error
167         */
168       static bool eq(const std::string& p1, const std::string& p2,
169          bool ic, bool iw, size_t m = 0ul, size_t bs = 1024ul)
170       throw(const char*);
171 
172       /** Functor for hashed containers.
173        */
174       struct md5hash {
175 
176          /** Calculate a hash of the md5 hash.
177           * @param fi file info
178           * @return hash from md5
179           */
operatormd5hash180          size_t operator()(const filei& fi) const { return fi.h(); }
181       };
182 
183       /** Functor for sorted containers.
184        */
185       struct md5cmp {
186 
187          /** Compare the md5 hashes numerically.
188            * @param fi1 file info 1
189            * @param fi2 file info 2
190            * @return true if fi1._md5 < fi2._md5
191            */
192          bool operator()(const filei& fi1, const filei& fi2) const;
193       };
194 
195       /** Functor for containers.
196         */
197       struct md5eq {
198 
199          /** Compare the md5 hashes numerically.
200            * @param fi1 file info 1
201            * @param fi2 file info 2
202            * @return true if fi1._md5 == fi2._md5
203            */
204          bool operator()(const filei& fi1, const filei& fi2) const;
205       };
206 
207       // assign the three plugins below differently
208       // if you want re-entrant calculations,
209       // or different buffer allocation
210       // by default, all calculations share the same buffer and
211       // all calculations are performed at construction
212 
213       /** Function that gets work buffer.
214        * The size_t argument is the requested capacity.
215        * By default, it is set to filei::gbuff, which returns
216        * a static (shared) buffer of size 32K (regardless of the
217        * requested capacity).
218        */
219       static void* (*_gbuff)(size_t);
220 
221       /** Function that tells work buffer capacity.
222         * The function should tell the capacity of the
223         * buffer returned by (*_gbuff)(size_t).
224         */
225       static size_t (*_buffc)();
226 
227       /** Function that releases work buffer.
228         * This function will be called when a calculation
229         * has finished. The argument is the address of the work buffer
230         * returned by (*_gbuff)(size_t).
231         */
232       static void (*_relbuff)(void*);
233 };
234 
235 
236 /** Vector of file names. */
237 typedef std::vector<std::string> fvec_t;
238 
239 /** Set of file infos.
240   */
241 typedef std::set<filei,filei::md5cmp> set_t;
242 
243 /** Map from file name to file names.
244  * The key and the values together make up a set of
245  * identical files.
246  */
247 typedef std::map<filei,fvec_t,filei::md5cmp> map_t;
248 
249 
250 #if defined(__UA_USEHASH)
251 
252 /** Hashed set of file infos.
253   */
254 typedef __gnu_cxx::hash_set<filei,filei::md5hash,filei::md5eq> hset_t;
255 
256 /** Hashed map from file name to file names.
257  * The key and the values together make up a set of
258  * identical files.
259  */
260 typedef __gnu_cxx::hash_map<filei,fvec_t,filei::md5hash,filei::md5eq> hmap_t;
261 #endif
262 
263 /** Sets of identical files.
264  *
265  * S: must be some set<filei>
266  * M: must be some map<filei,std::vector<std::string> >
267  */
268 template<typename S, typename M>
269 class fset {
270    private:
271 
272       // "heads" are random representatives of sets of files
273       // with the same _md5
274 
275       S _files; // set of file "heads": each has unique _md5
276       M _cmn;   // the subsets that are identical ("head" -> path names)
277 
278       bool _ic; // ignore case
279       bool _iw; // ignore whitespace
280       size_t _max; // max chars to consider
281       size_t _bs;  // buffer size
282 
283       typedef typename M::const_iterator it_t; // subset iterator
284 
285       /** Add a file info.
286         * @param fi file info
287         */
add(const filei & fi)288       void add(const filei& fi) {
289          typename S::const_iterator i = _files.find(fi);
290          if (i != _files.end()) _cmn[*i].push_back(fi.path());
291          else _files.insert(fi);
292       }
293 
294 
295    public:
296 
297       /** Constructor.
298        *
299        * @param ic ignore case
300        * @param iw ignore white space
301        * @param m consider at most these many bytes for hash (0: ALL)
302        * @param bs internal buffer size (default 1024)
303        */
304       fset(bool ic, bool iw, size_t m = 0, size_t bs = 1024):
_ic(ic)305          _ic(ic), _iw(iw), _max(m), _bs(bs) {
306       }
307 
308 
309       /** Add a file.
310         * @param path file
311         * @throws a description if hash could not be constructed
312         */
add(const std::string & path)313       void add(const std::string& path) throw(const char*) {
314          add(filei(path,_ic,_iw,_max,_bs));
315       }
316 
317       /** Print the sets of identical files.
318         *
319         * Each set of identical files are printed on a single line.
320         * @param cmn common files
321         * @param os output stream
322         * @param s separator (default " ")
323         * @param ph print hash (if true, the first column is the hash)
324         */
325       static void produce(const M& cmn, std::ostream& os,
326          const std::string& s = " ", bool ph = false) {
327 
328          for(it_t it= cmn.begin(); it != cmn.end(); ++it) {
329             if (ph) { // print hash
330                for(int i=0; i< 16; ++i) {
331                   int hi = it->first[i] >> 4 & 0x0f;
332                   int lo = it->first[i] & 0x0f;
333                   os << std::hex << hi << lo;
334                }
335                os << s;
336             }
337             os << it->first.path();
338             for(int i=0; i<(int)it->second.size();++i) os << s << it->second[i];
339             os << std::endl;
340          }
341       }
342 
343       /** Identical sets of files.
344         *
345         * The result is a map from a file info to vectors of file paths.
346         * The key and its corresponding values together make up one
347         * identical set of files. The key is not repeated in the values and
348         * its designation as key is arbitrary.
349         *
350         * @return a map of the identical sets
351         */
common()352       const M& common() const { return _cmn; }
353 
354       /** Calculate sets of identical files.
355        *
356        * This function can be used to implement a multi-stage
357        * calculation of identical file sets. The param cmn
358        * could be obtained by looking at only the prefix of the files
359        * and thus files in its identical sets may not be totally identical
360        * (only in prefix). However, files which are not in these sets can
361        * be safely ignored.
362        *
363        * @param res resulting common set of files (returned)
364        * @param cmn common set of files
365        * @param ic ignore case
366        * @param iw ignore white space
367        * @param m consider at most these many bytes for hash (0: ALL)
368        * @param bs internal buffer size (default 1024)
369        */
370       static void common(M& res, const M& cmn,
371          bool ic, bool iw, size_t m=0, size_t bs=1204) {
372          for(it_t it=cmn.begin(); it != cmn.end(); ++it) {
373             fset files(ic,iw,m,bs);
374             files.add(it->first.path());
375             for(int i=0; i<(int)it->second.size();++i) files.add(it->second[i]);
376 
377             const M& locmn = files.common();
378             for(it_t lit = locmn.begin(); lit!=locmn.end(); ++lit) {
379                res[lit->first] = lit->second;
380             }
381          }
382       }
383 };
384 
385 /** Choose preferred types for file set and result set.
386  * fsetc_t: preferred type for the map from file size to file names
387  * res_t:   preferred type for the map of identical subsets
388  * fset_t:  preferred type for the fset object
389  *
390  */
391 #if defined(__UA_USEHASH)
392 typedef fset<hset_t,hmap_t> fset_t;
393 typedef hmap_t res_t;
394 typedef __gnu_cxx::hash_map<size_t,fvec_t> fsetc_t;
395 #else
396 typedef std::map<size_t,fvec_t> fsetc_t;
397 typedef map_t res_t;
398 typedef fset<set_t,map_t> fset_t;
399 #endif
400 
401 #endif
402