1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /*
3  * Pan - A Newsreader for Gtk+
4  * Copyright (C) 2002-2006  Charles Kerr <charles@rebelbase.com>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; version 2 of the License.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #ifndef _Quark_h_
21 #define _Quark_h_
22 
23 #include <stdint.h>
24 #include <cassert>
25 #include <ostream>
26 #include <set>
27 #include <string>
28 #include <climits>
29 #include <string>
30 #include <vector>
31 
32 #if defined(HAVE_TR1_UNORDERED_SET)
33 # include <tr1/unordered_set>
34 #elif defined(HAVE_EXT_HASH_SET)
35 # include <ext/hash_set>
36 #else
37 # include <set>
38 #endif
39 
40 #include <pan/general/string-view.h>
41 
42 #ifndef UINT32_MAX
43 #define UINT32_MAX             (4294967295U)
44 #endif
45 
46 namespace pan
47 {
48   /**
49    * A two-way association between a string and integral type identifier.
50    *
51    * Quarks make good keys because comparision operations can be done
52    * on the integral type instead of with costly strcmps.
53    *
54    * In X11 and Gtk+ implementations, the string+integral mapping is
55    * permanent.  However pan::Quark frees its copy of the mapped string when
56    * the last corresponding pan::Quark is destroyed.  This way Quarks can be
57    * used even on large, transient sets of data (such as Message-IDs) without
58    * leaking the keys.
59    *
60    * There is, obviously, a tradeoff involved: hashing strings can be
61    * expensive, and the refcounted hashtable of strings has its own
62    * memory overhead.  So while strings that are likely to be duplicated
63    * or used as keys -- message-ids, author names, and group names
64    * spring to mind -- they're less appropriate for temporary, unique data.
65    *
66    * @ingroup general
67    */
68   class Quark
69   {
70     private:
71 
72       struct Impl {
73         uint32_t refcount;
74         uint32_t len;
75         char * str;
ImplImpl76         Impl (): refcount(0), len(0), str(0) {}
ImplImpl77         Impl (const StringView& v): refcount(0), len(v.len), str(const_cast<char*>(v.str)) {}
to_viewImpl78         StringView to_view () const { return StringView(str,len); }
79         //wtf? bool operator() (const Impl& a, const Impl& b) const { return StringView(str,len) == StringView(b.str,b.len); }
80         bool operator== (const Impl& b) const { return StringView(str,len) == StringView(b.str,b.len); }
81         bool operator< (const Impl& b) const { return StringView(str,len) < StringView(b.str,b.len); }
82       };
83 
84       struct StringViewHash
85       {
get16bitsStringViewHash86         static uint16_t get16bits( const char * in )
87         {
88           return (in[0]<<8) | in[1];
89         }
90 
91         /**
92          * Paul Hsieh's "SuperFastHash" algorithm, from
93          * http://www.azillionmonkeys.com/qed/hash.html
94          */
operatorStringViewHash95         size_t operator()(const Impl& s) const
96         {
97           const char * data (s.str);
98           int len (s.len);
99 
100           uint32_t tmp, hash=len;
101           if (len <= 0 || data == NULL) return 0;
102 
103           int rem = len & 3;
104           len >>= 2;
105 
106           /* Main loop */
107           for (;len > 0; len--) {
108               hash  += get16bits (data);
109               tmp    = (get16bits (data+2) << 11) ^ hash;
110               hash   = (hash << 16) ^ tmp;
111               data  += 2*sizeof (uint16_t);
112               hash  += hash >> 11;
113           }
114 
115           /* Handle end cases */
116           switch (rem) {
117               case 3: hash += get16bits (data);
118                       hash ^= hash << 16;
119                       hash ^= data[sizeof (uint16_t)] << 18;
120                       hash += hash >> 11;
121                       break;
122               case 2: hash += get16bits (data);
123                       hash ^= hash << 11;
124                       hash += hash >> 17;
125                       break;
126               case 1: hash += *data;
127                       hash ^= hash << 10;
128                       hash += hash >> 1;
129           }
130 
131           /* Force "avalanching" of final 127 bits */
132           hash ^= hash << 3;
133           hash += hash >> 5;
134           hash ^= hash << 4;
135           hash += hash >> 17;
136           hash ^= hash << 25;
137           hash += hash >> 6;
138 
139           return hash;
140         }
141       };
142 
143 
144 #if defined(HAVE_TR1_UNORDERED_SET)
145       typedef std::tr1::unordered_set<Impl, StringViewHash> lookup_t;
146 #elif defined(HAVE_EXT_HASH_SET)
147       typedef __gnu_cxx::hash_set<Impl, StringViewHash> lookup_t;
148 #else
149       typedef std::set<Impl> lookup_t;
150 #endif
151       static lookup_t _lookup;
152 
init(const StringView & s)153       static Impl* init (const StringView& s)
154       {
155         std::pair<lookup_t::iterator,bool> result (_lookup.insert (Impl(s)));
156         Impl * impl = const_cast<Impl*>(&*result.first);
157         if (result.second)
158         {
159          // new item -- populate the entry with a new string
160           impl->str = new char [s.len + 1];
161           memcpy (impl->str, s.str, s.len);
162           impl->str[s.len] = '\0';
163           impl->len = s.len;
164         }
165         assert (impl->refcount!=UINT32_MAX);
166         ++impl->refcount;
167         return impl;
168       }
169 
unref()170       void unref () {
171         if (impl!=0) {
172           assert (impl->refcount);
173           if (!--impl->refcount) {
174             const Impl tmp (*impl);
175             _lookup.erase (tmp);
176             impl = 0;
177             delete [] (char*)tmp.str;
178           }
179         }
180       }
181 
182     private:
183       Impl * impl;
184 
185     public:
Quark()186       Quark (): impl(0) {}
Quark(const std::string & s)187       Quark (const std::string & s): impl (init (StringView (s))) { }
Quark(const char * s)188       Quark (const char * s): impl (init (StringView (s))) { }
Quark(const StringView & p)189       Quark (const StringView &  p): impl (init (p)) { }
Quark(const Quark & q)190       Quark (const Quark& q) {
191         if (q.impl != 0) ++q.impl->refcount;
192         impl = q.impl;
193       }
194 
195       Quark& operator= (const Quark& q) {
196         if (q.impl != 0) ++q.impl->refcount;
197         unref ();
198         impl = q.impl;
199         return *this;
200       }
201 
~Quark()202       ~Quark () { clear(); }
clear()203       void clear () { unref(); impl=0; }
empty()204       bool empty() const { return impl == 0; }
205       bool operator== (const char * that) const {
206         const char * pch = c_str ();
207         if (!pch && !that) return true;
208         if (!pch || !that) return false;
209         return !strcmp(c_str(), that);
210       }
211       bool operator!= (const char * that) const { return !(*this == that); }
212       bool operator== (const StringView& that) const { return impl ? !that.strcmp(impl->str,impl->len) :  that.empty(); }
213       bool operator!= (const StringView& that) const { return impl ?  that.strcmp(impl->str,impl->len) : !that.empty(); }
214       bool operator== (const Quark& q) const { return impl == q.impl; }
215       bool operator!= (const Quark& q) const { return impl != q.impl; }
216       bool operator< (const Quark& q) const { return impl < q.impl; }
217       bool operator! () const { return empty(); }
to_view()218       const StringView to_view () const { return impl ? impl->to_view() : StringView(); }
c_str()219       const char* c_str () const { return impl ? impl->str : NULL; }
to_string()220       std::string to_string () const { return std::string (impl ? impl->str : ""); }
221       operator const char* () const { return c_str(); }
222 
223       /** Number of unique strings being mapped.
224           Included for debugging and regression tests... */
size()225       static unsigned long size () { return _lookup.size(); }
226       static void dump (std::ostream&);
227   };
228 
229   std::ostream& operator<< (std::ostream& os, const Quark& s);
230 
231   typedef std::set<Quark> quarks_t;
232 
233   typedef std::vector<Quark> quarks_v;
234 
235   /**
236    * StrictWeakOrdering which sorts Quarks alphabetically.
237    * This should be used sparingly, as it defeats Quark's main speed advantage.
238    */
239   struct AlphabeticalQuarkOrdering {
operatorAlphabeticalQuarkOrdering240     bool operator() (const Quark& a, const Quark& b) const {
241       if (a.empty() && b.empty()) return false;
242       if (a.empty()) return true;
243       if (b.empty()) return false;
244       return ::strcmp (a.c_str(), b.c_str()) < 0;
245     }
operatorAlphabeticalQuarkOrdering246     bool operator() (const Quark* a, const Quark* b) const {
247       return (*this) (*a, *b);
248     }
249   };
250 }
251 
252 #endif
253