1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* 3 * Pan - A Newsreader for Gtk+ 4 * Copyright (C) 2002-2006 Charles Kerr <charles@rebelbase.com> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; version 2 of the License. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 #ifndef _Quark_h_ 21 #define _Quark_h_ 22 23 #include <stdint.h> 24 #include <cassert> 25 #include <ostream> 26 #include <set> 27 #include <string> 28 #include <climits> 29 #include <string> 30 #include <vector> 31 32 #if defined(HAVE_TR1_UNORDERED_SET) 33 # include <tr1/unordered_set> 34 #elif defined(HAVE_EXT_HASH_SET) 35 # include <ext/hash_set> 36 #else 37 # include <set> 38 #endif 39 40 #include <pan/general/string-view.h> 41 42 #ifndef UINT32_MAX 43 #define UINT32_MAX (4294967295U) 44 #endif 45 46 namespace pan 47 { 48 /** 49 * A two-way association between a string and integral type identifier. 50 * 51 * Quarks make good keys because comparision operations can be done 52 * on the integral type instead of with costly strcmps. 53 * 54 * In X11 and Gtk+ implementations, the string+integral mapping is 55 * permanent. However pan::Quark frees its copy of the mapped string when 56 * the last corresponding pan::Quark is destroyed. This way Quarks can be 57 * used even on large, transient sets of data (such as Message-IDs) without 58 * leaking the keys. 59 * 60 * There is, obviously, a tradeoff involved: hashing strings can be 61 * expensive, and the refcounted hashtable of strings has its own 62 * memory overhead. So while strings that are likely to be duplicated 63 * or used as keys -- message-ids, author names, and group names 64 * spring to mind -- they're less appropriate for temporary, unique data. 65 * 66 * @ingroup general 67 */ 68 class Quark 69 { 70 private: 71 72 struct Impl { 73 uint32_t refcount; 74 uint32_t len; 75 char * str; ImplImpl76 Impl (): refcount(0), len(0), str(0) {} ImplImpl77 Impl (const StringView& v): refcount(0), len(v.len), str(const_cast<char*>(v.str)) {} to_viewImpl78 StringView to_view () const { return StringView(str,len); } 79 //wtf? bool operator() (const Impl& a, const Impl& b) const { return StringView(str,len) == StringView(b.str,b.len); } 80 bool operator== (const Impl& b) const { return StringView(str,len) == StringView(b.str,b.len); } 81 bool operator< (const Impl& b) const { return StringView(str,len) < StringView(b.str,b.len); } 82 }; 83 84 struct StringViewHash 85 { get16bitsStringViewHash86 static uint16_t get16bits( const char * in ) 87 { 88 return (in[0]<<8) | in[1]; 89 } 90 91 /** 92 * Paul Hsieh's "SuperFastHash" algorithm, from 93 * http://www.azillionmonkeys.com/qed/hash.html 94 */ operatorStringViewHash95 size_t operator()(const Impl& s) const 96 { 97 const char * data (s.str); 98 int len (s.len); 99 100 uint32_t tmp, hash=len; 101 if (len <= 0 || data == NULL) return 0; 102 103 int rem = len & 3; 104 len >>= 2; 105 106 /* Main loop */ 107 for (;len > 0; len--) { 108 hash += get16bits (data); 109 tmp = (get16bits (data+2) << 11) ^ hash; 110 hash = (hash << 16) ^ tmp; 111 data += 2*sizeof (uint16_t); 112 hash += hash >> 11; 113 } 114 115 /* Handle end cases */ 116 switch (rem) { 117 case 3: hash += get16bits (data); 118 hash ^= hash << 16; 119 hash ^= data[sizeof (uint16_t)] << 18; 120 hash += hash >> 11; 121 break; 122 case 2: hash += get16bits (data); 123 hash ^= hash << 11; 124 hash += hash >> 17; 125 break; 126 case 1: hash += *data; 127 hash ^= hash << 10; 128 hash += hash >> 1; 129 } 130 131 /* Force "avalanching" of final 127 bits */ 132 hash ^= hash << 3; 133 hash += hash >> 5; 134 hash ^= hash << 4; 135 hash += hash >> 17; 136 hash ^= hash << 25; 137 hash += hash >> 6; 138 139 return hash; 140 } 141 }; 142 143 144 #if defined(HAVE_TR1_UNORDERED_SET) 145 typedef std::tr1::unordered_set<Impl, StringViewHash> lookup_t; 146 #elif defined(HAVE_EXT_HASH_SET) 147 typedef __gnu_cxx::hash_set<Impl, StringViewHash> lookup_t; 148 #else 149 typedef std::set<Impl> lookup_t; 150 #endif 151 static lookup_t _lookup; 152 init(const StringView & s)153 static Impl* init (const StringView& s) 154 { 155 std::pair<lookup_t::iterator,bool> result (_lookup.insert (Impl(s))); 156 Impl * impl = const_cast<Impl*>(&*result.first); 157 if (result.second) 158 { 159 // new item -- populate the entry with a new string 160 impl->str = new char [s.len + 1]; 161 memcpy (impl->str, s.str, s.len); 162 impl->str[s.len] = '\0'; 163 impl->len = s.len; 164 } 165 assert (impl->refcount!=UINT32_MAX); 166 ++impl->refcount; 167 return impl; 168 } 169 unref()170 void unref () { 171 if (impl!=0) { 172 assert (impl->refcount); 173 if (!--impl->refcount) { 174 const Impl tmp (*impl); 175 _lookup.erase (tmp); 176 impl = 0; 177 delete [] (char*)tmp.str; 178 } 179 } 180 } 181 182 private: 183 Impl * impl; 184 185 public: Quark()186 Quark (): impl(0) {} Quark(const std::string & s)187 Quark (const std::string & s): impl (init (StringView (s))) { } Quark(const char * s)188 Quark (const char * s): impl (init (StringView (s))) { } Quark(const StringView & p)189 Quark (const StringView & p): impl (init (p)) { } Quark(const Quark & q)190 Quark (const Quark& q) { 191 if (q.impl != 0) ++q.impl->refcount; 192 impl = q.impl; 193 } 194 195 Quark& operator= (const Quark& q) { 196 if (q.impl != 0) ++q.impl->refcount; 197 unref (); 198 impl = q.impl; 199 return *this; 200 } 201 ~Quark()202 ~Quark () { clear(); } clear()203 void clear () { unref(); impl=0; } empty()204 bool empty() const { return impl == 0; } 205 bool operator== (const char * that) const { 206 const char * pch = c_str (); 207 if (!pch && !that) return true; 208 if (!pch || !that) return false; 209 return !strcmp(c_str(), that); 210 } 211 bool operator!= (const char * that) const { return !(*this == that); } 212 bool operator== (const StringView& that) const { return impl ? !that.strcmp(impl->str,impl->len) : that.empty(); } 213 bool operator!= (const StringView& that) const { return impl ? that.strcmp(impl->str,impl->len) : !that.empty(); } 214 bool operator== (const Quark& q) const { return impl == q.impl; } 215 bool operator!= (const Quark& q) const { return impl != q.impl; } 216 bool operator< (const Quark& q) const { return impl < q.impl; } 217 bool operator! () const { return empty(); } to_view()218 const StringView to_view () const { return impl ? impl->to_view() : StringView(); } c_str()219 const char* c_str () const { return impl ? impl->str : NULL; } to_string()220 std::string to_string () const { return std::string (impl ? impl->str : ""); } 221 operator const char* () const { return c_str(); } 222 223 /** Number of unique strings being mapped. 224 Included for debugging and regression tests... */ size()225 static unsigned long size () { return _lookup.size(); } 226 static void dump (std::ostream&); 227 }; 228 229 std::ostream& operator<< (std::ostream& os, const Quark& s); 230 231 typedef std::set<Quark> quarks_t; 232 233 typedef std::vector<Quark> quarks_v; 234 235 /** 236 * StrictWeakOrdering which sorts Quarks alphabetically. 237 * This should be used sparingly, as it defeats Quark's main speed advantage. 238 */ 239 struct AlphabeticalQuarkOrdering { operatorAlphabeticalQuarkOrdering240 bool operator() (const Quark& a, const Quark& b) const { 241 if (a.empty() && b.empty()) return false; 242 if (a.empty()) return true; 243 if (b.empty()) return false; 244 return ::strcmp (a.c_str(), b.c_str()) < 0; 245 } operatorAlphabeticalQuarkOrdering246 bool operator() (const Quark* a, const Quark* b) const { 247 return (*this) (*a, *b); 248 } 249 }; 250 } 251 252 #endif 253