1 #ifndef ALGO_TEXT___VECTOR__HPP 2 #define ALGO_TEXT___VECTOR__HPP 3 4 /* $Id: vector.hpp 548600 2017-10-16 18:33:56Z dicuccio $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Authors: Mike DiCuccio 30 * 31 * File Description: 32 * 33 */ 34 35 #include <corelib/ncbiobj.hpp> 36 37 BEGIN_NCBI_SCOPE 38 39 template <class Key, class Score> 40 class CScoreVector; 41 42 template <class Key, class Score> 43 class CRawScoreVector; 44 45 46 ///////////////////////////////////////////////////////////////////////////// 47 /// 48 /// class CRawScoreVector stores its data in a (sorted) STL vector 49 /// this gives a better memory profile and ias easier to serialize 50 /// 51 52 template <class Key, class Score> 53 class CRawScoreVector : public CObject 54 { 55 public: 56 typedef Key key_type; 57 typedef Score score_type; 58 typedef pair<Key, Score> TIdxScore; 59 typedef vector<TIdxScore> TVector; 60 typedef TIdxScore value_type; 61 typedef typename TVector::iterator iterator; 62 typedef typename TVector::const_iterator const_iterator; 63 64 CRawScoreVector(); ~CRawScoreVector()65 virtual ~CRawScoreVector() {} 66 67 CRawScoreVector(const CScoreVector<Key, Score>&); 68 CRawScoreVector& operator=(const CScoreVector<Key, Score>&); 69 70 CRawScoreVector(const CRawScoreVector<Key, Score>&); 71 CRawScoreVector& operator=(const CRawScoreVector<Key, Score>&); 72 73 virtual void Swap(CRawScoreVector<Key, Score>& other); 74 75 /// @name STL-ish functions 76 /// @{ 77 78 void clear(); 79 bool empty() const; 80 size_t size() const; 81 void reserve(size_t size); 82 iterator begin(); 83 iterator end(); 84 const_iterator begin() const; 85 const_iterator end() const; 86 iterator find(const Key& key); 87 const_iterator find(const Key& key) const; 88 void insert(const value_type& val); 89 void insert(iterator ins_before, 90 const_iterator start, 91 const_iterator stop); 92 void erase(iterator it); 93 94 /// @} 95 96 /// 97 /// setup functions 98 /// 99 key_type GetId() const; 100 void SetId(key_type uid); 101 Score Get(Key idx) const; 102 void Set(Key idx, Score weight); 103 void Add(Key idx, Score weight = Score(1)); 104 void Set(const_iterator begin, const_iterator end); 105 106 void TrimLength(float trim_pct); 107 void TrimCount (size_t max_words); 108 void TrimThresh(Score min_score); 109 110 /// force the vector to be sorted in order of descending score 111 void SortByScore(); 112 113 /// re-sort the vector by index. 114 /// This should normally never need to be done 115 void SortByIndex(); 116 117 /// 118 /// math functions 119 /// 120 float Length2() const; 121 float Length() const; 122 void Normalize(); 123 void ProbNormalize(); 124 125 CRawScoreVector<Key, Score>& operator+=(const CRawScoreVector<Key, Score>& other); 126 CRawScoreVector<Key, Score>& operator-=(const CRawScoreVector<Key, Score>& other); 127 CRawScoreVector<Key, Score>& operator*=(Score val); 128 CRawScoreVector<Key, Score>& operator/=(Score val); 129 Set()130 TVector& Set() { return m_Data; } Get() const131 const TVector& Get() const { return m_Data; } 132 133 protected: 134 /// UID for this set 135 key_type m_Uid; 136 137 /// the data for this document 138 TVector m_Data; 139 }; 140 141 142 ///////////////////////////////////////////////////////////////////////////// 143 144 145 template <class Key, class Score> 146 class CScoreVector : public CObject 147 { 148 public: 149 typedef Key key_type; 150 typedef Score score_type; 151 typedef map<Key, Score> TVector; 152 typedef typename TVector::value_type value_type; 153 typedef typename TVector::iterator iterator; 154 typedef typename TVector::const_iterator const_iterator; 155 156 CScoreVector(); ~CScoreVector()157 virtual ~CScoreVector() {} 158 CScoreVector(const CScoreVector<Key, Score>& other); 159 CScoreVector(const CRawScoreVector<Key, Score>& other); 160 CScoreVector& operator=(const CScoreVector<Key, Score>& other); 161 CScoreVector& operator=(const CRawScoreVector<Key, Score>& other); 162 163 virtual void Swap(CScoreVector<Key, Score>& other); 164 165 /// @name STL-ish functions 166 /// @{ 167 168 void clear(); 169 bool empty() const; 170 size_t size() const; 171 iterator begin(); 172 iterator end(); 173 const_iterator begin() const; 174 const_iterator end() const; 175 iterator find(const Key& key); 176 const_iterator find(const Key& key) const; 177 pair<iterator, bool> insert(const value_type& val); 178 iterator insert(iterator hint, const value_type& val); 179 void erase(iterator it); 180 void erase(const key_type& v); 181 182 template <typename OtherIterator> insert(OtherIterator it_begin,OtherIterator it_end)183 void insert(OtherIterator it_begin, OtherIterator it_end) 184 { 185 m_Data.insert(it_begin, it_end); 186 } 187 188 /// @} 189 190 /// 191 /// setup functions 192 /// 193 key_type GetId() const; 194 void SetId(key_type uid); GetSize() const195 size_t GetSize() const { return m_Data.size(); } 196 Score Get(Key idx) const; 197 void Set(Key idx, Score weight); 198 void Add(Key idx, Score weight = Score(1)); 199 200 void TrimLength(float trim_pct); 201 void TrimCount (size_t max_words); 202 void TrimThresh(Score min_score); 203 204 void SubtractMissing(const CScoreVector<Key, Score>& other); 205 void AddScores (const CScoreVector<Key, Score>& other); 206 207 /// 208 /// math functions 209 /// 210 float Length2() const; 211 float Length() const; 212 void Normalize(); 213 void ProbNormalize(); 214 215 CScoreVector<Key, Score>& operator+=(const CScoreVector<Key, Score>& other); 216 CScoreVector<Key, Score>& operator-=(const CScoreVector<Key, Score>& other); 217 CScoreVector<Key, Score>& operator*=(Score val); 218 CScoreVector<Key, Score>& operator/=(Score val); 219 Set()220 TVector& Set() { return m_Data; } Get() const221 const TVector& Get() const { return m_Data; } 222 223 protected: 224 /// UID for this set 225 key_type m_Uid; 226 227 /// the data for this document 228 TVector m_Data; 229 }; 230 231 232 233 /// @name Scoring Interface 234 /// @{ 235 236 template <class ScoreVectorA, class ScoreVectorB> 237 inline 238 float ScoreCombined(const ScoreVectorA& query, const ScoreVectorB& vec); 239 240 template <class ScoreVectorA, class ScoreVectorB> 241 inline 242 float ScoreCosine(const ScoreVectorA& query, const ScoreVectorB& vec); 243 244 template <class ScoreVectorA, class ScoreVectorB> 245 inline 246 float ScoreDice(const ScoreVectorA& query, const ScoreVectorB& vec); 247 248 template <class ScoreVectorA, class ScoreVectorB> 249 inline 250 float ScoreDistance(const ScoreVectorA& query, const ScoreVectorB& vec); 251 252 template <class ScoreVectorA, class ScoreVectorB> 253 inline 254 float ScoreDot(const ScoreVectorA& query, const ScoreVectorB& vec); 255 256 template <class ScoreVectorA, class ScoreVectorB> 257 inline 258 float ScoreJaccard(const ScoreVectorA& query, const ScoreVectorB& vec); 259 260 template <class ScoreVectorA, class ScoreVectorB> 261 inline 262 float ScoreOverlap(const ScoreVectorA& query, const ScoreVectorB& vec); 263 264 /// @} 265 266 267 268 END_NCBI_SCOPE 269 270 271 #include <algo/text/vector_impl.hpp> 272 273 #endif // ALGO_TEXT___VECTOR__HPP 274