1 #ifndef ALGO_TEXT___VECTOR__HPP
2 #define ALGO_TEXT___VECTOR__HPP
3 
4 /*  $Id: vector.hpp 548600 2017-10-16 18:33:56Z dicuccio $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Mike DiCuccio
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbiobj.hpp>
36 
37 BEGIN_NCBI_SCOPE
38 
39 template <class Key, class Score>
40 class CScoreVector;
41 
42 template <class Key, class Score>
43 class CRawScoreVector;
44 
45 
46 /////////////////////////////////////////////////////////////////////////////
47 ///
48 /// class CRawScoreVector stores its data in a (sorted) STL vector
49 /// this gives a better memory profile and ias easier to serialize
50 ///
51 
52 template <class Key, class Score>
53 class CRawScoreVector : public CObject
54 {
55 public:
56     typedef Key                              key_type;
57     typedef Score                            score_type;
58     typedef pair<Key, Score>                 TIdxScore;
59     typedef vector<TIdxScore>                TVector;
60     typedef TIdxScore                        value_type;
61     typedef typename TVector::iterator       iterator;
62     typedef typename TVector::const_iterator const_iterator;
63 
64     CRawScoreVector();
~CRawScoreVector()65     virtual ~CRawScoreVector() {}
66 
67     CRawScoreVector(const CScoreVector<Key, Score>&);
68     CRawScoreVector& operator=(const CScoreVector<Key, Score>&);
69 
70     CRawScoreVector(const CRawScoreVector<Key, Score>&);
71     CRawScoreVector& operator=(const CRawScoreVector<Key, Score>&);
72 
73     virtual void Swap(CRawScoreVector<Key, Score>& other);
74 
75     /// @name STL-ish functions
76     /// @{
77 
78     void clear();
79     bool empty() const;
80     size_t size() const;
81     void reserve(size_t size);
82     iterator begin();
83     iterator end();
84     const_iterator begin() const;
85     const_iterator end() const;
86     iterator find(const Key& key);
87     const_iterator find(const Key& key) const;
88     void insert(const value_type& val);
89     void insert(iterator ins_before,
90                 const_iterator start,
91                 const_iterator stop);
92     void erase(iterator it);
93 
94     /// @}
95 
96     ///
97     /// setup functions
98     ///
99     key_type GetId() const;
100     void SetId(key_type uid);
101     Score Get(Key idx) const;
102     void Set(Key idx, Score weight);
103     void Add(Key idx, Score weight = Score(1));
104     void Set(const_iterator begin, const_iterator end);
105 
106     void TrimLength(float  trim_pct);
107     void TrimCount (size_t max_words);
108     void TrimThresh(Score  min_score);
109 
110     /// force the vector to be sorted in order of descending score
111     void SortByScore();
112 
113     /// re-sort the vector by index.
114     /// This should normally never need to be done
115     void SortByIndex();
116 
117     ///
118     /// math functions
119     ///
120     float Length2() const;
121     float Length() const;
122     void Normalize();
123     void ProbNormalize();
124 
125     CRawScoreVector<Key, Score>& operator+=(const CRawScoreVector<Key, Score>& other);
126     CRawScoreVector<Key, Score>& operator-=(const CRawScoreVector<Key, Score>& other);
127     CRawScoreVector<Key, Score>& operator*=(Score val);
128     CRawScoreVector<Key, Score>& operator/=(Score val);
129 
Set()130     TVector&       Set()       { return m_Data; }
Get() const131     const TVector& Get() const { return m_Data; }
132 
133 protected:
134     /// UID for this set
135     key_type m_Uid;
136 
137     /// the data for this document
138     TVector m_Data;
139 };
140 
141 
142 /////////////////////////////////////////////////////////////////////////////
143 
144 
145 template <class Key, class Score>
146 class CScoreVector : public CObject
147 {
148 public:
149     typedef Key                     key_type;
150     typedef Score                   score_type;
151     typedef map<Key, Score> TVector;
152     typedef typename TVector::value_type     value_type;
153     typedef typename TVector::iterator       iterator;
154     typedef typename TVector::const_iterator const_iterator;
155 
156     CScoreVector();
~CScoreVector()157     virtual ~CScoreVector() {}
158     CScoreVector(const CScoreVector<Key, Score>& other);
159     CScoreVector(const CRawScoreVector<Key, Score>& other);
160     CScoreVector& operator=(const CScoreVector<Key, Score>& other);
161     CScoreVector& operator=(const CRawScoreVector<Key, Score>& other);
162 
163     virtual void Swap(CScoreVector<Key, Score>& other);
164 
165     /// @name STL-ish functions
166     /// @{
167 
168     void clear();
169     bool empty() const;
170     size_t size() const;
171     iterator begin();
172     iterator end();
173     const_iterator begin() const;
174     const_iterator end() const;
175     iterator find(const Key& key);
176     const_iterator find(const Key& key) const;
177     pair<iterator, bool> insert(const value_type& val);
178     iterator insert(iterator hint, const value_type& val);
179     void erase(iterator it);
180     void erase(const key_type& v);
181 
182     template <typename OtherIterator>
insert(OtherIterator it_begin,OtherIterator it_end)183     void insert(OtherIterator it_begin, OtherIterator it_end)
184     {
185         m_Data.insert(it_begin, it_end);
186     }
187 
188     /// @}
189 
190     ///
191     /// setup functions
192     ///
193     key_type GetId() const;
194     void SetId(key_type uid);
GetSize() const195     size_t GetSize() const { return m_Data.size(); }
196     Score Get(Key idx) const;
197     void Set(Key idx, Score weight);
198     void Add(Key idx, Score weight = Score(1));
199 
200     void TrimLength(float  trim_pct);
201     void TrimCount (size_t max_words);
202     void TrimThresh(Score  min_score);
203 
204     void SubtractMissing(const CScoreVector<Key, Score>& other);
205     void AddScores      (const CScoreVector<Key, Score>& other);
206 
207     ///
208     /// math functions
209     ///
210     float Length2() const;
211     float Length() const;
212     void Normalize();
213     void ProbNormalize();
214 
215     CScoreVector<Key, Score>& operator+=(const CScoreVector<Key, Score>& other);
216     CScoreVector<Key, Score>& operator-=(const CScoreVector<Key, Score>& other);
217     CScoreVector<Key, Score>& operator*=(Score val);
218     CScoreVector<Key, Score>& operator/=(Score val);
219 
Set()220     TVector&       Set()       { return m_Data; }
Get() const221     const TVector& Get() const { return m_Data; }
222 
223 protected:
224     /// UID for this set
225     key_type m_Uid;
226 
227     /// the data for this document
228     TVector m_Data;
229 };
230 
231 
232 
233 /// @name Scoring Interface
234 /// @{
235 
236 template <class ScoreVectorA, class ScoreVectorB>
237 inline
238 float ScoreCombined(const ScoreVectorA& query, const ScoreVectorB& vec);
239 
240 template <class ScoreVectorA, class ScoreVectorB>
241 inline
242 float ScoreCosine(const ScoreVectorA& query, const ScoreVectorB& vec);
243 
244 template <class ScoreVectorA, class ScoreVectorB>
245 inline
246 float ScoreDice(const ScoreVectorA& query, const ScoreVectorB& vec);
247 
248 template <class ScoreVectorA, class ScoreVectorB>
249 inline
250 float ScoreDistance(const ScoreVectorA& query, const ScoreVectorB& vec);
251 
252 template <class ScoreVectorA, class ScoreVectorB>
253 inline
254 float ScoreDot(const ScoreVectorA& query, const ScoreVectorB& vec);
255 
256 template <class ScoreVectorA, class ScoreVectorB>
257 inline
258 float ScoreJaccard(const ScoreVectorA& query, const ScoreVectorB& vec);
259 
260 template <class ScoreVectorA, class ScoreVectorB>
261 inline
262 float ScoreOverlap(const ScoreVectorA& query, const ScoreVectorB& vec);
263 
264 /// @}
265 
266 
267 
268 END_NCBI_SCOPE
269 
270 
271 #include <algo/text/vector_impl.hpp>
272 
273 #endif  // ALGO_TEXT___VECTOR__HPP
274