1 /* $Id: gene_info_test.cpp 631547 2021-05-19 13:51:35Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Vahram Avagyan
27 *
28 */
29
30 #include <ncbi_pch.hpp>
31 #include <corelib/ncbiapp.hpp>
32 #include <corelib/ncbienv.hpp>
33
34 #include <objtools/blast/gene_info_reader/gene_info.hpp>
35 #include <objtools/blast/gene_info_reader/gene_info_reader.hpp>
36
37 #include <map>
38
39 #include <corelib/test_boost.hpp>
40
41 //==========================================================================//
42
43 #ifndef SKIP_DOXYGEN_PROCESSING
44
45 USING_NCBI_SCOPE;
46
47 //==========================================================================//
48
49 typedef map<int, int> TIntToIntMap;
50 typedef multimap<TGi, int> TGiToIntMultimap;
51 typedef map<int, string> TIntToStringMap;
52
53 static void
s_MakeGeneInfoFileReaders(CGeneInfoFileReader * & pReader1,CGeneInfoFileReader * & pReader2)54 s_MakeGeneInfoFileReaders(CGeneInfoFileReader*& pReader1,
55 CGeneInfoFileReader*& pReader2)
56 {
57 pReader1 = new CGeneInfoFileReader(true);
58 pReader2 = new CGeneInfoFileReader(false);
59 }
60
61 static void
s_InitTestData(IGeneInfoInput::TGeneIdList & listIds,IGeneInfoInput::TGiList & listGis,TGiToIntMultimap & mapGiToIds,IGeneInfoInput::TGeneIdToGeneInfoMap & mapIdToInfo)62 s_InitTestData(IGeneInfoInput::TGeneIdList& listIds,
63 IGeneInfoInput::TGiList& listGis,
64 TGiToIntMultimap& mapGiToIds,
65 IGeneInfoInput::TGeneIdToGeneInfoMap& mapIdToInfo)
66 {
67 int geneId;
68
69 // Initialize Gene IDs and Gene Infos
70
71 TIntToIntMap mapIdToPMIDs;
72 mapIdToPMIDs[1] = 1;
73 mapIdToPMIDs[2] = 2;
74 mapIdToPMIDs[3] = 2;
75 mapIdToPMIDs[4] = 1;
76 mapIdToPMIDs[5] = 1;
77 mapIdToPMIDs[6] = 2;
78 mapIdToPMIDs[7] = 0;
79
80 TIntToStringMap mapIdToOrgname;
81 mapIdToOrgname[1] = "unknown";
82 mapIdToOrgname[2] = "Gallus gallus";
83 mapIdToOrgname[3] = "Homo sapiens";
84 mapIdToOrgname[4] = "Homo sapiens";
85 mapIdToOrgname[5] = "Gallus gallus";
86 mapIdToOrgname[6] = "Homo sapiens";
87 mapIdToOrgname[7] = "unknown";
88
89 for (geneId = 1; geneId <= 7; geneId++)
90 {
91 listIds.push_back(geneId);
92
93 string strGeneId = NStr::IntToString(geneId);
94 mapIdToInfo[geneId] = CRef<CGeneInfo>(new CGeneInfo(
95 geneId,
96 "GeneID" + strGeneId,
97 "Description text for GeneID" + strGeneId,
98 mapIdToOrgname[geneId],
99 mapIdToPMIDs[geneId]));
100 }
101
102 // Link Gis to Gene IDs
103
104 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(1), 7));
105 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 2));
106 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 3));
107 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 5));
108 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(3), 4));
109 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(4), 4));
110 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(4), 6));
111 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(10), 1));
112 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(11), 1));
113 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(11), 7));
114 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(20), 1));
115 // (21, 1), (21, 7) excluded: "Genomic" Gi, multiple IDs
116 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(30), 2));
117 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(31), 5));
118 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(31), 4));
119 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(32), 3));
120 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(32), 6));
121 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(40), 2));
122 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(41), 2));
123 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(42), 2));
124 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(50), 5));
125 // (60, 3), (60, 4) excluded: "Genomic" Gi, multiple IDs
126 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(61), 4));
127 mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(62), 6));
128
129 listGis.push_back(GI_CONST(1));
130 listGis.push_back(GI_CONST(2));
131 listGis.push_back(GI_CONST(3));
132 listGis.push_back(GI_CONST(4));
133 listGis.push_back(GI_CONST(10));
134 listGis.push_back(GI_CONST(11));
135 listGis.push_back(GI_CONST(20));
136 // listGis.push_back(GI_CONST(21);;
137 listGis.push_back(GI_CONST(30));
138 listGis.push_back(GI_CONST(31));
139 listGis.push_back(GI_CONST(32));
140 listGis.push_back(GI_CONST(40));
141 listGis.push_back(GI_CONST(41));
142 listGis.push_back(GI_CONST(42));
143 listGis.push_back(GI_CONST(50));
144 // listGis.push_back(GI_CONST(60));
145 listGis.push_back(GI_CONST(61));
146 listGis.push_back(GI_CONST(62));
147 }
148
149 static void
s_InitGisWithNoGeneIds(IGeneInfoInput::TGiList & listGis)150 s_InitGisWithNoGeneIds(IGeneInfoInput::TGiList& listGis)
151 {
152 listGis.push_back(GI_CONST(100)); // no gene links
153 listGis.push_back(GI_CONST(60)); // "Genomic" Gi, multiple gene links
154 }
155
156 static bool
s_CheckPubMedLinkCount(int nLinks1,int nLinks2)157 s_CheckPubMedLinkCount(int nLinks1, int nLinks2)
158 {
159 while (nLinks1 != 0 && nLinks2 != 0)
160 {
161 nLinks1 /= 10;
162 nLinks2 /= 10;
163 }
164 return nLinks1 == nLinks2;
165 }
166
167 static void
s_CheckInfoEquality(CRef<CGeneInfo> info1,CRef<CGeneInfo> info2)168 s_CheckInfoEquality(CRef<CGeneInfo> info1,
169 CRef<CGeneInfo> info2)
170 {
171 BOOST_REQUIRE_EQUAL(info1->GetGeneId(),
172 info2->GetGeneId());
173 BOOST_REQUIRE_EQUAL(info1->GetSymbol(),
174 info2->GetSymbol());
175 BOOST_REQUIRE_EQUAL(info1->GetDescription(),
176 info2->GetDescription());
177 BOOST_REQUIRE_EQUAL(info1->GetOrganismName(),
178 info2->GetOrganismName());
179 BOOST_REQUIRE(s_CheckPubMedLinkCount(info1->GetNumPubMedLinks(),
180 info2->GetNumPubMedLinks()));
181 }
182
183 struct SGeneInfoListSorter {
operator ()SGeneInfoListSorter184 bool operator() (const CRef<CGeneInfo>& a,
185 const CRef<CGeneInfo>& b) const
186 {
187 return a->GetGeneId() < b->GetGeneId();
188 }
189 };
190
191 static void
s_SortInfoList(IGeneInfoInput::TGeneInfoList & infoList)192 s_SortInfoList(IGeneInfoInput::TGeneInfoList& infoList)
193 {
194 sort(infoList.begin(), infoList.end(), SGeneInfoListSorter() );
195 }
196
197 static void
s_CheckInfoListEquality(IGeneInfoInput::TGeneInfoList & infoList1,IGeneInfoInput::TGeneInfoList & infoList2)198 s_CheckInfoListEquality(IGeneInfoInput::TGeneInfoList& infoList1,
199 IGeneInfoInput::TGeneInfoList& infoList2)
200 {
201 s_SortInfoList(infoList1);
202 s_SortInfoList(infoList2);
203
204 IGeneInfoInput::TGeneInfoList::iterator it1, it2;
205 for (it1 = infoList1.begin(), it2 = infoList2.begin();
206 it1 != infoList1.end() && it2 != infoList2.end();
207 it1++, it2++)
208 {
209 s_CheckInfoEquality(*it1, *it2);
210 }
211 BOOST_REQUIRE(it1 == infoList1.end() && it2 == infoList2.end());
212 if (it1 != infoList1.end())
213 cout << endl << "Extra info 1: " << **it1 << endl;
214 if (it2 != infoList2.end())
215 cout << endl << "Extra info 2: " << **it2 << endl;
216 }
217
218 static void
s_FillExpectedInfoListForGi(TGi gi,TGiToIntMultimap & mapGiToIds,IGeneInfoInput::TGeneIdToGeneInfoMap & mapIdToInfo,IGeneInfoInput::TGeneInfoList & infoList)219 s_FillExpectedInfoListForGi(TGi gi,
220 TGiToIntMultimap& mapGiToIds,
221 IGeneInfoInput::TGeneIdToGeneInfoMap& mapIdToInfo,
222 IGeneInfoInput::TGeneInfoList& infoList)
223 {
224 // cout << endl << "Gene IDs for Gi=" << gi << ": ";
225 TGiToIntMultimap::iterator itGiToGeneId = mapGiToIds.find(gi);
226 while (itGiToGeneId != mapGiToIds.end() &&
227 itGiToGeneId->first == gi)
228 {
229 int geneId = itGiToGeneId->second;
230 // cout << geneId << " ";
231
232 CRef<CGeneInfo> info = mapIdToInfo[geneId];
233 infoList.push_back(info);
234
235 // cout << endl << *info << endl;
236
237 itGiToGeneId++;
238 }
239 // cout << endl;
240 }
241
242 template<class T> void
s_CheckIntInList(T val,list<T> & listVals)243 s_CheckIntInList(T val, list<T>& listVals)
244 {
245 BOOST_REQUIRE(find(listVals.begin(), listVals.end(), val) != listVals.end());
246 }
247
248 static void
s_CheckGiToGeneConsistency(TGi gi,TGiToIntMultimap & mapGiToIds,CGeneInfoFileReader * pReader)249 s_CheckGiToGeneConsistency(TGi gi,
250 TGiToIntMultimap& mapGiToIds,
251 CGeneInfoFileReader *pReader)
252 {
253 // see if this gi appears in the gi lists for each of its Gene IDs
254
255 TGiToIntMultimap::iterator itGiToGeneId = mapGiToIds.find(gi);
256 while (itGiToGeneId != mapGiToIds.end() &&
257 itGiToGeneId->first == gi)
258 {
259 int geneId = itGiToGeneId->second;
260 // cout << "\nGi's for GeneID=" << geneId << ": ";
261
262 IGeneInfoInput::TGiList giListRNA, giListProtein, giListGenomic;
263 bool bRNA, bProtein, bGenomic;
264 bRNA = pReader->GetRNAGisForGeneId(geneId, giListRNA);
265 bProtein = pReader->GetProteinGisForGeneId(geneId, giListProtein);
266 bGenomic = pReader->GetGenomicGisForGeneId(geneId, giListGenomic);
267 BOOST_REQUIRE(bRNA || bProtein || bGenomic);
268
269 // cout << endl << "\tRNA Gi's: ";
270 // s_OutputList(giListRNA);
271 // cout << endl << "\tProtein Gi's: ";
272 // s_OutputList(giListProtein);
273 // cout << endl << "\tGenomic Gi's: ";
274 // s_OutputList(giListGenomic);
275
276 IGeneInfoInput::TGiList giListAll;
277 copy(giListRNA.begin(), giListRNA.end(),
278 back_inserter(giListAll));
279 copy(giListProtein.begin(), giListProtein.end(),
280 back_inserter(giListAll));
281 copy(giListGenomic.begin(), giListGenomic.end(),
282 back_inserter(giListAll));
283
284 s_CheckIntInList(gi, giListAll);
285
286 itGiToGeneId++;
287 }
288 // cout << endl;
289
290 // see if this gi's Gene IDs appear in the actual Gene ID list
291 // returned by the reader
292
293 IGeneInfoInput::TGeneIdList geneIdsFromReader;
294 BOOST_REQUIRE(pReader->GetGeneIdsForGi(gi, geneIdsFromReader));
295
296 itGiToGeneId = mapGiToIds.find(gi);
297 while (itGiToGeneId != mapGiToIds.end() &&
298 itGiToGeneId->first == gi)
299 {
300 int geneId = itGiToGeneId->second;
301 s_CheckIntInList(geneId, geneIdsFromReader);
302
303 itGiToGeneId++;
304 }
305 }
306
307 //==========================================================================//
308 // Test successful Gi to Gene Info mapping
309 BOOST_AUTO_TEST_SUITE(gene_info)
310
BOOST_AUTO_TEST_CASE(s_MainInfoReaderTest)311 BOOST_AUTO_TEST_CASE(s_MainInfoReaderTest)
312 {
313 CNcbiEnvironment env;
314 env.Set(GENE_INFO_PATH_ENV_VARIABLE, "data/");
315
316 try
317 {
318 IGeneInfoInput::TGeneIdList listGeneIds;
319 IGeneInfoInput::TGiList listGis;
320 TGiToIntMultimap mapGiToIds;
321 IGeneInfoInput::TGeneIdToGeneInfoMap mapIdToInfo;
322
323 s_InitTestData(listGeneIds, listGis,
324 mapGiToIds, mapIdToInfo);
325
326 CGeneInfoFileReader *pReader1 = NULL, *pReader2 = NULL;
327 BOOST_REQUIRE_NO_THROW(s_MakeGeneInfoFileReaders(pReader1, pReader2));
328 auto_ptr<CGeneInfoFileReader> fileReader1(pReader1);
329 auto_ptr<CGeneInfoFileReader> fileReader2(pReader2);
330
331 IGeneInfoInput::TGiList::iterator itGi = listGis.begin();
332 for (; itGi != listGis.end(); itGi++)
333 {
334 TGi gi = *itGi;
335
336 // cout << endl << "Processing new Gi: " << gi << endl;
337
338 IGeneInfoInput::TGeneInfoList infoList1, infoList2,
339 infoListExpected;
340 BOOST_REQUIRE(fileReader1->GetGeneInfoForGi(gi, infoList1));
341 BOOST_REQUIRE(fileReader2->GetGeneInfoForGi(gi, infoList2));
342
343 s_FillExpectedInfoListForGi(gi, mapGiToIds,
344 mapIdToInfo, infoListExpected);
345
346 s_CheckInfoListEquality(infoList1, infoList2);
347 s_CheckInfoListEquality(infoList1, infoListExpected);
348
349 s_CheckGiToGeneConsistency(gi, mapGiToIds,
350 fileReader1.get());
351 s_CheckGiToGeneConsistency(gi, mapGiToIds,
352 fileReader2.get());
353 }
354 }
355 catch (CException& e)
356 {
357 BOOST_FAIL(e.what());
358 }
359 }
360
361 //==========================================================================//
362 // Test Gis that are not mapped to a single Gene Id
363
BOOST_AUTO_TEST_CASE(s_GiWithNoGeneIdTest)364 BOOST_AUTO_TEST_CASE(s_GiWithNoGeneIdTest)
365 {
366 CNcbiEnvironment env;
367 env.Set(GENE_INFO_PATH_ENV_VARIABLE, "data/");
368
369 try
370 {
371 IGeneInfoInput::TGiList listGis;
372 s_InitGisWithNoGeneIds(listGis);
373
374 CGeneInfoFileReader *pReader1 = NULL, *pReader2 = NULL;
375 BOOST_REQUIRE_NO_THROW(s_MakeGeneInfoFileReaders(pReader1, pReader2));
376 auto_ptr<CGeneInfoFileReader> fileReader1(pReader1);
377 auto_ptr<CGeneInfoFileReader> fileReader2(pReader2);
378
379 IGeneInfoInput::TGiList::iterator itGi = listGis.begin();
380 for (; itGi != listGis.end(); itGi++)
381 {
382 TGi gi = *itGi;
383
384 IGeneInfoInput::TGeneInfoList infoList1, infoList2,
385 infoListExpected;
386 BOOST_REQUIRE(!fileReader1->GetGeneInfoForGi(gi, infoList1));
387 BOOST_REQUIRE(!fileReader2->GetGeneInfoForGi(gi, infoList2));
388
389 BOOST_REQUIRE(infoList1.empty());
390 BOOST_REQUIRE(infoList2.empty());
391 }
392 }
393 catch (CException& e)
394 {
395 BOOST_FAIL(e.what());
396 }
397 }
398
399 //==========================================================================//
400 // Test basic functionality of the Gene Info class
401
BOOST_AUTO_TEST_CASE(s_TestGeneInfo)402 BOOST_AUTO_TEST_CASE(s_TestGeneInfo)
403 {
404 try
405 {
406 CGeneInfo info;
407 BOOST_REQUIRE(!info.IsInitialized());
408
409 int geneId = 3481;
410 string strSymbol = "IGF2";
411 string strDescription =
412 "insulin-like growth factor 2 (somatomedin A)";
413 string strOrganism = "Homo sapiens";
414 int nPubMedCount = 100;
415
416 info = CGeneInfo(geneId,
417 strSymbol,
418 strDescription,
419 strOrganism,
420 nPubMedCount);
421
422 BOOST_REQUIRE(info.IsInitialized());
423 BOOST_REQUIRE(info.GetGeneId() == geneId);
424 BOOST_REQUIRE(info.GetSymbol() == strSymbol);
425 BOOST_REQUIRE(info.GetDescription() == strDescription);
426 BOOST_REQUIRE(info.GetOrganismName() == strOrganism);
427 BOOST_REQUIRE(info.GetNumPubMedLinks() == nPubMedCount);
428
429 string strPlain, strHTML;
430 BOOST_REQUIRE_NO_THROW(info.ToString(strPlain, false));
431 BOOST_REQUIRE_NO_THROW(info.ToString(strHTML, true, "GENE_URL"));
432
433 string strExpectedPlain =
434 " GENE ID: 3481 IGF2"
435 " | insulin-like growth factor 2 (somatomedin A)"
436 "\n[Homo sapiens]"
437 " (Over 100 PubMed links)";
438 BOOST_REQUIRE(strPlain == strExpectedPlain);
439
440 string strExpectedHTML =
441 " <a href=\"GENE_URL\">GENE ID: 3481 IGF2</a>"
442 " | insulin-like growth factor 2 (somatomedin A)"
443 "\n[Homo sapiens]"
444 " <span class=\"Gene_PubMedLinks\">"
445 "(Over 100 PubMed links)</span>";
446 BOOST_REQUIRE(strHTML == strExpectedHTML);
447 }
448 catch (CException& e)
449 {
450 BOOST_FAIL(e.what());
451 }
452 }
453
454 //==========================================================================//
455 // Test failed attempts to read Gene Info from an incorrect path
456
BOOST_AUTO_TEST_CASE(s_IncorrectPathTest)457 BOOST_AUTO_TEST_CASE(s_IncorrectPathTest)
458 {
459 unique_ptr<CGeneInfoFileReader> pReader1, pReader2;
460
461 CNcbiEnvironment env;
462 string strDirPath = env.Get(GENE_INFO_PATH_ENV_VARIABLE);
463
464 env.Set(GENE_INFO_PATH_ENV_VARIABLE, "./");
465 BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(true)),
466 CGeneInfoException);
467 BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(false)),
468 CGeneInfoException);
469
470 env.Set(GENE_INFO_PATH_ENV_VARIABLE, "invalid_path");
471 BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(true)),
472 CGeneInfoException);
473 BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(false)),
474 CGeneInfoException);
475
476 if (strDirPath != kEmptyStr) {
477 env.Set(GENE_INFO_PATH_ENV_VARIABLE, strDirPath);
478 BOOST_REQUIRE_NO_THROW(pReader1.reset(new CGeneInfoFileReader(true)));
479 BOOST_REQUIRE_NO_THROW(pReader2.reset(new CGeneInfoFileReader(false)));
480 }
481 }
482
483 BOOST_AUTO_TEST_SUITE_END()
484 #endif /* SKIP_DOXYGEN_PROCESSING */
485
486 //==========================================================================//
487