1 /*  $Id: gene_info_test.cpp 631547 2021-05-19 13:51:35Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Vahram Avagyan
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <corelib/ncbiapp.hpp>
32 #include <corelib/ncbienv.hpp>
33 
34 #include <objtools/blast/gene_info_reader/gene_info.hpp>
35 #include <objtools/blast/gene_info_reader/gene_info_reader.hpp>
36 
37 #include <map>
38 
39 #include <corelib/test_boost.hpp>
40 
41 //==========================================================================//
42 
43 #ifndef SKIP_DOXYGEN_PROCESSING
44 
45 USING_NCBI_SCOPE;
46 
47 //==========================================================================//
48 
49 typedef map<int, int> TIntToIntMap;
50 typedef multimap<TGi, int> TGiToIntMultimap;
51 typedef map<int, string> TIntToStringMap;
52 
53 static void
s_MakeGeneInfoFileReaders(CGeneInfoFileReader * & pReader1,CGeneInfoFileReader * & pReader2)54     s_MakeGeneInfoFileReaders(CGeneInfoFileReader*& pReader1,
55                               CGeneInfoFileReader*& pReader2)
56 {
57     pReader1 = new CGeneInfoFileReader(true);
58     pReader2 = new CGeneInfoFileReader(false);
59 }
60 
61 static void
s_InitTestData(IGeneInfoInput::TGeneIdList & listIds,IGeneInfoInput::TGiList & listGis,TGiToIntMultimap & mapGiToIds,IGeneInfoInput::TGeneIdToGeneInfoMap & mapIdToInfo)62     s_InitTestData(IGeneInfoInput::TGeneIdList& listIds,
63                    IGeneInfoInput::TGiList& listGis,
64                    TGiToIntMultimap& mapGiToIds,
65                    IGeneInfoInput::TGeneIdToGeneInfoMap& mapIdToInfo)
66 {
67     int geneId;
68 
69     // Initialize Gene IDs and Gene Infos
70 
71     TIntToIntMap mapIdToPMIDs;
72     mapIdToPMIDs[1] = 1;
73     mapIdToPMIDs[2] = 2;
74     mapIdToPMIDs[3] = 2;
75     mapIdToPMIDs[4] = 1;
76     mapIdToPMIDs[5] = 1;
77     mapIdToPMIDs[6] = 2;
78     mapIdToPMIDs[7] = 0;
79 
80     TIntToStringMap mapIdToOrgname;
81     mapIdToOrgname[1] = "unknown";
82     mapIdToOrgname[2] = "Gallus gallus";
83     mapIdToOrgname[3] = "Homo sapiens";
84     mapIdToOrgname[4] = "Homo sapiens";
85     mapIdToOrgname[5] = "Gallus gallus";
86     mapIdToOrgname[6] = "Homo sapiens";
87     mapIdToOrgname[7] = "unknown";
88 
89     for (geneId = 1; geneId <= 7; geneId++)
90     {
91         listIds.push_back(geneId);
92 
93         string strGeneId = NStr::IntToString(geneId);
94         mapIdToInfo[geneId] = CRef<CGeneInfo>(new CGeneInfo(
95             geneId,
96             "GeneID" + strGeneId,
97             "Description text for GeneID" + strGeneId,
98             mapIdToOrgname[geneId],
99             mapIdToPMIDs[geneId]));
100     }
101 
102     // Link Gis to Gene IDs
103 
104     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(1), 7));
105     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 2));
106     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 3));
107     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(2), 5));
108     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(3), 4));
109     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(4), 4));
110     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(4), 6));
111     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(10), 1));
112     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(11), 1));
113     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(11), 7));
114     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(20), 1));
115 //  (21, 1), (21, 7) excluded: "Genomic" Gi, multiple IDs
116     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(30), 2));
117     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(31), 5));
118     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(31), 4));
119     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(32), 3));
120     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(32), 6));
121     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(40), 2));
122     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(41), 2));
123     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(42), 2));
124     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(50), 5));
125 //  (60, 3), (60, 4) excluded: "Genomic" Gi, multiple IDs
126     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(61), 4));
127     mapGiToIds.insert(TGiToIntMultimap::value_type(GI_CONST(62), 6));
128 
129     listGis.push_back(GI_CONST(1));
130     listGis.push_back(GI_CONST(2));
131     listGis.push_back(GI_CONST(3));
132     listGis.push_back(GI_CONST(4));
133     listGis.push_back(GI_CONST(10));
134     listGis.push_back(GI_CONST(11));
135     listGis.push_back(GI_CONST(20));
136 //    listGis.push_back(GI_CONST(21);;
137     listGis.push_back(GI_CONST(30));
138     listGis.push_back(GI_CONST(31));
139     listGis.push_back(GI_CONST(32));
140     listGis.push_back(GI_CONST(40));
141     listGis.push_back(GI_CONST(41));
142     listGis.push_back(GI_CONST(42));
143     listGis.push_back(GI_CONST(50));
144 //    listGis.push_back(GI_CONST(60));
145     listGis.push_back(GI_CONST(61));
146     listGis.push_back(GI_CONST(62));
147 }
148 
149 static void
s_InitGisWithNoGeneIds(IGeneInfoInput::TGiList & listGis)150     s_InitGisWithNoGeneIds(IGeneInfoInput::TGiList& listGis)
151 {
152     listGis.push_back(GI_CONST(100));         // no gene links
153     listGis.push_back(GI_CONST(60));          // "Genomic" Gi, multiple gene links
154 }
155 
156 static bool
s_CheckPubMedLinkCount(int nLinks1,int nLinks2)157     s_CheckPubMedLinkCount(int nLinks1, int nLinks2)
158 {
159     while (nLinks1 != 0 && nLinks2 != 0)
160     {
161         nLinks1 /= 10;
162         nLinks2 /= 10;
163     }
164     return nLinks1 == nLinks2;
165 }
166 
167 static void
s_CheckInfoEquality(CRef<CGeneInfo> info1,CRef<CGeneInfo> info2)168     s_CheckInfoEquality(CRef<CGeneInfo> info1,
169                         CRef<CGeneInfo> info2)
170 {
171     BOOST_REQUIRE_EQUAL(info1->GetGeneId(),
172                 info2->GetGeneId());
173     BOOST_REQUIRE_EQUAL(info1->GetSymbol(),
174                 info2->GetSymbol());
175     BOOST_REQUIRE_EQUAL(info1->GetDescription(),
176                 info2->GetDescription());
177     BOOST_REQUIRE_EQUAL(info1->GetOrganismName(),
178                 info2->GetOrganismName());
179     BOOST_REQUIRE(s_CheckPubMedLinkCount(info1->GetNumPubMedLinks(),
180                                  info2->GetNumPubMedLinks()));
181 }
182 
183 struct SGeneInfoListSorter {
operator ()SGeneInfoListSorter184     bool operator() (const CRef<CGeneInfo>& a,
185                      const CRef<CGeneInfo>& b) const
186     {
187         return a->GetGeneId() < b->GetGeneId();
188     }
189 };
190 
191 static void
s_SortInfoList(IGeneInfoInput::TGeneInfoList & infoList)192     s_SortInfoList(IGeneInfoInput::TGeneInfoList& infoList)
193 {
194     sort(infoList.begin(), infoList.end(), SGeneInfoListSorter() );
195 }
196 
197 static void
s_CheckInfoListEquality(IGeneInfoInput::TGeneInfoList & infoList1,IGeneInfoInput::TGeneInfoList & infoList2)198     s_CheckInfoListEquality(IGeneInfoInput::TGeneInfoList& infoList1,
199                             IGeneInfoInput::TGeneInfoList& infoList2)
200 {
201     s_SortInfoList(infoList1);
202     s_SortInfoList(infoList2);
203 
204     IGeneInfoInput::TGeneInfoList::iterator it1, it2;
205     for (it1 = infoList1.begin(), it2 = infoList2.begin();
206          it1 != infoList1.end() && it2 != infoList2.end();
207          it1++, it2++)
208     {
209         s_CheckInfoEquality(*it1, *it2);
210     }
211     BOOST_REQUIRE(it1 == infoList1.end() && it2 == infoList2.end());
212     if (it1 != infoList1.end())
213         cout << endl << "Extra info 1: " << **it1 << endl;
214     if (it2 != infoList2.end())
215         cout << endl << "Extra info 2: " << **it2 << endl;
216 }
217 
218 static void
s_FillExpectedInfoListForGi(TGi gi,TGiToIntMultimap & mapGiToIds,IGeneInfoInput::TGeneIdToGeneInfoMap & mapIdToInfo,IGeneInfoInput::TGeneInfoList & infoList)219     s_FillExpectedInfoListForGi(TGi gi,
220                                 TGiToIntMultimap& mapGiToIds,
221                                 IGeneInfoInput::TGeneIdToGeneInfoMap& mapIdToInfo,
222                                 IGeneInfoInput::TGeneInfoList& infoList)
223 {
224     // cout << endl << "Gene IDs for Gi=" << gi << ": ";
225     TGiToIntMultimap::iterator itGiToGeneId = mapGiToIds.find(gi);
226     while (itGiToGeneId != mapGiToIds.end() &&
227            itGiToGeneId->first == gi)
228     {
229         int geneId = itGiToGeneId->second;
230         // cout << geneId << " ";
231 
232         CRef<CGeneInfo> info = mapIdToInfo[geneId];
233         infoList.push_back(info);
234 
235         // cout << endl << *info << endl;
236 
237         itGiToGeneId++;
238     }
239     // cout << endl;
240 }
241 
242 template<class T> void
s_CheckIntInList(T val,list<T> & listVals)243     s_CheckIntInList(T val, list<T>& listVals)
244 {
245     BOOST_REQUIRE(find(listVals.begin(), listVals.end(), val) != listVals.end());
246 }
247 
248 static void
s_CheckGiToGeneConsistency(TGi gi,TGiToIntMultimap & mapGiToIds,CGeneInfoFileReader * pReader)249     s_CheckGiToGeneConsistency(TGi gi,
250                                TGiToIntMultimap& mapGiToIds,
251                                CGeneInfoFileReader *pReader)
252 {
253     // see if this gi appears in the gi lists for each of its Gene IDs
254 
255     TGiToIntMultimap::iterator itGiToGeneId = mapGiToIds.find(gi);
256     while (itGiToGeneId != mapGiToIds.end() &&
257            itGiToGeneId->first == gi)
258     {
259         int geneId = itGiToGeneId->second;
260         // cout << "\nGi's for GeneID=" << geneId << ": ";
261 
262         IGeneInfoInput::TGiList giListRNA, giListProtein, giListGenomic;
263         bool bRNA, bProtein, bGenomic;
264         bRNA     = pReader->GetRNAGisForGeneId(geneId, giListRNA);
265         bProtein = pReader->GetProteinGisForGeneId(geneId, giListProtein);
266         bGenomic = pReader->GetGenomicGisForGeneId(geneId, giListGenomic);
267         BOOST_REQUIRE(bRNA || bProtein || bGenomic);
268 
269         // cout << endl << "\tRNA Gi's: ";
270         // s_OutputList(giListRNA);
271         // cout << endl << "\tProtein Gi's: ";
272         // s_OutputList(giListProtein);
273         // cout << endl << "\tGenomic Gi's: ";
274         // s_OutputList(giListGenomic);
275 
276         IGeneInfoInput::TGiList giListAll;
277         copy(giListRNA.begin(), giListRNA.end(),
278                 back_inserter(giListAll));
279         copy(giListProtein.begin(), giListProtein.end(),
280                 back_inserter(giListAll));
281         copy(giListGenomic.begin(), giListGenomic.end(),
282                 back_inserter(giListAll));
283 
284         s_CheckIntInList(gi, giListAll);
285 
286         itGiToGeneId++;
287     }
288     // cout << endl;
289 
290     // see if this gi's Gene IDs appear in the actual Gene ID list
291     // returned by the reader
292 
293     IGeneInfoInput::TGeneIdList geneIdsFromReader;
294     BOOST_REQUIRE(pReader->GetGeneIdsForGi(gi, geneIdsFromReader));
295 
296     itGiToGeneId = mapGiToIds.find(gi);
297     while (itGiToGeneId != mapGiToIds.end() &&
298            itGiToGeneId->first == gi)
299     {
300         int geneId = itGiToGeneId->second;
301         s_CheckIntInList(geneId, geneIdsFromReader);
302 
303         itGiToGeneId++;
304     }
305 }
306 
307 //==========================================================================//
308 // Test successful Gi to Gene Info mapping
309 BOOST_AUTO_TEST_SUITE(gene_info)
310 
BOOST_AUTO_TEST_CASE(s_MainInfoReaderTest)311 BOOST_AUTO_TEST_CASE(s_MainInfoReaderTest)
312 {
313     CNcbiEnvironment env;
314     env.Set(GENE_INFO_PATH_ENV_VARIABLE, "data/");
315 
316     try
317     {
318         IGeneInfoInput::TGeneIdList listGeneIds;
319         IGeneInfoInput::TGiList listGis;
320         TGiToIntMultimap mapGiToIds;
321         IGeneInfoInput::TGeneIdToGeneInfoMap mapIdToInfo;
322 
323         s_InitTestData(listGeneIds, listGis,
324                        mapGiToIds, mapIdToInfo);
325 
326         CGeneInfoFileReader *pReader1 = NULL, *pReader2 = NULL;
327         BOOST_REQUIRE_NO_THROW(s_MakeGeneInfoFileReaders(pReader1, pReader2));
328         auto_ptr<CGeneInfoFileReader> fileReader1(pReader1);
329         auto_ptr<CGeneInfoFileReader> fileReader2(pReader2);
330 
331         IGeneInfoInput::TGiList::iterator itGi = listGis.begin();
332         for (; itGi != listGis.end(); itGi++)
333         {
334             TGi gi = *itGi;
335 
336             // cout << endl << "Processing new Gi: " << gi << endl;
337 
338             IGeneInfoInput::TGeneInfoList infoList1, infoList2,
339                                           infoListExpected;
340             BOOST_REQUIRE(fileReader1->GetGeneInfoForGi(gi, infoList1));
341             BOOST_REQUIRE(fileReader2->GetGeneInfoForGi(gi, infoList2));
342 
343             s_FillExpectedInfoListForGi(gi, mapGiToIds,
344                                         mapIdToInfo, infoListExpected);
345 
346             s_CheckInfoListEquality(infoList1, infoList2);
347             s_CheckInfoListEquality(infoList1, infoListExpected);
348 
349             s_CheckGiToGeneConsistency(gi, mapGiToIds,
350                                        fileReader1.get());
351             s_CheckGiToGeneConsistency(gi, mapGiToIds,
352                                        fileReader2.get());
353         }
354     }
355     catch (CException& e)
356     {
357         BOOST_FAIL(e.what());
358     }
359 }
360 
361 //==========================================================================//
362 // Test Gis that are not mapped to a single Gene Id
363 
BOOST_AUTO_TEST_CASE(s_GiWithNoGeneIdTest)364 BOOST_AUTO_TEST_CASE(s_GiWithNoGeneIdTest)
365 {
366     CNcbiEnvironment env;
367     env.Set(GENE_INFO_PATH_ENV_VARIABLE, "data/");
368 
369     try
370     {
371         IGeneInfoInput::TGiList listGis;
372         s_InitGisWithNoGeneIds(listGis);
373 
374         CGeneInfoFileReader *pReader1 = NULL, *pReader2 = NULL;
375         BOOST_REQUIRE_NO_THROW(s_MakeGeneInfoFileReaders(pReader1, pReader2));
376         auto_ptr<CGeneInfoFileReader> fileReader1(pReader1);
377         auto_ptr<CGeneInfoFileReader> fileReader2(pReader2);
378 
379         IGeneInfoInput::TGiList::iterator itGi = listGis.begin();
380         for (; itGi != listGis.end(); itGi++)
381         {
382             TGi gi = *itGi;
383 
384             IGeneInfoInput::TGeneInfoList infoList1, infoList2,
385                                           infoListExpected;
386             BOOST_REQUIRE(!fileReader1->GetGeneInfoForGi(gi, infoList1));
387             BOOST_REQUIRE(!fileReader2->GetGeneInfoForGi(gi, infoList2));
388 
389             BOOST_REQUIRE(infoList1.empty());
390             BOOST_REQUIRE(infoList2.empty());
391         }
392     }
393     catch (CException& e)
394     {
395         BOOST_FAIL(e.what());
396     }
397 }
398 
399 //==========================================================================//
400 // Test basic functionality of the Gene Info class
401 
BOOST_AUTO_TEST_CASE(s_TestGeneInfo)402 BOOST_AUTO_TEST_CASE(s_TestGeneInfo)
403 {
404     try
405     {
406         CGeneInfo info;
407         BOOST_REQUIRE(!info.IsInitialized());
408 
409         int geneId = 3481;
410         string strSymbol = "IGF2";
411         string strDescription =
412             "insulin-like growth factor 2 (somatomedin A)";
413         string strOrganism = "Homo sapiens";
414         int nPubMedCount = 100;
415 
416         info = CGeneInfo(geneId,
417                          strSymbol,
418                          strDescription,
419                          strOrganism,
420                          nPubMedCount);
421 
422         BOOST_REQUIRE(info.IsInitialized());
423         BOOST_REQUIRE(info.GetGeneId() == geneId);
424         BOOST_REQUIRE(info.GetSymbol() == strSymbol);
425         BOOST_REQUIRE(info.GetDescription() == strDescription);
426         BOOST_REQUIRE(info.GetOrganismName() == strOrganism);
427         BOOST_REQUIRE(info.GetNumPubMedLinks() == nPubMedCount);
428 
429         string strPlain, strHTML;
430         BOOST_REQUIRE_NO_THROW(info.ToString(strPlain, false));
431         BOOST_REQUIRE_NO_THROW(info.ToString(strHTML, true, "GENE_URL"));
432 
433         string strExpectedPlain =
434                    " GENE ID: 3481 IGF2"
435                    " | insulin-like growth factor 2 (somatomedin A)"
436                    "\n[Homo sapiens]"
437                    " (Over 100 PubMed links)";
438         BOOST_REQUIRE(strPlain == strExpectedPlain);
439 
440         string strExpectedHTML =
441                    " <a href=\"GENE_URL\">GENE ID: 3481 IGF2</a>"
442                    " | insulin-like growth factor 2 (somatomedin A)"
443                    "\n[Homo sapiens]"
444                    " <span class=\"Gene_PubMedLinks\">"
445                      "(Over 100 PubMed links)</span>";
446         BOOST_REQUIRE(strHTML == strExpectedHTML);
447     }
448     catch (CException& e)
449     {
450         BOOST_FAIL(e.what());
451     }
452 }
453 
454 //==========================================================================//
455 // Test failed attempts to read Gene Info from an incorrect path
456 
BOOST_AUTO_TEST_CASE(s_IncorrectPathTest)457 BOOST_AUTO_TEST_CASE(s_IncorrectPathTest)
458 {
459     unique_ptr<CGeneInfoFileReader> pReader1, pReader2;
460 
461     CNcbiEnvironment env;
462     string strDirPath = env.Get(GENE_INFO_PATH_ENV_VARIABLE);
463 
464     env.Set(GENE_INFO_PATH_ENV_VARIABLE, "./");
465     BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(true)),
466                         CGeneInfoException);
467     BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(false)),
468                         CGeneInfoException);
469 
470     env.Set(GENE_INFO_PATH_ENV_VARIABLE, "invalid_path");
471     BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(true)),
472                         CGeneInfoException);
473     BOOST_REQUIRE_THROW(pReader1.reset(new CGeneInfoFileReader(false)),
474                         CGeneInfoException);
475 
476     if (strDirPath != kEmptyStr) {
477         env.Set(GENE_INFO_PATH_ENV_VARIABLE, strDirPath);
478         BOOST_REQUIRE_NO_THROW(pReader1.reset(new CGeneInfoFileReader(true)));
479         BOOST_REQUIRE_NO_THROW(pReader2.reset(new CGeneInfoFileReader(false)));
480     }
481 }
482 
483 BOOST_AUTO_TEST_SUITE_END()
484 #endif /* SKIP_DOXYGEN_PROCESSING */
485 
486 //==========================================================================//
487