1 #include <ncbi_pch.hpp>
2 #include <objmgr/util/sequence.hpp>
3 #include <objects/general/Dbtag.hpp>
4 #include <objects/general/Object_id.hpp>
5 #include <objtools/readers/hgvs/id_resolver.hpp>
6 #include <objtools/readers/hgvs/irep_to_seqfeat_errors.hpp>
7 
8 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)9 BEGIN_SCOPE(objects)
10 
11 CIdResolver::CIdResolver(CScope& scope)  : m_Scope(scope),
12     m_LRGregex(new CRegexp("^(LRG_\\d+)([pt]\\d+)?$")),
13     m_CCDSregex(new CRegexp("^CCDS\\d+\\.\\d+$")),
14     m_E2Client(new CEntrez2Client())
15 {
16 }
17 
GetAccessionVersion(const string & identifier) const18 CSeq_id_Handle CIdResolver::GetAccessionVersion(const string& identifier) const
19 {
20 
21     if (NStr::IsBlank(identifier)) {
22 // LCOV_EXCL_START - This error will almost certainly be picked up by the parser first
23         NCBI_THROW(CVariationValidateException,
24                    eIDResolveError,
25                    "Empty sequence identifier string");
26 // LCOV_EXCL_STOP
27     }
28 
29     CSeq_id_Handle idh;
30 
31     if (x_TryProcessLRG(identifier, idh)) {
32         return idh;
33     }
34 /*
35     // Need to add code to handle CCDS identifier
36     // Waiting for the appropriate service to be put in place
37 */
38 
39     try {
40         auto temp_idh = CSeq_id_Handle::GetHandle(identifier);
41         idh = sequence::GetId(temp_idh,
42                               m_Scope,
43                               sequence::eGetId_ForceAcc | sequence::eGetId_ThrowOnError);
44     }
45     catch (...) {}
46 
47     if (!idh) { // Failed to resolve id
48         NCBI_THROW(CVariationValidateException,
49                    eIDResolveError,
50                    "Could not resolve sequence identifier: " + identifier);
51     }
52 
53     return idh;
54 }
55 
56 
x_TryProcessGenomicLRG(const string & identifier,CSeq_id_Handle & idh) const57 bool CIdResolver::x_TryProcessGenomicLRG(const string& identifier, CSeq_id_Handle& idh) const
58 {
59     if (NStr::IsBlank(identifier)) {
60         return false;
61     }
62 
63     if (!x_LooksLikeLRG(identifier)) {
64         return false; // LCOV_EXCL_LINE - Don't expect this to occur.
65     }
66 
67     CRef<CSeq_id> lrg_seqid;
68 
69     try {
70         lrg_seqid = Ref(new CSeq_id("gnl|LRG|" + identifier));
71     }
72 // LCOV_EXCL_START
73     catch(...) {
74         return false;
75     }
76 // LCOV_EXCL_STOP
77 
78     idh = sequence::GetId(*lrg_seqid,
79                           m_Scope,
80                           sequence::eGetId_ForceAcc | sequence::eGetId_ThrowOnError);
81 
82     return true;
83 }
84 
85 
86 
x_TryProcessLRG(const string & identifier,CSeq_id_Handle & idh) const87 bool CIdResolver::x_TryProcessLRG(const string& identifier, CSeq_id_Handle& idh) const
88 {
89     m_LRGregex->IsMatch(identifier);
90 
91     string genome_id = m_LRGregex->GetSub(identifier,1);
92     string lrg_product_id = m_LRGregex->GetSub(identifier,2);
93 
94     CSeq_id_Handle genome_idh;
95 
96     // The following routine calls CRegexp::IsMatch, which
97     // resets all results from previous GetMatch() calls.
98     // This is why we fetch the lrg_product_id above.
99     if (!x_TryProcessGenomicLRG(genome_id, genome_idh)) {
100         return false;
101     }
102 
103     if (NStr::IsBlank(lrg_product_id)) { // Nothing more to do here.
104       idh = genome_idh;                  // No need to search for products.
105       return true;
106     }
107 
108 
109     SAnnotSelector selector;
110     selector.SetResolveTSE();
111     selector.IncludeFeatType(CSeqFeatData::e_Gene);
112     selector.IncludeFeatType(CSeqFeatData::e_Rna);
113     selector.IncludeFeatType(CSeqFeatData::e_Cdregion);
114 
115     auto bsh = m_Scope.GetBioseqHandle(genome_idh);
116 
117     if (!bsh) { // Throw an exception
118 // LCOV_EXCL_START - could not find test case to trigger this exception.
119         NCBI_THROW(CVariationValidateException,
120                    eIDResolveError,
121                    "Could not find Bioseq for identifier : " + genome_id);
122 // LCOV_EXCL_STOP
123     }
124 
125 
126     for (CFeat_CI ci(bsh,selector); ci; ++ci) {
127         const auto& mapped_feat = *ci;
128         if (!mapped_feat.IsSetDbxref()) {
129             continue; // LCOV_EXCL_LINE
130         }
131         ITERATE(CSeq_feat::TDbxref, it, mapped_feat.GetDbxref()) {
132             const auto& dbtag = **it;
133             if (NStr::Equal(dbtag.GetDb(), "LRG") &&
134                 dbtag.GetTag().IsStr() &&
135                 NStr::Equal(dbtag.GetTag().GetStr(), lrg_product_id) &&
136                 mapped_feat.IsSetProduct() &&
137                 mapped_feat.GetProduct().GetId())
138             {
139                 try {
140                     idh = sequence::GetId(*mapped_feat.GetProduct().GetId(),
141                                            m_Scope,
142                                            sequence::eGetId_ForceAcc);
143                    return true;
144                 }
145 // LCOV_EXCL_START - could not find test case to trigger the exception
146                 catch (...)
147                 {
148                     break;
149                 }
150             }
151         }
152     }
153     NCBI_THROW(CVariationValidateException,
154                eIDResolveError,
155                "Could not find SeqId for : " + identifier);
156 
157     return false;
158 }
159 // LCOV_EXCL_STOP
160 
161 
162 /*
163 CSeq_id_Handle CIdResolver::x_ProcessCCDS(const string& identifier)
164 {
165     //string query_string = "srcdb_refseq[prop] AND biomol_mRNA[prop] AND dbxref_ccds[prop] AND \"CCDS:" + identifier + "\"";
166     string query_string = "srcdb_refseq[prop] AND dbxref_ccds[prop] AND \"CCDS:" + identifier + "\"";
167     //string query_string = "srcdb_refseq[prop] AND \"CCDS:" + identifier + "\"";
168 
169     query_string = "srcdb_refseq[prop] AND dbxref_ccds[prop] AND CCDS4702";
170    // query_string = "dbxref_ccds[prop] AND CCDS4702";
171 
172     vector<TGi> gi;
173     const size_t start_offs = 0;
174     const size_t count = 5;
175     m_E2Client->Query(query_string, "protein", gi, start_offs, count);
176 
177     if (gi.size() != 1) {
178         // Throw an exception
179     }
180 
181 
182     for(int i=0; i<gi.size(); ++i) {
183         auto gih = CSeq_id_Handle::GetHandle(gi[i]);
184 
185         auto idh = sequence::GetId(gih,
186                                m_Scope,
187                                sequence::eGetId_ForceAcc);
188 
189     }
190     CSeq_id_Handle idh;
191     return idh;
192 }
193 */
194 
x_LooksLikeLRG(const string & identifier) const195 bool CIdResolver::x_LooksLikeLRG(const string& identifier) const
196 {
197     return m_LRGregex->IsMatch(identifier);
198 }
199 
200 // LCOV_EXCL_START - exclude from code coverage until CCDS is supported
x_LooksLikeCCDS(const string & identifier) const201 bool CIdResolver::x_LooksLikeCCDS(const string& identifier) const
202 {
203     return  m_CCDSregex->IsMatch(identifier);
204 }
205 // LCOV_EXCL_STOP
206 
207 
208 END_SCOPE(objects)
209 END_NCBI_SCOPE
210 
211