1 #include <ncbi_pch.hpp>
2 #include <objmgr/util/sequence.hpp>
3 #include <objects/general/Dbtag.hpp>
4 #include <objects/general/Object_id.hpp>
5 #include <objtools/readers/hgvs/id_resolver.hpp>
6 #include <objtools/readers/hgvs/irep_to_seqfeat_errors.hpp>
7
8 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)9 BEGIN_SCOPE(objects)
10
11 CIdResolver::CIdResolver(CScope& scope) : m_Scope(scope),
12 m_LRGregex(new CRegexp("^(LRG_\\d+)([pt]\\d+)?$")),
13 m_CCDSregex(new CRegexp("^CCDS\\d+\\.\\d+$")),
14 m_E2Client(new CEntrez2Client())
15 {
16 }
17
GetAccessionVersion(const string & identifier) const18 CSeq_id_Handle CIdResolver::GetAccessionVersion(const string& identifier) const
19 {
20
21 if (NStr::IsBlank(identifier)) {
22 // LCOV_EXCL_START - This error will almost certainly be picked up by the parser first
23 NCBI_THROW(CVariationValidateException,
24 eIDResolveError,
25 "Empty sequence identifier string");
26 // LCOV_EXCL_STOP
27 }
28
29 CSeq_id_Handle idh;
30
31 if (x_TryProcessLRG(identifier, idh)) {
32 return idh;
33 }
34 /*
35 // Need to add code to handle CCDS identifier
36 // Waiting for the appropriate service to be put in place
37 */
38
39 try {
40 auto temp_idh = CSeq_id_Handle::GetHandle(identifier);
41 idh = sequence::GetId(temp_idh,
42 m_Scope,
43 sequence::eGetId_ForceAcc | sequence::eGetId_ThrowOnError);
44 }
45 catch (...) {}
46
47 if (!idh) { // Failed to resolve id
48 NCBI_THROW(CVariationValidateException,
49 eIDResolveError,
50 "Could not resolve sequence identifier: " + identifier);
51 }
52
53 return idh;
54 }
55
56
x_TryProcessGenomicLRG(const string & identifier,CSeq_id_Handle & idh) const57 bool CIdResolver::x_TryProcessGenomicLRG(const string& identifier, CSeq_id_Handle& idh) const
58 {
59 if (NStr::IsBlank(identifier)) {
60 return false;
61 }
62
63 if (!x_LooksLikeLRG(identifier)) {
64 return false; // LCOV_EXCL_LINE - Don't expect this to occur.
65 }
66
67 CRef<CSeq_id> lrg_seqid;
68
69 try {
70 lrg_seqid = Ref(new CSeq_id("gnl|LRG|" + identifier));
71 }
72 // LCOV_EXCL_START
73 catch(...) {
74 return false;
75 }
76 // LCOV_EXCL_STOP
77
78 idh = sequence::GetId(*lrg_seqid,
79 m_Scope,
80 sequence::eGetId_ForceAcc | sequence::eGetId_ThrowOnError);
81
82 return true;
83 }
84
85
86
x_TryProcessLRG(const string & identifier,CSeq_id_Handle & idh) const87 bool CIdResolver::x_TryProcessLRG(const string& identifier, CSeq_id_Handle& idh) const
88 {
89 m_LRGregex->IsMatch(identifier);
90
91 string genome_id = m_LRGregex->GetSub(identifier,1);
92 string lrg_product_id = m_LRGregex->GetSub(identifier,2);
93
94 CSeq_id_Handle genome_idh;
95
96 // The following routine calls CRegexp::IsMatch, which
97 // resets all results from previous GetMatch() calls.
98 // This is why we fetch the lrg_product_id above.
99 if (!x_TryProcessGenomicLRG(genome_id, genome_idh)) {
100 return false;
101 }
102
103 if (NStr::IsBlank(lrg_product_id)) { // Nothing more to do here.
104 idh = genome_idh; // No need to search for products.
105 return true;
106 }
107
108
109 SAnnotSelector selector;
110 selector.SetResolveTSE();
111 selector.IncludeFeatType(CSeqFeatData::e_Gene);
112 selector.IncludeFeatType(CSeqFeatData::e_Rna);
113 selector.IncludeFeatType(CSeqFeatData::e_Cdregion);
114
115 auto bsh = m_Scope.GetBioseqHandle(genome_idh);
116
117 if (!bsh) { // Throw an exception
118 // LCOV_EXCL_START - could not find test case to trigger this exception.
119 NCBI_THROW(CVariationValidateException,
120 eIDResolveError,
121 "Could not find Bioseq for identifier : " + genome_id);
122 // LCOV_EXCL_STOP
123 }
124
125
126 for (CFeat_CI ci(bsh,selector); ci; ++ci) {
127 const auto& mapped_feat = *ci;
128 if (!mapped_feat.IsSetDbxref()) {
129 continue; // LCOV_EXCL_LINE
130 }
131 ITERATE(CSeq_feat::TDbxref, it, mapped_feat.GetDbxref()) {
132 const auto& dbtag = **it;
133 if (NStr::Equal(dbtag.GetDb(), "LRG") &&
134 dbtag.GetTag().IsStr() &&
135 NStr::Equal(dbtag.GetTag().GetStr(), lrg_product_id) &&
136 mapped_feat.IsSetProduct() &&
137 mapped_feat.GetProduct().GetId())
138 {
139 try {
140 idh = sequence::GetId(*mapped_feat.GetProduct().GetId(),
141 m_Scope,
142 sequence::eGetId_ForceAcc);
143 return true;
144 }
145 // LCOV_EXCL_START - could not find test case to trigger the exception
146 catch (...)
147 {
148 break;
149 }
150 }
151 }
152 }
153 NCBI_THROW(CVariationValidateException,
154 eIDResolveError,
155 "Could not find SeqId for : " + identifier);
156
157 return false;
158 }
159 // LCOV_EXCL_STOP
160
161
162 /*
163 CSeq_id_Handle CIdResolver::x_ProcessCCDS(const string& identifier)
164 {
165 //string query_string = "srcdb_refseq[prop] AND biomol_mRNA[prop] AND dbxref_ccds[prop] AND \"CCDS:" + identifier + "\"";
166 string query_string = "srcdb_refseq[prop] AND dbxref_ccds[prop] AND \"CCDS:" + identifier + "\"";
167 //string query_string = "srcdb_refseq[prop] AND \"CCDS:" + identifier + "\"";
168
169 query_string = "srcdb_refseq[prop] AND dbxref_ccds[prop] AND CCDS4702";
170 // query_string = "dbxref_ccds[prop] AND CCDS4702";
171
172 vector<TGi> gi;
173 const size_t start_offs = 0;
174 const size_t count = 5;
175 m_E2Client->Query(query_string, "protein", gi, start_offs, count);
176
177 if (gi.size() != 1) {
178 // Throw an exception
179 }
180
181
182 for(int i=0; i<gi.size(); ++i) {
183 auto gih = CSeq_id_Handle::GetHandle(gi[i]);
184
185 auto idh = sequence::GetId(gih,
186 m_Scope,
187 sequence::eGetId_ForceAcc);
188
189 }
190 CSeq_id_Handle idh;
191 return idh;
192 }
193 */
194
x_LooksLikeLRG(const string & identifier) const195 bool CIdResolver::x_LooksLikeLRG(const string& identifier) const
196 {
197 return m_LRGregex->IsMatch(identifier);
198 }
199
200 // LCOV_EXCL_START - exclude from code coverage until CCDS is supported
x_LooksLikeCCDS(const string & identifier) const201 bool CIdResolver::x_LooksLikeCCDS(const string& identifier) const
202 {
203 return m_CCDSregex->IsMatch(identifier);
204 }
205 // LCOV_EXCL_STOP
206
207
208 END_SCOPE(objects)
209 END_NCBI_SCOPE
210
211