1 /* $Id: keywords_item.cpp 627647 2021-03-16 18:16:29Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat, NCBI
27 *
28 * File Description:
29 * flat-file generator -- keywords item implementation
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objects/seqblock/PIR_block.hpp>
35 #include <objects/seqblock/PRF_block.hpp>
36 #include <objects/seqblock/GB_block.hpp>
37 #include <objects/seqblock/SP_block.hpp>
38 #include <objects/seqblock/EMBL_block.hpp>
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seqfeat/BioSource.hpp>
41 #include <objects/seqfeat/SubSource.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/OrgMod.hpp>
45 #include <objmgr/bioseq_ci.hpp>
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <util/static_set.hpp>
48 #include <algorithm>
49
50 #include <objtools/format/formatter.hpp>
51 #include <objtools/format/text_ostream.hpp>
52 #include <objtools/format/items/keywords_item.hpp>
53 #include <objtools/format/context.hpp>
54 #include <objects/valid/Comment_set.hpp>
55 #include <objects/valid/Comment_rule.hpp>
56
57 #include <objects/misc/sequence_util_macros.hpp>
58
59
60 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)61 BEGIN_SCOPE(objects)
62
63
64 CKeywordsItem::CKeywordsItem(CBioseqContext& ctx) :
65 CFlatItem(&ctx)
66 {
67 x_GatherInfo(ctx);
68 }
69
GetItemType(void) const70 IFlatItem::EItem CKeywordsItem::GetItemType(void) const
71 {
72 return eItem_Keywords;
73 }
74
Format(IFormatter & formatter,IFlatTextOStream & text_os) const75 void CKeywordsItem::Format
76 (IFormatter& formatter,
77 IFlatTextOStream& text_os) const
78 {
79 formatter.FormatKeywords(*this, text_os);
80 }
81
82
83 /***************************************************************************/
84 /* PRIVATE */
85 /***************************************************************************/
86
87
88 enum ETechFlags {
89 e_not_set,
90 eEST,
91 eSTS,
92 eGSS
93 };
94
95
96 // EST keywords
97 static const char* const sc_EST[] = {
98 "EST", "EST (expressed sequence tag)", "EST PROTO((expressed sequence tag)",
99 "EST(expressed sequence tag)", "TSR", "UK putts", "expressed sequence tag",
100 "partial cDNA sequence", "putatively transcribed partial sequence",
101 "transcribed sequence fragment"
102 };
103 typedef CStaticArraySet<const char*, PCase_CStr> TStaticKeywordSet;
104 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_EST_kw, sc_EST);
105
106
107 // GSS keywords
108 static const char* const sc_GSS[] = {
109 "GSS", "trapped exon"
110 };
111 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_GSS_kw, sc_GSS);
112
113 // STS keywords
114 static const char* const sc_STS[] = {
115 "STS", "STS (sequence tagged site)", "STS sequence",
116 "STS(sequence tagged site)", "sequence tagged site"
117 };
118 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_STS_kw, sc_STS);
119
120
s_CheckSpecialKeyword(const string & keyword,ETechFlags tech)121 static bool s_CheckSpecialKeyword(const string& keyword, ETechFlags tech)
122 {
123 if (tech == eEST) {
124 if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
125 return false;
126 }
127 if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
128 return false;
129 }
130 }
131
132 if (tech == eSTS) {
133 if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
134 return false;
135 }
136 if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
137 return false;
138 }
139 }
140
141 if (tech == eGSS) {
142 if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
143 return false;
144 }
145 if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
146 return false;
147 }
148 }
149
150 return true;
151 }
152
153
x_GatherInfo(CBioseqContext & ctx)154 void CKeywordsItem::x_GatherInfo(CBioseqContext& ctx)
155 {
156 switch( ctx.GetRepr() ) {
157 case CSeq_inst::eRepr_map:
158 x_AddKeyword("Whole_Genome_Map");
159 break;
160 default:
161 // no action needed yet for other types
162 break;
163 }
164
165 // check if env sample or metagenome_source
166 bool is_env_sample = false;
167 bool is_metagenome_source = false;
168 CSeqdesc_CI src_desc(ctx.GetHandle(), CSeqdesc::e_Source);
169 if (src_desc) {
170 ITERATE(CBioSource::TSubtype, it, src_desc->GetSource().GetSubtype()) {
171 if (! (*it)->IsSetSubtype()) continue;
172 if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
173 is_env_sample = true;
174 }
175 }
176 if (src_desc->GetSource().IsSetOrg()) {
177 const CBioSource::TOrg& org = src_desc->GetSource().GetOrg();
178 if ( org.IsSetOrgname()) {
179 ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
180 if (! (*it)->IsSetSubtype()) continue;
181 if ((*it)->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
182 is_metagenome_source = true;
183 }
184 }
185 }
186 }
187 }
188
189 // we might set this in the mol-info switch statement below
190 bool is_tsa = false;
191
192 // add keywords based on mol-info
193 ETechFlags tech = e_not_set;
194 // don't do tech-related keywords if molinfo isn't set
195 if( ctx.GetMolinfo() != NULL ) {
196 switch ( ctx.GetTech() ) {
197 case CMolInfo::eTech_est:
198 tech = eEST;
199 x_AddKeyword("EST");
200 if (is_env_sample) {
201 x_AddKeyword("ENV");
202 }
203 break;
204
205 case CMolInfo::eTech_sts:
206 tech = eSTS;
207 x_AddKeyword("STS");
208 break;
209
210 case CMolInfo::eTech_survey:
211 tech = eGSS;
212 x_AddKeyword("GSS");
213 if (is_env_sample) {
214 x_AddKeyword("ENV");
215 }
216 break;
217
218 case CMolInfo::eTech_htgs_0:
219 x_AddKeyword("HTG");
220 x_AddKeyword("HTGS_PHASE0");
221 break;
222
223 case CMolInfo::eTech_htgs_1:
224 x_AddKeyword("HTG");
225 x_AddKeyword("HTGS_PHASE1");
226 break;
227
228 case CMolInfo::eTech_htgs_2:
229 x_AddKeyword("HTG");
230 x_AddKeyword("HTGS_PHASE2");
231 break;
232
233 case CMolInfo::eTech_htgs_3:
234 x_AddKeyword("HTG");
235 break;
236
237 case CMolInfo::eTech_fli_cdna:
238 x_AddKeyword("FLI_CDNA");
239 break;
240
241 case CMolInfo::eTech_htc:
242 x_AddKeyword("HTC");
243 break;
244
245 case CMolInfo::eTech_wgs:
246 x_AddKeyword("WGS");
247 break;
248
249 case CMolInfo::eTech_tsa:
250 x_AddKeyword("TSA");
251 x_AddKeyword("Transcriptome Shotgun Assembly");
252 is_tsa = true; // remember so we don't add it twice
253 break;
254
255 case CMolInfo::eTech_targeted:
256 x_AddKeyword("TLS");
257 x_AddKeyword("Targeted Locus Study");
258 break;
259
260 case CMolInfo::eTech_unknown:
261 case CMolInfo::eTech_standard:
262 case CMolInfo::eTech_other:
263 if (is_env_sample) {
264 x_AddKeyword("ENV");
265 }
266 break;
267
268 default:
269 break;
270 }
271 }
272
273 if (is_metagenome_source) {
274 x_AddKeyword("Metagenome Assembled Genome");
275 x_AddKeyword("MAG");
276 }
277
278 // propagate TSA keyword from nuc to prot in same nuc-prot set
279 if( ! is_tsa && ctx.IsProt() && ctx.IsInNucProt() ) {
280 CBioseq_set_Handle parent_bioseq_set = ctx.GetHandle().GetParentBioseq_set();
281 if( parent_bioseq_set ) {
282 CBioseq_CI bioseq_ci( parent_bioseq_set, CSeq_inst::eMol_na );
283 if( bioseq_ci ) {
284 CBioseq_Handle nuc = *bioseq_ci;
285 if( nuc ) {
286 CSeqdesc_CI desc_ci( nuc, CSeqdesc::e_Molinfo );
287 for( ; desc_ci; ++desc_ci ) {
288 if( desc_ci->GetMolinfo().CanGetTech() &&
289 desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa )
290 {
291 x_AddKeyword("TSA");
292 x_AddKeyword("Transcriptome Shotgun Assembly");
293 break;
294 }
295 }
296 }
297 }
298 }
299 }
300
301 CBioseq_Handle bsh = ctx.GetHandle();
302 for (CSeqdesc_CI di(bsh, CSeqdesc::e_User); di; ++di) {
303 const CUser_object& usr = di->GetUser();
304 if ( ! CComment_rule::IsStructuredComment (usr) ) continue;
305 string pfx = CComment_rule::GetStructuredCommentPrefix ( usr, true );
306 bool is_valid = false;
307 CConstRef<CComment_set> comment_rules = CComment_set::GetCommentRules();
308 if (comment_rules) {
309 CConstRef<CComment_rule> ruler = comment_rules->FindCommentRuleEx(pfx);
310 if (ruler) {
311 const CComment_rule& rule = *ruler;
312 CComment_rule::TErrorList errors = rule.IsValid(usr);
313 if(errors.size() == 0) {
314 is_valid = true;
315 }
316 }
317 }
318 if ( is_valid ) {
319 if ( NStr::EqualNocase (pfx, "MIGS:5.0-Data" )) {
320 x_AddKeyword("GSC:MIxS");
321 x_AddKeyword("MIGS:5.0.");
322 } else if ( NStr::EqualNocase (pfx, "MIMS:5.0-Data" )) {
323 x_AddKeyword("GSC:MIxS");
324 x_AddKeyword("MIMS:5.0.");
325 } else if ( NStr::EqualNocase (pfx, "MIMARKS:5.0-Data" )) {
326 x_AddKeyword("GSC:MIxS");
327 x_AddKeyword("MIMARKS:5.0.");
328 } else if ( NStr::EqualNocase (pfx, "MISAG:5.0-Data" )) {
329 x_AddKeyword("GSC:MIxS");
330 x_AddKeyword("MISAG:5.0.");
331 } else if ( NStr::EqualNocase (pfx, "MIMAG:5.0-Data" )) {
332 x_AddKeyword("GSC:MIxS");
333 x_AddKeyword("MIMAG:5.0.");
334 } else if ( NStr::EqualNocase (pfx, "MIUVIG:5.0-Data" )) {
335 x_AddKeyword("GSC:MIxS");
336 x_AddKeyword("MIUVIG:5.0.");
337 }
338 }
339 try {
340 list<string> keywords = CComment_set::GetKeywords(usr);
341 FOR_EACH_STRING_IN_LIST ( s_itr, keywords ) {
342 x_AddKeyword(*s_itr);
343 }
344 } catch (CException) {
345 }
346 }
347
348 CBioseqContext::TUnverified unv = ctx.GetUnverifiedType();
349 if ((unv & CBioseqContext::fUnverified_SequenceOrAnnotation) != 0) {
350 x_AddKeyword("UNVERIFIED");
351 }
352 if ((unv & CBioseqContext::fUnverified_Organism) != 0) {
353 x_AddKeyword("UNVERIFIED");
354 x_AddKeyword("UNVERIFIED_ORGANISM");
355 }
356 if ((unv & CBioseqContext::fUnverified_Misassembled) != 0) {
357 x_AddKeyword("UNVERIFIED");
358 x_AddKeyword("UNVERIFIED_MISASSEMBLY");
359 }
360 if ((unv & CBioseqContext::fUnverified_Contaminant) != 0) {
361 x_AddKeyword("UNVERIFIED");
362 x_AddKeyword("UNVERIFIED_CONTAMINANT");
363 }
364
365 if (ctx.IsEncode()) {
366 x_AddKeyword("ENCODE");
367 }
368
369 if( ctx.IsGenomeAssembly() && ! ctx.GetFinishingStatus().empty() ) {
370 x_AddKeyword( ctx.GetFinishingStatus() );
371 }
372
373 if ( ctx.IsTPA() ) {
374 // add TPA keywords
375 x_AddKeyword("Third Party Data");
376 x_AddKeyword("TPA");
377 } else if ( ctx.IsRefSeq() ) {
378 // add RefSeq keyword
379 x_AddKeyword("RefSeq");
380 }
381
382 if ( ctx.IsCrossKingdom() && ctx.IsRSUniqueProt() ) {
383 // add CrossKingdom keyword
384 x_AddKeyword("CROSS_KINGDOM");
385 }
386
387 for (CSeqdesc_CI it(ctx.GetHandle()); it; ++it) {
388 const list<string>* keywords = NULL;
389
390 switch (it->Which()) {
391
392 case CSeqdesc::e_Pir:
393 keywords = &(it->GetPir().GetKeywords());
394 break;
395
396 case CSeqdesc::e_Genbank:
397 keywords = &(it->GetGenbank().GetKeywords());
398 break;
399
400 case CSeqdesc::e_Sp:
401 keywords = &(it->GetSp().GetKeywords());
402 break;
403
404 case CSeqdesc::e_Embl:
405 keywords = &(it->GetEmbl().GetKeywords());
406 break;
407
408 case CSeqdesc::e_Prf:
409 keywords = &(it->GetPrf().GetKeywords());
410 break;
411
412 default:
413 keywords = NULL;
414 break;
415 }
416
417 if (keywords != NULL) {
418 if (!IsSetObject()) {
419 x_SetObject(*it);
420 }
421 ITERATE (list<string>, kwd, *keywords) {
422 if (s_CheckSpecialKeyword(*kwd, tech)) {
423 x_AddKeyword(*kwd);
424 }
425 }
426 }
427 }
428 }
429
430
431 // Add a keyword to the list
x_OkayToAddKeyword(const string & keyword,vector<string> keywords)432 static bool x_OkayToAddKeyword(const string& keyword, vector<string> keywords)
433 {
434 ITERATE (vector<string>, it, keywords) {
435 if (NStr::EqualNocase(keyword, *it)) {
436 return false;
437 }
438 }
439 return true;
440 }
x_AddKeyword(const string & keyword)441 void CKeywordsItem::x_AddKeyword(const string& keyword)
442 {
443 list<string> kywds;
444 NStr::Split( keyword, ";", kywds, NStr::fSplit_Tokenize );
445 FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
446 const string& kw = *k_itr;
447 if (x_OkayToAddKeyword (kw, m_Keywords)) {
448 m_Keywords.push_back(kw);
449 }
450 }
451 }
452
453
454 END_SCOPE(objects)
455 END_NCBI_SCOPE
456