1 /*  $Id: keywords_item.cpp 627647 2021-03-16 18:16:29Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Mati Shomrat, NCBI
27 *
28 * File Description:
29 *   flat-file generator -- keywords item implementation
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objects/seqblock/PIR_block.hpp>
35 #include <objects/seqblock/PRF_block.hpp>
36 #include <objects/seqblock/GB_block.hpp>
37 #include <objects/seqblock/SP_block.hpp>
38 #include <objects/seqblock/EMBL_block.hpp>
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seqfeat/BioSource.hpp>
41 #include <objects/seqfeat/SubSource.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/OrgMod.hpp>
45 #include <objmgr/bioseq_ci.hpp>
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <util/static_set.hpp>
48 #include <algorithm>
49 
50 #include <objtools/format/formatter.hpp>
51 #include <objtools/format/text_ostream.hpp>
52 #include <objtools/format/items/keywords_item.hpp>
53 #include <objtools/format/context.hpp>
54 #include <objects/valid/Comment_set.hpp>
55 #include <objects/valid/Comment_rule.hpp>
56 
57 #include <objects/misc/sequence_util_macros.hpp>
58 
59 
60 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)61 BEGIN_SCOPE(objects)
62 
63 
64 CKeywordsItem::CKeywordsItem(CBioseqContext& ctx) :
65     CFlatItem(&ctx)
66 {
67     x_GatherInfo(ctx);
68 }
69 
GetItemType(void) const70 IFlatItem::EItem CKeywordsItem::GetItemType(void) const
71 {
72     return eItem_Keywords;
73 }
74 
Format(IFormatter & formatter,IFlatTextOStream & text_os) const75 void CKeywordsItem::Format
76 (IFormatter& formatter,
77  IFlatTextOStream& text_os) const
78 {
79     formatter.FormatKeywords(*this, text_os);
80 }
81 
82 
83 /***************************************************************************/
84 /*                                  PRIVATE                                */
85 /***************************************************************************/
86 
87 
88 enum ETechFlags {
89     e_not_set,
90     eEST,
91     eSTS,
92     eGSS
93 };
94 
95 
96 // EST keywords
97 static const char* const sc_EST[] = {
98   "EST", "EST (expressed sequence tag)", "EST PROTO((expressed sequence tag)",
99   "EST(expressed sequence tag)", "TSR", "UK putts", "expressed sequence tag",
100   "partial cDNA sequence", "putatively transcribed partial sequence",
101   "transcribed sequence fragment"
102 };
103 typedef CStaticArraySet<const char*, PCase_CStr> TStaticKeywordSet;
104 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_EST_kw, sc_EST);
105 
106 
107 // GSS keywords
108 static const char* const sc_GSS[] = {
109   "GSS", "trapped exon"
110 };
111 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_GSS_kw, sc_GSS);
112 
113 // STS keywords
114 static const char* const sc_STS[] = {
115   "STS", "STS (sequence tagged site)", "STS sequence",
116   "STS(sequence tagged site)", "sequence tagged site"
117 };
118 DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_STS_kw, sc_STS);
119 
120 
s_CheckSpecialKeyword(const string & keyword,ETechFlags tech)121 static bool s_CheckSpecialKeyword(const string& keyword, ETechFlags tech)
122 {
123     if (tech == eEST) {
124         if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
125             return false;
126         }
127         if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
128             return false;
129         }
130     }
131 
132     if (tech == eSTS) {
133         if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
134             return false;
135         }
136         if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
137             return false;
138         }
139     }
140 
141     if (tech == eGSS) {
142         if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
143             return false;
144         }
145         if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
146             return false;
147         }
148     }
149 
150     return true;
151 }
152 
153 
x_GatherInfo(CBioseqContext & ctx)154 void CKeywordsItem::x_GatherInfo(CBioseqContext& ctx)
155 {
156     switch( ctx.GetRepr() ) {
157     case CSeq_inst::eRepr_map:
158         x_AddKeyword("Whole_Genome_Map");
159         break;
160     default:
161         // no action needed yet for other types
162         break;
163     }
164 
165     // check if env sample or metagenome_source
166     bool is_env_sample = false;
167     bool is_metagenome_source = false;
168     CSeqdesc_CI src_desc(ctx.GetHandle(), CSeqdesc::e_Source);
169     if (src_desc) {
170         ITERATE(CBioSource::TSubtype, it, src_desc->GetSource().GetSubtype()) {
171             if (! (*it)->IsSetSubtype()) continue;
172             if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
173                 is_env_sample = true;
174             }
175         }
176         if (src_desc->GetSource().IsSetOrg()) {
177             const CBioSource::TOrg& org = src_desc->GetSource().GetOrg();
178             if ( org.IsSetOrgname()) {
179                 ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
180                     if (! (*it)->IsSetSubtype()) continue;
181                     if ((*it)->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
182                         is_metagenome_source = true;
183                     }
184                 }
185             }
186         }
187     }
188 
189     // we might set this in the mol-info switch statement below
190     bool is_tsa = false;
191 
192     // add keywords based on mol-info
193     ETechFlags tech = e_not_set;
194     // don't do tech-related keywords if molinfo isn't set
195     if( ctx.GetMolinfo() != NULL ) {
196         switch ( ctx.GetTech() ) {
197         case CMolInfo::eTech_est:
198             tech = eEST;
199             x_AddKeyword("EST");
200             if (is_env_sample) {
201                 x_AddKeyword("ENV");
202             }
203             break;
204 
205         case CMolInfo::eTech_sts:
206             tech = eSTS;
207             x_AddKeyword("STS");
208             break;
209 
210         case CMolInfo::eTech_survey:
211             tech = eGSS;
212             x_AddKeyword("GSS");
213             if (is_env_sample) {
214                 x_AddKeyword("ENV");
215             }
216             break;
217 
218         case CMolInfo::eTech_htgs_0:
219             x_AddKeyword("HTG");
220             x_AddKeyword("HTGS_PHASE0");
221             break;
222 
223         case CMolInfo::eTech_htgs_1:
224             x_AddKeyword("HTG");
225             x_AddKeyword("HTGS_PHASE1");
226             break;
227 
228         case CMolInfo::eTech_htgs_2:
229             x_AddKeyword("HTG");
230             x_AddKeyword("HTGS_PHASE2");
231             break;
232 
233         case CMolInfo::eTech_htgs_3:
234             x_AddKeyword("HTG");
235             break;
236 
237         case CMolInfo::eTech_fli_cdna:
238             x_AddKeyword("FLI_CDNA");
239             break;
240 
241         case CMolInfo::eTech_htc:
242             x_AddKeyword("HTC");
243             break;
244 
245         case CMolInfo::eTech_wgs:
246             x_AddKeyword("WGS");
247             break;
248 
249         case CMolInfo::eTech_tsa:
250             x_AddKeyword("TSA");
251             x_AddKeyword("Transcriptome Shotgun Assembly");
252             is_tsa = true; // remember so we don't add it twice
253             break;
254 
255         case CMolInfo::eTech_targeted:
256             x_AddKeyword("TLS");
257             x_AddKeyword("Targeted Locus Study");
258             break;
259 
260         case CMolInfo::eTech_unknown:
261         case CMolInfo::eTech_standard:
262         case CMolInfo::eTech_other:
263             if (is_env_sample) {
264                 x_AddKeyword("ENV");
265             }
266             break;
267 
268         default:
269             break;
270         }
271     }
272 
273     if (is_metagenome_source) {
274         x_AddKeyword("Metagenome Assembled Genome");
275         x_AddKeyword("MAG");
276     }
277 
278     // propagate TSA keyword from nuc to prot in same nuc-prot set
279     if( ! is_tsa && ctx.IsProt() && ctx.IsInNucProt() ) {
280         CBioseq_set_Handle parent_bioseq_set = ctx.GetHandle().GetParentBioseq_set();
281         if( parent_bioseq_set ) {
282             CBioseq_CI bioseq_ci( parent_bioseq_set, CSeq_inst::eMol_na );
283             if( bioseq_ci ) {
284                 CBioseq_Handle nuc = *bioseq_ci;
285                 if( nuc ) {
286                     CSeqdesc_CI desc_ci( nuc, CSeqdesc::e_Molinfo );
287                     for( ; desc_ci; ++desc_ci ) {
288                         if( desc_ci->GetMolinfo().CanGetTech() &&
289                             desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa )
290                         {
291                             x_AddKeyword("TSA");
292                             x_AddKeyword("Transcriptome Shotgun Assembly");
293                             break;
294                         }
295                     }
296                 }
297             }
298         }
299     }
300 
301     CBioseq_Handle bsh = ctx.GetHandle();
302     for (CSeqdesc_CI di(bsh, CSeqdesc::e_User); di; ++di) {
303         const CUser_object& usr = di->GetUser();
304         if ( ! CComment_rule::IsStructuredComment (usr) ) continue;
305         string pfx = CComment_rule::GetStructuredCommentPrefix ( usr, true );
306         bool is_valid = false;
307         CConstRef<CComment_set> comment_rules = CComment_set::GetCommentRules();
308         if (comment_rules) {
309             CConstRef<CComment_rule> ruler = comment_rules->FindCommentRuleEx(pfx);
310             if (ruler) {
311                 const CComment_rule& rule = *ruler;
312                 CComment_rule::TErrorList errors = rule.IsValid(usr);
313                 if(errors.size() == 0) {
314                     is_valid = true;
315                 }
316             }
317         }
318         if ( is_valid ) {
319             if ( NStr::EqualNocase (pfx, "MIGS:5.0-Data" )) {
320                 x_AddKeyword("GSC:MIxS");
321                 x_AddKeyword("MIGS:5.0.");
322             } else if ( NStr::EqualNocase (pfx, "MIMS:5.0-Data" )) {
323                 x_AddKeyword("GSC:MIxS");
324                 x_AddKeyword("MIMS:5.0.");
325             } else if ( NStr::EqualNocase (pfx, "MIMARKS:5.0-Data" )) {
326                 x_AddKeyword("GSC:MIxS");
327                 x_AddKeyword("MIMARKS:5.0.");
328             } else if ( NStr::EqualNocase (pfx, "MISAG:5.0-Data" )) {
329                 x_AddKeyword("GSC:MIxS");
330                 x_AddKeyword("MISAG:5.0.");
331             } else if ( NStr::EqualNocase (pfx, "MIMAG:5.0-Data" )) {
332                 x_AddKeyword("GSC:MIxS");
333                 x_AddKeyword("MIMAG:5.0.");
334             } else if ( NStr::EqualNocase (pfx, "MIUVIG:5.0-Data" )) {
335                 x_AddKeyword("GSC:MIxS");
336                 x_AddKeyword("MIUVIG:5.0.");
337             }
338         }
339         try {
340             list<string> keywords = CComment_set::GetKeywords(usr);
341             FOR_EACH_STRING_IN_LIST ( s_itr, keywords ) {
342                 x_AddKeyword(*s_itr);
343             }
344         } catch (CException) {
345         }
346     }
347 
348     CBioseqContext::TUnverified unv = ctx.GetUnverifiedType();
349     if ((unv & CBioseqContext::fUnverified_SequenceOrAnnotation) != 0) {
350         x_AddKeyword("UNVERIFIED");
351     }
352     if ((unv & CBioseqContext::fUnverified_Organism) != 0) {
353         x_AddKeyword("UNVERIFIED");
354         x_AddKeyword("UNVERIFIED_ORGANISM");
355     }
356     if ((unv & CBioseqContext::fUnverified_Misassembled) != 0) {
357         x_AddKeyword("UNVERIFIED");
358         x_AddKeyword("UNVERIFIED_MISASSEMBLY");
359     }
360     if ((unv & CBioseqContext::fUnverified_Contaminant) != 0) {
361         x_AddKeyword("UNVERIFIED");
362         x_AddKeyword("UNVERIFIED_CONTAMINANT");
363     }
364 
365     if (ctx.IsEncode()) {
366         x_AddKeyword("ENCODE");
367     }
368 
369     if( ctx.IsGenomeAssembly() && ! ctx.GetFinishingStatus().empty() ) {
370         x_AddKeyword( ctx.GetFinishingStatus() );
371     }
372 
373     if ( ctx.IsTPA() ) {
374         // add TPA keywords
375         x_AddKeyword("Third Party Data");
376         x_AddKeyword("TPA");
377     } else if ( ctx.IsRefSeq() ) {
378         // add RefSeq keyword
379         x_AddKeyword("RefSeq");
380     }
381 
382     if ( ctx.IsCrossKingdom() && ctx.IsRSUniqueProt() ) {
383         // add CrossKingdom keyword
384         x_AddKeyword("CROSS_KINGDOM");
385     }
386 
387     for (CSeqdesc_CI it(ctx.GetHandle());  it;  ++it) {
388         const list<string>* keywords = NULL;
389 
390         switch (it->Which()) {
391 
392         case CSeqdesc::e_Pir:
393             keywords = &(it->GetPir().GetKeywords());
394             break;
395 
396         case CSeqdesc::e_Genbank:
397             keywords = &(it->GetGenbank().GetKeywords());
398             break;
399 
400         case CSeqdesc::e_Sp:
401             keywords = &(it->GetSp().GetKeywords());
402             break;
403 
404         case CSeqdesc::e_Embl:
405             keywords = &(it->GetEmbl().GetKeywords());
406             break;
407 
408         case CSeqdesc::e_Prf:
409             keywords = &(it->GetPrf().GetKeywords());
410             break;
411 
412         default:
413             keywords = NULL;
414             break;
415         }
416 
417         if (keywords != NULL) {
418             if (!IsSetObject()) {
419                 x_SetObject(*it);
420             }
421             ITERATE (list<string>, kwd, *keywords) {
422                 if (s_CheckSpecialKeyword(*kwd, tech)) {
423                     x_AddKeyword(*kwd);
424                 }
425             }
426         }
427     }
428 }
429 
430 
431 // Add a keyword to the list
x_OkayToAddKeyword(const string & keyword,vector<string> keywords)432 static bool x_OkayToAddKeyword(const string& keyword, vector<string> keywords)
433 {
434     ITERATE (vector<string>, it, keywords) {
435         if (NStr::EqualNocase(keyword, *it)) {
436             return false;
437         }
438     }
439     return true;
440 }
x_AddKeyword(const string & keyword)441 void CKeywordsItem::x_AddKeyword(const string& keyword)
442 {
443     list<string> kywds;
444     NStr::Split( keyword, ";", kywds, NStr::fSplit_Tokenize );
445     FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
446         const string& kw = *k_itr;
447         if (x_OkayToAddKeyword (kw, m_Keywords)) {
448             m_Keywords.push_back(kw);
449         }
450     }
451 }
452 
453 
454 END_SCOPE(objects)
455 END_NCBI_SCOPE
456