1 /*  $Id: suspect_product_names.cpp 637427 2021-09-13 13:13:02Z ivanov $
2  * =========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <corelib/ncbistd.hpp>
32 #include <corelib/ncbimisc.hpp>
33 #include <corelib/ncbi_autoinit.hpp>
34 #include "discrepancy_core.hpp"
35 #include "utils.hpp"
36 #include <objects/macro/Constraint_choice.hpp>
37 #include <objects/macro/Constraint_choice_set.hpp>
38 #include <objects/macro/Replace_func.hpp>
39 #include <objects/macro/Search_func.hpp>
40 #include <objects/macro/Simple_replace.hpp>
41 #include <objects/macro/Suspect_rule_set.hpp>
42 #include <objects/macro/Suspect_rule.hpp>
43 #include <objects/misc/sequence_util_macros.hpp>
44 #include <objects/seqfeat/Org_ref.hpp>
45 #include <objects/seqfeat/OrgMod.hpp>
46 #include <objects/seqfeat/OrgName.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48 #include <objects/seqfeat/RNA_qual.hpp>
49 #include <objects/seqfeat/RNA_qual_set.hpp>
50 #include <objects/seqfeat/RNA_ref.hpp>
51 #include <objects/seqfeat/SeqFeatData.hpp>
52 #include <objects/seqfeat/SubSource.hpp>
53 #include <objects/seqfeat/Trna_ext.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <serial/objistrasn.hpp>
56 
57 BEGIN_NCBI_SCOPE
58 BEGIN_SCOPE(NDiscrepancy)
59 USING_SCOPE(objects);
60 
61 DISCREPANCY_MODULE(suspect_product_names);
62 
63 
GetTwoFieldSubfield(const string & str,unsigned subfield)64 string GetTwoFieldSubfield(const string& str, unsigned subfield)
65 {
66     string strtmp;
67     if (str.empty() || subfield > 2)  return "";
68     if (!subfield) return str;
69     else {
70         size_t pos = str.find(':');
71         if (pos == string::npos) {
72             if (subfield == 1) return str;
73             else return kEmptyStr;
74         }
75         else {
76             if (subfield == 1) return str.substr(0, pos);
77             else {
78                 strtmp = CTempString(str).substr(pos + 1).empty();
79                 if (!strtmp.empty()) return strtmp;
80                 else return "";
81             }
82         }
83     }
84 }
85 
86 
GetFirstGBQualMatch(const vector<CRef<CGb_qual>> & quals,const string & qual_name,unsigned subfield=0,const CString_constraint * str_cons=nullptr)87 static string GetFirstGBQualMatch (const vector <CRef <CGb_qual> >& quals, const string& qual_name, unsigned subfield = 0, const CString_constraint* str_cons = nullptr)
88 {
89     string str;
90     for (auto it : quals) {
91         if (NStr::EqualNocase(it->GetQual(), qual_name)) {
92             str = it->GetVal();
93             str = GetTwoFieldSubfield(str, subfield);
94             if ( str.empty() || (str_cons && !str_cons->Empty() && !(str_cons->Match(str))) ) {
95                 str.clear();
96             }
97             else break;
98         }
99     }
100     return str;
101 }
102 
103 
GetRNAProductString(const CSeq_feat & seq_feat)104 static string GetRNAProductString(const CSeq_feat& seq_feat)
105 {
106     const CRNA_ref& rna = seq_feat.GetData().GetRna();
107     string rna_str;
108     if (!rna.CanGetExt()) {
109         rna_str = seq_feat.GetNamedQual("product");
110     }
111     else {
112         const CRNA_ref::C_Ext& ext = rna.GetExt();
113         switch (ext.Which()) {
114             case CRNA_ref::C_Ext::e_Name:
115                     rna_str = ext.GetName();
116                     if (seq_feat.CanGetQual() && (rna_str.empty() || rna_str== "ncRNA" || rna_str== "tmRNA" || rna_str== "misc_RNA")) {
117                         rna_str = GetFirstGBQualMatch(seq_feat.GetQual(), (string)"product");
118                     }
119                     break;
120             case CRNA_ref::C_Ext::e_TRNA:
121                     GetLabel(seq_feat, &rna_str, feature::fFGL_Content);
122                     rna_str = "tRNA-" + rna_str;
123                     break;
124             case CRNA_ref::C_Ext::e_Gen:
125                     if (ext.GetGen().CanGetProduct()) {
126                         rna_str = ext.GetGen().GetProduct();
127                     }
128             default: break;
129         }
130     }
131     return rna_str;
132 }
133 
134 
GetRuleText(const CSuspect_rule & rule)135 static string GetRuleText(const CSuspect_rule& rule)
136 {
137     static const char* rule_type[] = {
138         "None",
139         "Typo",
140         "Putative Typo",
141         "Quick fix",
142         "Organelles not appropriate in prokaryote",
143         "Suspicious phrase; should this be nonfunctional?",
144         "May contain database identifier more appropriate in note; remove from product name",
145         "Remove organism from product name",
146         "Possible parsing error or incorrect formatting; remove inappropriate symbols",
147         "Implies evolutionary relationship; change to -like protein",
148         "Consider adding 'protein' to the end of the product name",
149         "Correct the name or use 'hypothetical protein'",
150         "Use American spelling",
151         "Use short product name instead of descriptive phrase",
152         "use protein instead of gene as appropriate"
153     };
154     return rule_type[rule.GetRule_type()];
155 }
156 
157 
GetRuleMatch(const CSuspect_rule & rule)158 static string GetRuleMatch(const CSuspect_rule& rule)
159 {
160     if (rule.IsSetDescription()) {
161         string desc = rule.GetDescription();
162         NStr::ReplaceInPlace(desc, "contains", "contain[s]");
163         return "[n] feature[s] " + desc;
164     }
165     if (rule.CanGetFind()) {
166         const CSearch_func& find = rule.GetFind();
167         switch (find.Which()) {
168             case CSearch_func::e_String_constraint:
169             {   string s = "[n] feature[s] ";
170                 switch (find.GetString_constraint().GetMatch_location()) {
171                     case eString_location_starts:
172                         s += "start[S] with";
173                         break;
174                     case eString_location_ends:
175                         s += "end[S] with";
176                         break;
177                     case eString_location_equals:
178                         s += "equal[S]";
179                         break;
180                     default:
181                         s += "contain[S]";
182                 }
183                 return s+ " [*(*]\'" + find.GetString_constraint().GetMatch_text()
184                     + (rule.CanGetRule_type() && (rule.GetRule_type() == eFix_type_typo || rule.GetRule_type() == eFix_type_quickfix) &&
185                         rule.CanGetReplace() && rule.GetReplace().GetReplace_func().IsSimple_replace() && rule.GetReplace().GetReplace_func().GetSimple_replace().CanGetReplace()
186                         ? "\'[*)*], Replace with [*(*]\'" + rule.GetReplace().GetReplace_func().GetSimple_replace().GetReplace() : "")
187                     + "\'[*)*]";
188             }
189             case CSearch_func::e_Contains_plural:
190                 return "[n] feature[s] May contain plural";
191             case CSearch_func::e_N_or_more_brackets_or_parentheses:
192                 return "[n] feature[s] violate[S] e_N_or_more_brackets_or_parentheses !!!";
193             case CSearch_func::e_Three_numbers:
194                 //return "[n] feature[s] contain[S] three or more numbers together, but not contain[S] \'methyltransferas\'";
195                 return "[n] feature[s] Three or more numbers together but not contain[S] \'methyltransferas\'"; // from C Toolkit
196             case CSearch_func::e_Underscore:
197                 return "[n] feature[s] contain[S] underscore";
198             case CSearch_func::e_Prefix_and_numbers:
199                 return "[n] feature[s] violate[S] e_Prefix_and_numbers !!!";
200             case CSearch_func::e_All_caps:
201                 return "[n] feature[s] [is] all capital letters";
202             case CSearch_func::e_Unbalanced_paren:
203                 return "[n] feature[s] contain[S] unbalanced brackets or parentheses";
204             case CSearch_func::e_Too_long:
205                 return "[n] feature[s] violate[S] e_Too_long !!!";
206             case CSearch_func::e_Has_term:
207                 return "[n] feature[s] violate[S] e_Has_term !!!";
208             default:
209                 break;
210         }
211     }
212     return "[n] feature[s] violate[S] some other mysterious rule!";
213 }
214 
215 ///////////////////////////////////// SUSPECT_PRODUCT_NAMES
216 
217 static const string kSuspectProductNames = "[n] product_name[s] contain[S] suspect phrase[s] or character[s]";
218 
ContainsLetters(const string & prod_name)219 static bool ContainsLetters(const string& prod_name)
220 {
221     for (auto& symbol : prod_name) {
222         if (isalpha(symbol)) {
223             return true;
224         }
225     }
226     return false;
227 }
228 
229 
230 DISCREPANCY_CASE(SUSPECT_PRODUCT_NAMES, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eTSA | eFatal, "Suspect Product Name")
231 {
232     for (auto& feat : context.GetFeat()) {
233         if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot && feat.GetData().GetProt().IsSetName() && !feat.GetData().GetProt().GetName().empty() && !context.IsPseudo(feat)) {
234             CConstRef<CSuspect_rule_set> rules = context.GetProductRules();
235             string prot_name = *feat.GetData().GetProt().GetName().begin();
236             vector<char> Hits(rules->Get().size());
237             std::fill(Hits.begin(), Hits.end(), 0);
238             rules->Screen(prot_name, Hits.data());
239             if (!ContainsLetters(prot_name)) {
240                 const CSeq_feat* cds = sequence::GetCDSForProduct(context.CurrentBioseq(), &(context.GetScope()));    // consider different implementation
241                 CReportNode& node = m_Objs[kSuspectProductNames]["[*-1*]Product name does not contain letters"].Summ()["[n] feature[s] [does] not contain letters in product name"].Summ().Fatal();
242                 node.Add(*context.SeqFeatObjRef(cds ? *cds : feat)).Fatal();
243             }
244             else {
245                 size_t rule_num = 0;
246                 for (auto rule : rules->Get()) {
247                     if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(prot_name)) {
248                         string leading_space = "[*" + NStr::NumericToString(rule_num) + "*]";
249                         size_t rule_type = rule->GetRule_type();
250                         string rule_name = "[*";
251                         if (rule_type < 10) {
252                             rule_name += " ";
253                         }
254                         rule_name += NStr::NumericToString(rule_type) + "*]" + GetRuleText(*rule);
255                         string rule_text = leading_space + GetRuleMatch(*rule);
256                         CReportNode& node = m_Objs[kSuspectProductNames][rule_name].Summ()[rule_text].Summ();
257                         const CSeq_feat* cds = sequence::GetCDSForProduct(context.CurrentBioseq(), &(context.GetScope())); // needs to optimize
258                         if (rule->CanGetReplace()) {
259                             node.Add(*context.SeqFeatObjRef(cds ? *cds : feat, CDiscrepancyContext::eFixSet, (CObject*)&*rule)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
260                         }
261                         else {
262                             node.Add(*context.SeqFeatObjRef(cds ? *cds : feat)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
263                         }
264                     }
265                     rule_num++;
266                 }
267             }
268         }
269     }
270 }
271 
272 
DISCREPANCY_SUMMARIZE(SUSPECT_PRODUCT_NAMES)273 DISCREPANCY_SUMMARIZE(SUSPECT_PRODUCT_NAMES)
274 {
275     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
276 }
277 
278 
ReplaceNoCase(const string & input,const string & search,const string & replace)279 static string ReplaceNoCase(const string& input, const string& search, const string& replace)
280 {
281     string find = search;
282     NStr::TruncateSpacesInPlace(find);
283     if (!find.length()) {
284         return input;
285     }
286     size_t p;
287     if ((p = NStr::FindNoCase(input, find)) != NPOS) {
288         string tail = input.substr(p + find.length());
289         return input.substr(0, p) + replace + ReplaceNoCase(tail, find, replace);
290     }
291     return input;
292 }
293 
294 
GetProtAndRnaForCDS(const CSeq_feat & cds,CScope & scope,CSeq_feat * & prot,CSeq_feat * & mrna)295 const void GetProtAndRnaForCDS(const CSeq_feat& cds, CScope& scope, CSeq_feat*& prot, CSeq_feat*& mrna)
296 {
297     prot = 0;
298     mrna = 0;
299     CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetProduct());
300     if (!bsh) {
301         return;
302     }
303     CFeat_CI pr(bsh, CSeqFeatData::eSubtype_prot);
304     if (pr) {
305         prot = (CSeq_feat*)&pr->GetMappedFeature();
306         string name = *prot->GetData().GetProt().GetName().begin();
307         const CSeq_feat* rna = sequence::GetBestMrnaForCds(cds, scope);
308         if (rna && rna->GetData().GetRna().CanGetExt() && rna->GetData().GetRna().GetExt().GetName() == name) {
309             mrna = (CSeq_feat*)rna;
310         }
311     }
312 }
313 
314 
315 typedef std::function < CRef<CSeq_feat>() > GetFeatureFunc;
FixProductName(const CSuspect_rule * rule,CScope & scope,string & prot_name,GetFeatureFunc get_mrna,GetFeatureFunc get_cds)316 string FixProductName(const CSuspect_rule* rule, CScope& scope, string& prot_name, GetFeatureFunc get_mrna, GetFeatureFunc get_cds)
317 {
318     string newtext;
319     string orig_prot_name;
320 
321     const CReplace_rule& rr = rule->GetReplace();
322     const CReplace_func& rf = rr.GetReplace_func();
323     if (rf.IsSimple_replace()) {
324         const CSimple_replace& repl = rf.GetSimple_replace();
325         if (repl.GetWhole_string()) {
326             newtext = repl.GetReplace();
327         }
328         else {
329             const string& find = rule->GetFind().GetString_constraint().GetMatch_text();
330             const string& subst = repl.GetReplace();
331             newtext = ReplaceNoCase(prot_name, find, subst);
332         }
333     }
334     else if (rf.IsHaem_replace()) {
335         newtext = ReplaceNoCase(prot_name, "haem", "hem");
336     }
337     if (!newtext.empty() && newtext != prot_name) {
338         orig_prot_name = move(prot_name);
339         prot_name = move(newtext);
340         auto mrna = get_mrna();
341         if (mrna) {
342             mrna->SetData().SetRna().SetExt().SetName() = prot_name;
343         }
344         if (rr.GetMove_to_note()) {
345             auto cds = get_cds();
346             if (cds)
347                 AddComment(*cds, orig_prot_name);
348         }
349     }
350     return orig_prot_name;
351 }
352 
353 
DISCREPANCY_AUTOFIX(SUSPECT_PRODUCT_NAMES)354 DISCREPANCY_AUTOFIX(SUSPECT_PRODUCT_NAMES)
355 {
356     CRef<CAutofixReport> ret;
357     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
358     const CSuspect_rule* rule = dynamic_cast<const CSuspect_rule*>(obj->GetMoreInfo().GetPointer());
359     CSeq_feat* prot;
360     CSeq_feat* mrna;
361     GetProtAndRnaForCDS(*sf, context.GetScope(), prot, mrna);
362     if (prot) {
363         string& prot_name = prot->SetData().SetProt().SetName().front();
364         if (!rule->StringMatchesSuspectProductRule(prot_name)) {
365             return ret;
366         }
367         string old_prot_name = FixProductName(rule, context.GetScope(),
368             prot_name,
369             [&mrna] { return CRef<CSeq_feat>(mrna); },
370             [&sf] { return CRef<CSeq_feat>((CSeq_feat*)sf); });
371         if (prot_name != old_prot_name && !prot_name.empty()) {
372             string s = "Changed \'" + old_prot_name + "\' to \'" + prot_name + "\' at " + obj->GetLocation();
373             obj->SetFixed();
374             ret.Reset(new CAutofixReport("SUSPECT_PRODUCT_NAMES", 0));
375             CRef<CAutofixReport> report(new CAutofixReport(s, 1));
376             vector<CRef<CAutofixReport>> reports;
377             reports.push_back(report);
378             ret->AddSubitems(reports);
379         }
380     }
381     return ret;
382 }
383 
384 ///////////////////////////////////// ORGANELLE_PRODUCTS
385 
386 
387 DISCREPANCY_CASE(ORGANELLE_PRODUCTS, FEAT, eOncaller, "Organelle products on non-organelle sequence: on when neither bacteria nor virus")
388 {
389     const CSeqdesc* biosrc = context.GetBiosource();
390     if (biosrc) {
391         const CBioSource& src = biosrc->GetSource();
392         CBioSource::TGenome genome = src.GetGenome();
393         if (genome == CBioSource::eGenome_mitochondrion || genome == CBioSource::eGenome_chloroplast || genome == CBioSource::eGenome_plastid || context.IsViral(&src) || context.IsBacterial(&src)) {
394             return;
395         }
396         if (src.IsSetOrg() && src.GetOrg().IsSetTaxname() && CDiscrepancyContext::IsUnculturedNonOrganelleName(src.GetOrg().GetTaxname())) {
397             return;
398         }
399     }
400     for (auto& feat : context.GetFeat()) {
401         if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot && feat.GetData().GetProt().IsSetName() && !feat.GetData().GetProt().GetName().empty() && !context.IsPseudo(feat)) {
402             string prot_name = *feat.GetData().GetProt().GetName().begin();
403             CConstRef<CSuspect_rule_set> rules = context.GetOrganelleProductRules();
404             vector<char> Hits(rules->Get().size());
405             std::fill(Hits.begin(), Hits.end(), 0);
406             rules->Screen(prot_name, Hits.data());
407             size_t rule_num = 0;
408             for (auto rule : rules->Get()) {
409                 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(prot_name)) {
410                     if (rule->CanGetReplace()) {
411                         m_Objs["[n] suspect product[s] not organelle"].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet, (CObject*)&*rule)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
412                     }
413                     else {
414                         m_Objs["[n] suspect product[s] not organelle"].Add(*context.SeqFeatObjRef(feat)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
415                     }
416                 }
417                 rule_num++;
418             }
419         }
420     }
421 }
422 
423 
DISCREPANCY_SUMMARIZE(ORGANELLE_PRODUCTS)424 DISCREPANCY_SUMMARIZE(ORGANELLE_PRODUCTS)
425 {
426     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
427 }
428 
429 
DISCREPANCY_AUTOFIX(ORGANELLE_PRODUCTS)430 DISCREPANCY_AUTOFIX(ORGANELLE_PRODUCTS) // LCOV_EXCL_START // There are currently no autofixable rules for ORGANELLE_PRODUCTS
431 {
432     CRef<CAutofixReport> ret;
433     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
434     const CSuspect_rule* rule = dynamic_cast<const CSuspect_rule*>(obj->GetMoreInfo().GetPointer());
435     CSeq_feat* prot;
436     CSeq_feat* mrna;
437     GetProtAndRnaForCDS(*sf, context.GetScope(), prot, mrna);
438     if (prot) {
439         string& prot_name = prot->SetData().SetProt().SetName().front();
440         if (!rule->StringMatchesSuspectProductRule(prot_name)) {
441             return ret;
442         }
443         string old_prot_name = FixProductName(rule, context.GetScope(),
444             prot_name,
445             [&mrna] { return CRef<CSeq_feat>(mrna); },
446             [&sf] { return CRef<CSeq_feat>((CSeq_feat*)sf); });
447         if (prot_name != old_prot_name && !prot_name.empty()) {
448             string s = "Changed \'" + old_prot_name + "\' to \'" + prot_name + "\' at " + obj->GetLocation();
449             obj->SetFixed();
450             ret.Reset(new CAutofixReport("ORGANELLE_PRODUCTS", 0));
451             CRef<CAutofixReport> report(new CAutofixReport(s, 1));
452             vector<CRef<CAutofixReport>> reports;
453             reports.push_back(report);
454             ret->AddSubitems(reports);
455         }
456     }
457     return ret;
458 } // LCOV_EXCL_STOP
459 
460 
s_GetrRNAProductsSuspectRuleSet()461 static CConstRef<CSuspect_rule_set> s_GetrRNAProductsSuspectRuleSet()
462 {
463     DEFINE_STATIC_FAST_MUTEX(sx_RuleMutex);
464     CFastMutexGuard guard(sx_RuleMutex);
465 
466     static CAutoInitRef<CSuspect_rule_set> rrna_products_suspect_rule_set;
467     if( rrna_products_suspect_rule_set.IsInitialized() ) {
468         // already built
469         return ConstRef(&*rrna_products_suspect_rule_set);
470     }
471 
472     CTempString rrna_products_suspect_rule_set_asn_text =
473         "Suspect-rule-set ::= {\n"
474         "        { find string-constraint { match-text \"domain\", whole-word FALSE } },\n"
475         "        { find string-constraint { match-text \"partial\", whole-word FALSE } },\n"
476         "        { find string-constraint { match-text \"5s_rRNA\", whole-word FALSE } },\n"
477         "        { find string-constraint { match-text \"16s_rRNA\", whole-word FALSE } },\n"
478         "        { find string-constraint { match-text \"23s_rRNA\", whole-word FALSE } },\n"
479         "        {\n"
480         "            find string-constraint { match-text \"8S\", whole-word TRUE },\n"
481         "            except string-constraint { match-text \"5.8S\", whole-word TRUE } } }";
482 
483     CObjectIStreamAsn asn_istrm(rrna_products_suspect_rule_set_asn_text.data(), rrna_products_suspect_rule_set_asn_text.length());
484     asn_istrm.Read(&*rrna_products_suspect_rule_set, rrna_products_suspect_rule_set->GetThisTypeInfo());
485 
486     return ConstRef(&*rrna_products_suspect_rule_set);
487 }
488 
489 // gives a text description explaining what the string constraint matches.
490 // (e.g. "contains '5.8S' (whole word)")
491 //
492 // only bare minimum implementation and raises an exception if it the
493 // input goes beyond its capability
s_SummarizeStringConstraint(ostream & out_strm,const CString_constraint & string_constraint)494 static void s_SummarizeStringConstraint(
495     ostream & out_strm, const CString_constraint & string_constraint )
496 {
497     if( string_constraint.IsSetMatch_location() ||
498         string_constraint.IsSetCase_sensitive() ||
499         string_constraint.IsSetIgnore_space() ||
500         string_constraint.IsSetIgnore_punct() ||
501         string_constraint.IsSetIgnore_words() ||
502         string_constraint.IsSetNot_present() ||
503         string_constraint.IsSetIs_all_caps() ||
504         string_constraint.IsSetIs_all_lower() ||
505         string_constraint.IsSetIs_all_punct() ||
506         string_constraint.IsSetIgnore_weasel() ||
507         string_constraint.IsSetIs_first_cap() ||
508         string_constraint.IsSetIs_first_each_cap() )
509     {
510         NCBI_USER_THROW(
511             "s_SummarizeStringConstraint input too complex.  "
512             "Please expand the function or find/create a better one.");
513     }
514 
515     out_strm << "contains '" << string_constraint.GetMatch_text() << "'";
516     if( GET_FIELD_OR_DEFAULT(string_constraint, Whole_word, false) ) {
517         out_strm << " (whole word)";
518     }
519 }
520 
521 // Gives a text description of what the given search_func matches.
522 //
523 // only bare minimum implementation and raises an exception if it the
524 // input goes beyond its capability
s_SummarizeSearchFunc(ostream & out_strm,const CSearch_func & search_func)525 static void s_SummarizeSearchFunc(
526     ostream & out_strm, const CSearch_func & search_func)
527 {
528     if( ! search_func.IsString_constraint() ) {
529         NCBI_USER_THROW(
530             "s_SummarizeSearchFunc input too complex.  "
531             "Please expand the function or find/create a better one.");
532     }
533 
534     s_SummarizeStringConstraint(out_strm, search_func.GetString_constraint());
535 }
536 
537 // Gives a text description of a suspect rule.
538 //
539 // examples:
540 //
541 // - "contains 'partial'"
542 // - "contains '8S' (whole word) but not contains '5.8S' (whole word)"
543 //
544 // only implements the barest subset of this and will surely need more
545 // complexity and to be moved later on.
546 //
547 // Raises an exception if the input is beyond its ability to handle.
548 //
s_SummarizeSuspectRule(ostream & out_strm,const CSuspect_rule & rule)549 static void s_SummarizeSuspectRule(
550     ostream & out_strm, const CSuspect_rule & rule)
551 {
552     if( rule.IsSetFeat_constraint() ||
553         rule.IsSetRule_type() ||
554         rule.IsSetReplace() ||
555         rule.IsSetDescription() ||
556         rule.IsSetFatal() )
557     {
558         NCBI_USER_THROW(
559             "s_SummarizeSuspectRule input too complex.  "
560             "Please expand the function or find/create a better one.");
561     }
562 
563     _ASSERT(rule.IsSetFind());
564     s_SummarizeSearchFunc(out_strm, rule.GetFind());
565     if( rule.IsSetExcept() ) {
566         out_strm << " but not ";
567         s_SummarizeSearchFunc(out_strm, rule.GetExcept());
568     }
569 }
570 
571 
572 DISCREPANCY_CASE(SUSPECT_RRNA_PRODUCTS, FEAT, eDisc | eSubmitter | eSmart, "rRNA product names should not contain 'partial' or 'domain'")
573 {
574     static const string kMsg = "[n] rRNA product name[s] contain[S] suspect phrase";
575     for (auto& feat : context.GetFeat()) {
576         if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA) {
577             const string product = GetRNAProductString(feat);
578             CConstRef<CSuspect_rule_set> rules = s_GetrRNAProductsSuspectRuleSet();
579             vector<char> Hits(rules->Get().size());
580             std::fill(Hits.begin(), Hits.end(), 0);
581             rules->Screen(product, Hits.data());
582             size_t rule_num = 0;
583             for (auto rule : rules->Get()) {
584                 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(product)) {
585                     CNcbiOstrstream detailed_msg;
586                     detailed_msg << "[n] rRNA product name[s] ";
587                     s_SummarizeSuspectRule(detailed_msg, *rule);
588                     m_Objs[kMsg][(string)CNcbiOstrstreamToString(detailed_msg)].Ext().Add(*context.SeqFeatObjRef(feat));
589                 }
590                 rule_num++;
591             }
592         }
593     }
594 }
595 
596 
DISCREPANCY_SUMMARIZE(SUSPECT_RRNA_PRODUCTS)597 DISCREPANCY_SUMMARIZE(SUSPECT_RRNA_PRODUCTS)
598 {
599     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
600 }
601 
602 
603 // _SUSPECT_PRODUCT_NAMES - used for asndisc -N option
604 
605 DISCREPANCY_CASE(_SUSPECT_PRODUCT_NAMES, STRING, 0, "Suspect Product Names for asndisc -N option")
606 {
607     CConstRef<CSuspect_rule_set> rules = context.GetProductRules();
608     vector<char> Hits(rules->Get().size());
609     std::fill(Hits.begin(), Hits.end(), 0);
610     const string& str = context.CurrentText();
611     rules->Screen(str, Hits.data());
612 
613     if (!ContainsLetters(str)) {
614         CReportNode& node = m_Objs[kSuspectProductNames]["[*-1*]Product name does not contain letters"].Summ()["[n] feature[s] [does] not contain letters in product name"].Summ().Fatal();
615         node.Add(*context.StringObjRef()).Fatal();
616     }
617     size_t rule_num = 0;
618     for (auto rule : rules->Get()) {
619         if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(str)) {
620             string leading_space = "[*" + NStr::NumericToString(rule_num) + "*]";
621             size_t rule_type = rule->GetRule_type();
622             string rule_name = "[*";
623             if (rule_type < 10) {
624                 rule_name += " ";
625             }
626             rule_name += NStr::NumericToString(rule_type) + "*]" + GetRuleText(*rule);
627             string rule_text = leading_space + GetRuleMatch(*rule);
628             CReportNode& node = m_Objs[kSuspectProductNames][rule_name].Summ()[rule_text].Summ();
629             node.Add(*context.StringObjRef()).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
630         }
631         rule_num++;
632     }
633 }
634 
635 
DISCREPANCY_SUMMARIZE(_SUSPECT_PRODUCT_NAMES)636 DISCREPANCY_SUMMARIZE(_SUSPECT_PRODUCT_NAMES)
637 {
638     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
639 }
640 
641 //////////////////////////////////////////////////////////////////////////
642 
643 static bool  s_OrganelleProductRulesInitialized = false;
644 DEFINE_STATIC_FAST_MUTEX(s_OrganelleProductRulesMutex);
645 static CRef<CSuspect_rule_set> s_OrganelleProductRules;
646 
647 static bool  s_ProductRulesInitialized = false;
648 static string  s_ProductRulesFileName;
649 DEFINE_STATIC_FAST_MUTEX(s_ProductRulesMutex);
650 static CRef<CSuspect_rule_set> s_ProductRules;
651 
652 #define _FSM_RULES static const char* const s_Defaultorganelleproducts[]
653 #define _FSM_EMIT static bool s_Defaultorganelleproducts_emit[]
654 #define _FSM_HITS static map<size_t, vector<size_t>> s_Defaultorganelleproducts_hits
655 #define _FSM_STATES static size_t s_Defaultorganelleproducts_states[]
656 #include "organelle_products.inc"
657 #undef _FSM_EMIT
658 #undef _FSM_HITS
659 #undef _FSM_STATES
660 #undef _FSM_RULES
661 
662 
s_InitializeOrganelleProductRules(const string & name)663 static void s_InitializeOrganelleProductRules(const string& name)
664 {
665     CFastMutexGuard GUARD(s_OrganelleProductRulesMutex);
666     if (s_OrganelleProductRulesInitialized) {
667         return;
668     }
669     s_OrganelleProductRules.Reset(new CSuspect_rule_set());
670     //string file = name.empty() ? g_FindDataFile("organelle_products.prt") : name;
671 
672     if (!name.empty()) {
673         LOG_POST("Reading from " + name + " for organelle products");
674         auto_ptr<CObjectIStream> in;
675         in.reset(CObjectIStream::Open(name, eSerial_AsnText));
676         string header = in->ReadFileHeader();
677         in->Read(ObjectInfo(*s_OrganelleProductRules), CObjectIStream::eNoFileHeader);
678         s_OrganelleProductRules->SetPrecompiledData(nullptr, nullptr, nullptr);
679     }
680     if (!s_OrganelleProductRules->IsSet()) {
681         //LOG_POST("Falling back on built-in data for organelle products");
682         size_t num_lines = ArraySize(s_Defaultorganelleproducts);
683         string all_rules;
684         for (size_t i = 0; i < num_lines; i++) {
685             all_rules += s_Defaultorganelleproducts[i];
686         }
687         CNcbiIstrstream istr(all_rules);
688         istr >> MSerial_AsnText >> *s_OrganelleProductRules;
689         s_OrganelleProductRules->SetPrecompiledData(s_Defaultorganelleproducts_emit, &s_Defaultorganelleproducts_hits, s_Defaultorganelleproducts_states);
690     }
691 
692     s_OrganelleProductRulesInitialized = true;
693 }
694 
695 
696 #define _FSM_RULES static const char* const s_Defaultproductrules[]
697 #define _FSM_EMIT static bool s_Defaultproductrules_emit[]
698 #define _FSM_HITS static map<size_t, vector<size_t>> s_Defaultproductrules_hits
699 #define _FSM_STATES static size_t s_Defaultproductrules_states[]
700 #include "product_rules.inc"
701 #undef _FSM_EMIT
702 #undef _FSM_HITS
703 #undef _FSM_STATES
704 #undef _FSM_RULES
705 
706 
s_InitializeProductRules(const string & name)707 static void s_InitializeProductRules(const string& name)
708 {
709     CFastMutexGuard GUARD(s_ProductRulesMutex);
710     if (s_ProductRulesInitialized && name == s_ProductRulesFileName) {
711         return;
712     }
713     s_ProductRules.Reset(new CSuspect_rule_set());
714     s_ProductRulesFileName = name;
715     //string file = name.empty() ? g_FindDataFile("product_rules.prt") : name;
716 
717     if (!name.empty()) {
718         LOG_POST("Reading from " + name + " for suspect product rules");
719         auto_ptr<CObjectIStream> in;
720         in.reset(CObjectIStream::Open(name, eSerial_AsnText));
721         string header = in->ReadFileHeader();
722         in->Read(ObjectInfo(*s_ProductRules), CObjectIStream::eNoFileHeader);
723         s_ProductRules->SetPrecompiledData(nullptr, nullptr, nullptr);
724     }
725     if (!s_ProductRules->IsSet()) {
726         //LOG_POST("Falling back on built-in data for suspect product rules");
727         size_t num_lines = ArraySize(s_Defaultproductrules);
728         string all_rules;
729         for (size_t i = 0; i < num_lines; i++) {
730             all_rules += s_Defaultproductrules[i];
731         }
732         CNcbiIstrstream istr(all_rules);
733         istr >> MSerial_AsnText >> *s_ProductRules;
734         s_ProductRules->SetPrecompiledData(s_Defaultproductrules_emit, &s_Defaultproductrules_hits, s_Defaultproductrules_states);
735     }
736 
737     s_ProductRulesInitialized = true;
738 }
739 
740 
GetOrganelleProductRules(const string & name)741 CConstRef<CSuspect_rule_set> GetOrganelleProductRules(const string& name)
742 {
743     s_InitializeOrganelleProductRules(name);
744     return CConstRef<CSuspect_rule_set>(s_OrganelleProductRules.GetPointer());
745 }
746 
747 
GetProductRules(const string & name)748 CConstRef<CSuspect_rule_set> GetProductRules(const string& name)
749 {
750     s_InitializeProductRules(name);
751     return CConstRef<CSuspect_rule_set>(s_ProductRules.GetPointer());
752 }
753 
754 
755 END_SCOPE(NDiscrepancy)
756 END_NCBI_SCOPE
757