1 /* $Id: suspect_product_names.cpp 637427 2021-09-13 13:13:02Z ivanov $
2 * =========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * =========================================================================
25 *
26 * Authors: Sema Kachalo
27 *
28 */
29
30 #include <ncbi_pch.hpp>
31 #include <corelib/ncbistd.hpp>
32 #include <corelib/ncbimisc.hpp>
33 #include <corelib/ncbi_autoinit.hpp>
34 #include "discrepancy_core.hpp"
35 #include "utils.hpp"
36 #include <objects/macro/Constraint_choice.hpp>
37 #include <objects/macro/Constraint_choice_set.hpp>
38 #include <objects/macro/Replace_func.hpp>
39 #include <objects/macro/Search_func.hpp>
40 #include <objects/macro/Simple_replace.hpp>
41 #include <objects/macro/Suspect_rule_set.hpp>
42 #include <objects/macro/Suspect_rule.hpp>
43 #include <objects/misc/sequence_util_macros.hpp>
44 #include <objects/seqfeat/Org_ref.hpp>
45 #include <objects/seqfeat/OrgMod.hpp>
46 #include <objects/seqfeat/OrgName.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48 #include <objects/seqfeat/RNA_qual.hpp>
49 #include <objects/seqfeat/RNA_qual_set.hpp>
50 #include <objects/seqfeat/RNA_ref.hpp>
51 #include <objects/seqfeat/SeqFeatData.hpp>
52 #include <objects/seqfeat/SubSource.hpp>
53 #include <objects/seqfeat/Trna_ext.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <serial/objistrasn.hpp>
56
57 BEGIN_NCBI_SCOPE
58 BEGIN_SCOPE(NDiscrepancy)
59 USING_SCOPE(objects);
60
61 DISCREPANCY_MODULE(suspect_product_names);
62
63
GetTwoFieldSubfield(const string & str,unsigned subfield)64 string GetTwoFieldSubfield(const string& str, unsigned subfield)
65 {
66 string strtmp;
67 if (str.empty() || subfield > 2) return "";
68 if (!subfield) return str;
69 else {
70 size_t pos = str.find(':');
71 if (pos == string::npos) {
72 if (subfield == 1) return str;
73 else return kEmptyStr;
74 }
75 else {
76 if (subfield == 1) return str.substr(0, pos);
77 else {
78 strtmp = CTempString(str).substr(pos + 1).empty();
79 if (!strtmp.empty()) return strtmp;
80 else return "";
81 }
82 }
83 }
84 }
85
86
GetFirstGBQualMatch(const vector<CRef<CGb_qual>> & quals,const string & qual_name,unsigned subfield=0,const CString_constraint * str_cons=nullptr)87 static string GetFirstGBQualMatch (const vector <CRef <CGb_qual> >& quals, const string& qual_name, unsigned subfield = 0, const CString_constraint* str_cons = nullptr)
88 {
89 string str;
90 for (auto it : quals) {
91 if (NStr::EqualNocase(it->GetQual(), qual_name)) {
92 str = it->GetVal();
93 str = GetTwoFieldSubfield(str, subfield);
94 if ( str.empty() || (str_cons && !str_cons->Empty() && !(str_cons->Match(str))) ) {
95 str.clear();
96 }
97 else break;
98 }
99 }
100 return str;
101 }
102
103
GetRNAProductString(const CSeq_feat & seq_feat)104 static string GetRNAProductString(const CSeq_feat& seq_feat)
105 {
106 const CRNA_ref& rna = seq_feat.GetData().GetRna();
107 string rna_str;
108 if (!rna.CanGetExt()) {
109 rna_str = seq_feat.GetNamedQual("product");
110 }
111 else {
112 const CRNA_ref::C_Ext& ext = rna.GetExt();
113 switch (ext.Which()) {
114 case CRNA_ref::C_Ext::e_Name:
115 rna_str = ext.GetName();
116 if (seq_feat.CanGetQual() && (rna_str.empty() || rna_str== "ncRNA" || rna_str== "tmRNA" || rna_str== "misc_RNA")) {
117 rna_str = GetFirstGBQualMatch(seq_feat.GetQual(), (string)"product");
118 }
119 break;
120 case CRNA_ref::C_Ext::e_TRNA:
121 GetLabel(seq_feat, &rna_str, feature::fFGL_Content);
122 rna_str = "tRNA-" + rna_str;
123 break;
124 case CRNA_ref::C_Ext::e_Gen:
125 if (ext.GetGen().CanGetProduct()) {
126 rna_str = ext.GetGen().GetProduct();
127 }
128 default: break;
129 }
130 }
131 return rna_str;
132 }
133
134
GetRuleText(const CSuspect_rule & rule)135 static string GetRuleText(const CSuspect_rule& rule)
136 {
137 static const char* rule_type[] = {
138 "None",
139 "Typo",
140 "Putative Typo",
141 "Quick fix",
142 "Organelles not appropriate in prokaryote",
143 "Suspicious phrase; should this be nonfunctional?",
144 "May contain database identifier more appropriate in note; remove from product name",
145 "Remove organism from product name",
146 "Possible parsing error or incorrect formatting; remove inappropriate symbols",
147 "Implies evolutionary relationship; change to -like protein",
148 "Consider adding 'protein' to the end of the product name",
149 "Correct the name or use 'hypothetical protein'",
150 "Use American spelling",
151 "Use short product name instead of descriptive phrase",
152 "use protein instead of gene as appropriate"
153 };
154 return rule_type[rule.GetRule_type()];
155 }
156
157
GetRuleMatch(const CSuspect_rule & rule)158 static string GetRuleMatch(const CSuspect_rule& rule)
159 {
160 if (rule.IsSetDescription()) {
161 string desc = rule.GetDescription();
162 NStr::ReplaceInPlace(desc, "contains", "contain[s]");
163 return "[n] feature[s] " + desc;
164 }
165 if (rule.CanGetFind()) {
166 const CSearch_func& find = rule.GetFind();
167 switch (find.Which()) {
168 case CSearch_func::e_String_constraint:
169 { string s = "[n] feature[s] ";
170 switch (find.GetString_constraint().GetMatch_location()) {
171 case eString_location_starts:
172 s += "start[S] with";
173 break;
174 case eString_location_ends:
175 s += "end[S] with";
176 break;
177 case eString_location_equals:
178 s += "equal[S]";
179 break;
180 default:
181 s += "contain[S]";
182 }
183 return s+ " [*(*]\'" + find.GetString_constraint().GetMatch_text()
184 + (rule.CanGetRule_type() && (rule.GetRule_type() == eFix_type_typo || rule.GetRule_type() == eFix_type_quickfix) &&
185 rule.CanGetReplace() && rule.GetReplace().GetReplace_func().IsSimple_replace() && rule.GetReplace().GetReplace_func().GetSimple_replace().CanGetReplace()
186 ? "\'[*)*], Replace with [*(*]\'" + rule.GetReplace().GetReplace_func().GetSimple_replace().GetReplace() : "")
187 + "\'[*)*]";
188 }
189 case CSearch_func::e_Contains_plural:
190 return "[n] feature[s] May contain plural";
191 case CSearch_func::e_N_or_more_brackets_or_parentheses:
192 return "[n] feature[s] violate[S] e_N_or_more_brackets_or_parentheses !!!";
193 case CSearch_func::e_Three_numbers:
194 //return "[n] feature[s] contain[S] three or more numbers together, but not contain[S] \'methyltransferas\'";
195 return "[n] feature[s] Three or more numbers together but not contain[S] \'methyltransferas\'"; // from C Toolkit
196 case CSearch_func::e_Underscore:
197 return "[n] feature[s] contain[S] underscore";
198 case CSearch_func::e_Prefix_and_numbers:
199 return "[n] feature[s] violate[S] e_Prefix_and_numbers !!!";
200 case CSearch_func::e_All_caps:
201 return "[n] feature[s] [is] all capital letters";
202 case CSearch_func::e_Unbalanced_paren:
203 return "[n] feature[s] contain[S] unbalanced brackets or parentheses";
204 case CSearch_func::e_Too_long:
205 return "[n] feature[s] violate[S] e_Too_long !!!";
206 case CSearch_func::e_Has_term:
207 return "[n] feature[s] violate[S] e_Has_term !!!";
208 default:
209 break;
210 }
211 }
212 return "[n] feature[s] violate[S] some other mysterious rule!";
213 }
214
215 ///////////////////////////////////// SUSPECT_PRODUCT_NAMES
216
217 static const string kSuspectProductNames = "[n] product_name[s] contain[S] suspect phrase[s] or character[s]";
218
ContainsLetters(const string & prod_name)219 static bool ContainsLetters(const string& prod_name)
220 {
221 for (auto& symbol : prod_name) {
222 if (isalpha(symbol)) {
223 return true;
224 }
225 }
226 return false;
227 }
228
229
230 DISCREPANCY_CASE(SUSPECT_PRODUCT_NAMES, FEAT, eDisc | eOncaller | eSubmitter | eSmart | eTSA | eFatal, "Suspect Product Name")
231 {
232 for (auto& feat : context.GetFeat()) {
233 if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot && feat.GetData().GetProt().IsSetName() && !feat.GetData().GetProt().GetName().empty() && !context.IsPseudo(feat)) {
234 CConstRef<CSuspect_rule_set> rules = context.GetProductRules();
235 string prot_name = *feat.GetData().GetProt().GetName().begin();
236 vector<char> Hits(rules->Get().size());
237 std::fill(Hits.begin(), Hits.end(), 0);
238 rules->Screen(prot_name, Hits.data());
239 if (!ContainsLetters(prot_name)) {
240 const CSeq_feat* cds = sequence::GetCDSForProduct(context.CurrentBioseq(), &(context.GetScope())); // consider different implementation
241 CReportNode& node = m_Objs[kSuspectProductNames]["[*-1*]Product name does not contain letters"].Summ()["[n] feature[s] [does] not contain letters in product name"].Summ().Fatal();
242 node.Add(*context.SeqFeatObjRef(cds ? *cds : feat)).Fatal();
243 }
244 else {
245 size_t rule_num = 0;
246 for (auto rule : rules->Get()) {
247 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(prot_name)) {
248 string leading_space = "[*" + NStr::NumericToString(rule_num) + "*]";
249 size_t rule_type = rule->GetRule_type();
250 string rule_name = "[*";
251 if (rule_type < 10) {
252 rule_name += " ";
253 }
254 rule_name += NStr::NumericToString(rule_type) + "*]" + GetRuleText(*rule);
255 string rule_text = leading_space + GetRuleMatch(*rule);
256 CReportNode& node = m_Objs[kSuspectProductNames][rule_name].Summ()[rule_text].Summ();
257 const CSeq_feat* cds = sequence::GetCDSForProduct(context.CurrentBioseq(), &(context.GetScope())); // needs to optimize
258 if (rule->CanGetReplace()) {
259 node.Add(*context.SeqFeatObjRef(cds ? *cds : feat, CDiscrepancyContext::eFixSet, (CObject*)&*rule)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
260 }
261 else {
262 node.Add(*context.SeqFeatObjRef(cds ? *cds : feat)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
263 }
264 }
265 rule_num++;
266 }
267 }
268 }
269 }
270 }
271
272
DISCREPANCY_SUMMARIZE(SUSPECT_PRODUCT_NAMES)273 DISCREPANCY_SUMMARIZE(SUSPECT_PRODUCT_NAMES)
274 {
275 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
276 }
277
278
ReplaceNoCase(const string & input,const string & search,const string & replace)279 static string ReplaceNoCase(const string& input, const string& search, const string& replace)
280 {
281 string find = search;
282 NStr::TruncateSpacesInPlace(find);
283 if (!find.length()) {
284 return input;
285 }
286 size_t p;
287 if ((p = NStr::FindNoCase(input, find)) != NPOS) {
288 string tail = input.substr(p + find.length());
289 return input.substr(0, p) + replace + ReplaceNoCase(tail, find, replace);
290 }
291 return input;
292 }
293
294
GetProtAndRnaForCDS(const CSeq_feat & cds,CScope & scope,CSeq_feat * & prot,CSeq_feat * & mrna)295 const void GetProtAndRnaForCDS(const CSeq_feat& cds, CScope& scope, CSeq_feat*& prot, CSeq_feat*& mrna)
296 {
297 prot = 0;
298 mrna = 0;
299 CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetProduct());
300 if (!bsh) {
301 return;
302 }
303 CFeat_CI pr(bsh, CSeqFeatData::eSubtype_prot);
304 if (pr) {
305 prot = (CSeq_feat*)&pr->GetMappedFeature();
306 string name = *prot->GetData().GetProt().GetName().begin();
307 const CSeq_feat* rna = sequence::GetBestMrnaForCds(cds, scope);
308 if (rna && rna->GetData().GetRna().CanGetExt() && rna->GetData().GetRna().GetExt().GetName() == name) {
309 mrna = (CSeq_feat*)rna;
310 }
311 }
312 }
313
314
315 typedef std::function < CRef<CSeq_feat>() > GetFeatureFunc;
FixProductName(const CSuspect_rule * rule,CScope & scope,string & prot_name,GetFeatureFunc get_mrna,GetFeatureFunc get_cds)316 string FixProductName(const CSuspect_rule* rule, CScope& scope, string& prot_name, GetFeatureFunc get_mrna, GetFeatureFunc get_cds)
317 {
318 string newtext;
319 string orig_prot_name;
320
321 const CReplace_rule& rr = rule->GetReplace();
322 const CReplace_func& rf = rr.GetReplace_func();
323 if (rf.IsSimple_replace()) {
324 const CSimple_replace& repl = rf.GetSimple_replace();
325 if (repl.GetWhole_string()) {
326 newtext = repl.GetReplace();
327 }
328 else {
329 const string& find = rule->GetFind().GetString_constraint().GetMatch_text();
330 const string& subst = repl.GetReplace();
331 newtext = ReplaceNoCase(prot_name, find, subst);
332 }
333 }
334 else if (rf.IsHaem_replace()) {
335 newtext = ReplaceNoCase(prot_name, "haem", "hem");
336 }
337 if (!newtext.empty() && newtext != prot_name) {
338 orig_prot_name = move(prot_name);
339 prot_name = move(newtext);
340 auto mrna = get_mrna();
341 if (mrna) {
342 mrna->SetData().SetRna().SetExt().SetName() = prot_name;
343 }
344 if (rr.GetMove_to_note()) {
345 auto cds = get_cds();
346 if (cds)
347 AddComment(*cds, orig_prot_name);
348 }
349 }
350 return orig_prot_name;
351 }
352
353
DISCREPANCY_AUTOFIX(SUSPECT_PRODUCT_NAMES)354 DISCREPANCY_AUTOFIX(SUSPECT_PRODUCT_NAMES)
355 {
356 CRef<CAutofixReport> ret;
357 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
358 const CSuspect_rule* rule = dynamic_cast<const CSuspect_rule*>(obj->GetMoreInfo().GetPointer());
359 CSeq_feat* prot;
360 CSeq_feat* mrna;
361 GetProtAndRnaForCDS(*sf, context.GetScope(), prot, mrna);
362 if (prot) {
363 string& prot_name = prot->SetData().SetProt().SetName().front();
364 if (!rule->StringMatchesSuspectProductRule(prot_name)) {
365 return ret;
366 }
367 string old_prot_name = FixProductName(rule, context.GetScope(),
368 prot_name,
369 [&mrna] { return CRef<CSeq_feat>(mrna); },
370 [&sf] { return CRef<CSeq_feat>((CSeq_feat*)sf); });
371 if (prot_name != old_prot_name && !prot_name.empty()) {
372 string s = "Changed \'" + old_prot_name + "\' to \'" + prot_name + "\' at " + obj->GetLocation();
373 obj->SetFixed();
374 ret.Reset(new CAutofixReport("SUSPECT_PRODUCT_NAMES", 0));
375 CRef<CAutofixReport> report(new CAutofixReport(s, 1));
376 vector<CRef<CAutofixReport>> reports;
377 reports.push_back(report);
378 ret->AddSubitems(reports);
379 }
380 }
381 return ret;
382 }
383
384 ///////////////////////////////////// ORGANELLE_PRODUCTS
385
386
387 DISCREPANCY_CASE(ORGANELLE_PRODUCTS, FEAT, eOncaller, "Organelle products on non-organelle sequence: on when neither bacteria nor virus")
388 {
389 const CSeqdesc* biosrc = context.GetBiosource();
390 if (biosrc) {
391 const CBioSource& src = biosrc->GetSource();
392 CBioSource::TGenome genome = src.GetGenome();
393 if (genome == CBioSource::eGenome_mitochondrion || genome == CBioSource::eGenome_chloroplast || genome == CBioSource::eGenome_plastid || context.IsViral(&src) || context.IsBacterial(&src)) {
394 return;
395 }
396 if (src.IsSetOrg() && src.GetOrg().IsSetTaxname() && CDiscrepancyContext::IsUnculturedNonOrganelleName(src.GetOrg().GetTaxname())) {
397 return;
398 }
399 }
400 for (auto& feat : context.GetFeat()) {
401 if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_prot && feat.GetData().GetProt().IsSetName() && !feat.GetData().GetProt().GetName().empty() && !context.IsPseudo(feat)) {
402 string prot_name = *feat.GetData().GetProt().GetName().begin();
403 CConstRef<CSuspect_rule_set> rules = context.GetOrganelleProductRules();
404 vector<char> Hits(rules->Get().size());
405 std::fill(Hits.begin(), Hits.end(), 0);
406 rules->Screen(prot_name, Hits.data());
407 size_t rule_num = 0;
408 for (auto rule : rules->Get()) {
409 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(prot_name)) {
410 if (rule->CanGetReplace()) {
411 m_Objs["[n] suspect product[s] not organelle"].Add(*context.SeqFeatObjRef(feat, CDiscrepancyContext::eFixSet, (CObject*)&*rule)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
412 }
413 else {
414 m_Objs["[n] suspect product[s] not organelle"].Add(*context.SeqFeatObjRef(feat)).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
415 }
416 }
417 rule_num++;
418 }
419 }
420 }
421 }
422
423
DISCREPANCY_SUMMARIZE(ORGANELLE_PRODUCTS)424 DISCREPANCY_SUMMARIZE(ORGANELLE_PRODUCTS)
425 {
426 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
427 }
428
429
DISCREPANCY_AUTOFIX(ORGANELLE_PRODUCTS)430 DISCREPANCY_AUTOFIX(ORGANELLE_PRODUCTS) // LCOV_EXCL_START // There are currently no autofixable rules for ORGANELLE_PRODUCTS
431 {
432 CRef<CAutofixReport> ret;
433 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
434 const CSuspect_rule* rule = dynamic_cast<const CSuspect_rule*>(obj->GetMoreInfo().GetPointer());
435 CSeq_feat* prot;
436 CSeq_feat* mrna;
437 GetProtAndRnaForCDS(*sf, context.GetScope(), prot, mrna);
438 if (prot) {
439 string& prot_name = prot->SetData().SetProt().SetName().front();
440 if (!rule->StringMatchesSuspectProductRule(prot_name)) {
441 return ret;
442 }
443 string old_prot_name = FixProductName(rule, context.GetScope(),
444 prot_name,
445 [&mrna] { return CRef<CSeq_feat>(mrna); },
446 [&sf] { return CRef<CSeq_feat>((CSeq_feat*)sf); });
447 if (prot_name != old_prot_name && !prot_name.empty()) {
448 string s = "Changed \'" + old_prot_name + "\' to \'" + prot_name + "\' at " + obj->GetLocation();
449 obj->SetFixed();
450 ret.Reset(new CAutofixReport("ORGANELLE_PRODUCTS", 0));
451 CRef<CAutofixReport> report(new CAutofixReport(s, 1));
452 vector<CRef<CAutofixReport>> reports;
453 reports.push_back(report);
454 ret->AddSubitems(reports);
455 }
456 }
457 return ret;
458 } // LCOV_EXCL_STOP
459
460
s_GetrRNAProductsSuspectRuleSet()461 static CConstRef<CSuspect_rule_set> s_GetrRNAProductsSuspectRuleSet()
462 {
463 DEFINE_STATIC_FAST_MUTEX(sx_RuleMutex);
464 CFastMutexGuard guard(sx_RuleMutex);
465
466 static CAutoInitRef<CSuspect_rule_set> rrna_products_suspect_rule_set;
467 if( rrna_products_suspect_rule_set.IsInitialized() ) {
468 // already built
469 return ConstRef(&*rrna_products_suspect_rule_set);
470 }
471
472 CTempString rrna_products_suspect_rule_set_asn_text =
473 "Suspect-rule-set ::= {\n"
474 " { find string-constraint { match-text \"domain\", whole-word FALSE } },\n"
475 " { find string-constraint { match-text \"partial\", whole-word FALSE } },\n"
476 " { find string-constraint { match-text \"5s_rRNA\", whole-word FALSE } },\n"
477 " { find string-constraint { match-text \"16s_rRNA\", whole-word FALSE } },\n"
478 " { find string-constraint { match-text \"23s_rRNA\", whole-word FALSE } },\n"
479 " {\n"
480 " find string-constraint { match-text \"8S\", whole-word TRUE },\n"
481 " except string-constraint { match-text \"5.8S\", whole-word TRUE } } }";
482
483 CObjectIStreamAsn asn_istrm(rrna_products_suspect_rule_set_asn_text.data(), rrna_products_suspect_rule_set_asn_text.length());
484 asn_istrm.Read(&*rrna_products_suspect_rule_set, rrna_products_suspect_rule_set->GetThisTypeInfo());
485
486 return ConstRef(&*rrna_products_suspect_rule_set);
487 }
488
489 // gives a text description explaining what the string constraint matches.
490 // (e.g. "contains '5.8S' (whole word)")
491 //
492 // only bare minimum implementation and raises an exception if it the
493 // input goes beyond its capability
s_SummarizeStringConstraint(ostream & out_strm,const CString_constraint & string_constraint)494 static void s_SummarizeStringConstraint(
495 ostream & out_strm, const CString_constraint & string_constraint )
496 {
497 if( string_constraint.IsSetMatch_location() ||
498 string_constraint.IsSetCase_sensitive() ||
499 string_constraint.IsSetIgnore_space() ||
500 string_constraint.IsSetIgnore_punct() ||
501 string_constraint.IsSetIgnore_words() ||
502 string_constraint.IsSetNot_present() ||
503 string_constraint.IsSetIs_all_caps() ||
504 string_constraint.IsSetIs_all_lower() ||
505 string_constraint.IsSetIs_all_punct() ||
506 string_constraint.IsSetIgnore_weasel() ||
507 string_constraint.IsSetIs_first_cap() ||
508 string_constraint.IsSetIs_first_each_cap() )
509 {
510 NCBI_USER_THROW(
511 "s_SummarizeStringConstraint input too complex. "
512 "Please expand the function or find/create a better one.");
513 }
514
515 out_strm << "contains '" << string_constraint.GetMatch_text() << "'";
516 if( GET_FIELD_OR_DEFAULT(string_constraint, Whole_word, false) ) {
517 out_strm << " (whole word)";
518 }
519 }
520
521 // Gives a text description of what the given search_func matches.
522 //
523 // only bare minimum implementation and raises an exception if it the
524 // input goes beyond its capability
s_SummarizeSearchFunc(ostream & out_strm,const CSearch_func & search_func)525 static void s_SummarizeSearchFunc(
526 ostream & out_strm, const CSearch_func & search_func)
527 {
528 if( ! search_func.IsString_constraint() ) {
529 NCBI_USER_THROW(
530 "s_SummarizeSearchFunc input too complex. "
531 "Please expand the function or find/create a better one.");
532 }
533
534 s_SummarizeStringConstraint(out_strm, search_func.GetString_constraint());
535 }
536
537 // Gives a text description of a suspect rule.
538 //
539 // examples:
540 //
541 // - "contains 'partial'"
542 // - "contains '8S' (whole word) but not contains '5.8S' (whole word)"
543 //
544 // only implements the barest subset of this and will surely need more
545 // complexity and to be moved later on.
546 //
547 // Raises an exception if the input is beyond its ability to handle.
548 //
s_SummarizeSuspectRule(ostream & out_strm,const CSuspect_rule & rule)549 static void s_SummarizeSuspectRule(
550 ostream & out_strm, const CSuspect_rule & rule)
551 {
552 if( rule.IsSetFeat_constraint() ||
553 rule.IsSetRule_type() ||
554 rule.IsSetReplace() ||
555 rule.IsSetDescription() ||
556 rule.IsSetFatal() )
557 {
558 NCBI_USER_THROW(
559 "s_SummarizeSuspectRule input too complex. "
560 "Please expand the function or find/create a better one.");
561 }
562
563 _ASSERT(rule.IsSetFind());
564 s_SummarizeSearchFunc(out_strm, rule.GetFind());
565 if( rule.IsSetExcept() ) {
566 out_strm << " but not ";
567 s_SummarizeSearchFunc(out_strm, rule.GetExcept());
568 }
569 }
570
571
572 DISCREPANCY_CASE(SUSPECT_RRNA_PRODUCTS, FEAT, eDisc | eSubmitter | eSmart, "rRNA product names should not contain 'partial' or 'domain'")
573 {
574 static const string kMsg = "[n] rRNA product name[s] contain[S] suspect phrase";
575 for (auto& feat : context.GetFeat()) {
576 if (feat.IsSetData() && feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_rRNA) {
577 const string product = GetRNAProductString(feat);
578 CConstRef<CSuspect_rule_set> rules = s_GetrRNAProductsSuspectRuleSet();
579 vector<char> Hits(rules->Get().size());
580 std::fill(Hits.begin(), Hits.end(), 0);
581 rules->Screen(product, Hits.data());
582 size_t rule_num = 0;
583 for (auto rule : rules->Get()) {
584 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(product)) {
585 CNcbiOstrstream detailed_msg;
586 detailed_msg << "[n] rRNA product name[s] ";
587 s_SummarizeSuspectRule(detailed_msg, *rule);
588 m_Objs[kMsg][(string)CNcbiOstrstreamToString(detailed_msg)].Ext().Add(*context.SeqFeatObjRef(feat));
589 }
590 rule_num++;
591 }
592 }
593 }
594 }
595
596
DISCREPANCY_SUMMARIZE(SUSPECT_RRNA_PRODUCTS)597 DISCREPANCY_SUMMARIZE(SUSPECT_RRNA_PRODUCTS)
598 {
599 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
600 }
601
602
603 // _SUSPECT_PRODUCT_NAMES - used for asndisc -N option
604
605 DISCREPANCY_CASE(_SUSPECT_PRODUCT_NAMES, STRING, 0, "Suspect Product Names for asndisc -N option")
606 {
607 CConstRef<CSuspect_rule_set> rules = context.GetProductRules();
608 vector<char> Hits(rules->Get().size());
609 std::fill(Hits.begin(), Hits.end(), 0);
610 const string& str = context.CurrentText();
611 rules->Screen(str, Hits.data());
612
613 if (!ContainsLetters(str)) {
614 CReportNode& node = m_Objs[kSuspectProductNames]["[*-1*]Product name does not contain letters"].Summ()["[n] feature[s] [does] not contain letters in product name"].Summ().Fatal();
615 node.Add(*context.StringObjRef()).Fatal();
616 }
617 size_t rule_num = 0;
618 for (auto rule : rules->Get()) {
619 if (Hits[rule_num] && rule->StringMatchesSuspectProductRule(str)) {
620 string leading_space = "[*" + NStr::NumericToString(rule_num) + "*]";
621 size_t rule_type = rule->GetRule_type();
622 string rule_name = "[*";
623 if (rule_type < 10) {
624 rule_name += " ";
625 }
626 rule_name += NStr::NumericToString(rule_type) + "*]" + GetRuleText(*rule);
627 string rule_text = leading_space + GetRuleMatch(*rule);
628 CReportNode& node = m_Objs[kSuspectProductNames][rule_name].Summ()[rule_text].Summ();
629 node.Add(*context.StringObjRef()).Severity(rule->IsFatal() ? CReportItem::eSeverity_error : CReportItem::eSeverity_warning);
630 }
631 rule_num++;
632 }
633 }
634
635
DISCREPANCY_SUMMARIZE(_SUSPECT_PRODUCT_NAMES)636 DISCREPANCY_SUMMARIZE(_SUSPECT_PRODUCT_NAMES)
637 {
638 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
639 }
640
641 //////////////////////////////////////////////////////////////////////////
642
643 static bool s_OrganelleProductRulesInitialized = false;
644 DEFINE_STATIC_FAST_MUTEX(s_OrganelleProductRulesMutex);
645 static CRef<CSuspect_rule_set> s_OrganelleProductRules;
646
647 static bool s_ProductRulesInitialized = false;
648 static string s_ProductRulesFileName;
649 DEFINE_STATIC_FAST_MUTEX(s_ProductRulesMutex);
650 static CRef<CSuspect_rule_set> s_ProductRules;
651
652 #define _FSM_RULES static const char* const s_Defaultorganelleproducts[]
653 #define _FSM_EMIT static bool s_Defaultorganelleproducts_emit[]
654 #define _FSM_HITS static map<size_t, vector<size_t>> s_Defaultorganelleproducts_hits
655 #define _FSM_STATES static size_t s_Defaultorganelleproducts_states[]
656 #include "organelle_products.inc"
657 #undef _FSM_EMIT
658 #undef _FSM_HITS
659 #undef _FSM_STATES
660 #undef _FSM_RULES
661
662
s_InitializeOrganelleProductRules(const string & name)663 static void s_InitializeOrganelleProductRules(const string& name)
664 {
665 CFastMutexGuard GUARD(s_OrganelleProductRulesMutex);
666 if (s_OrganelleProductRulesInitialized) {
667 return;
668 }
669 s_OrganelleProductRules.Reset(new CSuspect_rule_set());
670 //string file = name.empty() ? g_FindDataFile("organelle_products.prt") : name;
671
672 if (!name.empty()) {
673 LOG_POST("Reading from " + name + " for organelle products");
674 auto_ptr<CObjectIStream> in;
675 in.reset(CObjectIStream::Open(name, eSerial_AsnText));
676 string header = in->ReadFileHeader();
677 in->Read(ObjectInfo(*s_OrganelleProductRules), CObjectIStream::eNoFileHeader);
678 s_OrganelleProductRules->SetPrecompiledData(nullptr, nullptr, nullptr);
679 }
680 if (!s_OrganelleProductRules->IsSet()) {
681 //LOG_POST("Falling back on built-in data for organelle products");
682 size_t num_lines = ArraySize(s_Defaultorganelleproducts);
683 string all_rules;
684 for (size_t i = 0; i < num_lines; i++) {
685 all_rules += s_Defaultorganelleproducts[i];
686 }
687 CNcbiIstrstream istr(all_rules);
688 istr >> MSerial_AsnText >> *s_OrganelleProductRules;
689 s_OrganelleProductRules->SetPrecompiledData(s_Defaultorganelleproducts_emit, &s_Defaultorganelleproducts_hits, s_Defaultorganelleproducts_states);
690 }
691
692 s_OrganelleProductRulesInitialized = true;
693 }
694
695
696 #define _FSM_RULES static const char* const s_Defaultproductrules[]
697 #define _FSM_EMIT static bool s_Defaultproductrules_emit[]
698 #define _FSM_HITS static map<size_t, vector<size_t>> s_Defaultproductrules_hits
699 #define _FSM_STATES static size_t s_Defaultproductrules_states[]
700 #include "product_rules.inc"
701 #undef _FSM_EMIT
702 #undef _FSM_HITS
703 #undef _FSM_STATES
704 #undef _FSM_RULES
705
706
s_InitializeProductRules(const string & name)707 static void s_InitializeProductRules(const string& name)
708 {
709 CFastMutexGuard GUARD(s_ProductRulesMutex);
710 if (s_ProductRulesInitialized && name == s_ProductRulesFileName) {
711 return;
712 }
713 s_ProductRules.Reset(new CSuspect_rule_set());
714 s_ProductRulesFileName = name;
715 //string file = name.empty() ? g_FindDataFile("product_rules.prt") : name;
716
717 if (!name.empty()) {
718 LOG_POST("Reading from " + name + " for suspect product rules");
719 auto_ptr<CObjectIStream> in;
720 in.reset(CObjectIStream::Open(name, eSerial_AsnText));
721 string header = in->ReadFileHeader();
722 in->Read(ObjectInfo(*s_ProductRules), CObjectIStream::eNoFileHeader);
723 s_ProductRules->SetPrecompiledData(nullptr, nullptr, nullptr);
724 }
725 if (!s_ProductRules->IsSet()) {
726 //LOG_POST("Falling back on built-in data for suspect product rules");
727 size_t num_lines = ArraySize(s_Defaultproductrules);
728 string all_rules;
729 for (size_t i = 0; i < num_lines; i++) {
730 all_rules += s_Defaultproductrules[i];
731 }
732 CNcbiIstrstream istr(all_rules);
733 istr >> MSerial_AsnText >> *s_ProductRules;
734 s_ProductRules->SetPrecompiledData(s_Defaultproductrules_emit, &s_Defaultproductrules_hits, s_Defaultproductrules_states);
735 }
736
737 s_ProductRulesInitialized = true;
738 }
739
740
GetOrganelleProductRules(const string & name)741 CConstRef<CSuspect_rule_set> GetOrganelleProductRules(const string& name)
742 {
743 s_InitializeOrganelleProductRules(name);
744 return CConstRef<CSuspect_rule_set>(s_OrganelleProductRules.GetPointer());
745 }
746
747
GetProductRules(const string & name)748 CConstRef<CSuspect_rule_set> GetProductRules(const string& name)
749 {
750 s_InitializeProductRules(name);
751 return CConstRef<CSuspect_rule_set>(s_ProductRules.GetPointer());
752 }
753
754
755 END_SCOPE(NDiscrepancy)
756 END_NCBI_SCOPE
757