1 /*  $Id: autodef_feature_clause.cpp 632113 2021-05-26 18:40:28Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Colleen Bollin
27 *
28 * File Description:
29 *   Generate unique definition lines for a set of sequences using organism
30 *   descriptions and feature clauses.
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <algorithm>
35 #include <objmgr/util/autodef.hpp>
36 #include <corelib/ncbimisc.hpp>
37 #include <objmgr/seqdesc_ci.hpp>
38 #include <objmgr/bioseq_ci.hpp>
39 #include <objmgr/feat_ci.hpp>
40 #include <objmgr/util/feature.hpp>
41 #include <objmgr/util/sequence.hpp>
42 
43 #include <objects/seq/Seq_descr.hpp>
44 #include <objects/seq/Seqdesc.hpp>
45 #include <objects/seq/Bioseq.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48 
49 #include <serial/iterator.hpp>
50 
51 BEGIN_NCBI_SCOPE
52 BEGIN_SCOPE(objects)
53 
54 using namespace sequence;
55 
CAutoDefFeatureClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)56 CAutoDefFeatureClause::CAutoDefFeatureClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
57                               : CAutoDefFeatureClause_Base(opts),
58                                 m_pMainFeat(&main_feat),
59                                 m_BH(bh)
60 {
61     x_SetBiomol();
62     m_ClauseList.clear();
63     m_GeneName = "";
64     m_AlleleName = "";
65     m_Interval = "";
66     m_IsAltSpliced = false;
67     m_Pluralizable = false;
68     m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
69     m_ShowTypewordFirst = x_ShowTypewordFirst(m_Typeword);
70     m_Description = "";
71     m_DescriptionChosen = false;
72     m_ProductName = "";
73     m_ProductNameChosen = false;
74 
75     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
76 
77     m_ClauseLocation = new CSeq_loc();
78     m_ClauseLocation->Add(mapped_loc);
79 
80     if (subtype == CSeqFeatData::eSubtype_operon || IsGeneCluster()) {
81         m_SuppressSubfeatures = true;
82     }
83 
84     if (m_pMainFeat->CanGetComment() && NStr::Find(m_pMainFeat->GetComment(), "alternatively spliced") != NCBI_NS_STD::string::npos
85         && (subtype == CSeqFeatData::eSubtype_cdregion
86         || subtype == CSeqFeatData::eSubtype_exon
87         || IsNoncodingProductFeat())) {
88         m_IsAltSpliced = true;
89     }
90 }
91 
92 
~CAutoDefFeatureClause()93 CAutoDefFeatureClause::~CAutoDefFeatureClause()
94 {
95 }
96 
97 
GetMainFeatureSubtype() const98 CSeqFeatData::ESubtype CAutoDefFeatureClause::GetMainFeatureSubtype() const
99 {
100     if (IsLTR(*m_pMainFeat)) {
101         return CSeqFeatData::eSubtype_LTR;
102     }
103     return m_pMainFeat->GetData().GetSubtype();
104 }
105 
106 
IsMobileElement() const107 bool CAutoDefFeatureClause::IsMobileElement() const
108 {
109     if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_mobile_element) {
110         return false;
111     } else {
112         return true;
113     }
114 }
115 
116 
IsInsertionSequence() const117 bool CAutoDefFeatureClause::IsInsertionSequence() const
118 {
119     if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region
120         || NStr::IsBlank(m_pMainFeat->GetNamedQual("insertion_seq"))) {
121         return false;
122     } else {
123         return true;
124     }
125 }
126 
127 
IsControlRegion(const CSeq_feat & feat)128 bool CAutoDefFeatureClause::IsControlRegion (const CSeq_feat& feat)
129 {
130     if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature
131         && feat.CanGetComment()
132         && NStr::StartsWith(feat.GetComment(), "control region")) {
133         return true;
134     } else {
135         return false;
136     }
137 }
138 
139 
IsControlRegion() const140 bool CAutoDefFeatureClause::IsControlRegion() const
141 {
142     return IsControlRegion(*m_pMainFeat);
143 }
144 
145 
IsEndogenousVirusSourceFeature() const146 bool CAutoDefFeatureClause::IsEndogenousVirusSourceFeature () const
147 {
148     if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_biosrc
149         || !m_pMainFeat->GetData().GetBiosrc().CanGetSubtype()) {
150         return false;
151     }
152     ITERATE (CBioSource::TSubtype, subSrcI, m_pMainFeat->GetData().GetBiosrc().GetSubtype()) {
153         if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_endogenous_virus_name) {
154              return true;
155         }
156     }
157     return false;
158 }
159 
160 
IsGeneCluster() const161 bool CAutoDefFeatureClause::IsGeneCluster () const
162 {
163     return IsGeneCluster (*m_pMainFeat);
164 }
165 
166 
IsGeneCluster(const CSeq_feat & feat)167 bool CAutoDefFeatureClause::IsGeneCluster (const CSeq_feat& feat)
168 {
169     if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature
170         || !feat.CanGetComment()) {
171         return false;
172     }
173 
174     string comment = feat.GetComment();
175     if (NStr::Find(comment, "gene cluster") != string::npos
176         || NStr::Find(comment, "gene locus") != string::npos) {
177         return true;
178     } else {
179         return false;
180     }
181 }
182 
183 
IsRecognizedFeature() const184 bool CAutoDefFeatureClause::IsRecognizedFeature() const
185 {
186     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
187     if (subtype == CSeqFeatData::eSubtype_3UTR
188         || subtype == CSeqFeatData::eSubtype_5UTR
189         || IsLTR(*m_pMainFeat)
190         || subtype == CSeqFeatData::eSubtype_cdregion
191         || subtype == CSeqFeatData::eSubtype_gene
192         || subtype == CSeqFeatData::eSubtype_mRNA
193         || subtype == CSeqFeatData::eSubtype_operon
194         || subtype == CSeqFeatData::eSubtype_exon
195         || subtype == CSeqFeatData::eSubtype_intron
196         || subtype == CSeqFeatData::eSubtype_rRNA
197         || subtype == CSeqFeatData::eSubtype_tRNA
198         || subtype == CSeqFeatData::eSubtype_otherRNA
199         || subtype == CSeqFeatData::eSubtype_misc_RNA
200         || subtype == CSeqFeatData::eSubtype_ncRNA
201         || subtype == CSeqFeatData::eSubtype_preRNA
202         || subtype == CSeqFeatData::eSubtype_tmRNA
203         || subtype == CSeqFeatData::eSubtype_D_loop
204         || subtype == CSeqFeatData::eSubtype_regulatory
205         || subtype == CSeqFeatData::eSubtype_misc_recomb
206         || IsNoncodingProductFeat()
207         || IsMobileElement()
208         || IsInsertionSequence()
209         || IsControlRegion()
210         || IsEndogenousVirusSourceFeature()
211         || IsSatelliteClause()
212         || IsPromoter()
213         || IsGeneCluster()
214         || GetClauseType() != eDefault) {
215         return true;
216     } else {
217         return false;
218     }
219 }
220 
221 
x_SetBiomol()222 void CAutoDefFeatureClause::x_SetBiomol()
223 {
224     m_Biomol = CMolInfo::eBiomol_genomic;
225     CSeqdesc_CI desc_iter(m_BH, CSeqdesc::e_Molinfo);
226     for ( ;  desc_iter;  ++desc_iter) {
227         if (desc_iter->GetMolinfo().IsSetBiomol()) {
228             m_Biomol = desc_iter->GetMolinfo().GetBiomol();
229         }
230     }
231 }
232 
233 
IsPseudo(const CSeq_feat & f)234 bool CAutoDefFeatureClause::IsPseudo(const CSeq_feat& f)
235 {
236     bool is_pseudo = false;
237     if (f.CanGetPseudo() && f.IsSetPseudo()) {
238         is_pseudo = true;
239     } else if (f.IsSetQual()) {
240         for (auto& it : f.GetQual()) {
241             if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), "pseudogene")) {
242                 is_pseudo = true;
243                 break;
244             }
245         }
246     }
247     return is_pseudo;
248 }
249 
250 
x_IsPseudo()251 bool CAutoDefFeatureClause::x_IsPseudo()
252 {
253     return (m_GeneIsPseudo || IsPseudo(*m_pMainFeat));
254 }
255 
256 
x_TypewordFromSequence()257 void CAutoDefFeatureClause::x_TypewordFromSequence()
258 {
259     if (m_Biomol == CMolInfo::eBiomol_genomic) {
260         m_Typeword = "genomic sequence";
261     } else if (m_Biomol == CMolInfo::eBiomol_mRNA) {
262         m_Typeword = "mRNA sequence";
263     } else {
264         m_Typeword = "sequence";
265     }
266     m_TypewordChosen = true;
267 }
268 
269 
x_GetFeatureTypeWord(string & typeword)270 bool CAutoDefFeatureClause::x_GetFeatureTypeWord(string &typeword)
271 {
272     string qual, comment;
273 
274     if (IsLTR(*m_pMainFeat)) {
275         typeword = "LTR repeat region";
276         return true;
277     }
278 
279     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
280     switch (subtype) {
281         case CSeqFeatData::eSubtype_exon:
282             typeword = "exon";
283             return true;
284             break;
285         case CSeqFeatData::eSubtype_intron:
286             typeword = "intron";
287             return true;
288             break;
289         case CSeqFeatData::eSubtype_D_loop:
290             typeword = "D-loop";
291             return true;
292             break;
293         case CSeqFeatData::eSubtype_3UTR:
294             typeword = "3' UTR";
295             return true;
296             break;
297         case CSeqFeatData::eSubtype_5UTR:
298             typeword = "5' UTR";
299             return true;
300             break;
301         case CSeqFeatData::eSubtype_operon:
302             typeword = "operon";
303             return true;
304             break;
305         case CSeqFeatData::eSubtype_repeat_region:
306             //if has insertion_seq gbqual
307             if (IsInsertionSequence()) {
308                 typeword = "insertion sequence";
309                 return true;
310             }
311             qual = m_pMainFeat->GetNamedQual("endogenous_virus");
312             if (!NStr::IsBlank(qual)) {
313                 typeword = "endogenous virus";
314                 return true;
315             }
316             if (IsMobileElement()) {
317                 typeword = "transposon";
318                 return true;
319             }
320             typeword = "repeat region";
321             return true;
322             break;
323         case CSeqFeatData::eSubtype_misc_feature:
324             if (m_pMainFeat->CanGetComment()) {
325                 comment = m_pMainFeat->GetComment();
326                 if (NStr::StartsWith(comment, "control region", NStr::eNocase)) {
327                     typeword = "control region";
328                     return true;
329                 }
330             }
331             break;
332         case CSeqFeatData::eSubtype_misc_recomb:
333             x_TypewordFromSequence();
334             return true;
335             break;
336         case CSeqFeatData::eSubtype_biosrc:
337             if (IsEndogenousVirusSourceFeature()) {
338                 typeword = "endogenous virus";
339                 return true;
340             }
341             break;
342         case CSeqFeatData::eSubtype_regulatory:
343             if (m_pMainFeat->IsSetQual()) {
344                 ITERATE(CSeq_feat::TQual, q, m_pMainFeat->GetQual()) {
345                     if ((*q)->IsSetQual() &&
346                         NStr::Equal((*q)->GetQual(), "regulatory_class") &&
347                         (*q)->IsSetVal() && !NStr::IsBlank((*q)->GetVal())) {
348                         typeword = (*q)->GetVal();
349                         return true;
350                     }
351                 }
352             }
353             break;
354         default:
355             break;
356     }
357 
358     if (m_Biomol == CMolInfo::eBiomol_genomic || m_Biomol == CMolInfo::eBiomol_cRNA) {
359         if (x_IsPseudo()) {
360             typeword = "pseudogene";
361             return true;
362         } else {
363             typeword = "gene";
364             return true;
365         }
366     } else if (subtype == CSeqFeatData::eSubtype_rRNA
367                || subtype == CSeqFeatData::eSubtype_snoRNA
368                || subtype == CSeqFeatData::eSubtype_snRNA
369 			   || subtype == CSeqFeatData::eSubtype_ncRNA) {
370         return false;
371     } else if (subtype == CSeqFeatData::eSubtype_precursor_RNA) {
372         typeword = "precursor RNA";
373         return true;
374     } else if (m_Biomol == CMolInfo::eBiomol_mRNA) {
375         if (x_IsPseudo()) {
376             typeword = "pseudogene mRNA";
377         } else {
378             typeword = "mRNA";
379         }
380         return true;
381     } else if (m_Biomol == CMolInfo::eBiomol_pre_RNA) {
382         if (x_IsPseudo()) {
383             typeword = "pseudogene precursor RNA";
384         } else {
385             typeword = "precursor RNA";
386         }
387         return true;
388     } else if (m_Biomol == CMolInfo::eBiomol_other_genetic) {
389         typeword = "gene";
390         return true;
391     }
392     typeword = "";
393     return true;
394 }
395 
396 
x_ShowTypewordFirst(string typeword)397 bool CAutoDefFeatureClause::x_ShowTypewordFirst(string typeword)
398 {
399     if (NStr::Equal(typeword, "")) {
400         return false;
401     } else if (NStr::EqualNocase(typeword, "exon")
402                || NStr::EqualNocase(typeword, "intron")
403                || NStr::EqualNocase(typeword, "transposon")
404                || NStr::EqualNocase(typeword, "insertion sequence")
405                || NStr::EqualNocase(typeword, "endogenous virus")
406                || NStr::EqualNocase(typeword, "retrotransposon")
407                || NStr::EqualNocase(typeword, "P-element")
408                || NStr::EqualNocase(typeword, "transposable element")
409                || NStr::EqualNocase(typeword, "integron")
410                || NStr::EqualNocase(typeword, "superintegron")
411                || NStr::EqualNocase(typeword, "MITE")) {
412         return true;
413     } else {
414         return false;
415     }
416 }
417 
418 
x_FindNoncodingFeatureKeywordProduct(string comment,string keyword,string & product_name) const419 bool CAutoDefFeatureClause::x_FindNoncodingFeatureKeywordProduct (string comment, string keyword, string &product_name) const
420 {
421     if (NStr::IsBlank(comment) || NStr::IsBlank(keyword)) {
422         return false;
423     }
424     string::size_type start_pos = 0;
425 
426     while (start_pos != NCBI_NS_STD::string::npos) {
427         start_pos = NStr::Find(comment, keyword, start_pos);
428         if (start_pos != NCBI_NS_STD::string::npos) {
429             string possible = comment.substr(start_pos + keyword.length());
430             NStr::TruncateSpacesInPlace(possible);
431             if (!NStr::StartsWith(possible, "GenBank Accession Number")) {
432                 product_name = possible;
433                 // truncate at first semicolon
434                 string::size_type end = NStr::Find(product_name, ";");
435                 if (end != NCBI_NS_STD::string::npos) {
436                     product_name = product_name.substr(0, end);
437                 }
438                 // remove sequence from end of product name if found
439                 if (NStr::EndsWith(product_name, " sequence")) {
440                     product_name = product_name.substr(0, product_name.length() - 9);
441                 }
442                 // add "-like" if not present
443                 if (!NStr::EndsWith(product_name, "-like")) {
444                     product_name += "-like";
445                 }
446                 return true;
447             } else {
448                 start_pos += keyword.length();
449             }
450         }
451     }
452     return false;
453 }
454 
455 
x_GetNoncodingProductFeatProduct(string & product_name) const456 bool CAutoDefFeatureClause::x_GetNoncodingProductFeatProduct (string &product_name) const
457 {
458     if (GetMainFeatureSubtype() != CSeqFeatData::eSubtype_misc_feature
459         || !m_pMainFeat->CanGetComment()) {
460         return false;
461     }
462     string comment = m_pMainFeat->GetComment();
463     string::size_type start_pos = NStr::Find(comment, "nonfunctional ");
464     if (start_pos != NCBI_NS_STD::string::npos) {
465         string::size_type sep_pos = NStr::Find (comment, " due to ", start_pos);
466         if (sep_pos != NCBI_NS_STD::string::npos) {
467             product_name = comment.substr(start_pos, sep_pos - start_pos);
468             return true;
469         }
470     }
471     if (x_FindNoncodingFeatureKeywordProduct (comment, "similar to ", product_name)) {
472         return true;
473     } else if (x_FindNoncodingFeatureKeywordProduct (comment, "contains ", product_name)) {
474         return true;
475     } else {
476         return false;
477     }
478 }
479 
IsNoncodingProductFeat() const480 bool CAutoDefFeatureClause::IsNoncodingProductFeat() const
481 {
482     string product_name;
483     return x_GetNoncodingProductFeatProduct(product_name);
484 }
485 
CAutoDefGeneClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)486 CAutoDefGeneClause::CAutoDefGeneClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
487     : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
488 {
489     m_GeneName = x_GetGeneName(m_pMainFeat->GetData().GetGene(), GetSuppressLocusTag());
490     if (m_pMainFeat->GetData().GetGene().CanGetAllele()) {
491         m_AlleleName = m_pMainFeat->GetData().GetGene().GetAllele();
492         if (!NStr::StartsWith(m_AlleleName, m_GeneName, NStr::eNocase)) {
493             if (!NStr::StartsWith(m_AlleleName, "-")) {
494                 m_AlleleName = "-" + m_AlleleName;
495             }
496             m_AlleleName = m_GeneName + m_AlleleName;
497         }
498     }
499     m_GeneIsPseudo = IsPseudo(*m_pMainFeat);
500     m_HasGene = true;
501 }
502 
503 
x_IsPseudo()504 bool CAutoDefGeneClause::x_IsPseudo()
505 {
506     if (CAutoDefFeatureClause::x_IsPseudo()) {
507         return true;
508     }
509     const CGene_ref& gene = m_pMainFeat->GetData().GetGene();
510     if (gene.CanGetPseudo() && gene.IsSetPseudo()) {
511         return true;
512     }
513     return false;
514 }
515 
516 /*
517 *If the feature is a gene and has different strings in the description than
518 * in the locus or locus tag, the description will be used as the product for
519 * the gene.
520 */
x_GetProductName(string & product_name)521 bool CAutoDefGeneClause::x_GetProductName(string &product_name)
522 {
523     if (m_pMainFeat->GetData().GetGene().CanGetDesc()
524         && !NStr::Equal(m_pMainFeat->GetData().GetGene().GetDesc(),
525         m_GeneName)) {
526         product_name = m_pMainFeat->GetData().GetGene().GetDesc();
527         return true;
528     } else {
529         return false;
530     }
531 }
532 
533 
ParseString(string comment,string & gene_name,string & product_name)534 bool CAutoDefParsedtRNAClause::ParseString(string comment, string& gene_name, string& product_name)
535 {
536     product_name = "";
537     gene_name = "";
538 
539     NStr::TruncateSpacesInPlace(comment);
540     if (NStr::EndsWith (comment, " gene")) {
541         comment = comment.substr (0, comment.length() - 5);
542     } else if (NStr::EndsWith (comment, " genes")) {
543         comment = comment.substr (0, comment.length() - 6);
544     }
545 
546     string::size_type pos = NStr::Find(comment, "(");
547     if (pos == NCBI_NS_STD::string::npos) {
548         if (NStr::StartsWith (comment, "tRNA-")) {
549             product_name = comment;
550         } else {
551             /* if not tRNA, gene name is required */
552             return false;
553         }
554     } else {
555         product_name = comment.substr(0, pos);
556         comment = comment.substr (pos + 1);
557         pos = NStr::Find(comment, ")");
558         if (pos == NCBI_NS_STD::string::npos) {
559             return false;
560         }
561         gene_name = comment.substr (0, pos);
562         NStr::TruncateSpacesInPlace(gene_name);
563     }
564     NStr::TruncateSpacesInPlace(product_name);
565 
566     if (NStr::StartsWith (product_name, "tRNA-")) {
567         /* tRNA name must start with "tRNA-" and be followed by one uppercase letter and
568          * two lowercase letters.
569          */
570         if (product_name.length() < 8
571             || !isalpha(product_name.c_str()[5]) || !isupper(product_name.c_str()[5])
572             || !isalpha(product_name.c_str()[6]) || !islower(product_name.c_str()[6])
573             || !isalpha(product_name.c_str()[7]) || !islower(product_name.c_str()[7])) {
574             return false;
575         }
576 
577         /* if present, gene name must start with letters "trn",
578          * and end with one uppercase letter.
579          */
580         if (!NStr::IsBlank (gene_name)
581             && (gene_name.length() < 4
582                 || !NStr::StartsWith(gene_name, "trn" )
583                 || !isalpha(gene_name.c_str()[3])
584                 || !isupper(gene_name.c_str()[3]))) {
585             return false;
586         }
587     }
588     if (NStr::IsBlank (product_name)) {
589         return false;
590     }
591     return true;
592 }
593 
594 
s_tRNAClauseFromNote(CBioseq_Handle bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,string comment,bool is_first,bool is_last,const CAutoDefOptions & opts)595 CAutoDefParsedtRNAClause *s_tRNAClauseFromNote(CBioseq_Handle bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, string comment, bool is_first, bool is_last, const CAutoDefOptions& opts)
596 {
597     string product_name;
598     string gene_name;
599     if (!CAutoDefParsedtRNAClause::ParseString(comment, gene_name, product_name)) {
600         return NULL;
601     }
602 
603     return new CAutoDefParsedtRNAClause(bh, cf, mapped_loc, gene_name, product_name, is_first, is_last, opts);
604 }
605 
606 
x_GetGeneName(const CGene_ref & gref,bool suppress_locus_tag) const607 string CAutoDefFeatureClause::x_GetGeneName(const CGene_ref& gref, bool suppress_locus_tag) const
608 {
609     if (gref.IsSuppressed()) {
610         return "";
611     } else if (gref.CanGetLocus() && !NStr::IsBlank(gref.GetLocus())) {
612         return gref.GetLocus();
613     } else if (!suppress_locus_tag && gref.IsSetLocus_tag() && !NStr::IsBlank(gref.GetLocus_tag())) {
614         return gref.GetLocus_tag();
615     } else if (gref.IsSetDesc() && !NStr::IsBlank(gref.GetDesc())) {
616         return gref.GetDesc();
617     } else {
618         return "";
619     }
620 }
621 
622 
s_UseCommentBeforeSemicolon(const CSeq_feat & feat,string & label)623 void s_UseCommentBeforeSemicolon(const CSeq_feat& feat, string& label)
624 {
625     if (feat.IsSetComment()) {
626         label = feat.GetComment();
627         string::size_type pos = NStr::Find(label, ";");
628         if (pos != NCBI_NS_STD::string::npos) {
629             label = label.substr(0, pos);
630         }
631     }
632 }
633 
634 
635 /* Frequently the product associated with a feature is listed as part of the
636  * description of the feature in the definition line.  This function determines
637  * the name of the product associated with this specific feature.  Some
638  * features will be listed with the product of a feature that is associated
639  * with the feature being described - this function does not look at other
640  * features to determine a product name.
641  * If the feature is a misc_feat with particular keywords in the comment,
642  * the product will be determined based on the contents of the comment.
643  * If the feature is a CDS and is marked as pseudo, the product will be
644  * determined based on the contents of the comment.
645  * If the feature is a gene and has different strings in the description than
646  * in the locus or locus tag, the description will be used as the product for
647  * the gene.
648  * If none of the above conditions apply, the sequence indexing context label
649  * will be used to obtain the product name for the feature.
650  */
x_GetProductName(string & product_name)651 bool CAutoDefFeatureClause::x_GetProductName(string &product_name)
652 {
653     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
654 
655     if (subtype == CSeqFeatData::eSubtype_misc_feature && x_GetNoncodingProductFeatProduct(product_name)) {
656         return true;
657     } else if (subtype == CSeqFeatData::eSubtype_cdregion
658                && m_pMainFeat->CanGetPseudo()
659                && m_pMainFeat->IsSetPseudo()
660                && m_pMainFeat->CanGetComment()) {
661         string comment = m_pMainFeat->GetComment();
662         if (!NStr::IsBlank(comment)) {
663             string::size_type pos = NStr::Find(comment, ";");
664             if (pos != NCBI_NS_STD::string::npos) {
665                 comment = comment.substr(0, pos);
666             }
667             product_name = comment;
668             return true;
669         }
670     } else if (subtype == CSeqFeatData::eSubtype_tmRNA) {
671         product_name = "tmRNA";
672         return true;
673     } else if (m_pMainFeat->GetData().Which() == CSeqFeatData::e_Rna) {
674         product_name = m_pMainFeat->GetData().GetRna().GetRnaProductName();
675         if (NStr::IsBlank(product_name) && m_pMainFeat->IsSetComment()) {
676             product_name = m_pMainFeat->GetComment();
677         }
678         return true;
679     } else if (subtype == CSeqFeatData::eSubtype_regulatory) {
680         return true;
681     } else if (subtype == CSeqFeatData::eSubtype_misc_recomb) {
682         if (m_pMainFeat->IsSetQual()) {
683             ITERATE(CSeq_feat::TQual, q, m_pMainFeat->GetQual()) {
684                 if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "recombination_class") &&
685                     (*q)->IsSetVal() && !NStr::IsBlank((*q)->GetVal())) {
686                     product_name = (*q)->GetVal();
687                     return true;
688                 }
689             }
690         }
691         s_UseCommentBeforeSemicolon(*m_pMainFeat, product_name);
692         return true;
693     } else if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
694         return x_GetExonDescription(product_name);
695     } else {
696         string label;
697 
698         if (subtype == CSeqFeatData::eSubtype_cdregion && m_pMainFeat->IsSetProduct() && !m_Opts.IsFeatureSuppressed(CSeqFeatData::eSubtype_mat_peptide_aa)) {
699             const CSeq_loc& product_loc = m_pMainFeat->GetProduct();
700             CBioseq_Handle prot_h = m_BH.GetScope().GetBioseqHandle(product_loc);
701             if (prot_h) {
702                 CFeat_CI prot_f(prot_h, CSeqFeatData::eSubtype_prot);
703                 if (prot_f) {
704                     feature::GetLabel(*(prot_f->GetSeq_feat()), &label, feature::fFGL_Content);
705                     if (m_pMainFeat->IsSetPartial() && m_pMainFeat->GetPartial()) {
706                         // RW-1216 suppress mat-peptide region phrase if sig-peptide also present
707                         CFeat_CI sig_pi(prot_h, CSeqFeatData::eSubtype_sig_peptide_aa);
708                         if (!sig_pi) {
709                             CFeat_CI mat_pi(prot_h, CSeqFeatData::eSubtype_mat_peptide_aa);
710                             if (mat_pi && mat_pi->GetData().GetProt().IsSetName()) {
711                                 const string&  m_name = mat_pi->GetData().GetProt().GetName().front();
712                                 ++mat_pi;
713                                 if (!mat_pi && !m_name.empty()) {
714                                     if (label.empty()) {
715                                         label = m_name;
716                                     }
717                                     else {
718                                         label += ", " + m_name + " region,";
719                                     }
720                                 }
721                             }
722                         }
723                     }
724                 }
725             }
726         }
727 
728         if (NStr::IsBlank(label)) {
729             feature::GetLabel(*m_pMainFeat, &label, feature::fFGL_Content);
730         }
731         if ((subtype == CSeqFeatData::eSubtype_cdregion && !NStr::Equal(label, "CDS"))
732             || (subtype == CSeqFeatData::eSubtype_mRNA && !NStr::Equal(label, "mRNA"))
733             || (subtype != CSeqFeatData::eSubtype_cdregion && subtype != CSeqFeatData::eSubtype_mRNA)) {
734         } else {
735             label = "";
736         }
737 
738         // remove unwanted "mRNA-" tacked onto label for mRNA features
739         if (subtype == CSeqFeatData::eSubtype_mRNA && NStr::StartsWith(label, "mRNA-")) {
740             label = label.substr(5);
741         } else if (subtype == CSeqFeatData::eSubtype_rRNA && NStr::StartsWith(label, "rRNA-")) {
742             label = label.substr(5);
743         }
744 
745         if (!NStr::IsBlank(label)) {
746             product_name = label;
747             return true;
748         } else {
749             product_name = "";
750             return false;
751         }
752     }
753     return false;
754 }
755 
756 
x_GetExonDescription(string & description)757 bool CAutoDefFeatureClause::x_GetExonDescription(string &description)
758 {
759     if (m_pMainFeat->IsSetQual()) {
760         ITERATE(CSeq_feat::TQual, it, m_pMainFeat->GetQual()) {
761             if ((*it)->IsSetQual() && (*it)->IsSetVal()
762                 && NStr::EqualNocase((*it)->GetQual(), "number")) {
763                 description = (*it)->GetVal();
764                 return true;
765             }
766         }
767     }
768     description = kEmptyStr;
769     return false;
770 }
771 
772 
x_GetDescription(string & description)773 bool CAutoDefFeatureClause::x_GetDescription(string &description)
774 {
775     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
776 
777     description = "";
778     if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
779         return x_GetExonDescription(description);
780     } else if (NStr::Equal(m_Typeword, "insertion sequence")) {
781         description = m_pMainFeat->GetNamedQual("insertion_seq");
782         if (NStr::Equal(description, "unnamed")
783             || NStr::IsBlank(description)) {
784             description = "";
785             return false;
786         } else {
787             return true;
788         }
789     } else if (subtype == CSeqFeatData::eSubtype_repeat_region) {
790         if (NStr::Equal(m_Typeword, "endogenous virus")) {
791             description = m_pMainFeat->GetNamedQual("endogenous_virus");
792             if (NStr::Equal(description, "unnamed")
793                 || NStr::IsBlank(description)) {
794                 description = "";
795                 return false;
796             } else {
797                 return true;
798             }
799         } else {
800             description = m_pMainFeat->GetNamedQual("rpt_family");
801             if (NStr::IsBlank(description) && m_pMainFeat->IsSetComment()) {
802                 description = m_pMainFeat->GetComment();
803                 if (IsLTR() && NStr::EndsWith(description, " LTR")) {
804                     description = description.substr(0, description.length() - 4);
805                 }
806             }
807             return true;
808         }
809     } else if (subtype == CSeqFeatData::eSubtype_biosrc
810                && NStr::Equal(m_Typeword, "endogenous virus")) {
811         if (m_pMainFeat->GetData().GetBiosrc().CanGetSubtype()) {
812             ITERATE (CBioSource::TSubtype, subSrcI, m_pMainFeat->GetData().GetBiosrc().GetSubtype()) {
813                 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_endogenous_virus_name) {
814                     description = (*subSrcI)->GetName();
815                     if (NStr::Equal(description, "unnamed")
816                         || NStr::IsBlank(description)) {
817                         description = "";
818                     } else {
819                         return true;
820                     }
821                 }
822             }
823         }
824         return false;
825     } else if (NStr::Equal(m_Typeword, "control region")
826                || NStr::Equal(m_Typeword, "D-loop")
827                || subtype == CSeqFeatData::eSubtype_3UTR
828                || subtype == CSeqFeatData::eSubtype_5UTR) {
829         return false;
830     } else if (IsLTR(*m_pMainFeat)) {
831         if (m_pMainFeat->CanGetComment()) {
832             string comment = m_pMainFeat->GetComment();
833             if (NStr::StartsWith(comment, "LTR ")) {
834                 comment = comment.substr(4);
835             } else if (NStr::EndsWith(comment, " LTR")) {
836                 comment = comment.substr(0, comment.length() - 4);
837             }
838             description = comment;
839         }
840         if (NStr::IsBlank(description)) {
841             return false;
842         } else {
843             return true;
844         }
845     } else if (subtype == CSeqFeatData::eSubtype_operon) {
846         description = m_pMainFeat->GetNamedQual("operon");
847         return true;
848     } else {
849         if (!m_ProductNameChosen) {
850             m_ProductNameChosen = x_GetProductName(m_ProductName);
851         }
852 
853         if (!NStr::IsBlank(m_GeneName) && !NStr::IsBlank(m_ProductName)) {
854             description = m_ProductName + " (" + m_GeneName + ")";
855         } else if (!NStr::IsBlank(m_GeneName)) {
856             description = m_GeneName;
857         } else if (!NStr::IsBlank(m_ProductName)) {
858             description = m_ProductName;
859         }
860         if (NStr::IsBlank(description)) {
861             return false;
862         } else {
863             return true;
864         }
865     }
866 }
867 
868 
IsSatelliteClause() const869 bool CAutoDefFeatureClause::IsSatelliteClause() const
870 {
871     return IsSatellite(*m_pMainFeat);
872 }
873 
874 
IsSatellite(const CSeq_feat & feat)875 bool CAutoDefFeatureClause::IsSatellite(const CSeq_feat& feat)
876 {
877     if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_repeat_region
878 		&& !NStr::IsBlank (feat.GetNamedQual("satellite"))) {
879         return true;
880     }
881     return false;
882 }
883 
884 
IsPromoter() const885 bool CAutoDefFeatureClause::IsPromoter() const
886 {
887     return IsPromoter(*m_pMainFeat);
888 }
889 
890 
IsLTR() const891 bool CAutoDefFeatureClause::IsLTR() const
892 {
893     return IsLTR(*m_pMainFeat);
894 }
895 
896 
IsPromoter(const CSeq_feat & feat)897 bool CAutoDefFeatureClause::IsPromoter(const CSeq_feat& feat)
898 {
899     if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_promoter) {
900         return true;
901     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_regulatory &&
902         NStr::Equal(feat.GetNamedQual("regulatory_class"), "promoter")) {
903         return true;
904     } else {
905         return false;
906     }
907 }
908 
909 
IsLTR(const CSeq_feat & feat)910 bool CAutoDefFeatureClause::IsLTR(const CSeq_feat& feat)
911 {
912     if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_LTR) {
913         return true;
914     } else if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region ||
915         !feat.IsSetQual()) {
916         return false;
917     }
918     ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
919         if ((*it)->IsSetQual() && (*it)->IsSetVal() &&
920             NStr::EqualNocase((*it)->GetQual(), "rpt_type") &&
921             NStr::FindNoCase((*it)->GetVal(), "long_terminal_repeat") != string::npos) {
922             return true;
923         }
924     }
925     return false;
926 }
927 
928 /* operons suppress all subfeatures except promoters (see GB-5635) */
x_GetOperonSubfeatures(string & interval)929 void CAutoDefFeatureClause::x_GetOperonSubfeatures(string &interval)
930 {
931     bool has_promoter = false;
932 
933     for (auto it : m_ClauseList) {
934         if (it->IsPromoter()) {
935             has_promoter = true;
936             break;
937         }
938     }
939     if (has_promoter) {
940         interval += ", promoter region, ";
941     }
942 }
943 
944 
945 /* This function calculates the "interval" for a clause in the definition
946  * line.  The interval could be an empty string, it could indicate whether
947  * the location of the feature is partial or complete and whether or not
948  * the feature is a CDS, the interval could be a description of the
949  * subfeatures of the clause, or the interval could be a combination of the
950  * last two items if the feature is a CDS.
951  */
x_GetGenericInterval(string & interval,bool suppress_allele)952 bool CAutoDefFeatureClause::x_GetGenericInterval (string &interval, bool suppress_allele)
953 {
954     interval = "";
955     if (m_IsUnknown) {
956         return false;
957     }
958 
959     CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype();
960     if (subtype == CSeqFeatData::eSubtype_exon && m_IsAltSpliced) {
961         interval = "alternatively spliced";
962         return true;
963     }
964 
965     if (IsSatelliteClause()
966         || IsPromoter()
967         || subtype == CSeqFeatData::eSubtype_regulatory
968         || subtype == CSeqFeatData::eSubtype_exon
969         || subtype == CSeqFeatData::eSubtype_intron
970         || subtype == CSeqFeatData::eSubtype_5UTR
971         || subtype == CSeqFeatData::eSubtype_3UTR
972         || (subtype == CSeqFeatData::eSubtype_repeat_region && !NStr::Equal(m_Typeword, "endogenous virus"))
973         || subtype == CSeqFeatData::eSubtype_misc_recomb
974         || IsLTR()) {
975         return false;
976     }
977 
978     CRef<CAutoDefFeatureClause_Base> utr3;
979 
980     if (subtype == CSeqFeatData::eSubtype_operon) {
981         // suppress subclauses except promoters
982         x_GetOperonSubfeatures(interval);
983     } else if (!m_SuppressSubfeatures) {
984         // label subclauses
985         // check to see if 3'UTR is present, and whether there are any other features
986         auto it = m_ClauseList.begin();
987         while (it != m_ClauseList.end()) {
988             if (*it) {
989                 (*it)->Label(suppress_allele);
990                 if ((*it)->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_3UTR && subtype == CSeqFeatData::eSubtype_cdregion) {
991                     utr3 = *it;
992                     it = m_ClauseList.erase(it);
993                 }
994                 else {
995                     ++it;
996                 }
997             } else {
998                 it = m_ClauseList.erase(it);
999             }
1000         }
1001 
1002         // label any subclauses
1003         if (m_ClauseList.size() > 0) {
1004             bool suppress_final_and = false;
1005             if (subtype == CSeqFeatData::eSubtype_cdregion && !m_ClauseInfoOnly) {
1006                 suppress_final_and = true;
1007             }
1008 
1009             // create subclause list for interval
1010             interval += ListClauses(false, suppress_final_and, suppress_allele);
1011 
1012             if (subtype == CSeqFeatData::eSubtype_cdregion && !m_ClauseInfoOnly) {
1013                 if (utr3 != NULL) {
1014                     interval += ", ";
1015                 } else if (m_ClauseList.size() == 1) {
1016                     interval += " and ";
1017                 } else {
1018                     interval += ", and ";
1019                 }
1020             } else {
1021                 return true;
1022             }
1023         }
1024     }
1025 
1026     if (IsPartial()) {
1027         interval += "partial ";
1028     } else {
1029         interval += "complete ";
1030     }
1031 
1032     if (subtype == CSeqFeatData::eSubtype_cdregion
1033         && (!x_IsPseudo())) {
1034         interval += "cds";
1035         if (m_IsAltSpliced) {
1036             interval += ", alternatively spliced";
1037         }
1038     } else {
1039         interval += "sequence";
1040         string product_name;
1041         if (m_IsAltSpliced && x_GetNoncodingProductFeatProduct (product_name)) {
1042             interval += ", alternatively spliced";
1043         }
1044     }
1045 
1046     if (utr3 != NULL) {
1047         /* tack UTR3 on at end of clause */
1048         if (m_ClauseList.size() == 0) {
1049             interval += " and 3' UTR";
1050         } else {
1051             interval += ", and 3' UTR";
1052         }
1053         m_ClauseList.push_back(utr3);
1054     }
1055 
1056     return true;
1057 }
1058 
1059 
Label(bool suppress_allele)1060 void CAutoDefFeatureClause::Label(bool suppress_allele)
1061 {
1062     if (!m_TypewordChosen) {
1063         m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
1064         m_ShowTypewordFirst = x_ShowTypewordFirst(m_Typeword);
1065         m_Pluralizable = true;
1066     }
1067     if (!m_ProductNameChosen) {
1068         m_ProductNameChosen = x_GetProductName(m_ProductName);
1069     }
1070     if (!m_DescriptionChosen) {
1071         m_DescriptionChosen = x_GetDescription(m_Description);
1072     }
1073 
1074     x_GetGenericInterval (m_Interval, suppress_allele);
1075 
1076 }
1077 
1078 
CompareLocation(const CSeq_loc & loc) const1079 sequence::ECompare CAutoDefFeatureClause::CompareLocation(const CSeq_loc& loc) const
1080 {
1081     return sequence::Compare(loc, *m_ClauseLocation, &(m_BH.GetScope()),
1082         sequence::fCompareOverlapping);
1083 }
1084 
1085 
SameStrand(const CSeq_loc & loc) const1086 bool CAutoDefFeatureClause::SameStrand(const CSeq_loc& loc) const
1087 {
1088     ENa_strand loc_strand = loc.GetStrand();
1089     ENa_strand this_strand = m_ClauseLocation->GetStrand();
1090 
1091     if ((loc_strand == eNa_strand_minus && this_strand != eNa_strand_minus)
1092         || (loc_strand != eNa_strand_minus && this_strand == eNa_strand_minus)) {
1093         return false;
1094     } else {
1095         return true;
1096     }
1097 
1098 }
1099 
IsPartial() const1100 bool CAutoDefFeatureClause::IsPartial() const
1101 {
1102     if (m_ClauseLocation->IsPartialStart(eExtreme_Biological)
1103         || m_ClauseLocation->IsPartialStop(eExtreme_Biological)) {
1104         return true;
1105     } else {
1106         return false;
1107     }
1108 }
1109 
1110 
GetLocation() const1111 CRef<CSeq_loc> CAutoDefFeatureClause::GetLocation() const
1112 {
1113     return m_ClauseLocation;
1114 }
1115 
1116 
AddToLocation(CRef<CSeq_loc> loc,bool also_set_partials)1117 void CAutoDefFeatureClause::AddToLocation(CRef<CSeq_loc> loc, bool also_set_partials)
1118 {
1119     bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological);
1120     bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological);
1121 
1122     if (also_set_partials) {
1123         partial5 |= loc->IsPartialStart(eExtreme_Biological);
1124     }
1125     if (also_set_partials) {
1126         partial3 |= loc->IsPartialStop(eExtreme_Biological);
1127     }
1128     m_ClauseLocation = Seq_loc_Add(*m_ClauseLocation, *loc,
1129                                    CSeq_loc::fSort | CSeq_loc::fMerge_Overlapping,
1130                                    &(m_BH.GetScope()));
1131 
1132 
1133     m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1134     m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1135 }
1136 
1137 
1138 // Match for identical strings or for match at the beginning followed by mat-peptide region
DoesmRNAProductNameMatch(const string & mrna_product) const1139 bool CAutoDefFeatureClause::DoesmRNAProductNameMatch(const string& mrna_product) const
1140 {
1141     if (!m_ProductNameChosen) {
1142         return false;
1143     }
1144     if (NStr::Equal(m_ProductName, mrna_product)) {
1145         return true;
1146     }
1147     if (NStr::StartsWith(m_ProductName, mrna_product) && m_ProductName[mrna_product.length()] == ',' && NStr::EndsWith(m_ProductName, " region,")) {
1148         return true;
1149     }
1150     return false;
1151 }
1152 
1153 
1154 /* This function searches this list for clauses to which this mRNA should
1155  * apply.  This is not taken care of by the GroupAllClauses function
1156  * because when an mRNA is added to a CDS, the product for the clause is
1157  * replaced and the location for the clause is expanded, rather than simply
1158  * adding the mRNA as an additional feature in the list, and because an
1159  * mRNA can apply to more than one clause, while other features should
1160  * really only belong to one clause.
1161  */
AddmRNA(CAutoDefFeatureClause_Base * mRNAClause)1162 bool CAutoDefFeatureClause::AddmRNA (CAutoDefFeatureClause_Base *mRNAClause)
1163 {
1164     bool used_mRNA = false;
1165     string clause_product, mRNA_product;
1166     bool adjust_partials = true;
1167 
1168     if (mRNAClause == NULL || ! mRNAClause->SameStrand(*m_ClauseLocation)) {
1169         return false;
1170     }
1171 
1172     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
1173     sequence::ECompare loc_compare = mRNAClause->CompareLocation(*m_ClauseLocation);
1174     if (subtype == CSeqFeatData::eSubtype_cdregion) {
1175         adjust_partials = false;
1176     }
1177 
1178     if (subtype == CSeqFeatData::eSubtype_cdregion
1179         && DoesmRNAProductNameMatch(mRNAClause->GetProductName())
1180         && (loc_compare == sequence::eContained || loc_compare == sequence::eSame)) {
1181         m_HasmRNA = true;
1182         // when expanding "location" to include mRNA, leave partials for CDS as they were
1183         AddToLocation(mRNAClause->GetLocation(), adjust_partials);
1184         used_mRNA = true;
1185     } else if ((subtype == CSeqFeatData::eSubtype_cdregion || subtype == CSeqFeatData::eSubtype_gene)
1186                && !m_ProductNameChosen
1187                && (loc_compare == sequence::eContained
1188                    || loc_compare == sequence::eContains
1189                    || loc_compare == sequence::eSame)) {
1190         m_HasmRNA = true;
1191         AddToLocation(mRNAClause->GetLocation(), adjust_partials);
1192         used_mRNA = true;
1193         m_ProductName = mRNAClause->GetProductName();
1194         m_ProductNameChosen = true;
1195     }
1196 
1197     if (used_mRNA && mRNAClause->IsAltSpliced()) {
1198         m_IsAltSpliced = true;
1199     }
1200 
1201     return used_mRNA;
1202 }
1203 
1204 
1205 /* This function searches this list for clauses to which this gene should
1206  * apply.  This is not taken care of by the GroupAllClauses function
1207  * because genes are added to clauses as a GeneRefPtr instead of as an
1208  * additional feature in the list, and because a gene can apply to more
1209  * than one clause, while other features should really only belong to
1210  * one clause.
1211  */
AddGene(CAutoDefFeatureClause_Base * gene_clause,bool suppress_allele)1212 bool CAutoDefFeatureClause::AddGene (CAutoDefFeatureClause_Base *gene_clause, bool suppress_allele)
1213 {
1214     bool used_gene = false;
1215 
1216     if (gene_clause == NULL || gene_clause->GetMainFeatureSubtype() != CSeqFeatData::eSubtype_gene) {
1217         return false;
1218     }
1219 
1220     CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype ();
1221 
1222     string noncoding_product_name;
1223 
1224     // only add gene to certain other types of clauses
1225     if (subtype != CSeqFeatData::eSubtype_cdregion
1226         && subtype != CSeqFeatData::eSubtype_mRNA
1227         && subtype != CSeqFeatData::eSubtype_rRNA
1228         && subtype != CSeqFeatData::eSubtype_tRNA
1229         && subtype != CSeqFeatData::eSubtype_misc_RNA
1230         && subtype != CSeqFeatData::eSubtype_otherRNA
1231         && subtype != CSeqFeatData::eSubtype_ncRNA
1232         && subtype != CSeqFeatData::eSubtype_precursor_RNA
1233         && subtype != CSeqFeatData::eSubtype_preRNA
1234         && subtype != CSeqFeatData::eSubtype_tmRNA
1235         && subtype != CSeqFeatData::eSubtype_intron
1236         && subtype != CSeqFeatData::eSubtype_exon
1237         && !x_GetNoncodingProductFeatProduct(noncoding_product_name)) {
1238         return false;
1239     }
1240 
1241     if (m_HasGene) {
1242         // already assigned
1243     } else {
1244         // find overlapping gene for this feature
1245         CAutoDefGeneClause *gene = dynamic_cast<CAutoDefGeneClause *>(gene_clause);
1246         bool suppress_locus_tag = gene ? gene->GetSuppressLocusTag() : false;
1247         CConstRef <CSeq_feat> gene_for_feat = sequence::GetGeneForFeature(*m_pMainFeat, m_BH.GetScope());
1248         if (gene_for_feat && NStr::Equal(x_GetGeneName(gene_for_feat->GetData().GetGene(), suppress_locus_tag), gene_clause->GetGeneName())) {
1249             used_gene = true;
1250             m_HasGene = true;
1251             m_GeneName = gene_clause->GetGeneName();
1252             m_AlleleName = gene_clause->GetAlleleName();
1253             m_GeneIsPseudo = gene_clause->GetGeneIsPseudo();
1254             m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
1255         }
1256     }
1257 
1258     if (used_gene && ! m_ProductNameChosen) {
1259         Label(suppress_allele);
1260         if (!m_ProductNameChosen) {
1261             m_ProductNameChosen = true;
1262             m_ProductName = gene_clause->GetProductName();
1263         }
1264     }
1265     if (used_gene) {
1266         m_DescriptionChosen = false;
1267         Label(suppress_allele);
1268     }
1269 
1270     return used_gene;
1271 }
1272 
1273 
OkToGroupUnderByType(const CAutoDefFeatureClause_Base * parent_clause) const1274 bool CAutoDefFeatureClause::OkToGroupUnderByType(const CAutoDefFeatureClause_Base *parent_clause) const
1275 {
1276     bool ok_to_group = false;
1277 
1278     if (parent_clause == NULL) {
1279         return false;
1280     }
1281     CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
1282     CSeqFeatData::ESubtype parent_subtype = parent_clause->GetMainFeatureSubtype();
1283 
1284     if (parent_subtype == CSeqFeatData::eSubtype_mobile_element) {
1285         return true;
1286     }
1287 
1288     if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
1289         if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1290             || parent_subtype == CSeqFeatData::eSubtype_D_loop
1291             || parent_subtype == CSeqFeatData::eSubtype_mRNA
1292             || parent_subtype == CSeqFeatData::eSubtype_gene
1293             || parent_subtype == CSeqFeatData::eSubtype_operon
1294             || parent_clause->IsNoncodingProductFeat()
1295             || parent_clause->IsEndogenousVirusSourceFeature()
1296             || parent_clause->IsGeneCluster()) {
1297             ok_to_group = true;
1298         }
1299     } else if (IsPromoter() || subtype == CSeqFeatData::eSubtype_regulatory) {
1300         if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1301             || parent_subtype == CSeqFeatData::eSubtype_mRNA
1302             || parent_subtype == CSeqFeatData::eSubtype_gene
1303             || parent_subtype == CSeqFeatData::eSubtype_operon
1304             || parent_clause->IsEndogenousVirusSourceFeature()
1305             || parent_clause->IsGeneCluster()) {
1306             ok_to_group = true;
1307         }
1308     } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
1309         if (parent_subtype == CSeqFeatData::eSubtype_mRNA
1310             || parent_clause->IsInsertionSequence()
1311             || parent_clause->IsMobileElement()
1312             || parent_clause->IsEndogenousVirusSourceFeature()
1313             || parent_subtype == CSeqFeatData::eSubtype_operon
1314             || parent_clause->IsGeneCluster()) {
1315             ok_to_group = true;
1316         }
1317     } else if (IsInsertionSequence()
1318                || subtype == CSeqFeatData::eSubtype_gene
1319                || IsMobileElement()
1320                || IsNoncodingProductFeat()
1321                || subtype == CSeqFeatData::eSubtype_operon
1322                || IsGeneCluster()) {
1323         if (parent_clause->IsMobileElement()
1324             || parent_clause->IsInsertionSequence()
1325             || parent_clause->IsEndogenousVirusSourceFeature()
1326             || parent_subtype == CSeqFeatData::eSubtype_operon
1327             || parent_clause->IsGeneCluster()) {
1328             ok_to_group = true;
1329         }
1330     } else if (subtype == CSeqFeatData::eSubtype_3UTR
1331                || subtype == CSeqFeatData::eSubtype_5UTR
1332                || IsLTR(*m_pMainFeat)) {
1333         if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1334             || parent_subtype == CSeqFeatData::eSubtype_mRNA
1335             || parent_subtype == CSeqFeatData::eSubtype_gene
1336             || parent_clause->IsEndogenousVirusSourceFeature()
1337             || parent_subtype == CSeqFeatData::eSubtype_operon
1338             || parent_clause->IsGeneCluster()) {
1339             ok_to_group = true;
1340         }
1341     }
1342 
1343     return ok_to_group;
1344 }
1345 
1346 
1347 // Transposons, insertion sequences, and endogenous virii
1348 // take subfeatures regardless of whether the subfeature is
1349 // on the same strand.
1350 // Gene Clusters can optionally take subfeatures on either
1351 // strand (gene_cluster_opp_strand is flag).
1352 // Promoters will match up to features that are adjacent.
1353 // Introns will match up to coding regions if the intron
1354 // location is the space between two coding region intervals.
1355 // Any feature on an mRNA sequence groups locationally.
1356 // All other feature matches must be that the feature to
1357 // go into the clause must fit inside the location of the
1358 // other clause.
OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base * parent_clause,bool gene_cluster_opp_strand) const1359 bool CAutoDefFeatureClause::OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base *parent_clause, bool gene_cluster_opp_strand) const
1360 {
1361     if (parent_clause == NULL) {
1362         return false;
1363     }
1364 
1365     if (m_HasGene && parent_clause->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_gene) {
1366         // genes must match to be parents
1367         if (!NStr::Equal(m_GeneName, parent_clause->GetGeneName())) {
1368             return false;
1369         }
1370     }
1371 
1372     if (m_Biomol == CMolInfo::eBiomol_mRNA) {
1373         return true;
1374     }
1375 
1376     sequence::ECompare loc_compare = parent_clause->CompareLocation(*m_ClauseLocation);
1377 
1378     if (loc_compare == sequence::eContained || loc_compare == sequence::eSame) {
1379         if (parent_clause->SameStrand(*m_ClauseLocation)) {
1380             return true;
1381         } else if (parent_clause->IsMobileElement()
1382                    || parent_clause->IsInsertionSequence()
1383                    || parent_clause->IsEndogenousVirusSourceFeature()
1384                    || (parent_clause->IsGeneCluster() && gene_cluster_opp_strand)) {
1385             return true;
1386         }
1387     } else if (IsPromoter()
1388                && parent_clause->SameStrand(*m_ClauseLocation)) {
1389         unsigned int promoter_stop = sequence::GetStop(*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1390         unsigned int parent_start = sequence::GetStart(*(parent_clause->GetLocation()), &(m_BH.GetScope()), eExtreme_Biological);
1391         if (m_ClauseLocation->GetStrand() == eNa_strand_minus) {
1392             if (promoter_stop == parent_start + 1) {
1393                 return true;
1394             }
1395         } else if (promoter_stop + 1 == parent_start) {
1396             return true;
1397         }
1398     } else if (m_pMainFeat->GetData().GetSubtype() == CSeqFeatData::eSubtype_intron
1399                && parent_clause->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_cdregion
1400                && parent_clause->SameStrand(*m_ClauseLocation)) {
1401         CSeq_loc_CI seq_loc_it(*(parent_clause->GetLocation()));
1402         if (seq_loc_it) {
1403             int intron_start = sequence::GetStart(*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1404             int intron_stop = sequence::GetStop (*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1405             int prev_start = seq_loc_it.GetRange().GetFrom();
1406             int prev_stop = seq_loc_it.GetRange().GetTo();
1407             ++seq_loc_it;
1408             while (seq_loc_it) {
1409                 int cds_start = seq_loc_it.GetRange().GetFrom();
1410                 int cds_stop = seq_loc_it.GetRange().GetTo();
1411                 if ((intron_start == prev_stop + 1 && intron_stop == cds_start - 1)
1412                     || (intron_start == cds_stop + 1 && intron_stop == prev_start - 1)) {
1413                     return true;
1414                 }
1415                 prev_start = cds_start;
1416                 prev_stop = cds_stop;
1417                 ++seq_loc_it;
1418             }
1419             // intron could also group with coding region if coding region is adjacent
1420             if (intron_start > prev_stop && intron_start - 1 == prev_stop) {
1421                 return true;
1422             } else if (prev_start > intron_stop && prev_start - 1 == intron_stop) {
1423                 return true;
1424             }
1425         }
1426     }
1427 
1428     return false;
1429 }
1430 
1431 
FindBestParentClause(CAutoDefFeatureClause_Base * subclause,bool gene_cluster_opp_strand)1432 CAutoDefFeatureClause_Base *CAutoDefFeatureClause::FindBestParentClause(CAutoDefFeatureClause_Base * subclause, bool gene_cluster_opp_strand)
1433 {
1434     CAutoDefFeatureClause_Base *best_parent;
1435 
1436     if (subclause == NULL || subclause == this) {
1437         return NULL;
1438     }
1439 
1440 	if (!NStr::IsBlank(subclause->GetGeneName()) &&
1441 		!NStr::IsBlank(this->GetGeneName()) &&
1442 		!NStr::Equal(subclause->GetGeneName(), this->GetGeneName())) {
1443 		return NULL;
1444 	}
1445 
1446     best_parent = CAutoDefFeatureClause_Base::FindBestParentClause(subclause, gene_cluster_opp_strand);
1447 
1448     if (subclause->OkToGroupUnderByLocation(this, gene_cluster_opp_strand)
1449         && subclause->OkToGroupUnderByType(this)) {
1450         if (best_parent == NULL || best_parent->CompareLocation(*m_ClauseLocation) == sequence::eContained) {
1451             best_parent = this;
1452         }
1453     }
1454     return best_parent;
1455 }
1456 
ReverseCDSClauseLists()1457 void CAutoDefFeatureClause::ReverseCDSClauseLists()
1458 {
1459     ENa_strand this_strand = m_ClauseLocation->GetStrand();
1460     if (this_strand == eNa_strand_minus
1461         && GetMainFeatureSubtype() == CSeqFeatData::eSubtype_cdregion) {
1462         std::reverse(m_ClauseList.begin(), m_ClauseList.end());
1463     }
1464 
1465     for (unsigned int k = 0; k < m_ClauseList.size(); k++) {
1466         m_ClauseList[k]->ReverseCDSClauseLists();
1467     }
1468 }
1469 
1470 
1471 
ShouldRemoveExons() const1472 bool CAutoDefFeatureClause::ShouldRemoveExons() const
1473 {
1474     unsigned int subtype = GetMainFeatureSubtype();
1475 
1476     if (subtype == CSeqFeatData::eSubtype_mRNA) {
1477         return false;
1478     } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
1479         if (IsPartial()) {
1480             // keep only if exons have numbers
1481             for (size_t k = 0; k < m_ClauseList.size(); k++) {
1482                 if (m_ClauseList[k]->IsExonWithNumber()) {
1483                     return false;
1484                 }
1485             }
1486             return true;
1487         } else {
1488             return true;
1489         }
1490     } else {
1491         return true;
1492     }
1493 }
1494 
1495 
IsExonWithNumber() const1496 bool CAutoDefFeatureClause::IsExonWithNumber() const
1497 {
1498     if (m_pMainFeat->IsSetData() &&
1499         m_pMainFeat->GetData().GetSubtype() == CSeqFeatData::eSubtype_exon &&
1500         m_pMainFeat->IsSetQual()) {
1501         ITERATE(CSeq_feat::TQual, it, m_pMainFeat->GetQual()) {
1502             if ((*it)->IsSetQual() &&
1503                 NStr::Equal((*it)->GetQual(), "number") &&
1504                 (*it)->IsSetVal() &&
1505                 !NStr::IsBlank((*it)->GetVal())) {
1506                 return true;
1507             }
1508         }
1509     }
1510     return false;
1511 }
1512 
1513 
IsBioseqPrecursorRNA() const1514 bool CAutoDefFeatureClause::IsBioseqPrecursorRNA() const
1515 {
1516     if (m_Biomol == CMolInfo::eBiomol_pre_RNA && GetMainFeatureSubtype() == CSeqFeatData::eSubtype_preRNA) {
1517         return true;
1518     } else {
1519         return false;
1520     }
1521 }
1522 
1523 
CAutoDefNcRNAClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1524 CAutoDefNcRNAClause::CAutoDefNcRNAClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1525                       : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts),
1526 					   m_UseComment (m_Opts.GetUseNcRNAComment())
1527 {
1528 }
1529 
1530 
~CAutoDefNcRNAClause()1531 CAutoDefNcRNAClause::~CAutoDefNcRNAClause()
1532 {
1533 }
1534 
1535 
x_GetProductName(string & product_name)1536 bool CAutoDefNcRNAClause::x_GetProductName(string &product_name)
1537 {
1538     string ncrna_product;
1539     string ncrna_class;
1540     if (m_pMainFeat->IsSetData() && m_pMainFeat->GetData().IsRna()
1541         && m_pMainFeat->GetData().GetRna().IsSetExt()) {
1542         const CRNA_ref::TExt& ext = m_pMainFeat->GetData().GetRna().GetExt();
1543         if (ext.IsName()) {
1544             ncrna_product = ext.GetName();
1545             if (NStr::EqualNocase(ncrna_product, "ncRNA")) {
1546                 ncrna_product = "";
1547             }
1548         } else if (ext.IsGen()) {
1549             if (ext.GetGen().IsSetProduct()) {
1550                 ncrna_product = ext.GetGen().GetProduct();
1551             }
1552             if (ext.GetGen().IsSetClass()) {
1553                 ncrna_class = ext.GetGen().GetClass();
1554             }
1555         }
1556     }
1557     if (NStr::IsBlank(ncrna_product)) {
1558         ncrna_product = m_pMainFeat->GetNamedQual("product");
1559     }
1560     if (NStr::IsBlank(ncrna_class)) {
1561         ncrna_class = m_pMainFeat->GetNamedQual("ncRNA_class");
1562     }
1563     if (NStr::EqualNocase(ncrna_class, "other")) {
1564         ncrna_class = "";
1565     }
1566     NStr::ReplaceInPlace(ncrna_class, "_", " ");
1567 
1568 	string ncrna_comment;
1569     if (m_pMainFeat->IsSetComment()) {
1570         ncrna_comment = m_pMainFeat->GetComment();
1571         if (!NStr::IsBlank(ncrna_comment)) {
1572             string::size_type pos = NStr::Find(ncrna_comment, ";");
1573             if (pos != NCBI_NS_STD::string::npos) {
1574                 ncrna_comment = ncrna_comment.substr(0, pos);
1575             }
1576         }
1577     }
1578 
1579     if (!NStr::IsBlank (ncrna_product)) {
1580         product_name = ncrna_product;
1581         if (!NStr::IsBlank (ncrna_class)) {
1582             product_name += " " + ncrna_class;
1583         }
1584 	} else if (!NStr::IsBlank(ncrna_class)) {
1585         product_name = ncrna_class;
1586 	} else if (m_UseComment && !NStr::IsBlank (ncrna_comment)) {
1587 		product_name = ncrna_comment;
1588     } else {
1589         product_name = "non-coding RNA";
1590     }
1591     return true;
1592 
1593 }
1594 
1595 
1596 static string mobile_element_keywords [] = {
1597   "insertion sequence",
1598   "retrotransposon",
1599   "non-LTR retrotransposon",
1600   "transposon",
1601   "P-element",
1602   "transposable element",
1603   "integron",
1604   "superintegron",
1605   "SINE",
1606   "MITE",
1607   "LINE"
1608 };
1609 
1610 
CAutoDefMobileElementClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1611 CAutoDefMobileElementClause::CAutoDefMobileElementClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
1612                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1613 {
1614     string mobile_element_name = m_pMainFeat->GetNamedQual("mobile_element_type");
1615     if (NStr::StartsWith(mobile_element_name, "other:")) {
1616         mobile_element_name = mobile_element_name.substr(6);
1617     }
1618     bool   found_keyword = false;
1619 
1620     m_Pluralizable = true;
1621 
1622     if (NStr::IsBlank(mobile_element_name)) {
1623         m_Description = "";
1624         m_ShowTypewordFirst = false;
1625         m_Typeword = "mobile element";
1626     } else {
1627         for (unsigned int k = 0; k < sizeof (mobile_element_keywords) / sizeof (string) && !found_keyword; k++) {
1628             size_t pos;
1629             if (NStr::StartsWith(mobile_element_name, mobile_element_keywords[k])) {
1630                 // keyword at the beginning
1631                 m_Typeword = mobile_element_keywords[k];
1632                 if (NStr::Equal(mobile_element_name, mobile_element_keywords[k])) {
1633                     m_ShowTypewordFirst = false;
1634                     m_Description = "";
1635                 } else {
1636                     m_ShowTypewordFirst = true;
1637                     m_Description = mobile_element_name.substr(mobile_element_keywords[k].length());
1638                     NStr::TruncateSpacesInPlace(m_Description);
1639                 }
1640                 if (mobile_element_name.c_str()[mobile_element_keywords[k].length()] == '-') {
1641                     // if keyword is hyphenated portion of name, no pluralization
1642                     m_Pluralizable = false;
1643                 }
1644                 found_keyword = true;
1645             } else if (NStr::EndsWith(mobile_element_name, mobile_element_keywords[k])) {
1646                 // keyword at the end
1647                 m_Typeword = mobile_element_keywords[k];
1648                 m_ShowTypewordFirst = false;
1649                 m_Description = mobile_element_name.substr(0, mobile_element_name.length() - mobile_element_keywords[k].length());
1650                 NStr::TruncateSpacesInPlace(m_Description);
1651                 found_keyword = true;
1652             } else if ((pos = NStr::Find(mobile_element_name, mobile_element_keywords[k])) != string::npos
1653                        && isspace(mobile_element_name.c_str()[pos])) {
1654                 // keyword in the middle
1655                 m_Typeword = "";
1656                 m_ShowTypewordFirst = false;
1657                 m_Description = mobile_element_name.substr(pos);
1658                 m_Pluralizable = false;
1659             }
1660         }
1661         if (!found_keyword) {
1662             // keyword not in description
1663             m_Typeword = "mobile element";
1664             m_Description = mobile_element_name;
1665         }
1666     }
1667     if (NStr::EqualNocase(m_Typeword, "integron")) {
1668         m_ShowTypewordFirst = false;
1669     }
1670 
1671     m_DescriptionChosen = true;
1672     m_TypewordChosen = true;
1673     m_ProductName = "";
1674     m_ProductNameChosen = true;
1675     NStr::TruncateSpacesInPlace(m_Description);
1676     if (NStr::StartsWith(m_Description, ":")) {
1677         m_Description = m_Description.substr(1);
1678         NStr::TruncateSpacesInPlace(m_Description);
1679     }
1680     if (NStr::Equal(m_Description, "unnamed")) {
1681         m_Description = "";
1682     }
1683 }
1684 
1685 
~CAutoDefMobileElementClause()1686 CAutoDefMobileElementClause::~CAutoDefMobileElementClause()
1687 {
1688 }
1689 
1690 
Label(bool suppress_allele)1691 void CAutoDefMobileElementClause::Label(bool suppress_allele)
1692 {
1693     m_DescriptionChosen = true;
1694     x_GetGenericInterval (m_Interval, suppress_allele);
1695 }
1696 
1697 
IsOptional()1698 bool CAutoDefMobileElementClause::IsOptional()
1699 {
1700     if (NStr::Equal(m_Typeword, "SINE") ||
1701         NStr::Equal(m_Typeword, "LINE") ||
1702         NStr::Equal(m_Typeword, "MITE")) {
1703         return true;
1704     } else {
1705         return false;
1706     }
1707 
1708 }
1709 
1710 
1711 const char *kMinisatellite = "minisatellite";
1712 const char *kMicrosatellite = "microsatellite";
1713 const char *kSatellite = "satellite";
1714 
CAutoDefSatelliteClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1715 CAutoDefSatelliteClause::CAutoDefSatelliteClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1716                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1717 {
1718 	string comment = m_pMainFeat->GetNamedQual("satellite");
1719     string::size_type pos = NStr::Find(comment, ";");
1720     if (pos != NCBI_NS_STD::string::npos) {
1721         comment = comment.substr(0, pos);
1722     }
1723 
1724 	size_t len = 0;
1725 
1726 	if (NStr::StartsWith(comment, kMinisatellite)) {
1727 		len = strlen (kMinisatellite);
1728 	} else if (NStr::StartsWith (comment, kMicrosatellite)) {
1729 		len = strlen (kMicrosatellite);
1730 	} else if (NStr::StartsWith (comment, kSatellite)) {
1731 		len = strlen (kSatellite);
1732     } else {
1733         // use default label satellite
1734         string prefix = kSatellite;
1735         comment = prefix + " " + comment;
1736     }
1737 	if (len > 0 && NStr::Equal(comment.substr(len, 1), ":")) {
1738 	    comment = comment.substr (0, len) + " " + comment.substr (len + 1);
1739 	}
1740 
1741     m_Description = comment;
1742     m_DescriptionChosen = true;
1743     m_Typeword = "sequence";
1744     m_TypewordChosen = true;
1745 }
1746 
1747 
~CAutoDefSatelliteClause()1748 CAutoDefSatelliteClause::~CAutoDefSatelliteClause()
1749 {
1750 }
1751 
1752 
Label(bool suppress_allele)1753 void CAutoDefSatelliteClause::Label(bool suppress_allele)
1754 {
1755     m_DescriptionChosen = true;
1756     x_GetGenericInterval(m_Interval, suppress_allele);
1757 }
1758 
1759 
CAutoDefPromoterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1760 CAutoDefPromoterClause::CAutoDefPromoterClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1761                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1762 {
1763     m_Description = "";
1764     m_DescriptionChosen = true;
1765     m_Typeword = "promoter region";
1766     m_TypewordChosen = true;
1767     m_Interval = "";
1768 }
1769 
1770 
~CAutoDefPromoterClause()1771 CAutoDefPromoterClause::~CAutoDefPromoterClause()
1772 {
1773 }
1774 
1775 
Label(bool suppress_allele)1776 void CAutoDefPromoterClause::Label(bool suppress_allele)
1777 {
1778     m_DescriptionChosen = true;
1779 }
1780 
1781 
1782 /* This class produces the default definition line label for a misc_feature
1783  * that has the word "intergenic spacer" in the comment.  If the comment starts
1784  * with the word "contains", "contains" is ignored.  If "intergenic spacer"
1785  * appears first in the comment (or first after the word "contains", the text
1786  * after the words "intergenic spacer" but before the first semicolon (if any)
1787  * appear after the words "intergenic spacer" in the definition line.  If there
1788  * are words after "contains" or at the beginning of the comment before the words
1789  * "intergenic spacer", this text will appear in the definition line before the words
1790  * "intergenic spacer".
1791  */
1792 
InitWithString(string comment,bool suppress_allele)1793 void CAutoDefIntergenicSpacerClause::InitWithString (string comment, bool suppress_allele)
1794 {
1795     m_Typeword = "intergenic spacer";
1796     m_TypewordChosen = true;
1797     m_ShowTypewordFirst = false;
1798     m_Pluralizable = false;
1799 
1800 
1801     if (NStr::StartsWith(comment, "may contain ")) {
1802         m_Description = comment.substr(12);
1803         m_DescriptionChosen = true;
1804         m_Typeword = "";
1805         m_TypewordChosen = true;
1806         m_Interval = "region";
1807     } else {
1808         if (NStr::StartsWith(comment, "contains ")) {
1809             comment = comment.substr(9);
1810         }
1811 
1812         if (NStr::StartsWith(comment, "intergenic spacer")) {
1813             comment = comment.substr(17);
1814             if (NStr::IsBlank(comment)) {
1815                 m_ShowTypewordFirst = false;
1816                 m_Description = "";
1817                 m_DescriptionChosen = true;
1818             } else {
1819                 NStr::TruncateSpacesInPlace(comment);
1820                 if (NStr::StartsWith(comment, "and ")) {
1821                     m_Description = "";
1822                     m_DescriptionChosen = true;
1823                     m_ShowTypewordFirst = false;
1824                 } else {
1825                     m_Description = comment;
1826                     m_DescriptionChosen = true;
1827                     m_ShowTypewordFirst = true;
1828                 }
1829             }
1830         } else {
1831             string::size_type pos = NStr::Find(comment, "intergenic spacer");
1832             if (pos != NCBI_NS_STD::string::npos) {
1833                 m_Description = comment.substr(0, pos);
1834                 NStr::TruncateSpacesInPlace(m_Description);
1835                 m_DescriptionChosen = true;
1836                 m_ShowTypewordFirst = false;
1837             }
1838         }
1839         x_GetGenericInterval(m_Interval, suppress_allele);
1840     }
1841 }
1842 
1843 
CAutoDefIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string comment,const CAutoDefOptions & opts)1844 CAutoDefIntergenicSpacerClause::CAutoDefIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, string comment, const CAutoDefOptions& opts)
1845                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1846 {
1847     InitWithString (comment, true);
1848 }
1849 
1850 
CAutoDefIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1851 CAutoDefIntergenicSpacerClause::CAutoDefIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1852                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1853 {
1854 
1855     string comment;
1856     if (m_pMainFeat->IsSetComment()) {
1857         comment = m_pMainFeat->GetComment();
1858     }
1859 
1860     /* truncate at first semicolon */
1861     string::size_type pos = NStr::Find(comment, ";");
1862     if (pos != NCBI_NS_STD::string::npos) {
1863         comment = comment.substr(0, pos);
1864     }
1865 
1866     InitWithString (comment, true);
1867 }
1868 
1869 
~CAutoDefIntergenicSpacerClause()1870 CAutoDefIntergenicSpacerClause::~CAutoDefIntergenicSpacerClause()
1871 {
1872 }
1873 
1874 
Label(bool suppress_allele)1875 void CAutoDefIntergenicSpacerClause::Label(bool suppress_allele)
1876 {
1877     m_DescriptionChosen = true;
1878 }
1879 
1880 
CAutoDefParsedIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const string & description,bool is_first,bool is_last,const CAutoDefOptions & opts)1881 CAutoDefParsedIntergenicSpacerClause::CAutoDefParsedIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc,
1882                                                                            const string& description, bool is_first, bool is_last, const CAutoDefOptions& opts)
1883                                                                            : CAutoDefIntergenicSpacerClause(bh, main_feat, mapped_loc, opts)
1884 {
1885     if (!NStr::IsBlank(description)) {
1886         m_Description = description;
1887         size_t pos = NStr::Find(m_Description, "intergenic spacer");
1888         if (pos != string::npos) {
1889             m_Description = m_Description.substr(0, pos);
1890             NStr::TruncateSpacesInPlace(m_Description);
1891         }
1892         m_DescriptionChosen = true;
1893     }
1894     m_Typeword = "intergenic spacer";
1895     m_TypewordChosen = true;
1896 
1897     // adjust partialness of location
1898     bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological) && is_first;
1899     bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological) && is_last;
1900     m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1901     m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1902     x_GetGenericInterval(m_Interval, true);
1903     if (NStr::EndsWith(description, " region")) {
1904         MakeRegion();
1905     }
1906 }
1907 
1908 
~CAutoDefParsedIntergenicSpacerClause()1909 CAutoDefParsedIntergenicSpacerClause::~CAutoDefParsedIntergenicSpacerClause()
1910 {
1911 }
1912 
1913 
~CAutoDefParsedtRNAClause()1914 CAutoDefParsedtRNAClause::~CAutoDefParsedtRNAClause()
1915 {
1916 }
1917 
1918 
CAutoDefParsedClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,bool is_first,bool is_last,const CAutoDefOptions & opts)1919 CAutoDefParsedClause::CAutoDefParsedClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, bool is_first, bool is_last, const CAutoDefOptions& opts)
1920                                        : CAutoDefFeatureClause (bh, main_feat, mapped_loc, opts)
1921 {
1922     // adjust partialness of location
1923     bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological) && is_first;
1924     bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological) && is_last;
1925     m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1926     m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1927 }
1928 
~CAutoDefParsedClause()1929 CAutoDefParsedClause::~CAutoDefParsedClause()
1930 {
1931 }
1932 
SetMiscRNAWord(const string & phrase)1933 void CAutoDefParsedClause::SetMiscRNAWord(const string& phrase)
1934 {
1935     ERnaMiscWord word_type = x_GetRnaMiscWordType(phrase);
1936     if (word_type == eMiscRnaWordType_InternalSpacer ||
1937         word_type == eMiscRnaWordType_ExternalSpacer ||
1938         word_type == eMiscRnaWordType_RNAIntergenicSpacer ||
1939         word_type == eMiscRnaWordType_IntergenicSpacer) {
1940         const string& item_name = x_GetRnaMiscWord(word_type);
1941         if (NStr::StartsWith(phrase, item_name)) {
1942             SetTypewordFirst(true);
1943             m_Description = phrase.substr(item_name.length());
1944         } else {
1945             SetTypewordFirst(false);
1946             m_Description = phrase.substr(0, NStr::Find(phrase, item_name));
1947         }
1948         if (NStr::EndsWith(phrase, " region") &&
1949             (!m_ShowTypewordFirst || m_Description != " region")) {
1950             SetTypeword(item_name + " region");
1951         } else {
1952             SetTypeword(item_name);
1953         }
1954     } else if (word_type == eMiscRnaWordType_RNA) {
1955         m_Description = phrase;
1956         if (NStr::EndsWith(m_Description, " gene")) {
1957             m_Description = m_Description.substr(0, m_Description.length() - 5);
1958         }
1959         SetTypeword("gene");
1960         SetTypewordFirst(false);
1961     } else if (word_type == eMiscRnaWordType_tRNA) {
1962         string gene_name;
1963         string product_name;
1964         if (CAutoDefParsedtRNAClause::ParseString(phrase, gene_name, product_name)) {
1965             m_TypewordChosen = true;
1966             m_GeneName = gene_name;
1967             if (!NStr::IsBlank(m_GeneName)) {
1968                 m_HasGene = true;
1969             }
1970             m_ProductName = product_name;
1971             m_ProductNameChosen = true;
1972             x_GetDescription(m_Description);
1973         } else {
1974             m_Description = phrase;
1975         }
1976         SetTypeword("gene");
1977         SetTypewordFirst(false);
1978     }
1979     NStr::TruncateSpacesInPlace(m_Description);
1980     m_DescriptionChosen = true;
1981 }
1982 
1983 
CAutoDefParsedtRNAClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string gene_name,string product_name,bool is_first,bool is_last,const CAutoDefOptions & opts)1984 CAutoDefParsedtRNAClause::CAutoDefParsedtRNAClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc,
1985                                                    string gene_name, string product_name,
1986                                                    bool is_first, bool is_last, const CAutoDefOptions& opts)
1987                                                    : CAutoDefParsedClause (bh, main_feat, mapped_loc, is_first, is_last, opts)
1988 {
1989     m_Typeword = "gene";
1990     m_TypewordChosen = true;
1991     m_GeneName = gene_name;
1992     if (!NStr::IsBlank (m_GeneName)) {
1993         m_HasGene = true;
1994     }
1995     m_ProductName = product_name;
1996     m_ProductNameChosen = true;
1997 }
1998 
1999 
CAutoDefGeneClusterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2000 CAutoDefGeneClusterClause::CAutoDefGeneClusterClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2001                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2002 {
2003     m_Pluralizable = false;
2004     m_ShowTypewordFirst = false;
2005     string comment = m_pMainFeat->GetComment();
2006 
2007     string::size_type pos = NStr::Find(comment, "gene cluster");
2008     if (pos == NCBI_NS_STD::string::npos) {
2009         pos = NStr::Find(comment, "gene locus");
2010         m_Typeword = "gene locus";
2011         m_TypewordChosen = true;
2012     } else {
2013         m_Typeword = "gene cluster";
2014         m_TypewordChosen = true;
2015     }
2016 
2017     if (pos != NCBI_NS_STD::string::npos) {
2018         comment = comment.substr(0, pos);
2019     }
2020     NStr::TruncateSpacesInPlace(comment);
2021     m_Description = comment;
2022     m_DescriptionChosen = true;
2023     m_SuppressSubfeatures = true;
2024 }
2025 
2026 
~CAutoDefGeneClusterClause()2027 CAutoDefGeneClusterClause::~CAutoDefGeneClusterClause()
2028 {
2029 }
2030 
2031 
Label(bool suppress_allele)2032 void CAutoDefGeneClusterClause::Label(bool suppress_allele)
2033 {
2034     x_GetGenericInterval(m_Interval, suppress_allele);
2035     m_DescriptionChosen = true;
2036 }
2037 
2038 
CAutoDefMiscCommentClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2039 CAutoDefMiscCommentClause::CAutoDefMiscCommentClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2040                   : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2041 {
2042     if (m_pMainFeat->CanGetComment()) {
2043         m_Description = m_pMainFeat->GetComment();
2044         string::size_type pos = NStr::Find(m_Description, ";");
2045         if (pos != NCBI_NS_STD::string::npos) {
2046             m_Description = m_Description.substr(0, pos);
2047         }
2048         m_DescriptionChosen = true;
2049     }
2050     if (NStr::EndsWith(m_Description, " sequence")) {
2051         m_Description = m_Description.substr(0, m_Description.length() - 9);
2052         m_Typeword = "sequence";
2053         m_TypewordChosen = true;
2054     } else {
2055         x_TypewordFromSequence();
2056     }
2057     m_Interval = "";
2058 }
2059 
2060 
~CAutoDefMiscCommentClause()2061 CAutoDefMiscCommentClause::~CAutoDefMiscCommentClause()
2062 {
2063 }
2064 
2065 
Label(bool suppress_allele)2066 void CAutoDefMiscCommentClause::Label(bool suppress_allele)
2067 {
2068     m_DescriptionChosen = true;
2069 }
2070 
2071 
CAutoDefParsedRegionClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string product,const CAutoDefOptions & opts)2072 CAutoDefParsedRegionClause::CAutoDefParsedRegionClause
2073 (CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, string product, const CAutoDefOptions& opts)
2074 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2075 {
2076     vector<string> elements = GetMiscRNAElements(product);
2077     if (elements.empty()) {
2078         m_Description = product;
2079     } else {
2080         ITERATE(vector<string>, it, elements) {
2081             if (!NStr::IsBlank(m_Description)) {
2082                 m_Description += ", ";
2083                 if (*it == elements.back()) {
2084                     m_Description += "and ";
2085                 }
2086             }
2087             m_Description += *it;
2088             if (NStr::Find(*it, "RNA") != string::npos && !NStr::EndsWith(*it, "gene") && !NStr::EndsWith(*it, "genes")) {
2089                 m_Description += " gene";
2090             }
2091         }
2092     }
2093     m_DescriptionChosen = true;
2094 
2095     m_Typeword = "";
2096     m_TypewordChosen = true;
2097     m_Interval = "region";
2098 }
2099 
2100 
~CAutoDefParsedRegionClause()2101 CAutoDefParsedRegionClause::~CAutoDefParsedRegionClause()
2102 {
2103 }
2104 
2105 
Label(bool suppress_allele)2106 void CAutoDefParsedRegionClause::Label(bool suppress_allele)
2107 {
2108 }
2109 
CAutoDefFakePromoterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2110 CAutoDefFakePromoterClause::CAutoDefFakePromoterClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2111                    : CAutoDefFeatureClause (bh, main_feat, mapped_loc, opts)
2112 {
2113     m_Description = "";
2114     m_DescriptionChosen = true;
2115     m_Typeword = "promoter region";
2116     m_TypewordChosen = true;
2117     m_ShowTypewordFirst = false;
2118     m_Interval = "";
2119 
2120 
2121     m_ClauseLocation = new CSeq_loc();
2122     const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
2123     CRef <CSeq_id> new_id(new CSeq_id);
2124     new_id->Assign(*id);
2125     m_ClauseLocation->SetInt().SetId(*new_id);
2126     m_ClauseLocation->SetInt().SetFrom(0);
2127     m_ClauseLocation->SetInt().SetTo(bh.GetInst_Length() - 1);
2128 
2129 }
2130 
2131 
~CAutoDefFakePromoterClause()2132 CAutoDefFakePromoterClause::~CAutoDefFakePromoterClause()
2133 {
2134 }
2135 
2136 
Label(bool suppress_allele)2137 void CAutoDefFakePromoterClause::Label(bool suppress_allele)
2138 {
2139 }
2140 
2141 
OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base * parent_clause,bool gene_cluster_opp_strand) const2142 bool CAutoDefFakePromoterClause::OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base *parent_clause, bool gene_cluster_opp_strand) const
2143 {
2144     if (parent_clause == NULL) {
2145         return false;
2146     } else {
2147         return true;
2148     }
2149 }
2150 
2151 
OkToGroupUnderByType(const CAutoDefFeatureClause_Base * parent_clause) const2152 bool CAutoDefFakePromoterClause::OkToGroupUnderByType(const CAutoDefFeatureClause_Base *parent_clause) const
2153 {
2154     bool ok_to_group = false;
2155 
2156     if (parent_clause == NULL) {
2157         return false;
2158     }
2159     CSeqFeatData::ESubtype parent_subtype = parent_clause->GetMainFeatureSubtype();
2160 
2161     if (parent_subtype == CSeqFeatData::eSubtype_cdregion
2162         || parent_subtype == CSeqFeatData::eSubtype_mRNA
2163         || parent_subtype == CSeqFeatData::eSubtype_gene
2164         || parent_subtype == CSeqFeatData::eSubtype_operon
2165         || parent_clause->IsEndogenousVirusSourceFeature()
2166         || parent_clause->IsGeneCluster()) {
2167         ok_to_group = true;
2168     }
2169 
2170     return ok_to_group;
2171 }
2172 
2173 
CAutoDefPromoterAnd5UTRClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2174 CAutoDefPromoterAnd5UTRClause::CAutoDefPromoterAnd5UTRClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2175     : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2176 {
2177     m_Description = "promoter region and 5' UTR";
2178     m_DescriptionChosen = true;
2179     m_Typeword = "";
2180     m_TypewordChosen = true;
2181     m_ShowTypewordFirst = false;
2182     m_Interval = "genomic sequence";
2183 
2184 
2185     m_ClauseLocation = new CSeq_loc();
2186     const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
2187     CRef <CSeq_id> new_id(new CSeq_id);
2188     new_id->Assign(*id);
2189     m_ClauseLocation->SetInt().SetId(*new_id);
2190     m_ClauseLocation->SetInt().SetFrom(0);
2191     m_ClauseLocation->SetInt().SetTo(bh.GetInst_Length() - 1);
2192 
2193 }
2194 
2195 
IsPromoterAnd5UTR(const CSeq_feat & feat)2196 bool CAutoDefPromoterAnd5UTRClause::IsPromoterAnd5UTR(const CSeq_feat& feat)
2197 {
2198     return (feat.IsSetData() &&
2199         feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature &&
2200         feat.IsSetComment() &&
2201         NStr::Equal(feat.GetComment(), "contains promoter and 5' UTR"));
2202 }
2203 
2204 
Label(bool suppress_allele)2205 void CAutoDefPromoterAnd5UTRClause::Label(bool suppress_allele)
2206 {
2207 
2208 }
2209 
2210 
GetClauseType() const2211 CAutoDefFeatureClause::EClauseType CAutoDefFeatureClause::GetClauseType() const
2212 {
2213     CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype();
2214     if (subtype == CSeqFeatData::eSubtype_repeat_region) {
2215         if (!NStr::IsBlank(m_pMainFeat->GetNamedQual("endogenous_virus"))) {
2216             return eEndogenousVirusRepeatRegion;
2217         }
2218     }
2219     return eDefault;
2220 }
2221 
2222 
2223 // Some misc_RNA clauses have a comment that actually lists multiple
2224 // features.  These functions create a clause for each element in the
2225 // comment.
2226 
AddMiscRNAFeatures(const CBioseq_Handle & bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2227 vector<CRef<CAutoDefFeatureClause > > AddMiscRNAFeatures(const CBioseq_Handle& bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
2228 {
2229     vector<CRef<CAutoDefFeatureClause > > rval;
2230     string comment;
2231     string::size_type pos;
2232 
2233     if (cf.GetData().Which() == CSeqFeatData::e_Rna) {
2234         comment = cf.GetNamedQual("product");
2235         if (NStr::IsBlank(comment)
2236             && cf.IsSetData()
2237             && cf.GetData().IsRna()
2238             && cf.GetData().GetRna().IsSetExt()) {
2239             if (cf.GetData().GetRna().GetExt().IsName()) {
2240                 comment = cf.GetData().GetRna().GetExt().GetName();
2241             }
2242             else if (cf.GetData().GetRna().GetExt().IsGen()
2243                 && cf.GetData().GetRna().GetExt().GetGen().IsSetProduct()) {
2244                 comment = cf.GetData().GetRna().GetExt().GetGen().GetProduct();
2245             }
2246         }
2247     }
2248 
2249     if ((NStr::Equal(comment, "misc_RNA") || NStr::IsBlank(comment)) && cf.CanGetComment()) {
2250         comment = cf.GetComment();
2251     }
2252     if (NStr::IsBlank(comment)) {
2253         return rval;
2254     }
2255 
2256     pos = NStr::Find(comment, "spacer");
2257     if (pos == NPOS) {
2258         return rval;
2259     }
2260 
2261     bool is_region = false;
2262 
2263     NStr::TrimPrefixInPlace(comment, "contains ");
2264     if (NStr::StartsWith(comment, "may contain ")) {
2265         NStr::TrimPrefixInPlace(comment, "may contain ");
2266         is_region = true;
2267     }
2268 
2269     pos = NStr::Find(comment, ";");
2270     if (pos != string::npos) {
2271         comment = comment.substr(0, pos);
2272     }
2273 
2274     if (is_region) {
2275         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedRegionClause(bh, cf, mapped_loc, comment, opts)));
2276     } else {
2277         vector<string> elements = CAutoDefFeatureClause::GetMiscRNAElements(comment);
2278         if (!elements.empty()) {
2279             for (auto s : elements) {
2280                 CRef<CAutoDefParsedClause> new_clause(new CAutoDefParsedClause(bh, cf, mapped_loc,
2281                     (s == elements.front()), (s == elements.back()), opts));
2282                 new_clause->SetMiscRNAWord(s);
2283                 rval.push_back(new_clause);
2284             }
2285         } else {
2286             elements = CAutoDefFeatureClause::GetTrnaIntergenicSpacerClausePhrases(comment);
2287             if (!elements.empty()) {
2288                 for (auto s : elements) {
2289                     size_t pos = NStr::Find(s, "intergenic spacer");
2290                     if (pos != string::npos) {
2291                         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedIntergenicSpacerClause(bh,
2292                             cf,
2293                             mapped_loc,
2294                             (s),
2295                             (s == elements.front()),
2296                             (s == elements.back()), opts)));
2297                     } else {
2298                         rval.push_back(CRef<CAutoDefFeatureClause>(s_tRNAClauseFromNote(bh, cf, mapped_loc, s, (s == elements.front()), (s == elements.back()), opts)));
2299                     }
2300                 }
2301             } else {
2302                 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedIntergenicSpacerClause(bh,
2303                     cf,
2304                     mapped_loc,
2305                     comment,
2306                     true,
2307                     true,
2308                     opts)));
2309             }
2310         }
2311     }
2312     return rval;
2313 }
2314 
2315 
AddtRNAAndOther(const CBioseq_Handle & bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2316 vector<CRef<CAutoDefFeatureClause > > AddtRNAAndOther(const CBioseq_Handle& bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
2317 {
2318     vector<CRef<CAutoDefFeatureClause> > rval;
2319     if (cf.GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature ||
2320         !cf.IsSetComment()) {
2321         return rval;
2322     }
2323 
2324     vector<string> phrases = CAutoDefFeatureClause_Base::GetFeatureClausePhrases(cf.GetComment());
2325     if (phrases.size() < 2) {
2326         return rval;
2327     }
2328 
2329     bool first = true;
2330     string last = phrases.back();
2331     phrases.pop_back();
2332     ITERATE(vector<string>, it, phrases) {
2333         rval.push_back(CRef<CAutoDefFeatureClause>(CAutoDefFeatureClause_Base::ClauseFromPhrase(*it, bh, cf, mapped_loc, first, false, opts)));
2334         first = false;
2335     }
2336     rval.push_back(CRef<CAutoDefFeatureClause>(CAutoDefFeatureClause_Base::ClauseFromPhrase(last, bh, cf, mapped_loc, first, true, opts)));
2337 
2338     return rval;
2339 }
2340 
2341 
FeatureClauseFactory(CBioseq_Handle bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts,bool is_single_misc_feat)2342 vector<CRef<CAutoDefFeatureClause > > FeatureClauseFactory(CBioseq_Handle bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts, bool is_single_misc_feat)
2343 {
2344     vector<CRef<CAutoDefFeatureClause> > rval;
2345 
2346     auto subtype = cf.GetData().GetSubtype();
2347 
2348     if (opts.IsFeatureSuppressed(subtype)) {
2349         return rval;
2350     }
2351 
2352     if (subtype == CSeqFeatData::eSubtype_gene) {
2353         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefGeneClause(bh, cf, mapped_loc, opts)));
2354     } else if (subtype == CSeqFeatData::eSubtype_ncRNA) {
2355         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefNcRNAClause(bh, cf, mapped_loc, opts)));
2356     } else if (subtype == CSeqFeatData::eSubtype_mobile_element) {
2357         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefMobileElementClause(bh, cf, mapped_loc, opts)));
2358     } else if (CAutoDefFeatureClause::IsSatellite(cf)) {
2359         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefSatelliteClause(bh, cf, mapped_loc, opts)));
2360     } else if (subtype == CSeqFeatData::eSubtype_otherRNA
2361         || subtype == CSeqFeatData::eSubtype_misc_RNA
2362         || subtype == CSeqFeatData::eSubtype_rRNA) {
2363         auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2364         if (misc_rna.empty()) {
2365             rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2366         } else {
2367             for (auto it : misc_rna) {
2368                 rval.push_back(it);
2369             }
2370         }
2371     } else if (CAutoDefFeatureClause::IsPromoter(cf)) {
2372         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefPromoterClause(bh, cf, mapped_loc, opts)));
2373     } else if (CAutoDefFeatureClause::IsGeneCluster(cf)) {
2374         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefGeneClusterClause(bh, cf, mapped_loc, opts)));
2375     } else if (CAutoDefFeatureClause::IsControlRegion(cf)) {
2376         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2377     }
2378     else if (subtype == CSeqFeatData::eSubtype_otherRNA) {
2379         auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2380         if (misc_rna.empty()) {
2381             // try to make trna clauses
2382             misc_rna = AddtRNAAndOther(bh, cf, mapped_loc, opts);
2383         }
2384         if (misc_rna.empty()) {
2385             rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2386         }
2387         else {
2388             for (auto it : misc_rna) {
2389                 rval.push_back(it);
2390             }
2391         }
2392     } else if (subtype == CSeqFeatData::eSubtype_misc_feature &&
2393         is_single_misc_feat && CAutoDefPromoterAnd5UTRClause::IsPromoterAnd5UTR(cf)) {
2394         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefPromoterAnd5UTRClause(bh, cf, mapped_loc, opts)));
2395     } else if (subtype == CSeqFeatData::eSubtype_misc_feature) {
2396         auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2397         if (misc_rna.empty()) {
2398             // try to make trna clauses
2399             misc_rna = AddtRNAAndOther(bh, cf, mapped_loc, opts);
2400         }
2401         if (misc_rna.empty()) {
2402             // some misc-features may require more parsing
2403             CRef<CAutoDefFeatureClause> new_clause(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts));
2404             if (!is_single_misc_feat &&
2405                 (opts.GetMiscFeatRule() == CAutoDefOptions::eDelete
2406                     || (opts.GetMiscFeatRule() == CAutoDefOptions::eNoncodingProductFeat && !new_clause->IsNoncodingProductFeat()))) {
2407                 // do not create a clause at all
2408                 new_clause.Reset(NULL);
2409             } else if (opts.GetMiscFeatRule() == CAutoDefOptions::eCommentFeat) {
2410                 new_clause.Reset(NULL);
2411                 if (cf.CanGetComment() && !NStr::IsBlank(cf.GetComment())) {
2412                     misc_rna.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefMiscCommentClause(bh, cf, mapped_loc, opts)));
2413                 }
2414             } else {
2415                 misc_rna.push_back(new_clause);
2416             }
2417         }
2418         if (!misc_rna.empty()) {
2419             for (auto it : misc_rna) {
2420                 rval.push_back(it);
2421             }
2422         }
2423 
2424     }  else {
2425         rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2426     }
2427     return rval;
2428 }
2429 
2430 
2431 END_SCOPE(objects)
2432 END_NCBI_SCOPE
2433