1 /* $Id: autodef_feature_clause.cpp 632113 2021-05-26 18:40:28Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Generate unique definition lines for a set of sequences using organism
30 * descriptions and feature clauses.
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algorithm>
35 #include <objmgr/util/autodef.hpp>
36 #include <corelib/ncbimisc.hpp>
37 #include <objmgr/seqdesc_ci.hpp>
38 #include <objmgr/bioseq_ci.hpp>
39 #include <objmgr/feat_ci.hpp>
40 #include <objmgr/util/feature.hpp>
41 #include <objmgr/util/sequence.hpp>
42
43 #include <objects/seq/Seq_descr.hpp>
44 #include <objects/seq/Seqdesc.hpp>
45 #include <objects/seq/Bioseq.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/RNA_gen.hpp>
48
49 #include <serial/iterator.hpp>
50
51 BEGIN_NCBI_SCOPE
52 BEGIN_SCOPE(objects)
53
54 using namespace sequence;
55
CAutoDefFeatureClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)56 CAutoDefFeatureClause::CAutoDefFeatureClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
57 : CAutoDefFeatureClause_Base(opts),
58 m_pMainFeat(&main_feat),
59 m_BH(bh)
60 {
61 x_SetBiomol();
62 m_ClauseList.clear();
63 m_GeneName = "";
64 m_AlleleName = "";
65 m_Interval = "";
66 m_IsAltSpliced = false;
67 m_Pluralizable = false;
68 m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
69 m_ShowTypewordFirst = x_ShowTypewordFirst(m_Typeword);
70 m_Description = "";
71 m_DescriptionChosen = false;
72 m_ProductName = "";
73 m_ProductNameChosen = false;
74
75 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
76
77 m_ClauseLocation = new CSeq_loc();
78 m_ClauseLocation->Add(mapped_loc);
79
80 if (subtype == CSeqFeatData::eSubtype_operon || IsGeneCluster()) {
81 m_SuppressSubfeatures = true;
82 }
83
84 if (m_pMainFeat->CanGetComment() && NStr::Find(m_pMainFeat->GetComment(), "alternatively spliced") != NCBI_NS_STD::string::npos
85 && (subtype == CSeqFeatData::eSubtype_cdregion
86 || subtype == CSeqFeatData::eSubtype_exon
87 || IsNoncodingProductFeat())) {
88 m_IsAltSpliced = true;
89 }
90 }
91
92
~CAutoDefFeatureClause()93 CAutoDefFeatureClause::~CAutoDefFeatureClause()
94 {
95 }
96
97
GetMainFeatureSubtype() const98 CSeqFeatData::ESubtype CAutoDefFeatureClause::GetMainFeatureSubtype() const
99 {
100 if (IsLTR(*m_pMainFeat)) {
101 return CSeqFeatData::eSubtype_LTR;
102 }
103 return m_pMainFeat->GetData().GetSubtype();
104 }
105
106
IsMobileElement() const107 bool CAutoDefFeatureClause::IsMobileElement() const
108 {
109 if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_mobile_element) {
110 return false;
111 } else {
112 return true;
113 }
114 }
115
116
IsInsertionSequence() const117 bool CAutoDefFeatureClause::IsInsertionSequence() const
118 {
119 if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region
120 || NStr::IsBlank(m_pMainFeat->GetNamedQual("insertion_seq"))) {
121 return false;
122 } else {
123 return true;
124 }
125 }
126
127
IsControlRegion(const CSeq_feat & feat)128 bool CAutoDefFeatureClause::IsControlRegion (const CSeq_feat& feat)
129 {
130 if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature
131 && feat.CanGetComment()
132 && NStr::StartsWith(feat.GetComment(), "control region")) {
133 return true;
134 } else {
135 return false;
136 }
137 }
138
139
IsControlRegion() const140 bool CAutoDefFeatureClause::IsControlRegion() const
141 {
142 return IsControlRegion(*m_pMainFeat);
143 }
144
145
IsEndogenousVirusSourceFeature() const146 bool CAutoDefFeatureClause::IsEndogenousVirusSourceFeature () const
147 {
148 if (m_pMainFeat->GetData().GetSubtype() != CSeqFeatData::eSubtype_biosrc
149 || !m_pMainFeat->GetData().GetBiosrc().CanGetSubtype()) {
150 return false;
151 }
152 ITERATE (CBioSource::TSubtype, subSrcI, m_pMainFeat->GetData().GetBiosrc().GetSubtype()) {
153 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_endogenous_virus_name) {
154 return true;
155 }
156 }
157 return false;
158 }
159
160
IsGeneCluster() const161 bool CAutoDefFeatureClause::IsGeneCluster () const
162 {
163 return IsGeneCluster (*m_pMainFeat);
164 }
165
166
IsGeneCluster(const CSeq_feat & feat)167 bool CAutoDefFeatureClause::IsGeneCluster (const CSeq_feat& feat)
168 {
169 if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature
170 || !feat.CanGetComment()) {
171 return false;
172 }
173
174 string comment = feat.GetComment();
175 if (NStr::Find(comment, "gene cluster") != string::npos
176 || NStr::Find(comment, "gene locus") != string::npos) {
177 return true;
178 } else {
179 return false;
180 }
181 }
182
183
IsRecognizedFeature() const184 bool CAutoDefFeatureClause::IsRecognizedFeature() const
185 {
186 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
187 if (subtype == CSeqFeatData::eSubtype_3UTR
188 || subtype == CSeqFeatData::eSubtype_5UTR
189 || IsLTR(*m_pMainFeat)
190 || subtype == CSeqFeatData::eSubtype_cdregion
191 || subtype == CSeqFeatData::eSubtype_gene
192 || subtype == CSeqFeatData::eSubtype_mRNA
193 || subtype == CSeqFeatData::eSubtype_operon
194 || subtype == CSeqFeatData::eSubtype_exon
195 || subtype == CSeqFeatData::eSubtype_intron
196 || subtype == CSeqFeatData::eSubtype_rRNA
197 || subtype == CSeqFeatData::eSubtype_tRNA
198 || subtype == CSeqFeatData::eSubtype_otherRNA
199 || subtype == CSeqFeatData::eSubtype_misc_RNA
200 || subtype == CSeqFeatData::eSubtype_ncRNA
201 || subtype == CSeqFeatData::eSubtype_preRNA
202 || subtype == CSeqFeatData::eSubtype_tmRNA
203 || subtype == CSeqFeatData::eSubtype_D_loop
204 || subtype == CSeqFeatData::eSubtype_regulatory
205 || subtype == CSeqFeatData::eSubtype_misc_recomb
206 || IsNoncodingProductFeat()
207 || IsMobileElement()
208 || IsInsertionSequence()
209 || IsControlRegion()
210 || IsEndogenousVirusSourceFeature()
211 || IsSatelliteClause()
212 || IsPromoter()
213 || IsGeneCluster()
214 || GetClauseType() != eDefault) {
215 return true;
216 } else {
217 return false;
218 }
219 }
220
221
x_SetBiomol()222 void CAutoDefFeatureClause::x_SetBiomol()
223 {
224 m_Biomol = CMolInfo::eBiomol_genomic;
225 CSeqdesc_CI desc_iter(m_BH, CSeqdesc::e_Molinfo);
226 for ( ; desc_iter; ++desc_iter) {
227 if (desc_iter->GetMolinfo().IsSetBiomol()) {
228 m_Biomol = desc_iter->GetMolinfo().GetBiomol();
229 }
230 }
231 }
232
233
IsPseudo(const CSeq_feat & f)234 bool CAutoDefFeatureClause::IsPseudo(const CSeq_feat& f)
235 {
236 bool is_pseudo = false;
237 if (f.CanGetPseudo() && f.IsSetPseudo()) {
238 is_pseudo = true;
239 } else if (f.IsSetQual()) {
240 for (auto& it : f.GetQual()) {
241 if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), "pseudogene")) {
242 is_pseudo = true;
243 break;
244 }
245 }
246 }
247 return is_pseudo;
248 }
249
250
x_IsPseudo()251 bool CAutoDefFeatureClause::x_IsPseudo()
252 {
253 return (m_GeneIsPseudo || IsPseudo(*m_pMainFeat));
254 }
255
256
x_TypewordFromSequence()257 void CAutoDefFeatureClause::x_TypewordFromSequence()
258 {
259 if (m_Biomol == CMolInfo::eBiomol_genomic) {
260 m_Typeword = "genomic sequence";
261 } else if (m_Biomol == CMolInfo::eBiomol_mRNA) {
262 m_Typeword = "mRNA sequence";
263 } else {
264 m_Typeword = "sequence";
265 }
266 m_TypewordChosen = true;
267 }
268
269
x_GetFeatureTypeWord(string & typeword)270 bool CAutoDefFeatureClause::x_GetFeatureTypeWord(string &typeword)
271 {
272 string qual, comment;
273
274 if (IsLTR(*m_pMainFeat)) {
275 typeword = "LTR repeat region";
276 return true;
277 }
278
279 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
280 switch (subtype) {
281 case CSeqFeatData::eSubtype_exon:
282 typeword = "exon";
283 return true;
284 break;
285 case CSeqFeatData::eSubtype_intron:
286 typeword = "intron";
287 return true;
288 break;
289 case CSeqFeatData::eSubtype_D_loop:
290 typeword = "D-loop";
291 return true;
292 break;
293 case CSeqFeatData::eSubtype_3UTR:
294 typeword = "3' UTR";
295 return true;
296 break;
297 case CSeqFeatData::eSubtype_5UTR:
298 typeword = "5' UTR";
299 return true;
300 break;
301 case CSeqFeatData::eSubtype_operon:
302 typeword = "operon";
303 return true;
304 break;
305 case CSeqFeatData::eSubtype_repeat_region:
306 //if has insertion_seq gbqual
307 if (IsInsertionSequence()) {
308 typeword = "insertion sequence";
309 return true;
310 }
311 qual = m_pMainFeat->GetNamedQual("endogenous_virus");
312 if (!NStr::IsBlank(qual)) {
313 typeword = "endogenous virus";
314 return true;
315 }
316 if (IsMobileElement()) {
317 typeword = "transposon";
318 return true;
319 }
320 typeword = "repeat region";
321 return true;
322 break;
323 case CSeqFeatData::eSubtype_misc_feature:
324 if (m_pMainFeat->CanGetComment()) {
325 comment = m_pMainFeat->GetComment();
326 if (NStr::StartsWith(comment, "control region", NStr::eNocase)) {
327 typeword = "control region";
328 return true;
329 }
330 }
331 break;
332 case CSeqFeatData::eSubtype_misc_recomb:
333 x_TypewordFromSequence();
334 return true;
335 break;
336 case CSeqFeatData::eSubtype_biosrc:
337 if (IsEndogenousVirusSourceFeature()) {
338 typeword = "endogenous virus";
339 return true;
340 }
341 break;
342 case CSeqFeatData::eSubtype_regulatory:
343 if (m_pMainFeat->IsSetQual()) {
344 ITERATE(CSeq_feat::TQual, q, m_pMainFeat->GetQual()) {
345 if ((*q)->IsSetQual() &&
346 NStr::Equal((*q)->GetQual(), "regulatory_class") &&
347 (*q)->IsSetVal() && !NStr::IsBlank((*q)->GetVal())) {
348 typeword = (*q)->GetVal();
349 return true;
350 }
351 }
352 }
353 break;
354 default:
355 break;
356 }
357
358 if (m_Biomol == CMolInfo::eBiomol_genomic || m_Biomol == CMolInfo::eBiomol_cRNA) {
359 if (x_IsPseudo()) {
360 typeword = "pseudogene";
361 return true;
362 } else {
363 typeword = "gene";
364 return true;
365 }
366 } else if (subtype == CSeqFeatData::eSubtype_rRNA
367 || subtype == CSeqFeatData::eSubtype_snoRNA
368 || subtype == CSeqFeatData::eSubtype_snRNA
369 || subtype == CSeqFeatData::eSubtype_ncRNA) {
370 return false;
371 } else if (subtype == CSeqFeatData::eSubtype_precursor_RNA) {
372 typeword = "precursor RNA";
373 return true;
374 } else if (m_Biomol == CMolInfo::eBiomol_mRNA) {
375 if (x_IsPseudo()) {
376 typeword = "pseudogene mRNA";
377 } else {
378 typeword = "mRNA";
379 }
380 return true;
381 } else if (m_Biomol == CMolInfo::eBiomol_pre_RNA) {
382 if (x_IsPseudo()) {
383 typeword = "pseudogene precursor RNA";
384 } else {
385 typeword = "precursor RNA";
386 }
387 return true;
388 } else if (m_Biomol == CMolInfo::eBiomol_other_genetic) {
389 typeword = "gene";
390 return true;
391 }
392 typeword = "";
393 return true;
394 }
395
396
x_ShowTypewordFirst(string typeword)397 bool CAutoDefFeatureClause::x_ShowTypewordFirst(string typeword)
398 {
399 if (NStr::Equal(typeword, "")) {
400 return false;
401 } else if (NStr::EqualNocase(typeword, "exon")
402 || NStr::EqualNocase(typeword, "intron")
403 || NStr::EqualNocase(typeword, "transposon")
404 || NStr::EqualNocase(typeword, "insertion sequence")
405 || NStr::EqualNocase(typeword, "endogenous virus")
406 || NStr::EqualNocase(typeword, "retrotransposon")
407 || NStr::EqualNocase(typeword, "P-element")
408 || NStr::EqualNocase(typeword, "transposable element")
409 || NStr::EqualNocase(typeword, "integron")
410 || NStr::EqualNocase(typeword, "superintegron")
411 || NStr::EqualNocase(typeword, "MITE")) {
412 return true;
413 } else {
414 return false;
415 }
416 }
417
418
x_FindNoncodingFeatureKeywordProduct(string comment,string keyword,string & product_name) const419 bool CAutoDefFeatureClause::x_FindNoncodingFeatureKeywordProduct (string comment, string keyword, string &product_name) const
420 {
421 if (NStr::IsBlank(comment) || NStr::IsBlank(keyword)) {
422 return false;
423 }
424 string::size_type start_pos = 0;
425
426 while (start_pos != NCBI_NS_STD::string::npos) {
427 start_pos = NStr::Find(comment, keyword, start_pos);
428 if (start_pos != NCBI_NS_STD::string::npos) {
429 string possible = comment.substr(start_pos + keyword.length());
430 NStr::TruncateSpacesInPlace(possible);
431 if (!NStr::StartsWith(possible, "GenBank Accession Number")) {
432 product_name = possible;
433 // truncate at first semicolon
434 string::size_type end = NStr::Find(product_name, ";");
435 if (end != NCBI_NS_STD::string::npos) {
436 product_name = product_name.substr(0, end);
437 }
438 // remove sequence from end of product name if found
439 if (NStr::EndsWith(product_name, " sequence")) {
440 product_name = product_name.substr(0, product_name.length() - 9);
441 }
442 // add "-like" if not present
443 if (!NStr::EndsWith(product_name, "-like")) {
444 product_name += "-like";
445 }
446 return true;
447 } else {
448 start_pos += keyword.length();
449 }
450 }
451 }
452 return false;
453 }
454
455
x_GetNoncodingProductFeatProduct(string & product_name) const456 bool CAutoDefFeatureClause::x_GetNoncodingProductFeatProduct (string &product_name) const
457 {
458 if (GetMainFeatureSubtype() != CSeqFeatData::eSubtype_misc_feature
459 || !m_pMainFeat->CanGetComment()) {
460 return false;
461 }
462 string comment = m_pMainFeat->GetComment();
463 string::size_type start_pos = NStr::Find(comment, "nonfunctional ");
464 if (start_pos != NCBI_NS_STD::string::npos) {
465 string::size_type sep_pos = NStr::Find (comment, " due to ", start_pos);
466 if (sep_pos != NCBI_NS_STD::string::npos) {
467 product_name = comment.substr(start_pos, sep_pos - start_pos);
468 return true;
469 }
470 }
471 if (x_FindNoncodingFeatureKeywordProduct (comment, "similar to ", product_name)) {
472 return true;
473 } else if (x_FindNoncodingFeatureKeywordProduct (comment, "contains ", product_name)) {
474 return true;
475 } else {
476 return false;
477 }
478 }
479
IsNoncodingProductFeat() const480 bool CAutoDefFeatureClause::IsNoncodingProductFeat() const
481 {
482 string product_name;
483 return x_GetNoncodingProductFeatProduct(product_name);
484 }
485
CAutoDefGeneClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)486 CAutoDefGeneClause::CAutoDefGeneClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
487 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
488 {
489 m_GeneName = x_GetGeneName(m_pMainFeat->GetData().GetGene(), GetSuppressLocusTag());
490 if (m_pMainFeat->GetData().GetGene().CanGetAllele()) {
491 m_AlleleName = m_pMainFeat->GetData().GetGene().GetAllele();
492 if (!NStr::StartsWith(m_AlleleName, m_GeneName, NStr::eNocase)) {
493 if (!NStr::StartsWith(m_AlleleName, "-")) {
494 m_AlleleName = "-" + m_AlleleName;
495 }
496 m_AlleleName = m_GeneName + m_AlleleName;
497 }
498 }
499 m_GeneIsPseudo = IsPseudo(*m_pMainFeat);
500 m_HasGene = true;
501 }
502
503
x_IsPseudo()504 bool CAutoDefGeneClause::x_IsPseudo()
505 {
506 if (CAutoDefFeatureClause::x_IsPseudo()) {
507 return true;
508 }
509 const CGene_ref& gene = m_pMainFeat->GetData().GetGene();
510 if (gene.CanGetPseudo() && gene.IsSetPseudo()) {
511 return true;
512 }
513 return false;
514 }
515
516 /*
517 *If the feature is a gene and has different strings in the description than
518 * in the locus or locus tag, the description will be used as the product for
519 * the gene.
520 */
x_GetProductName(string & product_name)521 bool CAutoDefGeneClause::x_GetProductName(string &product_name)
522 {
523 if (m_pMainFeat->GetData().GetGene().CanGetDesc()
524 && !NStr::Equal(m_pMainFeat->GetData().GetGene().GetDesc(),
525 m_GeneName)) {
526 product_name = m_pMainFeat->GetData().GetGene().GetDesc();
527 return true;
528 } else {
529 return false;
530 }
531 }
532
533
ParseString(string comment,string & gene_name,string & product_name)534 bool CAutoDefParsedtRNAClause::ParseString(string comment, string& gene_name, string& product_name)
535 {
536 product_name = "";
537 gene_name = "";
538
539 NStr::TruncateSpacesInPlace(comment);
540 if (NStr::EndsWith (comment, " gene")) {
541 comment = comment.substr (0, comment.length() - 5);
542 } else if (NStr::EndsWith (comment, " genes")) {
543 comment = comment.substr (0, comment.length() - 6);
544 }
545
546 string::size_type pos = NStr::Find(comment, "(");
547 if (pos == NCBI_NS_STD::string::npos) {
548 if (NStr::StartsWith (comment, "tRNA-")) {
549 product_name = comment;
550 } else {
551 /* if not tRNA, gene name is required */
552 return false;
553 }
554 } else {
555 product_name = comment.substr(0, pos);
556 comment = comment.substr (pos + 1);
557 pos = NStr::Find(comment, ")");
558 if (pos == NCBI_NS_STD::string::npos) {
559 return false;
560 }
561 gene_name = comment.substr (0, pos);
562 NStr::TruncateSpacesInPlace(gene_name);
563 }
564 NStr::TruncateSpacesInPlace(product_name);
565
566 if (NStr::StartsWith (product_name, "tRNA-")) {
567 /* tRNA name must start with "tRNA-" and be followed by one uppercase letter and
568 * two lowercase letters.
569 */
570 if (product_name.length() < 8
571 || !isalpha(product_name.c_str()[5]) || !isupper(product_name.c_str()[5])
572 || !isalpha(product_name.c_str()[6]) || !islower(product_name.c_str()[6])
573 || !isalpha(product_name.c_str()[7]) || !islower(product_name.c_str()[7])) {
574 return false;
575 }
576
577 /* if present, gene name must start with letters "trn",
578 * and end with one uppercase letter.
579 */
580 if (!NStr::IsBlank (gene_name)
581 && (gene_name.length() < 4
582 || !NStr::StartsWith(gene_name, "trn" )
583 || !isalpha(gene_name.c_str()[3])
584 || !isupper(gene_name.c_str()[3]))) {
585 return false;
586 }
587 }
588 if (NStr::IsBlank (product_name)) {
589 return false;
590 }
591 return true;
592 }
593
594
s_tRNAClauseFromNote(CBioseq_Handle bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,string comment,bool is_first,bool is_last,const CAutoDefOptions & opts)595 CAutoDefParsedtRNAClause *s_tRNAClauseFromNote(CBioseq_Handle bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, string comment, bool is_first, bool is_last, const CAutoDefOptions& opts)
596 {
597 string product_name;
598 string gene_name;
599 if (!CAutoDefParsedtRNAClause::ParseString(comment, gene_name, product_name)) {
600 return NULL;
601 }
602
603 return new CAutoDefParsedtRNAClause(bh, cf, mapped_loc, gene_name, product_name, is_first, is_last, opts);
604 }
605
606
x_GetGeneName(const CGene_ref & gref,bool suppress_locus_tag) const607 string CAutoDefFeatureClause::x_GetGeneName(const CGene_ref& gref, bool suppress_locus_tag) const
608 {
609 if (gref.IsSuppressed()) {
610 return "";
611 } else if (gref.CanGetLocus() && !NStr::IsBlank(gref.GetLocus())) {
612 return gref.GetLocus();
613 } else if (!suppress_locus_tag && gref.IsSetLocus_tag() && !NStr::IsBlank(gref.GetLocus_tag())) {
614 return gref.GetLocus_tag();
615 } else if (gref.IsSetDesc() && !NStr::IsBlank(gref.GetDesc())) {
616 return gref.GetDesc();
617 } else {
618 return "";
619 }
620 }
621
622
s_UseCommentBeforeSemicolon(const CSeq_feat & feat,string & label)623 void s_UseCommentBeforeSemicolon(const CSeq_feat& feat, string& label)
624 {
625 if (feat.IsSetComment()) {
626 label = feat.GetComment();
627 string::size_type pos = NStr::Find(label, ";");
628 if (pos != NCBI_NS_STD::string::npos) {
629 label = label.substr(0, pos);
630 }
631 }
632 }
633
634
635 /* Frequently the product associated with a feature is listed as part of the
636 * description of the feature in the definition line. This function determines
637 * the name of the product associated with this specific feature. Some
638 * features will be listed with the product of a feature that is associated
639 * with the feature being described - this function does not look at other
640 * features to determine a product name.
641 * If the feature is a misc_feat with particular keywords in the comment,
642 * the product will be determined based on the contents of the comment.
643 * If the feature is a CDS and is marked as pseudo, the product will be
644 * determined based on the contents of the comment.
645 * If the feature is a gene and has different strings in the description than
646 * in the locus or locus tag, the description will be used as the product for
647 * the gene.
648 * If none of the above conditions apply, the sequence indexing context label
649 * will be used to obtain the product name for the feature.
650 */
x_GetProductName(string & product_name)651 bool CAutoDefFeatureClause::x_GetProductName(string &product_name)
652 {
653 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
654
655 if (subtype == CSeqFeatData::eSubtype_misc_feature && x_GetNoncodingProductFeatProduct(product_name)) {
656 return true;
657 } else if (subtype == CSeqFeatData::eSubtype_cdregion
658 && m_pMainFeat->CanGetPseudo()
659 && m_pMainFeat->IsSetPseudo()
660 && m_pMainFeat->CanGetComment()) {
661 string comment = m_pMainFeat->GetComment();
662 if (!NStr::IsBlank(comment)) {
663 string::size_type pos = NStr::Find(comment, ";");
664 if (pos != NCBI_NS_STD::string::npos) {
665 comment = comment.substr(0, pos);
666 }
667 product_name = comment;
668 return true;
669 }
670 } else if (subtype == CSeqFeatData::eSubtype_tmRNA) {
671 product_name = "tmRNA";
672 return true;
673 } else if (m_pMainFeat->GetData().Which() == CSeqFeatData::e_Rna) {
674 product_name = m_pMainFeat->GetData().GetRna().GetRnaProductName();
675 if (NStr::IsBlank(product_name) && m_pMainFeat->IsSetComment()) {
676 product_name = m_pMainFeat->GetComment();
677 }
678 return true;
679 } else if (subtype == CSeqFeatData::eSubtype_regulatory) {
680 return true;
681 } else if (subtype == CSeqFeatData::eSubtype_misc_recomb) {
682 if (m_pMainFeat->IsSetQual()) {
683 ITERATE(CSeq_feat::TQual, q, m_pMainFeat->GetQual()) {
684 if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "recombination_class") &&
685 (*q)->IsSetVal() && !NStr::IsBlank((*q)->GetVal())) {
686 product_name = (*q)->GetVal();
687 return true;
688 }
689 }
690 }
691 s_UseCommentBeforeSemicolon(*m_pMainFeat, product_name);
692 return true;
693 } else if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
694 return x_GetExonDescription(product_name);
695 } else {
696 string label;
697
698 if (subtype == CSeqFeatData::eSubtype_cdregion && m_pMainFeat->IsSetProduct() && !m_Opts.IsFeatureSuppressed(CSeqFeatData::eSubtype_mat_peptide_aa)) {
699 const CSeq_loc& product_loc = m_pMainFeat->GetProduct();
700 CBioseq_Handle prot_h = m_BH.GetScope().GetBioseqHandle(product_loc);
701 if (prot_h) {
702 CFeat_CI prot_f(prot_h, CSeqFeatData::eSubtype_prot);
703 if (prot_f) {
704 feature::GetLabel(*(prot_f->GetSeq_feat()), &label, feature::fFGL_Content);
705 if (m_pMainFeat->IsSetPartial() && m_pMainFeat->GetPartial()) {
706 // RW-1216 suppress mat-peptide region phrase if sig-peptide also present
707 CFeat_CI sig_pi(prot_h, CSeqFeatData::eSubtype_sig_peptide_aa);
708 if (!sig_pi) {
709 CFeat_CI mat_pi(prot_h, CSeqFeatData::eSubtype_mat_peptide_aa);
710 if (mat_pi && mat_pi->GetData().GetProt().IsSetName()) {
711 const string& m_name = mat_pi->GetData().GetProt().GetName().front();
712 ++mat_pi;
713 if (!mat_pi && !m_name.empty()) {
714 if (label.empty()) {
715 label = m_name;
716 }
717 else {
718 label += ", " + m_name + " region,";
719 }
720 }
721 }
722 }
723 }
724 }
725 }
726 }
727
728 if (NStr::IsBlank(label)) {
729 feature::GetLabel(*m_pMainFeat, &label, feature::fFGL_Content);
730 }
731 if ((subtype == CSeqFeatData::eSubtype_cdregion && !NStr::Equal(label, "CDS"))
732 || (subtype == CSeqFeatData::eSubtype_mRNA && !NStr::Equal(label, "mRNA"))
733 || (subtype != CSeqFeatData::eSubtype_cdregion && subtype != CSeqFeatData::eSubtype_mRNA)) {
734 } else {
735 label = "";
736 }
737
738 // remove unwanted "mRNA-" tacked onto label for mRNA features
739 if (subtype == CSeqFeatData::eSubtype_mRNA && NStr::StartsWith(label, "mRNA-")) {
740 label = label.substr(5);
741 } else if (subtype == CSeqFeatData::eSubtype_rRNA && NStr::StartsWith(label, "rRNA-")) {
742 label = label.substr(5);
743 }
744
745 if (!NStr::IsBlank(label)) {
746 product_name = label;
747 return true;
748 } else {
749 product_name = "";
750 return false;
751 }
752 }
753 return false;
754 }
755
756
x_GetExonDescription(string & description)757 bool CAutoDefFeatureClause::x_GetExonDescription(string &description)
758 {
759 if (m_pMainFeat->IsSetQual()) {
760 ITERATE(CSeq_feat::TQual, it, m_pMainFeat->GetQual()) {
761 if ((*it)->IsSetQual() && (*it)->IsSetVal()
762 && NStr::EqualNocase((*it)->GetQual(), "number")) {
763 description = (*it)->GetVal();
764 return true;
765 }
766 }
767 }
768 description = kEmptyStr;
769 return false;
770 }
771
772
x_GetDescription(string & description)773 bool CAutoDefFeatureClause::x_GetDescription(string &description)
774 {
775 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
776
777 description = "";
778 if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
779 return x_GetExonDescription(description);
780 } else if (NStr::Equal(m_Typeword, "insertion sequence")) {
781 description = m_pMainFeat->GetNamedQual("insertion_seq");
782 if (NStr::Equal(description, "unnamed")
783 || NStr::IsBlank(description)) {
784 description = "";
785 return false;
786 } else {
787 return true;
788 }
789 } else if (subtype == CSeqFeatData::eSubtype_repeat_region) {
790 if (NStr::Equal(m_Typeword, "endogenous virus")) {
791 description = m_pMainFeat->GetNamedQual("endogenous_virus");
792 if (NStr::Equal(description, "unnamed")
793 || NStr::IsBlank(description)) {
794 description = "";
795 return false;
796 } else {
797 return true;
798 }
799 } else {
800 description = m_pMainFeat->GetNamedQual("rpt_family");
801 if (NStr::IsBlank(description) && m_pMainFeat->IsSetComment()) {
802 description = m_pMainFeat->GetComment();
803 if (IsLTR() && NStr::EndsWith(description, " LTR")) {
804 description = description.substr(0, description.length() - 4);
805 }
806 }
807 return true;
808 }
809 } else if (subtype == CSeqFeatData::eSubtype_biosrc
810 && NStr::Equal(m_Typeword, "endogenous virus")) {
811 if (m_pMainFeat->GetData().GetBiosrc().CanGetSubtype()) {
812 ITERATE (CBioSource::TSubtype, subSrcI, m_pMainFeat->GetData().GetBiosrc().GetSubtype()) {
813 if ((*subSrcI)->GetSubtype() == CSubSource::eSubtype_endogenous_virus_name) {
814 description = (*subSrcI)->GetName();
815 if (NStr::Equal(description, "unnamed")
816 || NStr::IsBlank(description)) {
817 description = "";
818 } else {
819 return true;
820 }
821 }
822 }
823 }
824 return false;
825 } else if (NStr::Equal(m_Typeword, "control region")
826 || NStr::Equal(m_Typeword, "D-loop")
827 || subtype == CSeqFeatData::eSubtype_3UTR
828 || subtype == CSeqFeatData::eSubtype_5UTR) {
829 return false;
830 } else if (IsLTR(*m_pMainFeat)) {
831 if (m_pMainFeat->CanGetComment()) {
832 string comment = m_pMainFeat->GetComment();
833 if (NStr::StartsWith(comment, "LTR ")) {
834 comment = comment.substr(4);
835 } else if (NStr::EndsWith(comment, " LTR")) {
836 comment = comment.substr(0, comment.length() - 4);
837 }
838 description = comment;
839 }
840 if (NStr::IsBlank(description)) {
841 return false;
842 } else {
843 return true;
844 }
845 } else if (subtype == CSeqFeatData::eSubtype_operon) {
846 description = m_pMainFeat->GetNamedQual("operon");
847 return true;
848 } else {
849 if (!m_ProductNameChosen) {
850 m_ProductNameChosen = x_GetProductName(m_ProductName);
851 }
852
853 if (!NStr::IsBlank(m_GeneName) && !NStr::IsBlank(m_ProductName)) {
854 description = m_ProductName + " (" + m_GeneName + ")";
855 } else if (!NStr::IsBlank(m_GeneName)) {
856 description = m_GeneName;
857 } else if (!NStr::IsBlank(m_ProductName)) {
858 description = m_ProductName;
859 }
860 if (NStr::IsBlank(description)) {
861 return false;
862 } else {
863 return true;
864 }
865 }
866 }
867
868
IsSatelliteClause() const869 bool CAutoDefFeatureClause::IsSatelliteClause() const
870 {
871 return IsSatellite(*m_pMainFeat);
872 }
873
874
IsSatellite(const CSeq_feat & feat)875 bool CAutoDefFeatureClause::IsSatellite(const CSeq_feat& feat)
876 {
877 if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_repeat_region
878 && !NStr::IsBlank (feat.GetNamedQual("satellite"))) {
879 return true;
880 }
881 return false;
882 }
883
884
IsPromoter() const885 bool CAutoDefFeatureClause::IsPromoter() const
886 {
887 return IsPromoter(*m_pMainFeat);
888 }
889
890
IsLTR() const891 bool CAutoDefFeatureClause::IsLTR() const
892 {
893 return IsLTR(*m_pMainFeat);
894 }
895
896
IsPromoter(const CSeq_feat & feat)897 bool CAutoDefFeatureClause::IsPromoter(const CSeq_feat& feat)
898 {
899 if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_promoter) {
900 return true;
901 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_regulatory &&
902 NStr::Equal(feat.GetNamedQual("regulatory_class"), "promoter")) {
903 return true;
904 } else {
905 return false;
906 }
907 }
908
909
IsLTR(const CSeq_feat & feat)910 bool CAutoDefFeatureClause::IsLTR(const CSeq_feat& feat)
911 {
912 if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_LTR) {
913 return true;
914 } else if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region ||
915 !feat.IsSetQual()) {
916 return false;
917 }
918 ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
919 if ((*it)->IsSetQual() && (*it)->IsSetVal() &&
920 NStr::EqualNocase((*it)->GetQual(), "rpt_type") &&
921 NStr::FindNoCase((*it)->GetVal(), "long_terminal_repeat") != string::npos) {
922 return true;
923 }
924 }
925 return false;
926 }
927
928 /* operons suppress all subfeatures except promoters (see GB-5635) */
x_GetOperonSubfeatures(string & interval)929 void CAutoDefFeatureClause::x_GetOperonSubfeatures(string &interval)
930 {
931 bool has_promoter = false;
932
933 for (auto it : m_ClauseList) {
934 if (it->IsPromoter()) {
935 has_promoter = true;
936 break;
937 }
938 }
939 if (has_promoter) {
940 interval += ", promoter region, ";
941 }
942 }
943
944
945 /* This function calculates the "interval" for a clause in the definition
946 * line. The interval could be an empty string, it could indicate whether
947 * the location of the feature is partial or complete and whether or not
948 * the feature is a CDS, the interval could be a description of the
949 * subfeatures of the clause, or the interval could be a combination of the
950 * last two items if the feature is a CDS.
951 */
x_GetGenericInterval(string & interval,bool suppress_allele)952 bool CAutoDefFeatureClause::x_GetGenericInterval (string &interval, bool suppress_allele)
953 {
954 interval = "";
955 if (m_IsUnknown) {
956 return false;
957 }
958
959 CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype();
960 if (subtype == CSeqFeatData::eSubtype_exon && m_IsAltSpliced) {
961 interval = "alternatively spliced";
962 return true;
963 }
964
965 if (IsSatelliteClause()
966 || IsPromoter()
967 || subtype == CSeqFeatData::eSubtype_regulatory
968 || subtype == CSeqFeatData::eSubtype_exon
969 || subtype == CSeqFeatData::eSubtype_intron
970 || subtype == CSeqFeatData::eSubtype_5UTR
971 || subtype == CSeqFeatData::eSubtype_3UTR
972 || (subtype == CSeqFeatData::eSubtype_repeat_region && !NStr::Equal(m_Typeword, "endogenous virus"))
973 || subtype == CSeqFeatData::eSubtype_misc_recomb
974 || IsLTR()) {
975 return false;
976 }
977
978 CRef<CAutoDefFeatureClause_Base> utr3;
979
980 if (subtype == CSeqFeatData::eSubtype_operon) {
981 // suppress subclauses except promoters
982 x_GetOperonSubfeatures(interval);
983 } else if (!m_SuppressSubfeatures) {
984 // label subclauses
985 // check to see if 3'UTR is present, and whether there are any other features
986 auto it = m_ClauseList.begin();
987 while (it != m_ClauseList.end()) {
988 if (*it) {
989 (*it)->Label(suppress_allele);
990 if ((*it)->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_3UTR && subtype == CSeqFeatData::eSubtype_cdregion) {
991 utr3 = *it;
992 it = m_ClauseList.erase(it);
993 }
994 else {
995 ++it;
996 }
997 } else {
998 it = m_ClauseList.erase(it);
999 }
1000 }
1001
1002 // label any subclauses
1003 if (m_ClauseList.size() > 0) {
1004 bool suppress_final_and = false;
1005 if (subtype == CSeqFeatData::eSubtype_cdregion && !m_ClauseInfoOnly) {
1006 suppress_final_and = true;
1007 }
1008
1009 // create subclause list for interval
1010 interval += ListClauses(false, suppress_final_and, suppress_allele);
1011
1012 if (subtype == CSeqFeatData::eSubtype_cdregion && !m_ClauseInfoOnly) {
1013 if (utr3 != NULL) {
1014 interval += ", ";
1015 } else if (m_ClauseList.size() == 1) {
1016 interval += " and ";
1017 } else {
1018 interval += ", and ";
1019 }
1020 } else {
1021 return true;
1022 }
1023 }
1024 }
1025
1026 if (IsPartial()) {
1027 interval += "partial ";
1028 } else {
1029 interval += "complete ";
1030 }
1031
1032 if (subtype == CSeqFeatData::eSubtype_cdregion
1033 && (!x_IsPseudo())) {
1034 interval += "cds";
1035 if (m_IsAltSpliced) {
1036 interval += ", alternatively spliced";
1037 }
1038 } else {
1039 interval += "sequence";
1040 string product_name;
1041 if (m_IsAltSpliced && x_GetNoncodingProductFeatProduct (product_name)) {
1042 interval += ", alternatively spliced";
1043 }
1044 }
1045
1046 if (utr3 != NULL) {
1047 /* tack UTR3 on at end of clause */
1048 if (m_ClauseList.size() == 0) {
1049 interval += " and 3' UTR";
1050 } else {
1051 interval += ", and 3' UTR";
1052 }
1053 m_ClauseList.push_back(utr3);
1054 }
1055
1056 return true;
1057 }
1058
1059
Label(bool suppress_allele)1060 void CAutoDefFeatureClause::Label(bool suppress_allele)
1061 {
1062 if (!m_TypewordChosen) {
1063 m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
1064 m_ShowTypewordFirst = x_ShowTypewordFirst(m_Typeword);
1065 m_Pluralizable = true;
1066 }
1067 if (!m_ProductNameChosen) {
1068 m_ProductNameChosen = x_GetProductName(m_ProductName);
1069 }
1070 if (!m_DescriptionChosen) {
1071 m_DescriptionChosen = x_GetDescription(m_Description);
1072 }
1073
1074 x_GetGenericInterval (m_Interval, suppress_allele);
1075
1076 }
1077
1078
CompareLocation(const CSeq_loc & loc) const1079 sequence::ECompare CAutoDefFeatureClause::CompareLocation(const CSeq_loc& loc) const
1080 {
1081 return sequence::Compare(loc, *m_ClauseLocation, &(m_BH.GetScope()),
1082 sequence::fCompareOverlapping);
1083 }
1084
1085
SameStrand(const CSeq_loc & loc) const1086 bool CAutoDefFeatureClause::SameStrand(const CSeq_loc& loc) const
1087 {
1088 ENa_strand loc_strand = loc.GetStrand();
1089 ENa_strand this_strand = m_ClauseLocation->GetStrand();
1090
1091 if ((loc_strand == eNa_strand_minus && this_strand != eNa_strand_minus)
1092 || (loc_strand != eNa_strand_minus && this_strand == eNa_strand_minus)) {
1093 return false;
1094 } else {
1095 return true;
1096 }
1097
1098 }
1099
IsPartial() const1100 bool CAutoDefFeatureClause::IsPartial() const
1101 {
1102 if (m_ClauseLocation->IsPartialStart(eExtreme_Biological)
1103 || m_ClauseLocation->IsPartialStop(eExtreme_Biological)) {
1104 return true;
1105 } else {
1106 return false;
1107 }
1108 }
1109
1110
GetLocation() const1111 CRef<CSeq_loc> CAutoDefFeatureClause::GetLocation() const
1112 {
1113 return m_ClauseLocation;
1114 }
1115
1116
AddToLocation(CRef<CSeq_loc> loc,bool also_set_partials)1117 void CAutoDefFeatureClause::AddToLocation(CRef<CSeq_loc> loc, bool also_set_partials)
1118 {
1119 bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological);
1120 bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological);
1121
1122 if (also_set_partials) {
1123 partial5 |= loc->IsPartialStart(eExtreme_Biological);
1124 }
1125 if (also_set_partials) {
1126 partial3 |= loc->IsPartialStop(eExtreme_Biological);
1127 }
1128 m_ClauseLocation = Seq_loc_Add(*m_ClauseLocation, *loc,
1129 CSeq_loc::fSort | CSeq_loc::fMerge_Overlapping,
1130 &(m_BH.GetScope()));
1131
1132
1133 m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1134 m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1135 }
1136
1137
1138 // Match for identical strings or for match at the beginning followed by mat-peptide region
DoesmRNAProductNameMatch(const string & mrna_product) const1139 bool CAutoDefFeatureClause::DoesmRNAProductNameMatch(const string& mrna_product) const
1140 {
1141 if (!m_ProductNameChosen) {
1142 return false;
1143 }
1144 if (NStr::Equal(m_ProductName, mrna_product)) {
1145 return true;
1146 }
1147 if (NStr::StartsWith(m_ProductName, mrna_product) && m_ProductName[mrna_product.length()] == ',' && NStr::EndsWith(m_ProductName, " region,")) {
1148 return true;
1149 }
1150 return false;
1151 }
1152
1153
1154 /* This function searches this list for clauses to which this mRNA should
1155 * apply. This is not taken care of by the GroupAllClauses function
1156 * because when an mRNA is added to a CDS, the product for the clause is
1157 * replaced and the location for the clause is expanded, rather than simply
1158 * adding the mRNA as an additional feature in the list, and because an
1159 * mRNA can apply to more than one clause, while other features should
1160 * really only belong to one clause.
1161 */
AddmRNA(CAutoDefFeatureClause_Base * mRNAClause)1162 bool CAutoDefFeatureClause::AddmRNA (CAutoDefFeatureClause_Base *mRNAClause)
1163 {
1164 bool used_mRNA = false;
1165 string clause_product, mRNA_product;
1166 bool adjust_partials = true;
1167
1168 if (mRNAClause == NULL || ! mRNAClause->SameStrand(*m_ClauseLocation)) {
1169 return false;
1170 }
1171
1172 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
1173 sequence::ECompare loc_compare = mRNAClause->CompareLocation(*m_ClauseLocation);
1174 if (subtype == CSeqFeatData::eSubtype_cdregion) {
1175 adjust_partials = false;
1176 }
1177
1178 if (subtype == CSeqFeatData::eSubtype_cdregion
1179 && DoesmRNAProductNameMatch(mRNAClause->GetProductName())
1180 && (loc_compare == sequence::eContained || loc_compare == sequence::eSame)) {
1181 m_HasmRNA = true;
1182 // when expanding "location" to include mRNA, leave partials for CDS as they were
1183 AddToLocation(mRNAClause->GetLocation(), adjust_partials);
1184 used_mRNA = true;
1185 } else if ((subtype == CSeqFeatData::eSubtype_cdregion || subtype == CSeqFeatData::eSubtype_gene)
1186 && !m_ProductNameChosen
1187 && (loc_compare == sequence::eContained
1188 || loc_compare == sequence::eContains
1189 || loc_compare == sequence::eSame)) {
1190 m_HasmRNA = true;
1191 AddToLocation(mRNAClause->GetLocation(), adjust_partials);
1192 used_mRNA = true;
1193 m_ProductName = mRNAClause->GetProductName();
1194 m_ProductNameChosen = true;
1195 }
1196
1197 if (used_mRNA && mRNAClause->IsAltSpliced()) {
1198 m_IsAltSpliced = true;
1199 }
1200
1201 return used_mRNA;
1202 }
1203
1204
1205 /* This function searches this list for clauses to which this gene should
1206 * apply. This is not taken care of by the GroupAllClauses function
1207 * because genes are added to clauses as a GeneRefPtr instead of as an
1208 * additional feature in the list, and because a gene can apply to more
1209 * than one clause, while other features should really only belong to
1210 * one clause.
1211 */
AddGene(CAutoDefFeatureClause_Base * gene_clause,bool suppress_allele)1212 bool CAutoDefFeatureClause::AddGene (CAutoDefFeatureClause_Base *gene_clause, bool suppress_allele)
1213 {
1214 bool used_gene = false;
1215
1216 if (gene_clause == NULL || gene_clause->GetMainFeatureSubtype() != CSeqFeatData::eSubtype_gene) {
1217 return false;
1218 }
1219
1220 CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype ();
1221
1222 string noncoding_product_name;
1223
1224 // only add gene to certain other types of clauses
1225 if (subtype != CSeqFeatData::eSubtype_cdregion
1226 && subtype != CSeqFeatData::eSubtype_mRNA
1227 && subtype != CSeqFeatData::eSubtype_rRNA
1228 && subtype != CSeqFeatData::eSubtype_tRNA
1229 && subtype != CSeqFeatData::eSubtype_misc_RNA
1230 && subtype != CSeqFeatData::eSubtype_otherRNA
1231 && subtype != CSeqFeatData::eSubtype_ncRNA
1232 && subtype != CSeqFeatData::eSubtype_precursor_RNA
1233 && subtype != CSeqFeatData::eSubtype_preRNA
1234 && subtype != CSeqFeatData::eSubtype_tmRNA
1235 && subtype != CSeqFeatData::eSubtype_intron
1236 && subtype != CSeqFeatData::eSubtype_exon
1237 && !x_GetNoncodingProductFeatProduct(noncoding_product_name)) {
1238 return false;
1239 }
1240
1241 if (m_HasGene) {
1242 // already assigned
1243 } else {
1244 // find overlapping gene for this feature
1245 CAutoDefGeneClause *gene = dynamic_cast<CAutoDefGeneClause *>(gene_clause);
1246 bool suppress_locus_tag = gene ? gene->GetSuppressLocusTag() : false;
1247 CConstRef <CSeq_feat> gene_for_feat = sequence::GetGeneForFeature(*m_pMainFeat, m_BH.GetScope());
1248 if (gene_for_feat && NStr::Equal(x_GetGeneName(gene_for_feat->GetData().GetGene(), suppress_locus_tag), gene_clause->GetGeneName())) {
1249 used_gene = true;
1250 m_HasGene = true;
1251 m_GeneName = gene_clause->GetGeneName();
1252 m_AlleleName = gene_clause->GetAlleleName();
1253 m_GeneIsPseudo = gene_clause->GetGeneIsPseudo();
1254 m_TypewordChosen = x_GetFeatureTypeWord(m_Typeword);
1255 }
1256 }
1257
1258 if (used_gene && ! m_ProductNameChosen) {
1259 Label(suppress_allele);
1260 if (!m_ProductNameChosen) {
1261 m_ProductNameChosen = true;
1262 m_ProductName = gene_clause->GetProductName();
1263 }
1264 }
1265 if (used_gene) {
1266 m_DescriptionChosen = false;
1267 Label(suppress_allele);
1268 }
1269
1270 return used_gene;
1271 }
1272
1273
OkToGroupUnderByType(const CAutoDefFeatureClause_Base * parent_clause) const1274 bool CAutoDefFeatureClause::OkToGroupUnderByType(const CAutoDefFeatureClause_Base *parent_clause) const
1275 {
1276 bool ok_to_group = false;
1277
1278 if (parent_clause == NULL) {
1279 return false;
1280 }
1281 CSeqFeatData::ESubtype subtype = m_pMainFeat->GetData().GetSubtype();
1282 CSeqFeatData::ESubtype parent_subtype = parent_clause->GetMainFeatureSubtype();
1283
1284 if (parent_subtype == CSeqFeatData::eSubtype_mobile_element) {
1285 return true;
1286 }
1287
1288 if (subtype == CSeqFeatData::eSubtype_exon || subtype == CSeqFeatData::eSubtype_intron) {
1289 if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1290 || parent_subtype == CSeqFeatData::eSubtype_D_loop
1291 || parent_subtype == CSeqFeatData::eSubtype_mRNA
1292 || parent_subtype == CSeqFeatData::eSubtype_gene
1293 || parent_subtype == CSeqFeatData::eSubtype_operon
1294 || parent_clause->IsNoncodingProductFeat()
1295 || parent_clause->IsEndogenousVirusSourceFeature()
1296 || parent_clause->IsGeneCluster()) {
1297 ok_to_group = true;
1298 }
1299 } else if (IsPromoter() || subtype == CSeqFeatData::eSubtype_regulatory) {
1300 if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1301 || parent_subtype == CSeqFeatData::eSubtype_mRNA
1302 || parent_subtype == CSeqFeatData::eSubtype_gene
1303 || parent_subtype == CSeqFeatData::eSubtype_operon
1304 || parent_clause->IsEndogenousVirusSourceFeature()
1305 || parent_clause->IsGeneCluster()) {
1306 ok_to_group = true;
1307 }
1308 } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
1309 if (parent_subtype == CSeqFeatData::eSubtype_mRNA
1310 || parent_clause->IsInsertionSequence()
1311 || parent_clause->IsMobileElement()
1312 || parent_clause->IsEndogenousVirusSourceFeature()
1313 || parent_subtype == CSeqFeatData::eSubtype_operon
1314 || parent_clause->IsGeneCluster()) {
1315 ok_to_group = true;
1316 }
1317 } else if (IsInsertionSequence()
1318 || subtype == CSeqFeatData::eSubtype_gene
1319 || IsMobileElement()
1320 || IsNoncodingProductFeat()
1321 || subtype == CSeqFeatData::eSubtype_operon
1322 || IsGeneCluster()) {
1323 if (parent_clause->IsMobileElement()
1324 || parent_clause->IsInsertionSequence()
1325 || parent_clause->IsEndogenousVirusSourceFeature()
1326 || parent_subtype == CSeqFeatData::eSubtype_operon
1327 || parent_clause->IsGeneCluster()) {
1328 ok_to_group = true;
1329 }
1330 } else if (subtype == CSeqFeatData::eSubtype_3UTR
1331 || subtype == CSeqFeatData::eSubtype_5UTR
1332 || IsLTR(*m_pMainFeat)) {
1333 if (parent_subtype == CSeqFeatData::eSubtype_cdregion
1334 || parent_subtype == CSeqFeatData::eSubtype_mRNA
1335 || parent_subtype == CSeqFeatData::eSubtype_gene
1336 || parent_clause->IsEndogenousVirusSourceFeature()
1337 || parent_subtype == CSeqFeatData::eSubtype_operon
1338 || parent_clause->IsGeneCluster()) {
1339 ok_to_group = true;
1340 }
1341 }
1342
1343 return ok_to_group;
1344 }
1345
1346
1347 // Transposons, insertion sequences, and endogenous virii
1348 // take subfeatures regardless of whether the subfeature is
1349 // on the same strand.
1350 // Gene Clusters can optionally take subfeatures on either
1351 // strand (gene_cluster_opp_strand is flag).
1352 // Promoters will match up to features that are adjacent.
1353 // Introns will match up to coding regions if the intron
1354 // location is the space between two coding region intervals.
1355 // Any feature on an mRNA sequence groups locationally.
1356 // All other feature matches must be that the feature to
1357 // go into the clause must fit inside the location of the
1358 // other clause.
OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base * parent_clause,bool gene_cluster_opp_strand) const1359 bool CAutoDefFeatureClause::OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base *parent_clause, bool gene_cluster_opp_strand) const
1360 {
1361 if (parent_clause == NULL) {
1362 return false;
1363 }
1364
1365 if (m_HasGene && parent_clause->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_gene) {
1366 // genes must match to be parents
1367 if (!NStr::Equal(m_GeneName, parent_clause->GetGeneName())) {
1368 return false;
1369 }
1370 }
1371
1372 if (m_Biomol == CMolInfo::eBiomol_mRNA) {
1373 return true;
1374 }
1375
1376 sequence::ECompare loc_compare = parent_clause->CompareLocation(*m_ClauseLocation);
1377
1378 if (loc_compare == sequence::eContained || loc_compare == sequence::eSame) {
1379 if (parent_clause->SameStrand(*m_ClauseLocation)) {
1380 return true;
1381 } else if (parent_clause->IsMobileElement()
1382 || parent_clause->IsInsertionSequence()
1383 || parent_clause->IsEndogenousVirusSourceFeature()
1384 || (parent_clause->IsGeneCluster() && gene_cluster_opp_strand)) {
1385 return true;
1386 }
1387 } else if (IsPromoter()
1388 && parent_clause->SameStrand(*m_ClauseLocation)) {
1389 unsigned int promoter_stop = sequence::GetStop(*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1390 unsigned int parent_start = sequence::GetStart(*(parent_clause->GetLocation()), &(m_BH.GetScope()), eExtreme_Biological);
1391 if (m_ClauseLocation->GetStrand() == eNa_strand_minus) {
1392 if (promoter_stop == parent_start + 1) {
1393 return true;
1394 }
1395 } else if (promoter_stop + 1 == parent_start) {
1396 return true;
1397 }
1398 } else if (m_pMainFeat->GetData().GetSubtype() == CSeqFeatData::eSubtype_intron
1399 && parent_clause->GetMainFeatureSubtype() == CSeqFeatData::eSubtype_cdregion
1400 && parent_clause->SameStrand(*m_ClauseLocation)) {
1401 CSeq_loc_CI seq_loc_it(*(parent_clause->GetLocation()));
1402 if (seq_loc_it) {
1403 int intron_start = sequence::GetStart(*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1404 int intron_stop = sequence::GetStop (*m_ClauseLocation, &(m_BH.GetScope()), eExtreme_Biological);
1405 int prev_start = seq_loc_it.GetRange().GetFrom();
1406 int prev_stop = seq_loc_it.GetRange().GetTo();
1407 ++seq_loc_it;
1408 while (seq_loc_it) {
1409 int cds_start = seq_loc_it.GetRange().GetFrom();
1410 int cds_stop = seq_loc_it.GetRange().GetTo();
1411 if ((intron_start == prev_stop + 1 && intron_stop == cds_start - 1)
1412 || (intron_start == cds_stop + 1 && intron_stop == prev_start - 1)) {
1413 return true;
1414 }
1415 prev_start = cds_start;
1416 prev_stop = cds_stop;
1417 ++seq_loc_it;
1418 }
1419 // intron could also group with coding region if coding region is adjacent
1420 if (intron_start > prev_stop && intron_start - 1 == prev_stop) {
1421 return true;
1422 } else if (prev_start > intron_stop && prev_start - 1 == intron_stop) {
1423 return true;
1424 }
1425 }
1426 }
1427
1428 return false;
1429 }
1430
1431
FindBestParentClause(CAutoDefFeatureClause_Base * subclause,bool gene_cluster_opp_strand)1432 CAutoDefFeatureClause_Base *CAutoDefFeatureClause::FindBestParentClause(CAutoDefFeatureClause_Base * subclause, bool gene_cluster_opp_strand)
1433 {
1434 CAutoDefFeatureClause_Base *best_parent;
1435
1436 if (subclause == NULL || subclause == this) {
1437 return NULL;
1438 }
1439
1440 if (!NStr::IsBlank(subclause->GetGeneName()) &&
1441 !NStr::IsBlank(this->GetGeneName()) &&
1442 !NStr::Equal(subclause->GetGeneName(), this->GetGeneName())) {
1443 return NULL;
1444 }
1445
1446 best_parent = CAutoDefFeatureClause_Base::FindBestParentClause(subclause, gene_cluster_opp_strand);
1447
1448 if (subclause->OkToGroupUnderByLocation(this, gene_cluster_opp_strand)
1449 && subclause->OkToGroupUnderByType(this)) {
1450 if (best_parent == NULL || best_parent->CompareLocation(*m_ClauseLocation) == sequence::eContained) {
1451 best_parent = this;
1452 }
1453 }
1454 return best_parent;
1455 }
1456
ReverseCDSClauseLists()1457 void CAutoDefFeatureClause::ReverseCDSClauseLists()
1458 {
1459 ENa_strand this_strand = m_ClauseLocation->GetStrand();
1460 if (this_strand == eNa_strand_minus
1461 && GetMainFeatureSubtype() == CSeqFeatData::eSubtype_cdregion) {
1462 std::reverse(m_ClauseList.begin(), m_ClauseList.end());
1463 }
1464
1465 for (unsigned int k = 0; k < m_ClauseList.size(); k++) {
1466 m_ClauseList[k]->ReverseCDSClauseLists();
1467 }
1468 }
1469
1470
1471
ShouldRemoveExons() const1472 bool CAutoDefFeatureClause::ShouldRemoveExons() const
1473 {
1474 unsigned int subtype = GetMainFeatureSubtype();
1475
1476 if (subtype == CSeqFeatData::eSubtype_mRNA) {
1477 return false;
1478 } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
1479 if (IsPartial()) {
1480 // keep only if exons have numbers
1481 for (size_t k = 0; k < m_ClauseList.size(); k++) {
1482 if (m_ClauseList[k]->IsExonWithNumber()) {
1483 return false;
1484 }
1485 }
1486 return true;
1487 } else {
1488 return true;
1489 }
1490 } else {
1491 return true;
1492 }
1493 }
1494
1495
IsExonWithNumber() const1496 bool CAutoDefFeatureClause::IsExonWithNumber() const
1497 {
1498 if (m_pMainFeat->IsSetData() &&
1499 m_pMainFeat->GetData().GetSubtype() == CSeqFeatData::eSubtype_exon &&
1500 m_pMainFeat->IsSetQual()) {
1501 ITERATE(CSeq_feat::TQual, it, m_pMainFeat->GetQual()) {
1502 if ((*it)->IsSetQual() &&
1503 NStr::Equal((*it)->GetQual(), "number") &&
1504 (*it)->IsSetVal() &&
1505 !NStr::IsBlank((*it)->GetVal())) {
1506 return true;
1507 }
1508 }
1509 }
1510 return false;
1511 }
1512
1513
IsBioseqPrecursorRNA() const1514 bool CAutoDefFeatureClause::IsBioseqPrecursorRNA() const
1515 {
1516 if (m_Biomol == CMolInfo::eBiomol_pre_RNA && GetMainFeatureSubtype() == CSeqFeatData::eSubtype_preRNA) {
1517 return true;
1518 } else {
1519 return false;
1520 }
1521 }
1522
1523
CAutoDefNcRNAClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1524 CAutoDefNcRNAClause::CAutoDefNcRNAClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1525 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts),
1526 m_UseComment (m_Opts.GetUseNcRNAComment())
1527 {
1528 }
1529
1530
~CAutoDefNcRNAClause()1531 CAutoDefNcRNAClause::~CAutoDefNcRNAClause()
1532 {
1533 }
1534
1535
x_GetProductName(string & product_name)1536 bool CAutoDefNcRNAClause::x_GetProductName(string &product_name)
1537 {
1538 string ncrna_product;
1539 string ncrna_class;
1540 if (m_pMainFeat->IsSetData() && m_pMainFeat->GetData().IsRna()
1541 && m_pMainFeat->GetData().GetRna().IsSetExt()) {
1542 const CRNA_ref::TExt& ext = m_pMainFeat->GetData().GetRna().GetExt();
1543 if (ext.IsName()) {
1544 ncrna_product = ext.GetName();
1545 if (NStr::EqualNocase(ncrna_product, "ncRNA")) {
1546 ncrna_product = "";
1547 }
1548 } else if (ext.IsGen()) {
1549 if (ext.GetGen().IsSetProduct()) {
1550 ncrna_product = ext.GetGen().GetProduct();
1551 }
1552 if (ext.GetGen().IsSetClass()) {
1553 ncrna_class = ext.GetGen().GetClass();
1554 }
1555 }
1556 }
1557 if (NStr::IsBlank(ncrna_product)) {
1558 ncrna_product = m_pMainFeat->GetNamedQual("product");
1559 }
1560 if (NStr::IsBlank(ncrna_class)) {
1561 ncrna_class = m_pMainFeat->GetNamedQual("ncRNA_class");
1562 }
1563 if (NStr::EqualNocase(ncrna_class, "other")) {
1564 ncrna_class = "";
1565 }
1566 NStr::ReplaceInPlace(ncrna_class, "_", " ");
1567
1568 string ncrna_comment;
1569 if (m_pMainFeat->IsSetComment()) {
1570 ncrna_comment = m_pMainFeat->GetComment();
1571 if (!NStr::IsBlank(ncrna_comment)) {
1572 string::size_type pos = NStr::Find(ncrna_comment, ";");
1573 if (pos != NCBI_NS_STD::string::npos) {
1574 ncrna_comment = ncrna_comment.substr(0, pos);
1575 }
1576 }
1577 }
1578
1579 if (!NStr::IsBlank (ncrna_product)) {
1580 product_name = ncrna_product;
1581 if (!NStr::IsBlank (ncrna_class)) {
1582 product_name += " " + ncrna_class;
1583 }
1584 } else if (!NStr::IsBlank(ncrna_class)) {
1585 product_name = ncrna_class;
1586 } else if (m_UseComment && !NStr::IsBlank (ncrna_comment)) {
1587 product_name = ncrna_comment;
1588 } else {
1589 product_name = "non-coding RNA";
1590 }
1591 return true;
1592
1593 }
1594
1595
1596 static string mobile_element_keywords [] = {
1597 "insertion sequence",
1598 "retrotransposon",
1599 "non-LTR retrotransposon",
1600 "transposon",
1601 "P-element",
1602 "transposable element",
1603 "integron",
1604 "superintegron",
1605 "SINE",
1606 "MITE",
1607 "LINE"
1608 };
1609
1610
CAutoDefMobileElementClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1611 CAutoDefMobileElementClause::CAutoDefMobileElementClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
1612 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1613 {
1614 string mobile_element_name = m_pMainFeat->GetNamedQual("mobile_element_type");
1615 if (NStr::StartsWith(mobile_element_name, "other:")) {
1616 mobile_element_name = mobile_element_name.substr(6);
1617 }
1618 bool found_keyword = false;
1619
1620 m_Pluralizable = true;
1621
1622 if (NStr::IsBlank(mobile_element_name)) {
1623 m_Description = "";
1624 m_ShowTypewordFirst = false;
1625 m_Typeword = "mobile element";
1626 } else {
1627 for (unsigned int k = 0; k < sizeof (mobile_element_keywords) / sizeof (string) && !found_keyword; k++) {
1628 size_t pos;
1629 if (NStr::StartsWith(mobile_element_name, mobile_element_keywords[k])) {
1630 // keyword at the beginning
1631 m_Typeword = mobile_element_keywords[k];
1632 if (NStr::Equal(mobile_element_name, mobile_element_keywords[k])) {
1633 m_ShowTypewordFirst = false;
1634 m_Description = "";
1635 } else {
1636 m_ShowTypewordFirst = true;
1637 m_Description = mobile_element_name.substr(mobile_element_keywords[k].length());
1638 NStr::TruncateSpacesInPlace(m_Description);
1639 }
1640 if (mobile_element_name.c_str()[mobile_element_keywords[k].length()] == '-') {
1641 // if keyword is hyphenated portion of name, no pluralization
1642 m_Pluralizable = false;
1643 }
1644 found_keyword = true;
1645 } else if (NStr::EndsWith(mobile_element_name, mobile_element_keywords[k])) {
1646 // keyword at the end
1647 m_Typeword = mobile_element_keywords[k];
1648 m_ShowTypewordFirst = false;
1649 m_Description = mobile_element_name.substr(0, mobile_element_name.length() - mobile_element_keywords[k].length());
1650 NStr::TruncateSpacesInPlace(m_Description);
1651 found_keyword = true;
1652 } else if ((pos = NStr::Find(mobile_element_name, mobile_element_keywords[k])) != string::npos
1653 && isspace(mobile_element_name.c_str()[pos])) {
1654 // keyword in the middle
1655 m_Typeword = "";
1656 m_ShowTypewordFirst = false;
1657 m_Description = mobile_element_name.substr(pos);
1658 m_Pluralizable = false;
1659 }
1660 }
1661 if (!found_keyword) {
1662 // keyword not in description
1663 m_Typeword = "mobile element";
1664 m_Description = mobile_element_name;
1665 }
1666 }
1667 if (NStr::EqualNocase(m_Typeword, "integron")) {
1668 m_ShowTypewordFirst = false;
1669 }
1670
1671 m_DescriptionChosen = true;
1672 m_TypewordChosen = true;
1673 m_ProductName = "";
1674 m_ProductNameChosen = true;
1675 NStr::TruncateSpacesInPlace(m_Description);
1676 if (NStr::StartsWith(m_Description, ":")) {
1677 m_Description = m_Description.substr(1);
1678 NStr::TruncateSpacesInPlace(m_Description);
1679 }
1680 if (NStr::Equal(m_Description, "unnamed")) {
1681 m_Description = "";
1682 }
1683 }
1684
1685
~CAutoDefMobileElementClause()1686 CAutoDefMobileElementClause::~CAutoDefMobileElementClause()
1687 {
1688 }
1689
1690
Label(bool suppress_allele)1691 void CAutoDefMobileElementClause::Label(bool suppress_allele)
1692 {
1693 m_DescriptionChosen = true;
1694 x_GetGenericInterval (m_Interval, suppress_allele);
1695 }
1696
1697
IsOptional()1698 bool CAutoDefMobileElementClause::IsOptional()
1699 {
1700 if (NStr::Equal(m_Typeword, "SINE") ||
1701 NStr::Equal(m_Typeword, "LINE") ||
1702 NStr::Equal(m_Typeword, "MITE")) {
1703 return true;
1704 } else {
1705 return false;
1706 }
1707
1708 }
1709
1710
1711 const char *kMinisatellite = "minisatellite";
1712 const char *kMicrosatellite = "microsatellite";
1713 const char *kSatellite = "satellite";
1714
CAutoDefSatelliteClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1715 CAutoDefSatelliteClause::CAutoDefSatelliteClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1716 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1717 {
1718 string comment = m_pMainFeat->GetNamedQual("satellite");
1719 string::size_type pos = NStr::Find(comment, ";");
1720 if (pos != NCBI_NS_STD::string::npos) {
1721 comment = comment.substr(0, pos);
1722 }
1723
1724 size_t len = 0;
1725
1726 if (NStr::StartsWith(comment, kMinisatellite)) {
1727 len = strlen (kMinisatellite);
1728 } else if (NStr::StartsWith (comment, kMicrosatellite)) {
1729 len = strlen (kMicrosatellite);
1730 } else if (NStr::StartsWith (comment, kSatellite)) {
1731 len = strlen (kSatellite);
1732 } else {
1733 // use default label satellite
1734 string prefix = kSatellite;
1735 comment = prefix + " " + comment;
1736 }
1737 if (len > 0 && NStr::Equal(comment.substr(len, 1), ":")) {
1738 comment = comment.substr (0, len) + " " + comment.substr (len + 1);
1739 }
1740
1741 m_Description = comment;
1742 m_DescriptionChosen = true;
1743 m_Typeword = "sequence";
1744 m_TypewordChosen = true;
1745 }
1746
1747
~CAutoDefSatelliteClause()1748 CAutoDefSatelliteClause::~CAutoDefSatelliteClause()
1749 {
1750 }
1751
1752
Label(bool suppress_allele)1753 void CAutoDefSatelliteClause::Label(bool suppress_allele)
1754 {
1755 m_DescriptionChosen = true;
1756 x_GetGenericInterval(m_Interval, suppress_allele);
1757 }
1758
1759
CAutoDefPromoterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1760 CAutoDefPromoterClause::CAutoDefPromoterClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1761 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1762 {
1763 m_Description = "";
1764 m_DescriptionChosen = true;
1765 m_Typeword = "promoter region";
1766 m_TypewordChosen = true;
1767 m_Interval = "";
1768 }
1769
1770
~CAutoDefPromoterClause()1771 CAutoDefPromoterClause::~CAutoDefPromoterClause()
1772 {
1773 }
1774
1775
Label(bool suppress_allele)1776 void CAutoDefPromoterClause::Label(bool suppress_allele)
1777 {
1778 m_DescriptionChosen = true;
1779 }
1780
1781
1782 /* This class produces the default definition line label for a misc_feature
1783 * that has the word "intergenic spacer" in the comment. If the comment starts
1784 * with the word "contains", "contains" is ignored. If "intergenic spacer"
1785 * appears first in the comment (or first after the word "contains", the text
1786 * after the words "intergenic spacer" but before the first semicolon (if any)
1787 * appear after the words "intergenic spacer" in the definition line. If there
1788 * are words after "contains" or at the beginning of the comment before the words
1789 * "intergenic spacer", this text will appear in the definition line before the words
1790 * "intergenic spacer".
1791 */
1792
InitWithString(string comment,bool suppress_allele)1793 void CAutoDefIntergenicSpacerClause::InitWithString (string comment, bool suppress_allele)
1794 {
1795 m_Typeword = "intergenic spacer";
1796 m_TypewordChosen = true;
1797 m_ShowTypewordFirst = false;
1798 m_Pluralizable = false;
1799
1800
1801 if (NStr::StartsWith(comment, "may contain ")) {
1802 m_Description = comment.substr(12);
1803 m_DescriptionChosen = true;
1804 m_Typeword = "";
1805 m_TypewordChosen = true;
1806 m_Interval = "region";
1807 } else {
1808 if (NStr::StartsWith(comment, "contains ")) {
1809 comment = comment.substr(9);
1810 }
1811
1812 if (NStr::StartsWith(comment, "intergenic spacer")) {
1813 comment = comment.substr(17);
1814 if (NStr::IsBlank(comment)) {
1815 m_ShowTypewordFirst = false;
1816 m_Description = "";
1817 m_DescriptionChosen = true;
1818 } else {
1819 NStr::TruncateSpacesInPlace(comment);
1820 if (NStr::StartsWith(comment, "and ")) {
1821 m_Description = "";
1822 m_DescriptionChosen = true;
1823 m_ShowTypewordFirst = false;
1824 } else {
1825 m_Description = comment;
1826 m_DescriptionChosen = true;
1827 m_ShowTypewordFirst = true;
1828 }
1829 }
1830 } else {
1831 string::size_type pos = NStr::Find(comment, "intergenic spacer");
1832 if (pos != NCBI_NS_STD::string::npos) {
1833 m_Description = comment.substr(0, pos);
1834 NStr::TruncateSpacesInPlace(m_Description);
1835 m_DescriptionChosen = true;
1836 m_ShowTypewordFirst = false;
1837 }
1838 }
1839 x_GetGenericInterval(m_Interval, suppress_allele);
1840 }
1841 }
1842
1843
CAutoDefIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string comment,const CAutoDefOptions & opts)1844 CAutoDefIntergenicSpacerClause::CAutoDefIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, string comment, const CAutoDefOptions& opts)
1845 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1846 {
1847 InitWithString (comment, true);
1848 }
1849
1850
CAutoDefIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)1851 CAutoDefIntergenicSpacerClause::CAutoDefIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
1852 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
1853 {
1854
1855 string comment;
1856 if (m_pMainFeat->IsSetComment()) {
1857 comment = m_pMainFeat->GetComment();
1858 }
1859
1860 /* truncate at first semicolon */
1861 string::size_type pos = NStr::Find(comment, ";");
1862 if (pos != NCBI_NS_STD::string::npos) {
1863 comment = comment.substr(0, pos);
1864 }
1865
1866 InitWithString (comment, true);
1867 }
1868
1869
~CAutoDefIntergenicSpacerClause()1870 CAutoDefIntergenicSpacerClause::~CAutoDefIntergenicSpacerClause()
1871 {
1872 }
1873
1874
Label(bool suppress_allele)1875 void CAutoDefIntergenicSpacerClause::Label(bool suppress_allele)
1876 {
1877 m_DescriptionChosen = true;
1878 }
1879
1880
CAutoDefParsedIntergenicSpacerClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const string & description,bool is_first,bool is_last,const CAutoDefOptions & opts)1881 CAutoDefParsedIntergenicSpacerClause::CAutoDefParsedIntergenicSpacerClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc,
1882 const string& description, bool is_first, bool is_last, const CAutoDefOptions& opts)
1883 : CAutoDefIntergenicSpacerClause(bh, main_feat, mapped_loc, opts)
1884 {
1885 if (!NStr::IsBlank(description)) {
1886 m_Description = description;
1887 size_t pos = NStr::Find(m_Description, "intergenic spacer");
1888 if (pos != string::npos) {
1889 m_Description = m_Description.substr(0, pos);
1890 NStr::TruncateSpacesInPlace(m_Description);
1891 }
1892 m_DescriptionChosen = true;
1893 }
1894 m_Typeword = "intergenic spacer";
1895 m_TypewordChosen = true;
1896
1897 // adjust partialness of location
1898 bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological) && is_first;
1899 bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological) && is_last;
1900 m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1901 m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1902 x_GetGenericInterval(m_Interval, true);
1903 if (NStr::EndsWith(description, " region")) {
1904 MakeRegion();
1905 }
1906 }
1907
1908
~CAutoDefParsedIntergenicSpacerClause()1909 CAutoDefParsedIntergenicSpacerClause::~CAutoDefParsedIntergenicSpacerClause()
1910 {
1911 }
1912
1913
~CAutoDefParsedtRNAClause()1914 CAutoDefParsedtRNAClause::~CAutoDefParsedtRNAClause()
1915 {
1916 }
1917
1918
CAutoDefParsedClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,bool is_first,bool is_last,const CAutoDefOptions & opts)1919 CAutoDefParsedClause::CAutoDefParsedClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, bool is_first, bool is_last, const CAutoDefOptions& opts)
1920 : CAutoDefFeatureClause (bh, main_feat, mapped_loc, opts)
1921 {
1922 // adjust partialness of location
1923 bool partial5 = m_ClauseLocation->IsPartialStart(eExtreme_Biological) && is_first;
1924 bool partial3 = m_ClauseLocation->IsPartialStop(eExtreme_Biological) && is_last;
1925 m_ClauseLocation->SetPartialStart(partial5, eExtreme_Biological);
1926 m_ClauseLocation->SetPartialStop(partial3, eExtreme_Biological);
1927 }
1928
~CAutoDefParsedClause()1929 CAutoDefParsedClause::~CAutoDefParsedClause()
1930 {
1931 }
1932
SetMiscRNAWord(const string & phrase)1933 void CAutoDefParsedClause::SetMiscRNAWord(const string& phrase)
1934 {
1935 ERnaMiscWord word_type = x_GetRnaMiscWordType(phrase);
1936 if (word_type == eMiscRnaWordType_InternalSpacer ||
1937 word_type == eMiscRnaWordType_ExternalSpacer ||
1938 word_type == eMiscRnaWordType_RNAIntergenicSpacer ||
1939 word_type == eMiscRnaWordType_IntergenicSpacer) {
1940 const string& item_name = x_GetRnaMiscWord(word_type);
1941 if (NStr::StartsWith(phrase, item_name)) {
1942 SetTypewordFirst(true);
1943 m_Description = phrase.substr(item_name.length());
1944 } else {
1945 SetTypewordFirst(false);
1946 m_Description = phrase.substr(0, NStr::Find(phrase, item_name));
1947 }
1948 if (NStr::EndsWith(phrase, " region") &&
1949 (!m_ShowTypewordFirst || m_Description != " region")) {
1950 SetTypeword(item_name + " region");
1951 } else {
1952 SetTypeword(item_name);
1953 }
1954 } else if (word_type == eMiscRnaWordType_RNA) {
1955 m_Description = phrase;
1956 if (NStr::EndsWith(m_Description, " gene")) {
1957 m_Description = m_Description.substr(0, m_Description.length() - 5);
1958 }
1959 SetTypeword("gene");
1960 SetTypewordFirst(false);
1961 } else if (word_type == eMiscRnaWordType_tRNA) {
1962 string gene_name;
1963 string product_name;
1964 if (CAutoDefParsedtRNAClause::ParseString(phrase, gene_name, product_name)) {
1965 m_TypewordChosen = true;
1966 m_GeneName = gene_name;
1967 if (!NStr::IsBlank(m_GeneName)) {
1968 m_HasGene = true;
1969 }
1970 m_ProductName = product_name;
1971 m_ProductNameChosen = true;
1972 x_GetDescription(m_Description);
1973 } else {
1974 m_Description = phrase;
1975 }
1976 SetTypeword("gene");
1977 SetTypewordFirst(false);
1978 }
1979 NStr::TruncateSpacesInPlace(m_Description);
1980 m_DescriptionChosen = true;
1981 }
1982
1983
CAutoDefParsedtRNAClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string gene_name,string product_name,bool is_first,bool is_last,const CAutoDefOptions & opts)1984 CAutoDefParsedtRNAClause::CAutoDefParsedtRNAClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc,
1985 string gene_name, string product_name,
1986 bool is_first, bool is_last, const CAutoDefOptions& opts)
1987 : CAutoDefParsedClause (bh, main_feat, mapped_loc, is_first, is_last, opts)
1988 {
1989 m_Typeword = "gene";
1990 m_TypewordChosen = true;
1991 m_GeneName = gene_name;
1992 if (!NStr::IsBlank (m_GeneName)) {
1993 m_HasGene = true;
1994 }
1995 m_ProductName = product_name;
1996 m_ProductNameChosen = true;
1997 }
1998
1999
CAutoDefGeneClusterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2000 CAutoDefGeneClusterClause::CAutoDefGeneClusterClause(CBioseq_Handle bh, const CSeq_feat& main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2001 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2002 {
2003 m_Pluralizable = false;
2004 m_ShowTypewordFirst = false;
2005 string comment = m_pMainFeat->GetComment();
2006
2007 string::size_type pos = NStr::Find(comment, "gene cluster");
2008 if (pos == NCBI_NS_STD::string::npos) {
2009 pos = NStr::Find(comment, "gene locus");
2010 m_Typeword = "gene locus";
2011 m_TypewordChosen = true;
2012 } else {
2013 m_Typeword = "gene cluster";
2014 m_TypewordChosen = true;
2015 }
2016
2017 if (pos != NCBI_NS_STD::string::npos) {
2018 comment = comment.substr(0, pos);
2019 }
2020 NStr::TruncateSpacesInPlace(comment);
2021 m_Description = comment;
2022 m_DescriptionChosen = true;
2023 m_SuppressSubfeatures = true;
2024 }
2025
2026
~CAutoDefGeneClusterClause()2027 CAutoDefGeneClusterClause::~CAutoDefGeneClusterClause()
2028 {
2029 }
2030
2031
Label(bool suppress_allele)2032 void CAutoDefGeneClusterClause::Label(bool suppress_allele)
2033 {
2034 x_GetGenericInterval(m_Interval, suppress_allele);
2035 m_DescriptionChosen = true;
2036 }
2037
2038
CAutoDefMiscCommentClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2039 CAutoDefMiscCommentClause::CAutoDefMiscCommentClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2040 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2041 {
2042 if (m_pMainFeat->CanGetComment()) {
2043 m_Description = m_pMainFeat->GetComment();
2044 string::size_type pos = NStr::Find(m_Description, ";");
2045 if (pos != NCBI_NS_STD::string::npos) {
2046 m_Description = m_Description.substr(0, pos);
2047 }
2048 m_DescriptionChosen = true;
2049 }
2050 if (NStr::EndsWith(m_Description, " sequence")) {
2051 m_Description = m_Description.substr(0, m_Description.length() - 9);
2052 m_Typeword = "sequence";
2053 m_TypewordChosen = true;
2054 } else {
2055 x_TypewordFromSequence();
2056 }
2057 m_Interval = "";
2058 }
2059
2060
~CAutoDefMiscCommentClause()2061 CAutoDefMiscCommentClause::~CAutoDefMiscCommentClause()
2062 {
2063 }
2064
2065
Label(bool suppress_allele)2066 void CAutoDefMiscCommentClause::Label(bool suppress_allele)
2067 {
2068 m_DescriptionChosen = true;
2069 }
2070
2071
CAutoDefParsedRegionClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,string product,const CAutoDefOptions & opts)2072 CAutoDefParsedRegionClause::CAutoDefParsedRegionClause
2073 (CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, string product, const CAutoDefOptions& opts)
2074 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2075 {
2076 vector<string> elements = GetMiscRNAElements(product);
2077 if (elements.empty()) {
2078 m_Description = product;
2079 } else {
2080 ITERATE(vector<string>, it, elements) {
2081 if (!NStr::IsBlank(m_Description)) {
2082 m_Description += ", ";
2083 if (*it == elements.back()) {
2084 m_Description += "and ";
2085 }
2086 }
2087 m_Description += *it;
2088 if (NStr::Find(*it, "RNA") != string::npos && !NStr::EndsWith(*it, "gene") && !NStr::EndsWith(*it, "genes")) {
2089 m_Description += " gene";
2090 }
2091 }
2092 }
2093 m_DescriptionChosen = true;
2094
2095 m_Typeword = "";
2096 m_TypewordChosen = true;
2097 m_Interval = "region";
2098 }
2099
2100
~CAutoDefParsedRegionClause()2101 CAutoDefParsedRegionClause::~CAutoDefParsedRegionClause()
2102 {
2103 }
2104
2105
Label(bool suppress_allele)2106 void CAutoDefParsedRegionClause::Label(bool suppress_allele)
2107 {
2108 }
2109
CAutoDefFakePromoterClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2110 CAutoDefFakePromoterClause::CAutoDefFakePromoterClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2111 : CAutoDefFeatureClause (bh, main_feat, mapped_loc, opts)
2112 {
2113 m_Description = "";
2114 m_DescriptionChosen = true;
2115 m_Typeword = "promoter region";
2116 m_TypewordChosen = true;
2117 m_ShowTypewordFirst = false;
2118 m_Interval = "";
2119
2120
2121 m_ClauseLocation = new CSeq_loc();
2122 const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
2123 CRef <CSeq_id> new_id(new CSeq_id);
2124 new_id->Assign(*id);
2125 m_ClauseLocation->SetInt().SetId(*new_id);
2126 m_ClauseLocation->SetInt().SetFrom(0);
2127 m_ClauseLocation->SetInt().SetTo(bh.GetInst_Length() - 1);
2128
2129 }
2130
2131
~CAutoDefFakePromoterClause()2132 CAutoDefFakePromoterClause::~CAutoDefFakePromoterClause()
2133 {
2134 }
2135
2136
Label(bool suppress_allele)2137 void CAutoDefFakePromoterClause::Label(bool suppress_allele)
2138 {
2139 }
2140
2141
OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base * parent_clause,bool gene_cluster_opp_strand) const2142 bool CAutoDefFakePromoterClause::OkToGroupUnderByLocation(const CAutoDefFeatureClause_Base *parent_clause, bool gene_cluster_opp_strand) const
2143 {
2144 if (parent_clause == NULL) {
2145 return false;
2146 } else {
2147 return true;
2148 }
2149 }
2150
2151
OkToGroupUnderByType(const CAutoDefFeatureClause_Base * parent_clause) const2152 bool CAutoDefFakePromoterClause::OkToGroupUnderByType(const CAutoDefFeatureClause_Base *parent_clause) const
2153 {
2154 bool ok_to_group = false;
2155
2156 if (parent_clause == NULL) {
2157 return false;
2158 }
2159 CSeqFeatData::ESubtype parent_subtype = parent_clause->GetMainFeatureSubtype();
2160
2161 if (parent_subtype == CSeqFeatData::eSubtype_cdregion
2162 || parent_subtype == CSeqFeatData::eSubtype_mRNA
2163 || parent_subtype == CSeqFeatData::eSubtype_gene
2164 || parent_subtype == CSeqFeatData::eSubtype_operon
2165 || parent_clause->IsEndogenousVirusSourceFeature()
2166 || parent_clause->IsGeneCluster()) {
2167 ok_to_group = true;
2168 }
2169
2170 return ok_to_group;
2171 }
2172
2173
CAutoDefPromoterAnd5UTRClause(CBioseq_Handle bh,const CSeq_feat & main_feat,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2174 CAutoDefPromoterAnd5UTRClause::CAutoDefPromoterAnd5UTRClause(CBioseq_Handle bh, const CSeq_feat &main_feat, const CSeq_loc &mapped_loc, const CAutoDefOptions& opts)
2175 : CAutoDefFeatureClause(bh, main_feat, mapped_loc, opts)
2176 {
2177 m_Description = "promoter region and 5' UTR";
2178 m_DescriptionChosen = true;
2179 m_Typeword = "";
2180 m_TypewordChosen = true;
2181 m_ShowTypewordFirst = false;
2182 m_Interval = "genomic sequence";
2183
2184
2185 m_ClauseLocation = new CSeq_loc();
2186 const CSeq_id* id = FindBestChoice(bh.GetBioseqCore()->GetId(), CSeq_id::BestRank);
2187 CRef <CSeq_id> new_id(new CSeq_id);
2188 new_id->Assign(*id);
2189 m_ClauseLocation->SetInt().SetId(*new_id);
2190 m_ClauseLocation->SetInt().SetFrom(0);
2191 m_ClauseLocation->SetInt().SetTo(bh.GetInst_Length() - 1);
2192
2193 }
2194
2195
IsPromoterAnd5UTR(const CSeq_feat & feat)2196 bool CAutoDefPromoterAnd5UTRClause::IsPromoterAnd5UTR(const CSeq_feat& feat)
2197 {
2198 return (feat.IsSetData() &&
2199 feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature &&
2200 feat.IsSetComment() &&
2201 NStr::Equal(feat.GetComment(), "contains promoter and 5' UTR"));
2202 }
2203
2204
Label(bool suppress_allele)2205 void CAutoDefPromoterAnd5UTRClause::Label(bool suppress_allele)
2206 {
2207
2208 }
2209
2210
GetClauseType() const2211 CAutoDefFeatureClause::EClauseType CAutoDefFeatureClause::GetClauseType() const
2212 {
2213 CSeqFeatData::ESubtype subtype = GetMainFeatureSubtype();
2214 if (subtype == CSeqFeatData::eSubtype_repeat_region) {
2215 if (!NStr::IsBlank(m_pMainFeat->GetNamedQual("endogenous_virus"))) {
2216 return eEndogenousVirusRepeatRegion;
2217 }
2218 }
2219 return eDefault;
2220 }
2221
2222
2223 // Some misc_RNA clauses have a comment that actually lists multiple
2224 // features. These functions create a clause for each element in the
2225 // comment.
2226
AddMiscRNAFeatures(const CBioseq_Handle & bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2227 vector<CRef<CAutoDefFeatureClause > > AddMiscRNAFeatures(const CBioseq_Handle& bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
2228 {
2229 vector<CRef<CAutoDefFeatureClause > > rval;
2230 string comment;
2231 string::size_type pos;
2232
2233 if (cf.GetData().Which() == CSeqFeatData::e_Rna) {
2234 comment = cf.GetNamedQual("product");
2235 if (NStr::IsBlank(comment)
2236 && cf.IsSetData()
2237 && cf.GetData().IsRna()
2238 && cf.GetData().GetRna().IsSetExt()) {
2239 if (cf.GetData().GetRna().GetExt().IsName()) {
2240 comment = cf.GetData().GetRna().GetExt().GetName();
2241 }
2242 else if (cf.GetData().GetRna().GetExt().IsGen()
2243 && cf.GetData().GetRna().GetExt().GetGen().IsSetProduct()) {
2244 comment = cf.GetData().GetRna().GetExt().GetGen().GetProduct();
2245 }
2246 }
2247 }
2248
2249 if ((NStr::Equal(comment, "misc_RNA") || NStr::IsBlank(comment)) && cf.CanGetComment()) {
2250 comment = cf.GetComment();
2251 }
2252 if (NStr::IsBlank(comment)) {
2253 return rval;
2254 }
2255
2256 pos = NStr::Find(comment, "spacer");
2257 if (pos == NPOS) {
2258 return rval;
2259 }
2260
2261 bool is_region = false;
2262
2263 NStr::TrimPrefixInPlace(comment, "contains ");
2264 if (NStr::StartsWith(comment, "may contain ")) {
2265 NStr::TrimPrefixInPlace(comment, "may contain ");
2266 is_region = true;
2267 }
2268
2269 pos = NStr::Find(comment, ";");
2270 if (pos != string::npos) {
2271 comment = comment.substr(0, pos);
2272 }
2273
2274 if (is_region) {
2275 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedRegionClause(bh, cf, mapped_loc, comment, opts)));
2276 } else {
2277 vector<string> elements = CAutoDefFeatureClause::GetMiscRNAElements(comment);
2278 if (!elements.empty()) {
2279 for (auto s : elements) {
2280 CRef<CAutoDefParsedClause> new_clause(new CAutoDefParsedClause(bh, cf, mapped_loc,
2281 (s == elements.front()), (s == elements.back()), opts));
2282 new_clause->SetMiscRNAWord(s);
2283 rval.push_back(new_clause);
2284 }
2285 } else {
2286 elements = CAutoDefFeatureClause::GetTrnaIntergenicSpacerClausePhrases(comment);
2287 if (!elements.empty()) {
2288 for (auto s : elements) {
2289 size_t pos = NStr::Find(s, "intergenic spacer");
2290 if (pos != string::npos) {
2291 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedIntergenicSpacerClause(bh,
2292 cf,
2293 mapped_loc,
2294 (s),
2295 (s == elements.front()),
2296 (s == elements.back()), opts)));
2297 } else {
2298 rval.push_back(CRef<CAutoDefFeatureClause>(s_tRNAClauseFromNote(bh, cf, mapped_loc, s, (s == elements.front()), (s == elements.back()), opts)));
2299 }
2300 }
2301 } else {
2302 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefParsedIntergenicSpacerClause(bh,
2303 cf,
2304 mapped_loc,
2305 comment,
2306 true,
2307 true,
2308 opts)));
2309 }
2310 }
2311 }
2312 return rval;
2313 }
2314
2315
AddtRNAAndOther(const CBioseq_Handle & bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts)2316 vector<CRef<CAutoDefFeatureClause > > AddtRNAAndOther(const CBioseq_Handle& bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts)
2317 {
2318 vector<CRef<CAutoDefFeatureClause> > rval;
2319 if (cf.GetData().GetSubtype() != CSeqFeatData::eSubtype_misc_feature ||
2320 !cf.IsSetComment()) {
2321 return rval;
2322 }
2323
2324 vector<string> phrases = CAutoDefFeatureClause_Base::GetFeatureClausePhrases(cf.GetComment());
2325 if (phrases.size() < 2) {
2326 return rval;
2327 }
2328
2329 bool first = true;
2330 string last = phrases.back();
2331 phrases.pop_back();
2332 ITERATE(vector<string>, it, phrases) {
2333 rval.push_back(CRef<CAutoDefFeatureClause>(CAutoDefFeatureClause_Base::ClauseFromPhrase(*it, bh, cf, mapped_loc, first, false, opts)));
2334 first = false;
2335 }
2336 rval.push_back(CRef<CAutoDefFeatureClause>(CAutoDefFeatureClause_Base::ClauseFromPhrase(last, bh, cf, mapped_loc, first, true, opts)));
2337
2338 return rval;
2339 }
2340
2341
FeatureClauseFactory(CBioseq_Handle bh,const CSeq_feat & cf,const CSeq_loc & mapped_loc,const CAutoDefOptions & opts,bool is_single_misc_feat)2342 vector<CRef<CAutoDefFeatureClause > > FeatureClauseFactory(CBioseq_Handle bh, const CSeq_feat& cf, const CSeq_loc& mapped_loc, const CAutoDefOptions& opts, bool is_single_misc_feat)
2343 {
2344 vector<CRef<CAutoDefFeatureClause> > rval;
2345
2346 auto subtype = cf.GetData().GetSubtype();
2347
2348 if (opts.IsFeatureSuppressed(subtype)) {
2349 return rval;
2350 }
2351
2352 if (subtype == CSeqFeatData::eSubtype_gene) {
2353 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefGeneClause(bh, cf, mapped_loc, opts)));
2354 } else if (subtype == CSeqFeatData::eSubtype_ncRNA) {
2355 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefNcRNAClause(bh, cf, mapped_loc, opts)));
2356 } else if (subtype == CSeqFeatData::eSubtype_mobile_element) {
2357 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefMobileElementClause(bh, cf, mapped_loc, opts)));
2358 } else if (CAutoDefFeatureClause::IsSatellite(cf)) {
2359 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefSatelliteClause(bh, cf, mapped_loc, opts)));
2360 } else if (subtype == CSeqFeatData::eSubtype_otherRNA
2361 || subtype == CSeqFeatData::eSubtype_misc_RNA
2362 || subtype == CSeqFeatData::eSubtype_rRNA) {
2363 auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2364 if (misc_rna.empty()) {
2365 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2366 } else {
2367 for (auto it : misc_rna) {
2368 rval.push_back(it);
2369 }
2370 }
2371 } else if (CAutoDefFeatureClause::IsPromoter(cf)) {
2372 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefPromoterClause(bh, cf, mapped_loc, opts)));
2373 } else if (CAutoDefFeatureClause::IsGeneCluster(cf)) {
2374 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefGeneClusterClause(bh, cf, mapped_loc, opts)));
2375 } else if (CAutoDefFeatureClause::IsControlRegion(cf)) {
2376 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2377 }
2378 else if (subtype == CSeqFeatData::eSubtype_otherRNA) {
2379 auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2380 if (misc_rna.empty()) {
2381 // try to make trna clauses
2382 misc_rna = AddtRNAAndOther(bh, cf, mapped_loc, opts);
2383 }
2384 if (misc_rna.empty()) {
2385 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2386 }
2387 else {
2388 for (auto it : misc_rna) {
2389 rval.push_back(it);
2390 }
2391 }
2392 } else if (subtype == CSeqFeatData::eSubtype_misc_feature &&
2393 is_single_misc_feat && CAutoDefPromoterAnd5UTRClause::IsPromoterAnd5UTR(cf)) {
2394 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefPromoterAnd5UTRClause(bh, cf, mapped_loc, opts)));
2395 } else if (subtype == CSeqFeatData::eSubtype_misc_feature) {
2396 auto misc_rna = AddMiscRNAFeatures(bh, cf, mapped_loc, opts);
2397 if (misc_rna.empty()) {
2398 // try to make trna clauses
2399 misc_rna = AddtRNAAndOther(bh, cf, mapped_loc, opts);
2400 }
2401 if (misc_rna.empty()) {
2402 // some misc-features may require more parsing
2403 CRef<CAutoDefFeatureClause> new_clause(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts));
2404 if (!is_single_misc_feat &&
2405 (opts.GetMiscFeatRule() == CAutoDefOptions::eDelete
2406 || (opts.GetMiscFeatRule() == CAutoDefOptions::eNoncodingProductFeat && !new_clause->IsNoncodingProductFeat()))) {
2407 // do not create a clause at all
2408 new_clause.Reset(NULL);
2409 } else if (opts.GetMiscFeatRule() == CAutoDefOptions::eCommentFeat) {
2410 new_clause.Reset(NULL);
2411 if (cf.CanGetComment() && !NStr::IsBlank(cf.GetComment())) {
2412 misc_rna.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefMiscCommentClause(bh, cf, mapped_loc, opts)));
2413 }
2414 } else {
2415 misc_rna.push_back(new_clause);
2416 }
2417 }
2418 if (!misc_rna.empty()) {
2419 for (auto it : misc_rna) {
2420 rval.push_back(it);
2421 }
2422 }
2423
2424 } else {
2425 rval.push_back(CRef<CAutoDefFeatureClause>(new CAutoDefFeatureClause(bh, cf, mapped_loc, opts)));
2426 }
2427 return rval;
2428 }
2429
2430
2431 END_SCOPE(objects)
2432 END_NCBI_SCOPE
2433