1 /* $Id: feature_item.cpp 637281 2021-09-09 19:27:07Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko, NCBI
27 * Mati Shomrat
28 * Maintainer: Frank Ludwig
29 *
30 * File Description:
31 * new (early 2003) flat-file generator -- representation of features
32 * (mainly of interest to implementors)
33 *
34 *
35 * WHEN EDITING THE LIST OF QUALIFIERS:
36 *
37 * - there is currently a lot of parallel logic for the FTable case
38 * (CFeatureItem::x_AddFTableQuals()) and the standard case
39 * (CFeatureItem::x_Add...Quals()). Make sure to edit both cases as
40 * appropriate.
41 * ===========================================================================
42 */
43 #include <ncbi_pch.hpp>
44 #include <corelib/ncbistd.hpp>
45 #include <serial/iterator.hpp>
46 #include <serial/enumvalues.hpp>
47
48 #include <algorithm>
49 #include <sstream>
50
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seq/Heterogen.hpp>
53 #include <objects/seq/MolInfo.hpp>
54 #include <objects/seq/seq_id_handle.hpp>
55 #include <objects/seq/Annot_descr.hpp>
56 #include <objects/seq/Annotdesc.hpp>
57 #include <objects/seq/Seq_literal.hpp>
58 #include <objects/seq/seqport_util.hpp>
59 #include <objects/seqfeat/Org_ref.hpp>
60 #include <objects/seqfeat/OrgName.hpp>
61 #include <objects/seqfeat/OrgMod.hpp>
62 #include <objects/seqfeat/PCRPrimerSet.hpp>
63 #include <objects/seqfeat/PCRPrimer.hpp>
64 #include <objects/seqfeat/PCRReaction.hpp>
65 #include <objects/seqfeat/PCRReactionSet.hpp>
66 #include <objects/seqfeat/Code_break.hpp>
67 #include <objects/seqfeat/Delta_item.hpp>
68 #include <objects/seqfeat/Gb_qual.hpp>
69 #include <objects/seqfeat/Gene_nomenclature.hpp>
70 #include <objects/seqfeat/Genetic_code.hpp>
71 #include <objects/seqfeat/Genetic_code_table.hpp>
72 #include <objects/seqfeat/Imp_feat.hpp>
73 #include <objects/seqfeat/RNA_ref.hpp>
74 #include <objects/seqfeat/RNA_gen.hpp>
75 #include <objects/seqfeat/RNA_qual_set.hpp>
76 #include <objects/seqfeat/RNA_qual.hpp>
77 #include <objects/seqfeat/Trna_ext.hpp>
78 #include <objects/seqfeat/Feat_id.hpp>
79 #include <objects/seqfeat/SeqFeatXref.hpp>
80 #include <objects/seqfeat/Variation_ref.hpp>
81 #include <objects/seqfeat/Variation_inst.hpp>
82 #include <objects/seqloc/Seq_loc.hpp>
83 #include <objects/seqloc/Seq_point.hpp>
84 #include <objects/seqloc/Seq_interval.hpp>
85 #include <objects/seqloc/Packed_seqpnt.hpp>
86 #include <objects/seqloc/Textseq_id.hpp>
87 #include <objects/general/Object_id.hpp>
88 #include <objects/misc/sequence_macros.hpp>
89
90 #include <objmgr/scope.hpp>
91 #include <objmgr/object_manager.hpp>
92 #include <objmgr/seqdesc_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/util/sequence.hpp>
95 #include <objmgr/util/feature.hpp>
96 #include <objmgr/util/weight.hpp>
97 #include <objmgr/util/seq_loc_util.hpp>
98
99 #include <util/static_set.hpp>
100 #include <util/static_map.hpp>
101 #include <util/sequtil/sequtil.hpp>
102 #include <util/sequtil/sequtil_convert.hpp>
103
104 #include <algorithm>
105 #include <objtools/data_loaders/genbank/gbloader.hpp>
106 #include <objtools/format/formatter.hpp>
107 #include <objtools/format/items/feature_item.hpp>
108 #include <objtools/format/items/gene_finder.hpp>
109 #include <objtools/format/context.hpp>
110 #include <objtools/format/items/qualifiers.hpp>
111 #include <objmgr/util/objutil.hpp>
112 #include "inst_info_map.hpp"
113
114 // On Mac OS X 10.3, FixMath.h defines ff as a one-argument macro(!)
115 #ifdef ff
116 # undef ff
117 #endif
118
119 BEGIN_NCBI_SCOPE
120 BEGIN_SCOPE(objects)
121 USING_SCOPE(sequence);
122
123 class CGoQualLessThan
124 {
125 public:
operator ()(const CConstRef<CFlatGoQVal> & obj1,const CConstRef<CFlatGoQVal> & obj2)126 bool operator() ( const CConstRef<CFlatGoQVal> &obj1, const CConstRef<CFlatGoQVal> &obj2 )
127 {
128 const CFlatGoQVal *qval1 = obj1.GetNonNullPointer();
129 const CFlatGoQVal *qval2 = obj2.GetNonNullPointer();
130
131 // sort by text string
132 const string &str1 = qval1->GetTextString();
133 const string &str2 = qval2->GetTextString();
134
135 int textComparison = 0;
136
137 // This whole paragraph should eventually be replaced with a mere NStr::CompareNocase stored into textComparison
138 // We can't just use NStr::CompareNocase, because that compares using tolower, whereas
139 // we must compare with toupper to maintain compatibility with C.
140 SIZE_TYPE pos = 0;
141 const SIZE_TYPE min_length = min( str1.length(), str2.length() );
142 for( ; pos < min_length; ++pos ) {
143 textComparison = toupper( str1[pos] ) - toupper( str2[pos] );
144 if( textComparison != 0 ) {
145 break;
146 }
147 }
148 if( 0 == textComparison ) {
149 // if we reached the end, compare via length (shorter first)
150 textComparison = str1.length() - str2.length();
151 }
152
153 // compare by text, if possible
154 if( textComparison < 0 ) {
155 return true;
156 } else if( textComparison > 0 ) {
157 return false;
158 }
159
160 // if text is tied, then sort by pubmed id, if any
161 int pmid1 = qval1->GetPubmedId();
162 int pmid2 = qval2->GetPubmedId();
163
164 if( 0 == pmid1 ) {
165 return false;
166 } else if( 0 == pmid2 ) {
167 return true;
168 } else {
169 return pmid1 < pmid2;
170 }
171 }
172 };
173
174 // -- static functions
175
s_ValidId(const CSeq_id & id)176 static bool s_ValidId(const CSeq_id& id)
177 {
178 return id.IsGenbank() || id.IsEmbl() || id.IsDdbj() ||
179 id.IsOther() || id.IsPatent() ||
180 id.IsTpg() || id.IsTpe() || id.IsTpd() ||
181 id.IsGpipe();
182 }
183
184 static
s_StrEqualDisregardFinalPeriod(const string & s1,const string & s2,NStr::ECase use_case)185 bool s_StrEqualDisregardFinalPeriod(
186 const string &s1, const string &s2,
187 NStr::ECase use_case )
188 {
189 if( s1.empty() || s2.empty() ) {
190 return s1.empty() && s2.empty();
191 }
192
193 // set length to disregard final period, if any
194 size_t s1_len = s1.length();
195 if( s1[s1_len-1] == '.' ) {
196 --s1_len;
197 }
198 size_t s2_len = s2.length();
199 if( s2[s2_len-1] == '.' ) {
200 --s2_len;
201 }
202
203 if( s1_len != s2_len ) {
204 return false;
205 }
206
207 // NStr::Equal does not have exactly the function I want,
208 // so I have to make my own.
209 for( size_t ii = 0; ii < s1_len ; ++ii ) {
210 const char ch1 = ( use_case == NStr::eNocase ? toupper(s1[ii]) : s1[ii] );
211 const char ch2 = ( use_case == NStr::eNocase ? toupper(s2[ii]) : s2[ii] );
212 if( ch1 != ch2 ) {
213 return false;
214 }
215 }
216 return true;
217 }
218
s_CheckQuals_cdregion(const CMappedFeat & feat,const CSeq_loc & loc,CBioseqContext & ctx)219 static bool s_CheckQuals_cdregion(const CMappedFeat& feat,
220 const CSeq_loc& loc,
221 CBioseqContext& ctx)
222 {
223 if ( !ctx.Config().CheckCDSProductId() ) {
224 return true;
225 }
226
227 CScope& scope = ctx.GetScope();
228
229 // non-pseudo CDS must have /product
230 bool pseudo = feat.IsSetPseudo() && feat.GetPseudo() ;
231 if ( !pseudo ) {
232 const CGene_ref* grp = feat.GetGeneXref();
233 if ( grp == NULL ) {
234 CConstRef<CSeq_feat> gene = GetOverlappingGene(loc, scope);
235 if (gene) {
236 pseudo = gene->IsSetPseudo() && gene->GetPseudo();
237 if ( !pseudo ) {
238 grp = &(gene->GetData().GetGene());
239 }
240 }
241 }
242 if ( !pseudo && grp != NULL ) {
243 pseudo = grp->GetPseudo();
244 }
245 }
246
247 bool just_stop = false;
248 const CSeq_loc& Loc = feat.GetLocation();
249 if ( Loc.IsPartialStart(eExtreme_Biological) && !Loc.IsPartialStop(eExtreme_Biological) ) {
250 if ( GetLength(Loc, &scope) <= 5 ) {
251 just_stop = true;
252 }
253 }
254
255 if ( pseudo || just_stop ) {
256 return true;
257 }
258
259 // make sure the product has a valid accession
260 if (feat.IsSetProduct()) {
261 CConstRef<CSeq_id> id;
262 try {
263 id.Reset(&(GetId(feat.GetProduct(), &scope)));
264 } catch ( CException& ) {
265 id.Reset(NULL);
266 }
267 if (id) {
268 if ((id->IsGi() && id->GetGi() > ZERO_GI) || id->IsLocal()) {
269 CBioseq_Handle prod = scope.GetBioseqHandleFromTSE(*id, ctx.GetHandle());
270 if (prod) {
271 ITERATE (CBioseq_Handle::TId, it, prod.GetId()) {
272 if (s_ValidId(*it->GetSeqId())) {
273 CConstRef<CTextseq_id> tsip(it->GetSeqId()->GetTextseq_Id());
274 if (tsip && tsip->IsSetAccession() &&
275 IsValidAccession(tsip->GetAccession())) {
276 return true;
277 }
278 }
279 }
280 } else if (id->IsGi() && id->GetGi() > ZERO_GI) {
281 // RELEASE_MODE requires that /protein_id is an accession
282 if (ctx.Config().IsModeRelease()) {
283 try {
284 if (IsValidAccession(GetAccessionForGi(id->GetGi(), scope))) {
285 return true;
286 }
287 } catch (CException&) {
288 }
289 }
290 }
291 } else if (s_ValidId(*id)) {
292 CConstRef<CTextseq_id> tsip(id->GetTextseq_Id());
293 if (tsip && tsip->IsSetAccession() &&
294 IsValidAccession(tsip->GetAccession())) {
295 return true;
296 }
297 }
298 }
299 } else { // no product
300 if (feat.IsSetExcept() && feat.GetExcept() &&
301 feat.IsSetExcept_text() ) {
302 if (NStr::Find(feat.GetExcept_text(),
303 "rearrangement required for product") != NPOS) {
304 return true;
305 }
306 }
307 }
308
309 return false;
310 }
311
312
313
s_HasPub(const CMappedFeat & feat,CBioseqContext & ctx)314 static bool s_HasPub(const CMappedFeat& feat, CBioseqContext& ctx)
315 {
316 ITERATE(CBioseqContext::TReferences, it, ctx.GetReferences()) {
317 if ((*it)->Matches(feat.GetCit())) {
318 return true;
319 }
320 }
321
322 return false;
323 }
324
325
s_HasCompareOrCitation(const CMappedFeat & feat,CBioseqContext & ctx)326 static bool s_HasCompareOrCitation(const CMappedFeat& feat, CBioseqContext& ctx)
327 {
328 // check for /compare
329 if (!NStr::IsBlank(feat.GetNamedQual("compare"))) {
330 return true;
331 }
332
333 // check for /citation
334 if (feat.IsSetCit()) {
335 return s_HasPub(feat, ctx);
336 }
337
338 return false;
339 }
340
341
342 // conflict requires /citation or /compare
s_CheckQuals_conflict(const CMappedFeat & feat,CBioseqContext & ctx)343 static bool s_CheckQuals_conflict(const CMappedFeat& feat, CBioseqContext& ctx)
344 {
345 // RefSeq allows conflict with accession in comment instead of sfp->cit
346 if (ctx.IsRefSeq() &&
347 feat.IsSetComment() && !NStr::IsBlank(feat.GetComment())) {
348 return true;
349 }
350
351 return s_HasCompareOrCitation(feat, ctx);
352 }
353
354 // old_sequence requires /citation or /compare
s_CheckQuals_old_seq(const CMappedFeat & feat,CBioseqContext & ctx)355 static bool s_CheckQuals_old_seq(const CMappedFeat& feat, CBioseqContext& ctx)
356 {
357 return s_HasCompareOrCitation(feat, ctx);
358 }
359
360
s_CheckQuals_gene(const CMappedFeat & feat)361 static bool s_CheckQuals_gene(const CMappedFeat& feat)
362 {
363 // gene requires /gene or /locus_tag, but desc or syn can be mapped to /gene
364 const CSeqFeatData::TGene& gene = feat.GetData().GetGene();
365 if ( (gene.IsSetLocus() && !gene.GetLocus().empty()) ||
366 (gene.IsSetLocus_tag() && !gene.GetLocus_tag().empty()) ||
367 (gene.IsSetDesc() && !gene.GetDesc().empty()) ||
368 (!gene.GetSyn().empty() && !gene.GetSyn().front().empty()) ) {
369 return true;
370 }
371
372 return false;
373 }
374
375
s_CheckQuals_bind(const CMappedFeat & feat)376 static bool s_CheckQuals_bind(const CMappedFeat& feat)
377 {
378 // protein_bind or misc_binding require eFQ_bound_moiety
379 return !NStr::IsBlank(feat.GetNamedQual("bound_moiety"));
380 }
381
382
s_CheckQuals_mod_base(const CMappedFeat & feat)383 static bool s_CheckQuals_mod_base(const CMappedFeat& feat)
384 {
385 // modified_base requires eFQ_mod_base
386 return !NStr::IsBlank(feat.GetNamedQual("mod_base"));
387 }
388
389
s_CheckQuals_gap(const CMappedFeat & feat)390 static bool s_CheckQuals_gap(const CMappedFeat& feat)
391 {
392 // gap feature must have /estimated_length qual
393 return !feat.GetNamedQual("estimated_length").empty();
394 }
395
s_CheckQuals_assembly_gap(const CMappedFeat & feat)396 static bool s_CheckQuals_assembly_gap(const CMappedFeat& feat)
397 {
398 // assembly_gap feature must have /estimated_length qual
399 // and /gap_type
400 return ! feat.GetNamedQual("estimated_length").empty() &&
401 ! feat.GetNamedQual("gap_type").empty();
402 }
403
404
s_CheckQuals_ncRNA(const CMappedFeat & feat)405 static bool s_CheckQuals_ncRNA(const CMappedFeat& feat)
406 {
407 if( !NStr::IsBlank(feat.GetNamedQual("ncRNA_class")) ) {
408 return true;
409 }
410
411 // Look at this mess; if only we could use sequence_macros.hpp
412 if( feat.GetData().GetRna().IsSetExt() &&
413 feat.GetData().GetRna().GetExt().IsGen() &&
414 feat.GetData().GetRna().GetExt().GetGen().IsSetClass() &&
415 !NStr::IsBlank(feat.GetData().GetRna().GetExt().GetGen().GetClass()) )
416 {
417 return true;
418 }
419
420 return false;
421 }
422
423
s_CheckQuals_regulatory(const CMappedFeat & feat)424 static bool s_CheckQuals_regulatory(const CMappedFeat& feat)
425 {
426 // regulatory feature must have /regulatory_class qual
427 return ! feat.GetNamedQual("regulatory_class").empty();
428 }
429
430
s_CheckMandatoryQuals(const CMappedFeat & feat,const CSeq_loc & loc,CBioseqContext & ctx)431 static bool s_CheckMandatoryQuals(const CMappedFeat& feat,
432 const CSeq_loc& loc,
433 CBioseqContext& ctx)
434 {
435 switch ( feat.GetData().GetSubtype() ) {
436 case CSeqFeatData::eSubtype_cdregion:
437 {
438 return s_CheckQuals_cdregion(feat, loc, ctx);
439 }
440 case CSeqFeatData::eSubtype_conflict:
441 {
442 return s_CheckQuals_conflict(feat, ctx);
443 }
444 case CSeqFeatData::eSubtype_old_sequence:
445 {
446 return s_CheckQuals_old_seq(feat, ctx);
447 }
448 case CSeqFeatData::eSubtype_gene:
449 {
450 return s_CheckQuals_gene(feat);
451 }
452 case CSeqFeatData::eSubtype_protein_bind:
453 case CSeqFeatData::eSubtype_misc_binding:
454 {
455 return s_CheckQuals_bind(feat);
456 }
457 case CSeqFeatData::eSubtype_modified_base:
458 {
459 return s_CheckQuals_mod_base(feat);
460 }
461 case CSeqFeatData::eSubtype_gap:
462 {
463 return s_CheckQuals_gap(feat);
464 }
465 case CSeqFeatData::eSubtype_assembly_gap:
466 {
467 return s_CheckQuals_assembly_gap(feat);
468 }
469 case CSeqFeatData::eSubtype_ncRNA:
470 {
471 return s_CheckQuals_ncRNA(feat);
472 }
473 case CSeqFeatData::eSubtype_regulatory:
474 {
475 return s_CheckQuals_regulatory(feat);
476 }
477 default:
478 break;
479 }
480
481 return true;
482 }
483
s_SkipFeature(const CMappedFeat & feat,const CSeq_loc & loc,CBioseqContext & ctx)484 static bool s_SkipFeature(const CMappedFeat& feat,
485 const CSeq_loc& loc,
486 CBioseqContext& ctx)
487 {
488 CSeqFeatData::E_Choice type = feat.GetData().Which();
489 CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
490
491 if ( subtype == CSeqFeatData::eSubtype_pub ||
492 /* subtype == CSeqFeatData::eSubtype_non_std_residue || */
493 subtype == CSeqFeatData::eSubtype_biosrc ||
494 subtype == CSeqFeatData::eSubtype_rsite ||
495 subtype == CSeqFeatData::eSubtype_seq ) {
496 return true;
497 }
498
499 const CFlatFileConfig& cfg = ctx.Config();
500
501 // check feature customization flags
502 if ( cfg.ValidateFeatures() &&
503 (subtype == CSeqFeatData::eSubtype_bad ||
504 subtype == CSeqFeatData::eSubtype_virion) ) {
505 return true;
506 }
507
508 if ( cfg.ValidateFeatures() && type == CSeqFeatData::e_Imp ) {
509 switch ( subtype ) {
510 default:
511 break;
512 case CSeqFeatData::eSubtype_imp:
513 case CSeqFeatData::eSubtype_site_ref:
514 case CSeqFeatData::eSubtype_gene:
515 case CSeqFeatData::eSubtype_mutation:
516 case CSeqFeatData::eSubtype_allele:
517 return true;
518 }
519 }
520
521 if ( ctx.IsNuc() && subtype == CSeqFeatData::eSubtype_het ) {
522 return true;
523 }
524
525 if ( cfg.HideImpFeatures() && type == CSeqFeatData::e_Imp ) {
526 return true;
527 }
528
529 if ( cfg.HideMiscFeatures() ) {
530 if ( type == CSeqFeatData::e_Site ||
531 type == CSeqFeatData::e_Bond ||
532 type == CSeqFeatData::e_Region ||
533 type == CSeqFeatData::e_Comment ||
534 subtype == CSeqFeatData::eSubtype_misc_feature ||
535 subtype == CSeqFeatData::eSubtype_preprotein ) {
536 return true;
537 }
538 }
539
540 if ( cfg.HideExonFeatures() && subtype == CSeqFeatData::eSubtype_exon ) {
541 return true;
542 }
543
544 if ( cfg.HideIntronFeatures() && subtype == CSeqFeatData::eSubtype_intron ) {
545 return true;
546 }
547
548 if ( cfg.HideRemoteImpFeatures() && type == CSeqFeatData::e_Imp ) {
549 if ( subtype == CSeqFeatData::eSubtype_variation ||
550 subtype == CSeqFeatData::eSubtype_exon ||
551 subtype == CSeqFeatData::eSubtype_intron ||
552 subtype == CSeqFeatData::eSubtype_misc_feature ) {
553 return true;
554 }
555 }
556
557 if ( cfg.GeneRNACDSFeatures() ) {
558 if ( type != CSeqFeatData::e_Gene &&
559 type != CSeqFeatData::e_Rna &&
560 type != CSeqFeatData::e_Cdregion ) {
561 return true;
562 }
563 }
564
565 // skip genes in DDBJ format
566 if ( cfg.IsFormatDDBJ() && type == CSeqFeatData::e_Gene ) {
567 return true;
568 }
569
570 // if RELEASE mode, make sure we have all info to create mandatory quals.
571 if ( cfg.NeedRequiredQuals() ) {
572 return !s_CheckMandatoryQuals(feat, loc, ctx);
573 }
574
575 return false;
576 }
577
578 class BadECNumberChar {
579 public:
operator ()(const char ch)580 bool operator()( const char ch )
581 {
582 return( ! isdigit(ch) && ch != '.' && ch != '-' );
583 }
584 };
585
586 // acceptable patterns are: (This might not be true anymore. Check the code. )
587 // num.num.num.num
588 // num.num.num.-
589 // num.num.-.-
590 // num.-.-.-
591 // -.-.-.-
592 // (You can use "n" instead of "-" )
s_IsLegalECNumber(const string & ec_number)593 static bool s_IsLegalECNumber(const string& ec_number)
594 {
595 if ( ec_number.empty() ) return false;
596
597 bool is_ambig = false;
598 int numperiods = 0;
599 int numdigits = 0;
600 int numdashes = 0;
601
602 ITERATE( string, ec_iter, ec_number ) {
603 if ( isdigit(*ec_iter) ) {
604 numdigits++;
605 if (is_ambig) return false;
606 } else if (*ec_iter == '-' ) {
607 numdashes++;
608 is_ambig = true;
609 } else if( *ec_iter == 'n') {
610 string::const_iterator ec_iter_next = ec_iter;
611 ++ec_iter_next;
612 if( ec_iter_next != ec_number.end() && numperiods == 3 && numdigits == 0 && isdigit(*ec_iter_next) ) {
613 // allow/ignore n in first position of fourth number to not mean ambiguous, if followed by digit
614 } else {
615 numdashes++;
616 is_ambig = true;
617 }
618 } else if (*ec_iter == '.') {
619 numperiods++;
620 if (numdigits > 0 && numdashes > 0) return false;
621 if (numdigits == 0 && numdashes == 0) return false;
622 if (numdashes > 1) return false;
623 numdigits = 0;
624 numdashes = 0;
625 }
626 }
627
628 if (numperiods == 3) {
629 if (numdigits > 0 && numdashes > 0) return false;
630 if (numdigits > 0 || numdashes == 1) return true;
631 }
632
633 return false;
634 }
635
636
s_GetBondName(CSeqFeatData::TBond bond)637 static const string& s_GetBondName(CSeqFeatData::TBond bond)
638 {
639 static const string kOther = "unclassified";
640 return (bond == CSeqFeatData::eBond_other) ? kOther :
641 CSeqFeatData::ENUM_METHOD_NAME(EBond)()->FindName(bond, true);
642 }
643
s_QualVectorToNote(const CFlatFeature::TQuals & qualVector,bool noRedundancy,string & note,string & punctuation,bool & addPeriod)644 static void s_QualVectorToNote(
645 const CFlatFeature::TQuals& qualVector,
646 bool noRedundancy,
647 string& note,
648 string& punctuation,
649 bool& addPeriod)
650 {
651 // is there at least one note which is more than blank or a period?
652 bool hasSubstantiveNote = false;
653 // store this so we can chop off the extra stuff we added if there was no note of substance
654 const string::size_type original_length = note.length();
655
656 string prefix;
657 ITERATE (CFlatFeature::TQuals, it, qualVector) {
658 const string& qual = (*it)->GetValue();
659
660 prefix.erase();
661 if ( !note.empty() ) {
662 prefix = punctuation;
663 const string& next_prefix = (*it)->GetPrefix();
664 if (!NStr::EndsWith(prefix, '\n') ) {
665 prefix += next_prefix;
666 }
667 }
668
669 if( !qual.empty() && qual != "." ) {
670 hasSubstantiveNote = true;
671 }
672
673 // A qual may declare that it be shown even if redundant and override the
674 // given noRedundancy variable
675 const bool noRedundancyThisIteration =
676 ( 0 != ( (*it)->GetFlags() & CFormatQual::fFlags_showEvenIfRedund ) ? false : noRedundancy );
677 JoinString(note, prefix, qual, noRedundancyThisIteration );
678
679 addPeriod = (*it)->GetAddPeriod();
680 punctuation = (*it)->GetSuffix();
681 }
682
683 // if there was no meaningful note, we clear it
684 if( ! hasSubstantiveNote ) {
685 note.resize( original_length );
686 }
687 }
688
689
s_NoteFinalize(bool addPeriod,string & noteStr,CFlatFeature & flatFeature,ETildeStyle style=eTilde_newline)690 static void s_NoteFinalize(
691 bool addPeriod,
692 string& noteStr,
693 CFlatFeature& flatFeature,
694 ETildeStyle style = eTilde_newline ) {
695
696 if (!noteStr.empty()) {
697 if (addPeriod && !NStr::EndsWith(noteStr, ".")) {
698
699 AddPeriod(noteStr);
700 }
701 // Policy change: expand tilde on both descriptors and features
702 ExpandTildes(noteStr, style);
703 TrimSpacesAndJunkFromEnds( noteStr, true );
704
705 CRef<CFormatQual> note(new CFormatQual("note", noteStr));
706 flatFeature.SetQuals().push_back(note);
707 }
708 }
709
s_GetOverlap(const CMappedFeat & feat)710 static int s_GetOverlap(const CMappedFeat& feat )
711 {
712 if (feat) {
713 int total_length = 0;
714 ITERATE( CSeq_loc, loc_iter, feat.GetLocation() ) {
715 total_length += loc_iter.GetRange().GetLength();
716 }
717 return total_length;
718 }
719 return 0;
720 }
721
722
723 ///
724 /// The best protein feature is defined as the one that has the most overlap
725 /// with the given DNA.
726 /// If there is a tie between two protein features in overlap then the one
727 /// with the lesser processing status is declared the winner.
728 ///
s_GetBestProtFeature(const CBioseq_Handle & seq)729 static CMappedFeat s_GetBestProtFeature(const CBioseq_Handle& seq)
730 {
731 SAnnotSelector sel(CSeqFeatData::e_Prot);
732 sel.SetLimitTSE(seq.GetTSE_Handle());
733
734 CMappedFeat best;
735 CProt_ref::TProcessed best_processed = CProt_ref::eProcessed_transit_peptide;
736 int best_overlap = 0;
737
738 for (CFeat_CI it(seq, sel); it; ++it) {
739
740 if ( !best ) {
741
742 best = *it;
743 best_processed = it->GetData().GetProt().GetProcessed();
744 best_overlap = s_GetOverlap(best);
745
746 } else {
747
748 int current_overlap = s_GetOverlap(*it);
749 CProt_ref::TProcessed current_processed = it->GetData().GetProt().GetProcessed();
750
751 if ( best_overlap < current_overlap ) {
752
753 best_overlap = current_overlap;
754 best_processed = current_processed;
755 best = *it;
756
757 } else if ( (best_overlap == current_overlap) && (best_processed > current_processed) ) {
758
759 best_processed = current_processed;
760 best = *it;
761 }
762 }
763 }
764 return best;
765 }
766
767 // -- FeatureHeader
768
CFeatHeaderItem(CBioseqContext & ctx)769 CFeatHeaderItem::CFeatHeaderItem(CBioseqContext& ctx) : CFlatItem(&ctx)
770 {
771 x_GatherInfo(ctx);
772 }
773
GetItemType(void) const774 IFlatItem::EItem CFeatHeaderItem::GetItemType(void) const
775 {
776 return eItem_FeatHeader;
777 }
778
x_GatherInfo(CBioseqContext & ctx)779 void CFeatHeaderItem::x_GatherInfo(CBioseqContext& ctx)
780 {
781 if ( ctx.Config().IsFormatFTable() ) {
782 m_Id.Reset(ctx.GetPrimaryId());
783 }
784 }
785
s_CheckFuzz(const CInt_fuzz & fuzz)786 static bool s_CheckFuzz(const CInt_fuzz& fuzz)
787 {
788 return !(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_unk);
789 }
790
s_LocIsFuzz(const CMappedFeat & feat,const CSeq_loc & loc)791 static bool s_LocIsFuzz(const CMappedFeat& feat, const CSeq_loc& loc)
792 {
793 if ( feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_imp &&
794 feat.GetData().IsImp() ) { // unmappable impfeats
795 const CSeqFeatData::TImp& imp = feat.GetData().GetImp();
796 if ( imp.IsSetLoc() ) {
797 const string& imploc = imp.GetLoc();
798 if ( imploc.find('<') != NPOS || imploc.find('>') != NPOS ) {
799 return true;
800 }
801 }
802 } else { // any regular feature test location for fuzz
803 for ( CSeq_loc_CI it(loc, CSeq_loc_CI::eEmpty_Allow); it; ++it ) {
804 const CSeq_loc& l = it.GetEmbeddingSeq_loc();
805 switch ( l.Which() ) {
806 case CSeq_loc::e_Pnt:
807 {{
808 if ( l.GetPnt().IsSetFuzz() ) {
809 if ( s_CheckFuzz(l.GetPnt().GetFuzz()) ) {
810 return true;
811 }
812 }
813 break;
814 }}
815 case CSeq_loc::e_Packed_pnt:
816 {{
817 if ( l.GetPacked_pnt().IsSetFuzz() ) {
818 if ( s_CheckFuzz(l.GetPacked_pnt().GetFuzz()) ) {
819 return true;
820 }
821 }
822 break;
823 }}
824 case CSeq_loc::e_Int:
825 {{
826 bool fuzz = false;
827 if ( l.GetInt().IsSetFuzz_from() ) {
828 fuzz = s_CheckFuzz(l.GetInt().GetFuzz_from());
829 }
830 if ( !fuzz && l.GetInt().IsSetFuzz_to() ) {
831 fuzz = s_CheckFuzz(l.GetInt().GetFuzz_to());
832 }
833 if ( fuzz ) {
834 return true;
835 }
836 break;
837 }}
838 case CSeq_loc::e_Packed_int:
839 {{
840 if ( l.GetPacked_int().IsPartialStart(eExtreme_Biological)
841 || l.GetPacked_int().IsPartialStop(eExtreme_Biological) ) {
842 return true;
843 }
844 break;
845 }}
846 case CSeq_loc::e_Null:
847 {{
848 return true;
849 }}
850 default:
851 break;
852 }
853 }
854 }
855
856 return false;
857 }
858
s_AddPcrPrimersQualsAppend(string & output,const string & name,const string & str)859 static void s_AddPcrPrimersQualsAppend( string &output, const string &name, const string &str )
860 {
861 if( ! str.empty() ) {
862 if( ! output.empty() ) {
863 output += ", ";
864 }
865 output += name + str;
866 }
867 }
868
869 // This splits a string that's comma-separated with parens at start and end
870 // (or, string might just contain a single string, so no splitting is needed,
871 // in which case the output_vec will be of size 1)
s_SplitCommaSeparatedStringInParens(vector<string> & output_vec,const string & string_to_split)872 static void s_SplitCommaSeparatedStringInParens( vector<string> &output_vec, const string &string_to_split )
873 {
874 // nothing to do since no input
875 if( string_to_split.empty() ) {
876 return;
877 }
878
879 // no splitting required
880 if( string_to_split[0] != '(' ) {
881 output_vec.push_back( string_to_split );
882 return;
883 }
884
885 // if ends with closing paren, chop that off.
886 // ( It's actually a data error if we DON'T end with a ')', but we continue anyway, since
887 // we want to do the best we can with the data we get. )
888 size_t amount_to_chop_off_end = 0;
889 if( string_to_split[string_to_split.length() - 1] == ')' ) {
890 amount_to_chop_off_end = 1;
891 }
892
893 NStr::Split( string_to_split.substr( 1, string_to_split.length() - amount_to_chop_off_end - 1), ",", output_vec, 0 );
894 }
895
896 static const char* const sc_ValidPseudoGene[] = {
897 "allelic",
898 "processed",
899 "unitary",
900 "unknown",
901 "unprocessed"
902 };
903 typedef CStaticArraySet<const char*, PNocase> TLegalPseudoGeneText;
904 DEFINE_STATIC_ARRAY_MAP(TLegalPseudoGeneText, sc_ValidPseudoGeneText, sc_ValidPseudoGene );
905
s_IsValidPseudoGene(objects::CFlatFileConfig::TMode mode,const string & text)906 static bool s_IsValidPseudoGene( objects::CFlatFileConfig::TMode mode, const string& text)
907 {
908 switch(mode)
909 {
910 case objects::CFlatFileConfig::eMode_Release:
911 case objects::CFlatFileConfig::eMode_Entrez:
912 return sc_ValidPseudoGeneText.find(text.c_str()) != sc_ValidPseudoGeneText.end();
913 default:
914 return ! text.empty();
915 }
916 }
917
918 static const char* const sc_ValidExceptionText[] = {
919 "annotated by transcript or proteomic data",
920 "rearrangement required for product",
921 "reasons given in citation",
922 "RNA editing"
923 };
924 typedef CStaticArraySet<const char*, PNocase_CStr> TLegalExceptText;
925 DEFINE_STATIC_ARRAY_MAP(TLegalExceptText, sc_LegalExceptText, sc_ValidExceptionText);
926
s_IsValidExceptionText(const string & text)927 static bool s_IsValidExceptionText(const string& text)
928 {
929 return sc_LegalExceptText.find(text.c_str()) != sc_LegalExceptText.end();
930 }
931
932
933 static const char* const sc_ValidRefSeqExceptionText[] = {
934 "adjusted for low-quality genome",
935 "alternative processing",
936 "alternative start codon",
937 "artificial frameshift",
938 "dicistronic gene",
939 "mismatches in transcription",
940 "mismatches in translation",
941 "modified codon recognition",
942 "nonconsensus splice site",
943 "transcribed product replaced",
944 "transcribed pseudogene",
945 "translated product replaced",
946 "unclassified transcription discrepancy",
947 "unclassified translation discrepancy",
948 "unextendable partial coding region"
949 };
950 typedef CStaticArraySet<const char*, PNocase> TLegalRefSeqExceptText;
951 DEFINE_STATIC_ARRAY_MAP(TLegalRefSeqExceptText, sc_LegalRefSeqExceptText, sc_ValidRefSeqExceptionText);
952
s_IsValidRefSeqExceptionText(const string & text)953 static bool s_IsValidRefSeqExceptionText(const string& text)
954 {
955 return sc_LegalRefSeqExceptText.find(text.c_str()) != sc_LegalRefSeqExceptText.end();
956 }
957
s_GetGbValue(CConstRef<CSeq_feat> feat,const string & key,string & value)958 bool s_GetGbValue( CConstRef<CSeq_feat> feat, const string& key, string& value )
959 {
960 if ( ! feat->IsSetQual() ) {
961 return false;
962 }
963 const CSeq_feat_Base::TQual & qual = feat->GetQual(); // must store reference since ITERATE macro evaluates 3rd arg multiple times
964 ITERATE( CSeq_feat::TQual, it, qual ) {
965 if (!(*it)->IsSetQual() || !(*it)->IsSetVal()) {
966 continue;
967 }
968 if ( (*it)->GetQual() != key ) {
969 continue;
970 }
971 value = (*it)->GetVal();
972 return true;
973 }
974 return false;
975 }
976
977
978 // -- FeatureItemBase
979
CFeatureItemBase(const CMappedFeat & feat,CBioseqContext & ctx,CRef<feature::CFeatTree> ftree,const CSeq_loc * loc,bool suppressAccession)980 CFeatureItemBase::CFeatureItemBase
981 (const CMappedFeat& feat,
982 CBioseqContext& ctx,
983 CRef<feature::CFeatTree> ftree,
984 const CSeq_loc* loc,
985 bool suppressAccession) :
986 CFlatItem(&ctx), m_Feat(feat), m_Feat_Tree(ftree), m_Loc(loc ? loc :
987 (feat ? &feat.GetLocation() : NULL)),
988 m_SuppressAccession(suppressAccession)
989 {
990 if (m_Feat) {
991 x_SetObject(m_Feat.GetOriginalFeature());
992
993 CSeq_feat_Handle feat = m_Feat.GetSeq_feat_Handle();
994 const CSeq_annot_Handle& ah = feat.GetAnnot();
995 CSeq_entry_Handle seh = ah.GetParentEntry();
996 if (! seh) {
997 x_SetExternal();
998 }
999 }
1000 }
1001
Format(void) const1002 CConstRef<CFlatFeature> CFeatureItemBase::Format(void) const
1003 {
1004 CRef<CFlatFeature> ff(new CFlatFeature(GetKey(),
1005 *new CFlatSeqLoc(GetLoc(), *GetContext(), CFlatSeqLoc::eType_location, false, false, this->IsSuppressAccession()),
1006 m_Feat));
1007 if ( ff ) {
1008 x_FormatQuals(*ff);
1009 }
1010 return ff;
1011 }
1012
1013
1014 // -- CFeatureItem
1015
GetKey(void) const1016 string CFeatureItem::GetKey(void) const
1017 {
1018 CBioseqContext& ctx = *GetContext();
1019
1020 CSeqFeatData::E_Choice type = m_Feat.GetData().Which();
1021 CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
1022
1023 if (GetContext()->IsProt()) { // protein
1024 if ( IsMappedFromProt() && type == CSeqFeatData::e_Prot ) {
1025 if ( subtype == CSeqFeatData::eSubtype_preprotein ||
1026 subtype == CSeqFeatData::eSubtype_mat_peptide_aa ||
1027 subtype == CSeqFeatData::eSubtype_sig_peptide_aa ||
1028 subtype == CSeqFeatData::eSubtype_transit_peptide_aa ||
1029 subtype == CSeqFeatData::eSubtype_propeptide_aa ) {
1030 return "Precursor";
1031 }
1032 }
1033 switch ( subtype ) {
1034 case CSeqFeatData::eSubtype_region:
1035 return "Region";
1036 case CSeqFeatData::eSubtype_bond:
1037 return "Bond";
1038 case CSeqFeatData::eSubtype_site:
1039 return "Site";
1040 default:
1041 break;
1042 }
1043 } else { // nucleotide
1044 switch ( subtype ) {
1045
1046 case CSeqFeatData::eSubtype_ncRNA:
1047 return "ncRNA";
1048
1049 case CSeqFeatData::eSubtype_tmRNA:
1050 return "tmRNA";
1051
1052 case CSeqFeatData::eSubtype_preprotein:
1053 if ( !ctx.IsRefSeq() ) {
1054 return "misc_feature";
1055 }
1056 break;
1057
1058 case CSeqFeatData::eSubtype_site:
1059 case CSeqFeatData::eSubtype_bond:
1060 case CSeqFeatData::eSubtype_region:
1061 case CSeqFeatData::eSubtype_comment:
1062 return "misc_feature";
1063
1064 default:
1065 break;
1066 }
1067 }
1068
1069 // deal with unmappable impfeats
1070 if (subtype == CSeqFeatData::eSubtype_imp && type == CSeqFeatData::e_Imp) {
1071 const CSeqFeatData::TImp& imp = m_Feat.GetData().GetImp();
1072 if ( imp.IsSetKey() ) {
1073 return imp.GetKey();
1074 }
1075 }
1076
1077 if (type == CSeqFeatData::e_Imp) {
1078 switch ( subtype ) {
1079 case CSeqFeatData::eSubtype_enhancer:
1080 case CSeqFeatData::eSubtype_promoter:
1081 case CSeqFeatData::eSubtype_CAAT_signal:
1082 case CSeqFeatData::eSubtype_TATA_signal:
1083 case CSeqFeatData::eSubtype_35_signal:
1084 case CSeqFeatData::eSubtype_10_signal:
1085 case CSeqFeatData::eSubtype_GC_signal:
1086 case CSeqFeatData::eSubtype_RBS:
1087 case CSeqFeatData::eSubtype_polyA_signal:
1088 case CSeqFeatData::eSubtype_attenuator:
1089 case CSeqFeatData::eSubtype_terminator:
1090 case CSeqFeatData::eSubtype_misc_signal:
1091 return "regulatory";
1092 default:
1093 break;
1094 }
1095 }
1096
1097 return CFeatureItemBase::GetKey();
1098 }
1099
1100
1101 // constructor from CSeq_feat
CFeatureItem(const CMappedFeat & feat,CBioseqContext & ctx,CRef<feature::CFeatTree> ftree,const CSeq_loc * loc,EMapped mapped,bool suppressAccession,CConstRef<CFeatureItem> parentFeatureItem)1102 CFeatureItem::CFeatureItem
1103 (const CMappedFeat& feat,
1104 CBioseqContext& ctx,
1105 CRef<feature::CFeatTree> ftree,
1106 const CSeq_loc* loc,
1107 EMapped mapped,
1108 bool suppressAccession,
1109 CConstRef<CFeatureItem> parentFeatureItem) :
1110 CFeatureItemBase(feat, ctx, ftree, loc, suppressAccession), m_Mapped(mapped)
1111 {
1112 x_GatherInfoWithParent(ctx, parentFeatureItem);
1113 }
1114
GetItemType(void) const1115 IFlatItem::EItem CFeatureItem::GetItemType(void) const
1116 {
1117 return eItem_Feature;
1118 }
1119
x_GatherInfoWithParent(CBioseqContext & ctx,CConstRef<CFeatureItem> parentFeatureItem)1120 void CFeatureItem::x_GatherInfoWithParent(CBioseqContext& ctx, CConstRef<CFeatureItem> parentFeatureItem )
1121 {
1122 if ( s_SkipFeature(GetFeat(), GetLoc(), ctx) ) {
1123 x_SetSkip();
1124 return;
1125 }
1126 m_Type = m_Feat.GetData().GetSubtype();
1127 x_AddQuals(ctx, parentFeatureItem );
1128 }
1129
1130 // ----------------------------------------------------------------------------
x_AddQualPartial(CBioseqContext & ctx)1131 void CFeatureItem::x_AddQualPartial(
1132 CBioseqContext& ctx )
1133 //
1134 // Note: /partial has been depricated since DEC-2001. Current policy is to
1135 // suppress /partial in entrez and release modes and let it stand in gbench and
1136 // dump modes
1137 // ----------------------------------------------------------------------------
1138 {
1139 if ( !ctx.Config().HideUnclassPartial() ) {
1140 if ( !IsMappedFromCDNA() || !ctx.IsProt() ) {
1141 if ( m_Feat.IsSetPartial() && m_Feat.GetPartial() ) {
1142 if ( eSeqlocPartial_Complete == sequence::SeqLocPartialCheck( GetLoc(), &ctx.GetScope() ) &&
1143 !s_LocIsFuzz( m_Feat, GetLoc() ) )
1144 {
1145 x_AddQual( eFQ_partial, new CFlatBoolQVal( true ) );
1146 }
1147 }
1148 }
1149 }
1150 }
1151
1152 // ----------------------------------------------------------------------------
x_AddQualOperon(CBioseqContext & ctx,CSeqFeatData::ESubtype subtype)1153 void CFeatureItem::x_AddQualOperon(
1154 CBioseqContext& ctx,
1155 CSeqFeatData::ESubtype subtype )
1156 // ----------------------------------------------------------------------------
1157 {
1158 if ( subtype == CSeqFeatData::eSubtype_operon ||
1159 subtype == CSeqFeatData::eSubtype_gap ) {
1160 return;
1161 }
1162
1163 // bail if this type of object is not allowed to carry an operon
1164 if( ! x_IsSeqFeatDataFeatureLegal( CSeqFeatData::eQual_operon ) ) {
1165 return;
1166 }
1167
1168 const CGene_ref* gene_ref = m_Feat.GetGeneXref();
1169 if ( gene_ref == NULL || !gene_ref->IsSuppressed()) {
1170 const CSeq_loc& operon_loc = ( ctx.IsProt() || !IsMapped() ) ?
1171 m_Feat.GetLocation() : GetLoc();
1172 CConstRef<CSeq_feat> operon
1173 = GetOverlappingOperon( operon_loc, ctx.GetScope() );
1174 if ( operon ) {
1175 const string& operon_name = operon->GetNamedQual( "operon" );
1176 if ( !operon_name.empty() ) {
1177 x_AddQual(eFQ_operon, new CFlatStringQVal(operon_name));
1178 }
1179 }
1180 }
1181 }
1182
1183 // ----------------------------------------------------------------------------
x_AddQualsRegulatoryClass(CBioseqContext & ctx,CSeqFeatData::ESubtype subtype)1184 void CFeatureItem::x_AddQualsRegulatoryClass(
1185 CBioseqContext& ctx,
1186 CSeqFeatData::ESubtype subtype )
1187 // ----------------------------------------------------------------------------
1188 {
1189 _ASSERT( m_Feat.GetData().IsImp() );
1190
1191 switch ( subtype ) {
1192 case CSeqFeatData::eSubtype_enhancer:
1193 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("enhancer"));
1194 break;
1195 case CSeqFeatData::eSubtype_promoter:
1196 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("promoter"));
1197 break;
1198 case CSeqFeatData::eSubtype_CAAT_signal:
1199 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("CAAT_signal"));
1200 break;
1201 case CSeqFeatData::eSubtype_TATA_signal:
1202 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("TATA_box"));
1203 break;
1204 case CSeqFeatData::eSubtype_35_signal:
1205 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("minus_35_signal"));
1206 break;
1207 case CSeqFeatData::eSubtype_10_signal:
1208 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("minus_10_signal"));
1209 break;
1210 case CSeqFeatData::eSubtype_GC_signal:
1211 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("GC_signal"));
1212 break;
1213 case CSeqFeatData::eSubtype_RBS:
1214 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("ribosome_binding_site"));
1215 break;
1216 case CSeqFeatData::eSubtype_polyA_signal:
1217 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("polyA_signal_sequence"));
1218 break;
1219 case CSeqFeatData::eSubtype_attenuator:
1220 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("attenuator"));
1221 break;
1222 case CSeqFeatData::eSubtype_terminator:
1223 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("terminator"));
1224 break;
1225 case CSeqFeatData::eSubtype_misc_signal:
1226 x_AddQual(eFQ_regulatory_class, new CFlatStringQVal("other"));
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232
1233 // ----------------------------------------------------------------------------
x_AddQualPseudo(CBioseqContext & ctx,CSeqFeatData::E_Choice type,CSeqFeatData::ESubtype subtype,bool pseudo)1234 void CFeatureItem::x_AddQualPseudo(
1235 CBioseqContext& ctx,
1236 CSeqFeatData::E_Choice type,
1237 CSeqFeatData::ESubtype subtype,
1238 bool pseudo )
1239 // ----------------------------------------------------------------------------
1240 {
1241 if ( !pseudo ||
1242 subtype == CSeqFeatData::eSubtype_mobile_element ||
1243 subtype == CSeqFeatData::eSubtype_centromere ||
1244 subtype == CSeqFeatData::eSubtype_telomere )
1245 {
1246 return;
1247 }
1248
1249 if (ctx.Config().DropIllegalQuals() &&
1250 ( type == CSeqFeatData::e_Rna || type == CSeqFeatData::e_Imp ) )
1251 {
1252 switch (subtype) {
1253 case CSeqFeatData::eSubtype_allele:
1254 case CSeqFeatData::eSubtype_conflict:
1255 case CSeqFeatData::eSubtype_D_loop:
1256 case CSeqFeatData::eSubtype_iDNA:
1257 case CSeqFeatData::eSubtype_LTR:
1258 case CSeqFeatData::eSubtype_misc_binding:
1259 case CSeqFeatData::eSubtype_misc_difference:
1260 case CSeqFeatData::eSubtype_misc_recomb:
1261 case CSeqFeatData::eSubtype_misc_RNA:
1262 case CSeqFeatData::eSubtype_misc_structure:
1263 case CSeqFeatData::eSubtype_modified_base:
1264 case CSeqFeatData::eSubtype_mutation:
1265 case CSeqFeatData::eSubtype_old_sequence:
1266 case CSeqFeatData::eSubtype_polyA_site:
1267 case CSeqFeatData::eSubtype_precursor_RNA:
1268 case CSeqFeatData::eSubtype_prim_transcript:
1269 case CSeqFeatData::eSubtype_primer_bind:
1270 case CSeqFeatData::eSubtype_protein_bind:
1271 case CSeqFeatData::eSubtype_repeat_region:
1272 case CSeqFeatData::eSubtype_repeat_unit:
1273 case CSeqFeatData::eSubtype_rep_origin:
1274 case CSeqFeatData::eSubtype_satellite:
1275 case CSeqFeatData::eSubtype_stem_loop:
1276 case CSeqFeatData::eSubtype_STS:
1277 case CSeqFeatData::eSubtype_unsure:
1278 case CSeqFeatData::eSubtype_variation:
1279 case CSeqFeatData::eSubtype_3clip:
1280 case CSeqFeatData::eSubtype_3UTR:
1281 case CSeqFeatData::eSubtype_5clip:
1282 case CSeqFeatData::eSubtype_5UTR:
1283 return;
1284 default:
1285 break;
1286 }
1287 }
1288 x_AddQual( eFQ_pseudo, new CFlatBoolQVal( true ) );
1289 }
1290
1291 // ----------------------------------------------------------------------------
x_AddQualSeqfeatNote(CBioseqContext & ctx)1292 void CFeatureItem::x_AddQualSeqfeatNote(CBioseqContext &ctx)
1293 // ----------------------------------------------------------------------------
1294 {
1295 string precursor_comment;
1296 // set precursor_comment, if needed.
1297 // It's set from the feature's product's best protein's comment
1298 if( GetContext()->IsProt() && IsMappedFromProt() && m_Feat.IsSetProduct() ) {
1299 const CSeq_id* prod_id = m_Feat.GetProduct().GetId();
1300 if( prod_id != NULL ) {
1301 CBioseq_Handle prod_bioseq = GetContext()->GetScope().GetBioseqHandle(*prod_id);
1302 if( prod_bioseq ) {
1303 CMappedFeat best_prot_feat = s_GetBestProtFeature( prod_bioseq );
1304 if( best_prot_feat && best_prot_feat.IsSetComment() ) {
1305 precursor_comment = best_prot_feat.GetComment() ;
1306 }
1307 }
1308 }
1309 }
1310
1311 if (m_Feat.IsSetComment()) {
1312 string comment = m_Feat.GetComment();
1313
1314 TrimSpacesAndJunkFromEnds( comment, true );
1315 if ( ! comment.empty() && comment != "~" && comment != precursor_comment) {
1316 bool bAddPeriod = RemovePeriodFromEnd( comment, true );
1317 ConvertQuotes(comment);
1318 CRef<CFlatStringQVal> seqfeat_note( new CFlatStringQVal( comment ) );
1319 // if ( bAddPeriod && ! x_GetStringQual(eFQ_prot_desc ) ) {
1320 // careful! Period must be removed if we have a valid eFQ_prot_desc
1321 // Examples to test some cases: AB001488, M96268
1322 if ( bAddPeriod ) {
1323 seqfeat_note->SetAddPeriod();
1324 }
1325 x_AddQual( eFQ_seqfeat_note, seqfeat_note );
1326 }
1327 }
1328
1329 /// also scan the annot to see if there is a comment there, if required
1330 if( ! ctx.ShowAnnotCommentAsCOMMENT() ) {
1331 if (m_Feat.GetAnnot().Seq_annot_IsSetDesc()) {
1332 ITERATE (CSeq_annot::TDesc::Tdata, it,
1333 m_Feat.GetAnnot().Seq_annot_GetDesc().Get()) {
1334 if ((*it)->IsComment()) {
1335 const string & comment = (*it)->GetComment();
1336 // certain comments require special handling
1337 const static string ktRNAscanSE = "tRNA features were annotated by tRNAscan-SE";
1338 if( NStr::StartsWith(comment, ktRNAscanSE, NStr::eNocase) /* && ! x_HasMethodtRNAscanSE() */ )
1339 {
1340 if ( m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_tRNA ) {
1341 // don't propagate tRNAscan-SE comments to irrelevant features
1342 continue;
1343 }
1344 }
1345 string comm = comment;
1346 TrimSpacesAndJunkFromEnds( comm, false );
1347 RemovePeriodFromEnd( comm, true );
1348 x_AddQual(eFQ_seqfeat_note,
1349 new CFlatStringQVal(comm));
1350 }
1351 }
1352 }
1353 }
1354
1355 }
1356
1357 // ----------------------------------------------------------------------------
x_AddQualExpInv(CBioseqContext & ctx)1358 void CFeatureItem::x_AddQualExpInv(
1359 CBioseqContext& ctx )
1360 // ----------------------------------------------------------------------------
1361 {
1362 if ( ! m_Feat.IsSetExp_ev() ) {
1363 return;
1364 }
1365
1366 string value;
1367 if ( m_Feat.GetExp_ev() == CSeq_feat::eExp_ev_experimental ) {
1368 if ( ! x_GetGbValue( "experiment", value ) && ! x_GetGbValue( "inference", value ) ) {
1369 x_AddQual( eFQ_experiment, new CFlatExperimentQVal() );
1370 }
1371 }
1372 else if ( ! x_GetGbValue( "inference", value ) ) {
1373 x_AddQual(eFQ_inference, new CFlatInferenceQVal( "" ));
1374 }
1375 }
1376
1377 static
s_TransSplicingFeatureAllowed(const CSeqFeatData & data)1378 bool s_TransSplicingFeatureAllowed(
1379 const CSeqFeatData& data )
1380 {
1381 switch( data.GetSubtype() ) {
1382 case CSeqFeatData::eSubtype_gene:
1383 case CSeqFeatData::eSubtype_cdregion:
1384 case CSeqFeatData::eSubtype_mRNA:
1385 case CSeqFeatData::eSubtype_tRNA:
1386 case CSeqFeatData::eSubtype_preRNA:
1387 case CSeqFeatData::eSubtype_otherRNA:
1388 case CSeqFeatData::eSubtype_exon:
1389 case CSeqFeatData::eSubtype_intron:
1390 case CSeqFeatData::eSubtype_3clip:
1391 case CSeqFeatData::eSubtype_3UTR:
1392 case CSeqFeatData::eSubtype_5clip:
1393 case CSeqFeatData::eSubtype_5UTR:
1394 return true;
1395 default:
1396 return false;
1397 }
1398 }
1399
1400 // ----------------------------------------------------------------------------
x_AddQualExceptions(CBioseqContext & ctx)1401 void CFeatureItem::x_AddQualExceptions(
1402 CBioseqContext& ctx )
1403 //
1404 // Add any existing exception qualifiers.
1405 // Note: These include /ribosomal_slippage and /trans-splicing as special
1406 // cases. Also, some exceptions are listed as notes.
1407 // ----------------------------------------------------------------------------
1408 {
1409 const CSeqFeatData& data = m_Feat.GetData();
1410
1411 string raw_exception;
1412
1413 if ( ( m_Feat.IsSetExcept() && m_Feat.GetExcept() ) &&
1414 (m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty()) ) {
1415 raw_exception = m_Feat.GetExcept_text();
1416 }
1417 if ( raw_exception == "" ) {
1418 return;
1419 }
1420
1421 const bool bIsRefseq = ctx.IsRefSeq();
1422 // const bool bIsRelaxed = ( ! cfg.DropIllegalQuals() );
1423 const bool bIsRelaxed = ((! ctx.Config().IsModeRelease()) && (! ctx.Config().IsModeEntrez()));
1424
1425 list<string> exceptions;
1426 NStr::Split( raw_exception, ",", exceptions, NStr::fSplit_Tokenize );
1427
1428 list<string> output_exceptions;
1429 list<string> output_notes;
1430 ITERATE( list<string>, it, exceptions ) {
1431 string cur = NStr::TruncateSpaces( *it );
1432 if( cur.empty() ) {
1433 continue;
1434 }
1435
1436 //
1437 // If exceptions are legal then it depends on the exception. Some are
1438 // turned into their own custom qualifiers. Others are allowed to stand
1439 // as exceptions, while others are turned into notes.
1440 //
1441 if ( s_IsValidExceptionText( cur ) ) {
1442 if( bIsRefseq || bIsRelaxed || data.IsCdregion() ) {
1443 output_exceptions.push_back( cur );
1444 } else {
1445 output_notes.push_back( cur );
1446 }
1447 continue;
1448 }
1449 if ( s_IsValidRefSeqExceptionText( cur ) ) {
1450 if( bIsRefseq || bIsRelaxed ) {
1451 output_exceptions.push_back( cur );
1452 } else {
1453 output_notes.push_back( cur );
1454 }
1455 continue;
1456 }
1457 if ( NStr::EqualNocase(cur, "ribosomal slippage") ) {
1458 if( data.IsCdregion() ) {
1459 x_AddQual( eFQ_ribosomal_slippage, new CFlatBoolQVal( true ) );
1460 } else {
1461 output_notes.push_back( cur );
1462 }
1463 continue;
1464 }
1465 if ( NStr::EqualNocase(cur, "trans-splicing") ) {
1466 if( s_TransSplicingFeatureAllowed( data ) ) {
1467 x_AddQual( eFQ_trans_splicing, new CFlatBoolQVal( true ) );
1468 } else {
1469 output_notes.push_back( cur );
1470 }
1471 continue;
1472 }
1473 if ( NStr::EqualNocase(cur, "circular RNA") ) {
1474 if( data.IsRna() ) {
1475 x_AddQual( eFQ_circular_RNA, new CFlatBoolQVal( true ) );
1476 } else {
1477 output_notes.push_back( cur );
1478 }
1479 continue;
1480 }
1481 const bool is_cds_or_mrna = ( data.IsCdregion() ||
1482 data.GetSubtype() == CSeqFeatData::eSubtype_mRNA );
1483 if( NStr::EqualNocase(cur, "artificial location") ) {
1484 if( is_cds_or_mrna ) {
1485 x_AddQual( eFQ_artificial_location, new CFlatBoolQVal( true ) );
1486 } else {
1487 output_notes.push_back( cur );
1488 }
1489 continue;
1490 }
1491 if( NStr::EqualNocase(cur, "heterogeneous population sequenced") ||
1492 NStr::EqualNocase(cur, "low-quality sequence region") )
1493 {
1494 if( is_cds_or_mrna ) {
1495 x_AddQual( eFQ_artificial_location, new CFlatStringQVal( cur ) );
1496 } else {
1497 output_notes.push_back( cur );
1498 }
1499 continue;
1500 }
1501 else {
1502 if ( bIsRelaxed ) {
1503 output_exceptions.push_back( cur );
1504 }
1505 else {
1506 output_notes.push_back( cur );
1507 }
1508 }
1509 }
1510 if ( ! output_exceptions.empty() ) {
1511 string exception = NStr::Join( output_exceptions, ", " );
1512 x_AddQual(eFQ_exception, new CFlatStringQVal( exception ) );
1513 }
1514 if ( ! output_notes.empty() ) {
1515 string note = NStr::Join( output_notes, ", " );
1516 x_AddQual(eFQ_exception_note, new CFlatStringQVal( note ) );
1517 }
1518 }
1519
1520 // ----------------------------------------------------------------------------
x_AddQualNote(CConstRef<CSeq_feat> gene_feat)1521 void CFeatureItem::x_AddQualNote(
1522 CConstRef<CSeq_feat> gene_feat )
1523 // ----------------------------------------------------------------------------
1524 {
1525 if ( ! gene_feat || ! gene_feat->IsSetComment() ) {
1526 return;
1527 }
1528 x_AddQual( eFQ_gene_note, new CFlatStringQVal(
1529 gene_feat->GetComment() ) );
1530 }
1531
1532 // ----------------------------------------------------------------------------
x_AddQualGeneXref(const CGene_ref * gene_ref,const CConstRef<CSeq_feat> & gene_feat)1533 void CFeatureItem::x_AddQualGeneXref(
1534 const CGene_ref* gene_ref,
1535 const CConstRef<CSeq_feat>& gene_feat )
1536 // ----------------------------------------------------------------------------
1537 {
1538 const CSeqFeatData& data = m_Feat.GetData();
1539 CSeqFeatData::E_Choice type = data.Which();
1540
1541 if ( type == CSeqFeatData::e_Cdregion || type == CSeqFeatData::e_Rna ) {
1542 if ( ! gene_ref && gene_feat ) {
1543 gene_ref = &gene_feat->GetData().GetGene();
1544 if ( gene_ref != NULL && gene_ref->IsSetDb() ) {
1545 x_AddQual(
1546 eFQ_gene_xref, new CFlatXrefQVal( gene_ref->GetDb() ) );
1547 } else if ( gene_feat->IsSetDbxref() ) {
1548 x_AddQual(
1549 eFQ_gene_xref, new CFlatXrefQVal( gene_feat->GetDbxref() ) );
1550 }
1551 }
1552 }
1553 }
1554
1555 // ----------------------------------------------------------------------------
x_AddQualOldLocusTag(const CBioseqContext & ctx,CConstRef<CSeq_feat> gene_feat)1556 void CFeatureItem::x_AddQualOldLocusTag(
1557 const CBioseqContext& ctx,
1558 CConstRef<CSeq_feat> gene_feat )
1559 //
1560 // For non-gene features, add /old_locus_tag, if one exists somewhere.
1561 // ----------------------------------------------------------------------------
1562 {
1563 if ( ! gene_feat ) {
1564 return;
1565 }
1566
1567 if ( ctx.IsProt() ) {
1568 // skip if GenPept format and not gene or CDS
1569 const CSeqFeatData& data = m_Feat.GetData();
1570 CSeqFeatData::ESubtype subtype = data.GetSubtype();
1571 if (subtype != CSeqFeatData::eSubtype_gene && subtype != CSeqFeatData::eSubtype_cdregion) {
1572 return;
1573 }
1574 }
1575
1576 const CSeq_feat::TQual& quals = gene_feat->GetQual();
1577 for ( size_t iPos = 0; iPos < quals.size(); ++iPos ) {
1578 CRef< CGb_qual > qual = quals[ iPos ];
1579 if ( ! qual->IsSetQual() || ! qual->IsSetVal() ) {
1580 continue;
1581 }
1582 if ( qual->GetQual() == "old_locus_tag" ) {
1583 x_AddQual(eFQ_old_locus_tag,
1584 new CFlatStringQVal( qual->GetVal(), CFormatQual::eTrim_WhitespaceOnly ) );
1585 }
1586 }
1587 }
1588
1589 // ----------------------------------------------------------------------------
x_GetPseudo(const CGene_ref * gene_ref,const CSeq_feat * gene_feat) const1590 bool CFeatureItem::x_GetPseudo(
1591 const CGene_ref* gene_ref,
1592 const CSeq_feat* gene_feat ) const
1593 // ----------------------------------------------------------------------------
1594 {
1595 const CSeqFeatData& data = m_Feat.GetData();
1596 CSeqFeatData::E_Choice type = data.Which();
1597 CSeqFeatData::ESubtype subtype = data.GetSubtype();
1598
1599 bool pseudo = m_Feat.IsSetPseudo() ? m_Feat.GetPseudo() : false;
1600 if ( type != CSeqFeatData::e_Gene &&
1601 subtype != CSeqFeatData::eSubtype_operon &&
1602 subtype != CSeqFeatData::eSubtype_gap )
1603 {
1604 if ( gene_feat && gene_feat->IsSetPseudo() && gene_feat->GetPseudo() ) {
1605 return true;
1606 const CGene_ref* altref = &gene_feat->GetData().GetGene();
1607 if ( altref && altref->IsSetPseudo() && altref->GetPseudo() ) {
1608 return true;
1609 }
1610 }
1611 if ( gene_ref && gene_ref->IsSetPseudo() && gene_ref->GetPseudo() ) {
1612 return true;
1613 }
1614 }
1615 if ( type == CSeqFeatData::e_Gene ) {
1616 if ( data.GetGene().IsSetPseudo() && data.GetGene().GetPseudo() ) {
1617 return true;
1618 }
1619 }
1620 if ( type == CSeqFeatData::e_Rna ) {
1621 if ( data.GetRna().IsSetPseudo() && data.GetRna().GetPseudo() ) {
1622 return true;
1623 }
1624 }
1625 return pseudo;
1626 }
1627
x_AddQualsIdx(CBioseqContext & ctx,CConstRef<CFeatureItem> parentFeatureItem)1628 void CFeatureItem::x_AddQualsIdx(
1629 CBioseqContext& ctx,
1630 CConstRef<CFeatureItem> parentFeatureItem )
1631 {
1632 CRef<CSeqEntryIndex> idx = ctx.GetSeqEntryIndex();
1633 if (! idx) return;
1634 CBioseq_Handle hdl = ctx.GetHandle();
1635 CRef<CBioseqIndex> bsx = idx->GetBioseqIndex (hdl);
1636 if (! bsx) return;
1637
1638 const CSeqFeatData& data = m_Feat.GetData();
1639 CSeqFeatData::E_Choice type = data.Which();
1640 CSeqFeatData::ESubtype subtype = data.GetSubtype();
1641
1642 bool is_not_genbank = false;
1643 {{
1644 ITERATE( CBioseq::TId, id_iter, ctx.GetBioseqIds() ) {
1645 const CSeq_id& id = **id_iter;
1646
1647 switch ( id.Which() ) {
1648 case CSeq_id_Base::e_Embl:
1649 case CSeq_id_Base::e_Ddbj:
1650 case CSeq_id_Base::e_Tpe:
1651 case CSeq_id_Base::e_Tpd:
1652 is_not_genbank = true;
1653 break;
1654 default:
1655 // do nothing
1656 break;
1657 }
1658 }
1659 }}
1660
1661 const CGene_ref* gene_ref = 0;
1662 CConstRef<CSeq_feat> gene_feat;
1663 const CGene_ref* feat_gene_xref = 0;
1664 feat_gene_xref = m_Feat.GetGeneXref();
1665 if (feat_gene_xref == 0 && parentFeatureItem) {
1666 feat_gene_xref = parentFeatureItem->GetFeat().GetGeneXref();
1667 }
1668 bool suppressed = false;
1669
1670 const bool gene_forbidden_if_genbank =
1671 ( subtype == CSeqFeatData::eSubtype_mobile_element ||
1672 subtype == CSeqFeatData::eSubtype_centromere ||
1673 subtype == CSeqFeatData::eSubtype_telomere );
1674
1675 if ( type == CSeqFeatData::e_Gene ) {
1676 } else if (subtype != CSeqFeatData::eSubtype_operon &&
1677 subtype != CSeqFeatData::eSubtype_gap &&
1678 (is_not_genbank || ! gene_forbidden_if_genbank)) {
1679 if (feat_gene_xref) {
1680 if (feat_gene_xref->IsSuppressed()) {
1681 suppressed = true;
1682 }
1683 }
1684
1685 if (feat_gene_xref && ! suppressed) {
1686 // RW-943
1687 // gene_ref = feat_gene_xref;
1688 CRef<CFeatureIndex> ft = bsx->GetFeatIndex (m_Feat);
1689 if (! ft) {
1690 if (parentFeatureItem) {
1691 // RW-985 fix for RW-943 dropping xrefs on sig_peptide and mat_peptide
1692 ft = bsx->GetFeatIndex (parentFeatureItem->GetFeat());
1693 } else {
1694 // SF-3276 BAM94483 coded_by CDS was not getting xref'd gene
1695 ft = bsx->GetFeatureForProduct();
1696 }
1697 }
1698 if (ft) {
1699 CRef<CFeatureIndex> fsx = ft->GetBestGene();
1700 if (fsx) {
1701 const CMappedFeat mf = fsx->GetMappedFeat();
1702 if (mf) {
1703 const CGene_ref* gr = 0;
1704 CConstRef<CSeq_feat> gf;
1705 gf = &(mf.GetMappedFeature());
1706 gr = &(mf.GetData().GetGene());
1707 if (gr) {
1708 if (feat_gene_xref->IsSetLocus_tag() && gr->IsSetLocus_tag()) {
1709 if (feat_gene_xref->GetLocus_tag() == gr->GetLocus_tag()) {
1710 gene_feat = &(mf.GetMappedFeature());
1711 gene_ref = &(mf.GetData().GetGene());
1712 } else {
1713 // RW-985
1714 gene_ref = feat_gene_xref;
1715 }
1716 } else if (feat_gene_xref->IsSetLocus() && gr->IsSetLocus()) {
1717 if (feat_gene_xref->GetLocus() == gr->GetLocus()) {
1718 gene_feat = &(mf.GetMappedFeature());
1719 gene_ref = &(mf.GetData().GetGene());
1720 } else {
1721 // RW-985
1722 gene_ref = feat_gene_xref;
1723 }
1724 } else {
1725 // SF-3822 - map locus in xref to desc in gene
1726 gene_ref = feat_gene_xref;
1727 }
1728 }
1729 }
1730 } else {
1731 // RW-943
1732 gene_ref = feat_gene_xref;
1733 }
1734 }
1735 } else if ((! feat_gene_xref || ! suppressed) &&
1736 subtype != CSeqFeatData::eSubtype_primer_bind) {
1737 CRef<CFeatureIndex> ft;
1738 bool is_mapped = false;
1739 if (parentFeatureItem) {
1740 ft = bsx->GetFeatIndex (parentFeatureItem->GetFeat());
1741 if (ft) {
1742 if (subtype == CSeqFeatData::eSubtype_preprotein ||
1743 subtype == CSeqFeatData::eSubtype_mat_peptide_aa ||
1744 subtype == CSeqFeatData::eSubtype_sig_peptide_aa ||
1745 subtype == CSeqFeatData::eSubtype_transit_peptide_aa ||
1746 subtype == CSeqFeatData::eSubtype_propeptide_aa) {
1747 try {
1748 if ( m_Feat.IsSetXref() ) {
1749 feat_gene_xref = m_Feat.GetGeneXref();
1750 if ( feat_gene_xref ) {
1751 gene_ref = feat_gene_xref;
1752 is_mapped = true;
1753 }
1754 }
1755 if (! is_mapped) {
1756 CRef<CFeatureIndex> fsx = ft->GetBestGene();
1757 if (fsx) {
1758 const CMappedFeat mf = fsx->GetMappedFeat();
1759 if (mf) {
1760 gene_feat = &(mf.GetMappedFeature());
1761 gene_ref = &(mf.GetData().GetGene());
1762 is_mapped = true;
1763 }
1764 }
1765 }
1766 if (! is_mapped) {
1767 // e.g., check sig_peptide for gene overlapping parent CDS
1768 CSeq_feat_Handle parent_feat_handle;
1769 parent_feat_handle = parentFeatureItem->GetFeat();
1770 CGeneFinder::GetAssociatedGeneInfo( m_Feat, ctx, m_Loc, m_GeneRef, gene_ref,
1771 gene_feat, parent_feat_handle );
1772 is_mapped = true;
1773 }
1774 } catch (CException&) {}
1775 }
1776 }
1777 } else {
1778 ft = bsx->GetFeatIndex (m_Feat);
1779 if (! ft) {
1780 ft = bsx->GetFeatureForProduct();
1781 }
1782 }
1783 if (ft && (! is_mapped)) {
1784 CRef<CFeatureIndex> fsx = ft->GetBestGene();
1785 if (fsx) {
1786 const CMappedFeat mf = fsx->GetMappedFeat();
1787 if (mf) {
1788 gene_feat = &(mf.GetMappedFeature());
1789 gene_ref = &(mf.GetData().GetGene());
1790 }
1791 } else if (feat_gene_xref) {
1792 // last resort, e.g., MH013512 after first nuc-prot set
1793 gene_ref = feat_gene_xref;
1794 }
1795 }
1796 }
1797 }
1798
1799 bool pseudo = x_GetPseudo(gene_ref, gene_feat );
1800
1801 //
1802 // Collect qualifiers that are specific to a single or just a few feature
1803 // types:
1804 //
1805 switch ( type ) {
1806 case CSeqFeatData::e_Cdregion:
1807 x_AddQualsCdregionIdx(m_Feat, ctx, pseudo);
1808 break;
1809 case CSeqFeatData::e_Rna:
1810 x_AddQualsRna(m_Feat, ctx, pseudo);
1811 break;
1812 case CSeqFeatData::e_Prot:
1813 x_AddQualsProt(ctx, pseudo);
1814 break;
1815 case CSeqFeatData::e_Region:
1816 x_AddQualsRegion( ctx );
1817 break;
1818 case CSeqFeatData::e_Site:
1819 x_AddQualsSite( ctx );
1820 break;
1821 case CSeqFeatData::e_Bond:
1822 x_AddQualsBond( ctx );
1823 break;
1824 case CSeqFeatData::e_Psec_str:
1825 x_AddQualsPsecStr( ctx );
1826 break;
1827 case CSeqFeatData::e_Non_std_residue:
1828 x_AddQualsNonStd( ctx );
1829 break;
1830 case CSeqFeatData::e_Het:
1831 x_AddQualsHet( ctx );
1832 break;
1833 case CSeqFeatData::e_Variation:
1834 x_AddQualsVariation( ctx );
1835 break;
1836 default:
1837 break;
1838 }
1839
1840 //
1841 // Collect qualifiers that are common to most feature types:
1842 //
1843 x_AddQualPartial( ctx );
1844 x_AddQualDbXref( ctx );
1845 x_AddQualExt();
1846 x_AddQualExpInv( ctx );
1847 x_AddQualCitation();
1848 x_AddQualExceptions( ctx );
1849 x_AddQualNote( gene_feat );
1850 x_AddQualOldLocusTag( ctx, gene_feat );
1851 x_AddQualDb( gene_ref );
1852 x_AddQualGeneXref( gene_ref, gene_feat );
1853 if (bsx->HasOperon()) {
1854 x_AddQualOperon( ctx, subtype );
1855 }
1856 x_AddQualsGene( ctx, gene_ref, gene_feat, gene_ref ? false : gene_feat.NotEmpty() );
1857
1858 x_AddQualPseudo( ctx, type, subtype, pseudo );
1859 x_AddQualsGb( ctx );
1860
1861 // dynamic mapping of old features to regulatory with regulatory_class qualifier
1862 if ( type == CSeqFeatData::e_Imp ) {
1863 x_AddQualsRegulatoryClass ( ctx, subtype );
1864 }
1865
1866 x_AddQualSeqfeatNote(ctx);
1867
1868 // cleanup (drop illegal quals, duplicate information etc.)
1869 x_CleanQuals( gene_ref );
1870
1871
1872 }
1873
1874 // ----------------------------------------------------------------------------
x_AddQuals(CBioseqContext & ctx,CConstRef<CFeatureItem> parentFeatureItem)1875 void CFeatureItem::x_AddQuals(
1876 CBioseqContext& ctx,
1877 CConstRef<CFeatureItem> parentFeatureItem )
1878 //
1879 // Add the various qualifiers to this feature. Top level function.
1880 // ----------------------------------------------------------------------------
1881 {
1882 // /**fl**/
1883 // leaving this here since it's so useful for debugging purposes.
1884 //21822,22172
1885 /* if(
1886 (GetLoc().GetStart(eExtreme_Biological) == 21821 &&
1887 GetLoc().GetStop(eExtreme_Biological) == 22171) ||
1888 (GetLoc().GetStop(eExtreme_Biological) == 21821 &&
1889 GetLoc().GetStart(eExtreme_Biological) == 22171)
1890 ) {
1891 cerr << ""; // a do-nothing statement in case we forget to comment it out
1892 } */
1893 // /**fl**/
1894
1895 if ( ctx.Config().IsFormatFTable() ) {
1896 x_AddFTableQuals( ctx );
1897 return;
1898 }
1899
1900 if ( ctx.UsingSeqEntryIndex() ) {
1901 x_AddQualsIdx(ctx, parentFeatureItem);
1902 return;
1903 }
1904
1905 // SQD-4444 : pass annot selector from the context structure
1906 m_Feat_Tree->AddGenesForFeat(m_Feat, ctx.GetAnnotSelector());
1907
1908 //
1909 // Collect/Compute data that will be shared between several qualifier
1910 // collectors:
1911 //
1912 const CSeqFeatData& data = m_Feat.GetData();
1913 CSeqFeatData::E_Choice type = data.Which();
1914 CSeqFeatData::ESubtype subtype = data.GetSubtype();
1915 // /**fl**/>>
1916 // if ( subtype == CSeqFeatData::eSubtype_sig_peptide_aa ||
1917 // subtype == CSeqFeatData::eSubtype_sig_peptide )
1918 // {
1919 // cerr << "Break" << endl;
1920 // }
1921 // <</**fl**/
1922
1923 // check if this is some kind of Genbank record (some of the logic may be a little different in that case)
1924 bool is_not_genbank = false;
1925 {{
1926 ITERATE( CBioseq::TId, id_iter, ctx.GetBioseqIds() ) {
1927 const CSeq_id& id = **id_iter;
1928
1929 switch ( id.Which() ) {
1930 case CSeq_id_Base::e_Embl:
1931 case CSeq_id_Base::e_Ddbj:
1932 case CSeq_id_Base::e_Tpe:
1933 case CSeq_id_Base::e_Tpd:
1934 is_not_genbank = true;
1935 break;
1936 default:
1937 // do nothing
1938 break;
1939 }
1940 }
1941 }}
1942
1943
1944 const CGene_ref* gene_ref = 0;
1945 CConstRef<CSeq_feat> gene_feat;
1946 const CGene_ref* feat_gene_xref = m_Feat.GetGeneXref();
1947 bool suppressed = false;
1948
1949 const bool gene_forbidden_if_genbank =
1950 ( subtype == CSeqFeatData::eSubtype_mobile_element ||
1951 subtype == CSeqFeatData::eSubtype_centromere ||
1952 subtype == CSeqFeatData::eSubtype_telomere );
1953
1954 if ( type == CSeqFeatData::e_Gene ) {
1955 } else if (subtype != CSeqFeatData::eSubtype_operon &&
1956 subtype != CSeqFeatData::eSubtype_gap &&
1957 (is_not_genbank || ! gene_forbidden_if_genbank)) {
1958 if (feat_gene_xref) {
1959 if (feat_gene_xref->IsSuppressed()) {
1960 suppressed = true;
1961 }
1962 }
1963 if (feat_gene_xref && ! suppressed &&
1964 ! CGeneFinder::ResolveGeneXref(feat_gene_xref, ctx.GetTopLevelEntry())) {
1965 gene_ref = feat_gene_xref;
1966 } else if ((! feat_gene_xref || ! suppressed) &&
1967 subtype != CSeqFeatData::eSubtype_primer_bind) {
1968
1969 bool is_mapped = false;
1970 try {
1971 CMappedFeat mapped_gene = ctx.GetFeatTree().GetBestGene(m_Feat);
1972 if (mapped_gene) {
1973 gene_feat = mapped_gene.GetOriginalSeq_feat();
1974 gene_ref = &gene_feat->GetData().GetGene();
1975 is_mapped = true;
1976 }
1977 } catch (CException&) {}
1978 if (! is_mapped) {
1979 try {
1980 CMappedFeat mapped_gene = m_Feat_Tree->GetBestGene(m_Feat);
1981 if (mapped_gene) {
1982 gene_feat = mapped_gene.GetOriginalSeq_feat();
1983 gene_ref = &gene_feat->GetData().GetGene();
1984 is_mapped = true;
1985 }
1986 } catch (CException&) {}
1987 }
1988 if (! is_mapped) {
1989 try {
1990 // e.g., check sig_peptide for gene overlapping parent CDS
1991 CSeq_feat_Handle parent_feat_handle;
1992 if( parentFeatureItem ) {
1993 parent_feat_handle = parentFeatureItem->GetFeat();
1994 CGeneFinder::GetAssociatedGeneInfo( m_Feat, ctx, m_Loc, m_GeneRef, gene_ref,
1995 gene_feat, parent_feat_handle );
1996 }
1997 } catch (CException&) {}
1998 }
1999 }
2000 }
2001
2002 bool pseudo = x_GetPseudo(gene_ref, gene_feat );
2003
2004 //
2005 // Collect qualifiers that are specific to a single or just a few feature
2006 // types:
2007 //
2008 switch ( type ) {
2009 case CSeqFeatData::e_Cdregion:
2010 x_AddQualsCdregion(m_Feat, ctx, pseudo);
2011 break;
2012 case CSeqFeatData::e_Rna:
2013 x_AddQualsRna(m_Feat, ctx, pseudo);
2014 break;
2015 case CSeqFeatData::e_Prot:
2016 x_AddQualsProt(ctx, pseudo);
2017 break;
2018 case CSeqFeatData::e_Region:
2019 x_AddQualsRegion( ctx );
2020 break;
2021 case CSeqFeatData::e_Site:
2022 x_AddQualsSite( ctx );
2023 break;
2024 case CSeqFeatData::e_Bond:
2025 x_AddQualsBond( ctx );
2026 break;
2027 case CSeqFeatData::e_Psec_str:
2028 x_AddQualsPsecStr( ctx );
2029 break;
2030 case CSeqFeatData::e_Non_std_residue:
2031 x_AddQualsNonStd( ctx );
2032 break;
2033 case CSeqFeatData::e_Het:
2034 x_AddQualsHet( ctx );
2035 break;
2036 case CSeqFeatData::e_Variation:
2037 x_AddQualsVariation( ctx );
2038 break;
2039 default:
2040 break;
2041 }
2042
2043 //
2044 // Collect qualifiers that are common to most feature types:
2045 //
2046 x_AddQualPartial( ctx );
2047 x_AddQualDbXref( ctx );
2048 x_AddQualExt();
2049 x_AddQualExpInv( ctx );
2050 x_AddQualCitation();
2051 x_AddQualExceptions( ctx );
2052 x_AddQualNote( gene_feat );
2053 x_AddQualOldLocusTag( ctx, gene_feat );
2054 x_AddQualDb( gene_ref );
2055 x_AddQualGeneXref( gene_ref, gene_feat );
2056 x_AddQualOperon( ctx, subtype );
2057 x_AddQualsGene( ctx, gene_ref, gene_feat, gene_ref ? false : gene_feat.NotEmpty() );
2058
2059 x_AddQualPseudo( ctx, type, subtype, pseudo );
2060 x_AddQualsGb( ctx );
2061
2062 // dynamic mapping of old features to regulatory with regulatory_class qualifier
2063 if ( type == CSeqFeatData::e_Imp ) {
2064 x_AddQualsRegulatoryClass ( ctx, subtype );
2065 }
2066
2067 x_AddQualSeqfeatNote(ctx);
2068
2069 // cleanup (drop illegal quals, duplicate information etc.)
2070 x_CleanQuals( gene_ref );
2071 }
2072
2073
2074 static const string s_TrnaList[] = {
2075 "tRNA-Gap",
2076 "tRNA-Ala",
2077 "tRNA-Asx",
2078 "tRNA-Cys",
2079 "tRNA-Asp",
2080 "tRNA-Glu",
2081 "tRNA-Phe",
2082 "tRNA-Gly",
2083 "tRNA-His",
2084 "tRNA-Ile",
2085 "tRNA-Xle",
2086 "tRNA-Lys",
2087 "tRNA-Leu",
2088 "tRNA-Met",
2089 "tRNA-Asn",
2090 "tRNA-Pyl",
2091 "tRNA-Pro",
2092 "tRNA-Gln",
2093 "tRNA-Arg",
2094 "tRNA-Ser",
2095 "tRNA-Thr",
2096 "tRNA-Sec",
2097 "tRNA-Val",
2098 "tRNA-Trp",
2099 "tRNA-OTHER",
2100 "tRNA-Tyr",
2101 "tRNA-Glx",
2102 "tRNA-TERM"
2103 };
2104
2105
s_AaName(int aa)2106 static const string& s_AaName(int aa)
2107 {
2108 int idx = 255;
2109
2110 if (aa != '*') {
2111 idx = aa - 64;
2112 } else {
2113 idx = 27;
2114 }
2115 if ( idx > 0 && idx < ArraySize(s_TrnaList) ) {
2116 return s_TrnaList [idx];
2117 }
2118 return kEmptyStr;
2119 }
2120
2121
s_ToIupacaa(int aa)2122 static int s_ToIupacaa(int aa)
2123 {
2124 vector<char> n(1, static_cast<char>(aa));
2125 vector<char> i;
2126 CSeqConvert::Convert(n, CSeqUtil::e_Ncbieaa, 0, 1, i, CSeqUtil::e_Iupacaa);
2127 return i.front();
2128 }
2129
2130 // ----------------------------------------------------------------------------
x_AddQualsRna(const CMappedFeat & feat,CBioseqContext & ctx,bool pseudo)2131 void CFeatureItem::x_AddQualsRna(
2132 const CMappedFeat& feat,
2133 CBioseqContext& ctx,
2134 bool pseudo )
2135 // ----------------------------------------------------------------------------
2136 {
2137
2138 CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
2139 const CRNA_ref& rna = feat.GetData().GetRna();
2140 const CFlatFileConfig& cfg = ctx.Config();
2141 CScope& scope = ctx.GetScope();
2142
2143 ///
2144 /// always output transcript_id
2145 ///
2146 {{
2147 EFeatureQualifier slot =
2148 (ctx.IsRefSeq() || cfg.IsModeDump() || cfg.IsModeGBench()) ?
2149 eFQ_transcript_id : eFQ_transcript_id_note;
2150 try {
2151 if (feat.IsSetProduct()) {
2152 CConstRef<CSeq_id> sip(feat.GetProduct().GetId());
2153 if (sip) {
2154 CBioseq_Handle prod =
2155 scope.GetBioseqHandleFromTSE(*sip, ctx.GetHandle());
2156 if ( prod ) {
2157 x_AddProductIdQuals(prod, slot);
2158 } else {
2159 string acc;
2160 sip->GetLabel(&acc, CSeq_id::eBoth);
2161 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*sip);
2162 CSeq_id_Handle besth = sequence::GetId(idh, scope, sequence::eGetId_Best);
2163 if (besth) {
2164 acc.clear();
2165 besth.GetSeqId()->GetLabel(&acc, CSeq_id::eContent);
2166 }
2167 if( acc.empty() && ! cfg.DropIllegalQuals() ) {
2168 //sure of that? doesn't look right---
2169 x_AddQual(slot, new CFlatStringQVal(
2170 NStr::NumericToString(sip->GetGi()) ) );
2171 }
2172 if (!acc.empty()) {
2173 if ( !cfg.DropIllegalQuals() || IsValidAccession(acc)) {
2174 CRef<CSeq_id> acc_id(new CSeq_id(acc));
2175 x_AddQual(slot, new CFlatSeqIdQVal(*acc_id));
2176 }
2177 /*
2178 if (! (cfg.HideGI() || cfg.IsPolicyFtp())) {
2179 x_AddQual(eFQ_db_xref, new CFlatSeqIdQVal(*sip, true));
2180 }
2181 */
2182 }
2183 }
2184 }
2185 }
2186 }
2187 catch (CObjmgrUtilException&) {
2188 }
2189 }}
2190
2191 CRNA_ref::TType rna_type = rna.IsSetType() ?
2192 rna.GetType() : CRNA_ref::eType_unknown;
2193 switch ( rna_type ) {
2194 case CRNA_ref::eType_tRNA:
2195 {
2196 if ( !pseudo && ( cfg.ShowTranscript() || cfg.IsFormatGBSeq() || cfg.IsFormatINSDSeq() ) ) {
2197 CSeqVector vec(feat.GetLocation(), scope);
2198 vec.SetCoding(CBioseq_Handle::eCoding_Iupac);
2199 string transcription;
2200 vec.GetSeqData(0, vec.size(), transcription);
2201 x_AddQual(eFQ_transcription, new CFlatStringQVal(transcription));
2202 }
2203 if (rna.IsSetExt()) {
2204 const CRNA_ref::C_Ext& ext = rna.GetExt();
2205 switch (ext.Which()) {
2206 case CRNA_ref::C_Ext::e_Name:
2207 {
2208 // amino acid could not be parsed into structured form
2209 if (!cfg.DropIllegalQuals()) {
2210 x_AddQual(eFQ_product,
2211 new CFlatStringQVal(ext.GetName()));
2212 } else {
2213 x_AddQual(eFQ_product,
2214 new CFlatStringQVal("tRNA-OTHER"));
2215 }
2216 break;
2217 }
2218 case CRNA_ref::C_Ext::e_TRNA:
2219 {
2220 const CTrna_ext& trna = ext.GetTRNA();
2221 int aa = 0;
2222 if ( trna.IsSetAa() && trna.GetAa().IsNcbieaa() ) {
2223 aa = trna.GetAa().GetNcbieaa();
2224 }
2225 if ( cfg.IupacaaOnly() ) {
2226 aa = s_ToIupacaa(aa);
2227 }
2228 const string& aa_str = s_AaName(aa);
2229 string amino_acid_str = aa_str;
2230
2231 if ( !aa_str.empty() ) {
2232 const string& ac_str = aa_str;
2233 if (NStr::CompareNocase (ac_str, "tRNA-Met") == 0) {
2234 for (auto& gbqual : m_Feat.GetQual()) {
2235 if (!gbqual->IsSetQual() || !gbqual->IsSetVal()) continue;
2236 if (NStr::CompareNocase( gbqual->GetQual(), "product") != 0) continue;
2237 if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-fMet") == 0) {
2238 amino_acid_str = "tRNA-fMet";
2239 }
2240 if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-iMet") == 0) {
2241 amino_acid_str = "tRNA-iMet";
2242 }
2243 }
2244 } else if (NStr::CompareNocase (ac_str, "tRNA-Ile") == 0) {
2245 for (auto& gbqual : m_Feat.GetQual()) {
2246 if (!gbqual->IsSetQual() || !gbqual->IsSetVal()) continue;
2247 if (NStr::CompareNocase( gbqual->GetQual(), "product") != 0) continue;
2248 if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-Ile2") == 0) {
2249 amino_acid_str = "tRNA-Ile2";
2250 }
2251 }
2252 }
2253 x_AddQual(eFQ_product, new CFlatStringQVal(amino_acid_str));
2254 if ( trna.IsSetAnticodon() && !ac_str.empty() ) {
2255 x_AddQual(eFQ_anticodon,
2256 new CFlatAnticodonQVal(trna.GetAnticodon(),
2257 ac_str.substr(5, NPOS)));
2258 }
2259 }
2260 if ( trna.IsSetCodon() ) {
2261 const string& comment =
2262 m_Feat.IsSetComment() ? m_Feat.GetComment() : kEmptyStr;
2263 x_AddQual(eFQ_trna_codons, new CFlatTrnaCodonsQVal(trna, comment));
2264 }
2265 //x_AddQual(eFQ_exception_note, new CFlatStringQVal("tRNA features were annotated by tRNAscan-SE."));
2266 break;
2267 }
2268 default:
2269 break;
2270 } // end of internal switch
2271 }
2272 break;
2273 }
2274 case CRNA_ref::eType_mRNA:
2275 case CRNA_ref::eType_rRNA:
2276 {
2277 if ( !pseudo && ( cfg.ShowTranscript() || cfg.IsFormatGBSeq() || cfg.IsFormatINSDSeq() ) ) {
2278 CSeqVector vec(feat.GetLocation(), scope);
2279 vec.SetCoding(CBioseq_Handle::eCoding_Iupac);
2280 string transcription;
2281 vec.GetSeqData(0, vec.size(), transcription);
2282 x_AddQual(eFQ_transcription, new CFlatStringQVal(transcription));
2283 }
2284 // intentional fall through
2285 }
2286 default:
2287 switch ( subtype ) {
2288
2289 case CSeqFeatData::eSubtype_ncRNA: {
2290 if ( ! rna.IsSetExt() ) {
2291 break;
2292 }
2293 const CRNA_ref_Base::TExt& ext = rna.GetExt();
2294 if ( ! ext.IsGen() ) {
2295 break;
2296 }
2297 break;
2298 }
2299 case CSeqFeatData::eSubtype_tmRNA: {
2300 if ( ! rna.IsSetExt() ) {
2301 break;
2302 }
2303 const CRNA_ref_Base::TExt& ext = rna.GetExt();
2304 if ( ext.IsGen() && ext.GetGen().IsSetQuals() ) {
2305
2306 const list< CRef< CRNA_qual > >& quals = ext.GetGen().GetQuals().Get();
2307 list< CRef< CRNA_qual > >::const_iterator it = quals.begin();
2308 for ( ; it != quals.end(); ++it ) {
2309 if ( (*it)->IsSetQual() && (*it)->IsSetVal() ) {
2310 if ( (*it)->GetQual() == "tag_peptide" ) {
2311 x_AddQual( eFQ_tag_peptide,
2312 new CFlatStringQVal(
2313 (*it)->GetVal(), CFormatQual::eUnquoted ) );
2314 break;
2315 }
2316 }
2317 }
2318 }
2319 break;
2320 }
2321 case CSeqFeatData::eSubtype_misc_RNA:
2322 case CSeqFeatData::eSubtype_otherRNA: {
2323 if ( ! rna.IsSetExt() ) {
2324 break;
2325 }
2326 const CRNA_ref_Base::TExt& ext = rna.GetExt();
2327 if ( ext.IsName() ) {
2328 string strName = ext.GetName();
2329 if ( strName != "misc_RNA" ) {
2330 x_AddQual( eFQ_product, new CFlatStringQVal( strName ) );
2331 }
2332 }
2333 break;
2334 }
2335 default:
2336 if ( rna.IsSetExt() && rna.GetExt().IsName() ) {
2337 x_AddQual( eFQ_product, new CFlatStringQVal( rna.GetExt().GetName() ) );
2338 }
2339 break;
2340 }
2341 } // end of switch
2342
2343 // some things to extract from RNA-gen
2344 if( rna.IsSetExt() && rna.GetExt().IsGen() ) {
2345 const CRNA_gen &gen = rna.GetExt().GetGen();
2346 if ( gen.IsSetClass() ) {
2347 if (gen.IsLegalClass()) {
2348 x_AddQual( eFQ_ncRNA_class,
2349 new CFlatStringQVal( gen.GetClass() ) );
2350 } else {
2351 x_AddQual( eFQ_ncRNA_class,
2352 new CFlatStringQVal( "other" ));
2353 x_AddQual( eFQ_seqfeat_note,
2354 new CFlatStringQVal( gen.GetClass() ) );
2355 }
2356 }
2357
2358 if ( gen.IsSetProduct() && ! x_HasQual(eFQ_product) ) {
2359 x_AddQual( eFQ_product,
2360 new CFlatStringQVal( gen.GetProduct() ) );
2361 }
2362 }
2363 }
2364
2365 // ----------------------------------------------------------------------------
x_AddQualTranslation(CBioseq_Handle & bsh,CBioseqContext & ctx,bool pseudo)2366 void CFeatureItem::x_AddQualTranslation(
2367 CBioseq_Handle& bsh,
2368 CBioseqContext& ctx,
2369 bool pseudo )
2370 // ----------------------------------------------------------------------------
2371 {
2372 const CFlatFileConfig& cfg = ctx.Config();
2373 CScope& scope = ctx.GetScope();
2374
2375 if ( pseudo || cfg.NeverTranslateCDS() ) {
2376 return;
2377 }
2378
2379 string translation;
2380 if ( cfg.AlwaysTranslateCDS() || (cfg.TranslateIfNoProduct() && !bsh) ) {
2381 CSeqTranslator::Translate(m_Feat.GetOriginalFeature(), scope,
2382 translation, false /* don't include stops */);
2383 }
2384 else if ( bsh ) {
2385 CSeqVector seqv = bsh.GetSeqVector();
2386 /*
2387 CSeq_data::E_Choice coding = cfg.IupacaaOnly() ?
2388 CSeq_data::e_Iupacaa : CSeq_data::e_Ncbieaa;
2389 */
2390 CSeq_data::E_Choice coding = CSeq_data::e_Ncbieaa;
2391 seqv.SetCoding( coding );
2392
2393 try {
2394 // an exception can occur here if the specified length doesn't match the actual length.
2395 // Although I don't know of any released .asn files with this problem, it can occur
2396 // in submissions.
2397 seqv.GetSeqData( 0, seqv.size(), translation );
2398 } catch( const CException & ) {
2399 // we're unable to do the translation
2400 translation.clear();
2401 }
2402 }
2403
2404 if (!NStr::IsBlank(translation)) {
2405 x_AddQual(eFQ_translation, new CFlatStringQVal( translation ) );
2406 }
2407 }
2408
2409 // ----------------------------------------------------------------------------
x_AddQualTranslationTable(const CCdregion & cdr,CBioseqContext & ctx)2410 void CFeatureItem::x_AddQualTranslationTable(
2411 const CCdregion& cdr,
2412 CBioseqContext& ctx )
2413 // ----------------------------------------------------------------------------
2414 {
2415 if ( ! cdr.IsSetCode() ) {
2416 return;
2417 }
2418 int gcode = cdr.GetCode().GetId();
2419 if ( gcode == 255 ) {
2420 return;
2421 }
2422 if ( ctx.Config().IsFormatGBSeq() || ctx.Config().IsFormatINSDSeq() || gcode > 1 ) {
2423 x_AddQual(eFQ_transl_table, new CFlatIntQVal(gcode));
2424 }
2425 }
2426
2427 // ----------------------------------------------------------------------------
x_AddQualCodonStart(const CCdregion & cdr,CBioseqContext & ctx)2428 void CFeatureItem::x_AddQualCodonStart(
2429 const CCdregion& cdr,
2430 CBioseqContext& ctx )
2431 // ----------------------------------------------------------------------------
2432 {
2433 CCdregion::TFrame frame = cdr.GetFrame();
2434 if (frame == CCdregion::eFrame_not_set)
2435 frame = CCdregion::eFrame_one;
2436
2437 // codon_start qualifier is always shown for nucleotides and for proteins mapped
2438 // from cDNA, otherwise only when the frame is not 1.
2439 if ( !ctx.IsProt() || !IsMappedFromCDNA() || frame != CCdregion::eFrame_one ) {
2440 x_AddQual( eFQ_codon_start, new CFlatIntQVal( frame ) );
2441 }
2442 }
2443
2444 // ----------------------------------------------------------------------------
x_AddQualCodonStartIdx(const CCdregion & cdr,CBioseqContext & ctx,const int inset)2445 void CFeatureItem::x_AddQualCodonStartIdx(
2446 const CCdregion& cdr,
2447 CBioseqContext& ctx,
2448 const int inset )
2449 // ----------------------------------------------------------------------------
2450 {
2451 CCdregion::TFrame frame = cdr.GetFrame();
2452 if (frame == CCdregion::eFrame_not_set) {
2453 frame = CCdregion::eFrame_one;
2454 }
2455
2456 if (inset == 1) {
2457 if (frame == CCdregion::eFrame_one) {
2458 frame = CCdregion::eFrame_three;
2459 } else if (frame == CCdregion::eFrame_two) {
2460 frame = CCdregion::eFrame_one;
2461 } else if (frame == CCdregion::eFrame_three) {
2462 frame = CCdregion::eFrame_two;
2463 }
2464 } else if (inset == 2) {
2465 if (frame == CCdregion::eFrame_one) {
2466 frame = CCdregion::eFrame_two;
2467 } else if (frame == CCdregion::eFrame_two) {
2468 frame = CCdregion::eFrame_three;
2469 } else if (frame == CCdregion::eFrame_three) {
2470 frame = CCdregion::eFrame_one;
2471 }
2472 }
2473
2474 // codon_start qualifier is always shown for nucleotides and for proteins mapped
2475 // from cDNA, otherwise only when the frame is not 1.
2476 if ( !ctx.IsProt() || !IsMappedFromCDNA() || frame != CCdregion::eFrame_one ) {
2477 x_AddQual( eFQ_codon_start, new CFlatIntQVal( frame ) );
2478 }
2479 }
2480
2481 // ----------------------------------------------------------------------------
x_AddQualTranslationException(const CCdregion & cdr,CBioseqContext & ctx)2482 void CFeatureItem::x_AddQualTranslationException(
2483 const CCdregion& cdr,
2484 CBioseqContext& ctx )
2485 // ----------------------------------------------------------------------------
2486 {
2487 if ( !ctx.IsProt() || !IsMappedFromCDNA() ) {
2488 if ( cdr.IsSetCode_break() ) {
2489 x_AddQual( eFQ_transl_except,
2490 new CFlatCodeBreakQVal( cdr.GetCode_break() ) );
2491 }
2492
2493 }
2494 }
2495
2496 // ----------------------------------------------------------------------------
x_AddQualTranslationExceptionIdx(const CCdregion & cdr,CBioseqContext & ctx,string & tr_ex)2497 void CFeatureItem::x_AddQualTranslationExceptionIdx(
2498 const CCdregion& cdr,
2499 CBioseqContext& ctx,
2500 string& tr_ex )
2501 // ----------------------------------------------------------------------------
2502 {
2503 if ( !ctx.IsProt() || !IsMappedFromCDNA() ) {
2504 if ( cdr.IsSetCode_break() ) {
2505 x_AddQual( eFQ_transl_except,
2506 new CFlatCodeBreakQVal( cdr.GetCode_break() ) );
2507 } else if ( tr_ex.length() > 0 ) {
2508 x_AddQual(eFQ_seqfeat_note, new CFlatStringQVal("unprocessed translation exception: " + tr_ex));
2509 }
2510 }
2511 }
2512
2513 // ----------------------------------------------------------------------------
x_AddQualProteinConflict(const CCdregion & cdr,CBioseqContext & ctx)2514 void CFeatureItem::x_AddQualProteinConflict(
2515 const CCdregion& cdr,
2516 CBioseqContext& ctx )
2517 // ----------------------------------------------------------------------------
2518 {
2519 static const string conflict_msg =
2520 "Protein sequence is in conflict with the conceptual translation";
2521
2522 const bool conflict_set = (cdr.IsSetConflict() && cdr.GetConflict());
2523
2524 if (conflict_set)
2525 {
2526 if (!ctx.IsProt() || !IsMappedFromCDNA()) {
2527 bool has_prot = false;
2528 if (m_Feat.IsSetProduct() && m_Feat.GetProduct().GetId() != 0) {
2529 has_prot = (sequence::GetLength(m_Feat.GetProduct(), &ctx.GetScope()) > 0);
2530 }
2531 if (has_prot) {
2532 x_AddQual(eFQ_prot_conflict, new CFlatStringQVal(conflict_msg));
2533 }
2534 }
2535 }
2536 }
2537
2538 // ----------------------------------------------------------------------------
x_AddQualCodedBy(CBioseqContext & ctx)2539 void CFeatureItem::x_AddQualCodedBy(
2540 CBioseqContext& ctx )
2541 // ----------------------------------------------------------------------------
2542 {
2543 //if ( ctx.IsProt() && IsMappedFromCDNA() ) {
2544 if ( ctx.IsProt() ) {
2545 x_AddQual( eFQ_coded_by, new CFlatSeqLocQVal( m_Feat.GetLocation() ) );
2546 }
2547 }
2548
2549 // ----------------------------------------------------------------------------
x_AddQualProtComment(const CBioseq_Handle & protHandle)2550 void CFeatureItem::x_AddQualProtComment(
2551 const CBioseq_Handle& protHandle )
2552 // ----------------------------------------------------------------------------
2553 {
2554 if ( ! protHandle ) {
2555 return;
2556 }
2557 CSeqdesc_CI comm( protHandle, CSeqdesc::e_Comment, 1 );
2558 if ( comm && !comm->GetComment().empty() ) {
2559 string comment = comm->GetComment();
2560
2561 TrimSpacesAndJunkFromEnds( comment, true );
2562 /* const bool bAddPeriod = */ RemovePeriodFromEnd( comment, true );
2563 CFlatStringQVal *commentQVal = new CFlatStringQVal( comment );
2564 /* if( bAddPeriod ) {
2565 commentQVal->SetAddPeriod();
2566 } */
2567 x_AddQual( eFQ_prot_comment, commentQVal );
2568 }
2569 }
2570
2571 // ----------------------------------------------------------------------------
x_AddQualProtMethod(const CBioseq_Handle & protHandle)2572 void CFeatureItem::x_AddQualProtMethod(
2573 const CBioseq_Handle& protHandle )
2574 // ----------------------------------------------------------------------------
2575 {
2576 if ( ! protHandle ) {
2577 return;
2578 }
2579 CSeqdesc_CI mi( protHandle, CSeqdesc::e_Molinfo );
2580 if ( mi ) {
2581 CMolInfo::TTech prot_tech = mi->GetMolinfo().GetTech();
2582 if ( prot_tech > CMolInfo::eTech_standard &&
2583 prot_tech != CMolInfo::eTech_concept_trans &&
2584 prot_tech != CMolInfo::eTech_concept_trans_a ) {
2585 if ( !GetTechString( prot_tech ).empty() ) {
2586 x_AddQual( eFQ_prot_method, new CFlatStringQVal(
2587 "Method: " + GetTechString( prot_tech) ) );
2588 }
2589 }
2590 }
2591 }
2592
2593 // ----------------------------------------------------------------------------
x_GetAssociatedProtInfoIdx(CBioseqContext & ctx,CBioseq_Handle & protHandle,const CProt_ref * & protRef,CMappedFeat & protFeat,CConstRef<CSeq_id> & protId)2594 void CFeatureItem::x_GetAssociatedProtInfoIdx(
2595 CBioseqContext& ctx,
2596 CBioseq_Handle& protHandle,
2597 const CProt_ref*& protRef,
2598 CMappedFeat& protFeat,
2599 CConstRef<CSeq_id>& protId )
2600 // ----------------------------------------------------------------------------
2601 {
2602 const CFlatFileConfig& cfg = ctx.Config();
2603 CScope& scope = ctx.GetScope();
2604
2605 protId.Reset( m_Feat.GetProduct().GetId() );
2606 if ( protId ) {
2607 if ( !cfg.AlwaysTranslateCDS() ) {
2608 CScope::EGetBioseqFlag get_flag = CScope::eGetBioseq_Loaded;
2609 if ( cfg.ShowFarTranslations() || ctx.IsGED() || ctx.IsRefSeq() || cfg.IsPolicyFtp() ) {
2610 get_flag = CScope::eGetBioseq_All;
2611 }
2612 protHandle = scope.GetBioseqHandle(*protId, get_flag);
2613 }
2614 }
2615
2616 CRef<CSeqEntryIndex> idx = ctx.GetSeqEntryIndex();
2617 if (! idx) return;
2618 CBioseq_Handle hdl = ctx.GetHandle();
2619 CRef<CBioseqIndex> bsx = idx->GetBioseqIndex (hdl);
2620 if (! bsx) return;
2621
2622
2623 protRef = 0;
2624 if ( protHandle ) {
2625 CRef<CSeqEntryIndex> idx = ctx.GetSeqEntryIndex();
2626 if (! idx) return;
2627 CRef<CBioseqIndex> bsx = idx->GetBioseqIndex (protHandle);
2628 if (bsx) {
2629 CRef<CFeatureIndex> pfx = bsx->GetBestProteinFeature();
2630 if (pfx) {
2631 protFeat = pfx->GetMappedFeat();
2632 if ( protFeat ) {
2633 protRef = &( protFeat.GetData().GetProt() );
2634 }
2635 }
2636 } else {
2637 x_GetAssociatedProtInfo(ctx, protHandle, protRef, protFeat, protId);
2638 }
2639 }
2640 }
2641
2642 // ----------------------------------------------------------------------------
x_GetAssociatedProtInfo(CBioseqContext & ctx,CBioseq_Handle & protHandle,const CProt_ref * & protRef,CMappedFeat & protFeat,CConstRef<CSeq_id> & protId)2643 void CFeatureItem::x_GetAssociatedProtInfo(
2644 CBioseqContext& ctx,
2645 CBioseq_Handle& protHandle,
2646 const CProt_ref*& protRef,
2647 CMappedFeat& protFeat,
2648 CConstRef<CSeq_id>& protId )
2649 // ----------------------------------------------------------------------------
2650 {
2651 const CFlatFileConfig& cfg = ctx.Config();
2652 CScope& scope = ctx.GetScope();
2653
2654 protId.Reset( m_Feat.GetProduct().GetId() );
2655 if ( protId ) {
2656 if ( !cfg.AlwaysTranslateCDS() ) {
2657 CScope::EGetBioseqFlag get_flag = CScope::eGetBioseq_Loaded;
2658 if ( cfg.ShowFarTranslations() || ctx.IsGED() || ctx.IsRefSeq() || cfg.IsPolicyFtp() ) {
2659 get_flag = CScope::eGetBioseq_All;
2660 }
2661 protHandle = scope.GetBioseqHandle(*protId, get_flag);
2662 }
2663 }
2664
2665 protRef = 0;
2666 if ( protHandle ) {
2667 protFeat = s_GetBestProtFeature( protHandle );
2668 if ( protFeat ) {
2669 protRef = &( protFeat.GetData().GetProt() );
2670 }
2671 }
2672 }
2673
2674 // ----------------------------------------------------------------------------
x_AddQualProtNote(const CProt_ref * protRef,const CMappedFeat & protFeat)2675 void CFeatureItem::x_AddQualProtNote(
2676 const CProt_ref* protRef,
2677 const CMappedFeat& protFeat )
2678 // ----------------------------------------------------------------------------
2679 {
2680 if ( ! protRef ) {
2681 return;
2682 }
2683 if ( protFeat.IsSetComment() ) {
2684 if ( protRef->GetProcessed() == CProt_ref::eProcessed_not_set ||
2685 protRef->GetProcessed() == CProt_ref::eProcessed_preprotein ) {
2686 string prot_note = protFeat.GetComment();
2687 TrimSpacesAndJunkFromEnds( prot_note, true );
2688 RemovePeriodFromEnd( prot_note, true );
2689 x_AddQual( eFQ_prot_note, new CFlatStringQVal( prot_note ) );
2690 }
2691 }
2692 }
2693
2694
2695 // ----------------------------------------------------------------------------
x_AddQualProteinId(CBioseqContext & ctx,const CBioseq_Handle & protHandle,CConstRef<CSeq_id> protId)2696 void CFeatureItem::x_AddQualProteinId(
2697 CBioseqContext& ctx,
2698 const CBioseq_Handle& protHandle,
2699 CConstRef<CSeq_id> protId )
2700 // ----------------------------------------------------------------------------
2701 {
2702 if ( protHandle ) {
2703 CConstRef<CBioseq> pBioseq( protHandle.GetCompleteBioseq() );
2704
2705 // extract the *one* usable general seq-id (if there is one)
2706 // (the loop sets pTheOneGeneralSeqId, or leaves it NULL
2707 // if there is zero or more than one usable general seqids)
2708 CConstRef<CSeq_id> pTheOneUsableGeneralSeqId;
2709 FOR_EACH_SEQID_ON_BIOSEQ(seqid_ci, *pBioseq) {
2710 const CSeq_id & seqid = **seqid_ci;
2711 if( ! seqid.IsGeneral() ) {
2712 // not just general, so ignore all of them
2713 pTheOneUsableGeneralSeqId.Reset();
2714 break;
2715 }
2716
2717 const CDbtag & db_tag = seqid.GetGeneral();
2718
2719 // db types to ignore
2720 static const char* const sc_IgnoredDbs[] = {
2721 "BankIt",
2722 "NCBIFILE",
2723 "PID",
2724 "SMART",
2725 "TMSMART",
2726 };
2727 typedef CStaticArraySet<const char*, PNocase> TIgnoredDbSet;
2728 DEFINE_STATIC_ARRAY_MAP(TIgnoredDbSet, sc_IgnoredDbSet, sc_IgnoredDbs );
2729
2730 // get db and tag
2731 const string & sDb = GET_STRING_FLD_OR_BLANK(db_tag, Db);
2732 string sTag;
2733 if( FIELD_IS_SET(db_tag, Tag) ) {
2734 stringstream sTagStrm;
2735 db_tag.GetTag().AsString(sTagStrm);
2736 // swap faster than assignment
2737 sTagStrm.str().swap(sTag);
2738 }
2739
2740 if( ! sDb.empty() && ! sTag.empty() &&
2741 sc_IgnoredDbSet.find(sDb.c_str()) == sc_IgnoredDbSet.end() )
2742 {
2743 if( pTheOneUsableGeneralSeqId ) {
2744 // more than one, so ignore all of them
2745 pTheOneUsableGeneralSeqId.Reset();
2746 break;
2747 } else {
2748 pTheOneUsableGeneralSeqId = *seqid_ci;
2749 }
2750 }
2751 }
2752
2753 CSeq_id::E_Choice eLastRegularChoice = CSeq_id::e_not_set;
2754 FOR_EACH_SEQID_ON_BIOSEQ(seqid_ci, *pBioseq) {
2755 const CSeq_id & seqid = **seqid_ci;
2756
2757 switch( seqid.Which() ) {
2758 case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Ddbj:
2759 case CSeq_id::e_Other:
2760 case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd:
2761 case CSeq_id::e_Gpipe:
2762 x_AddQual( eFQ_protein_id, new CFlatSeqIdQVal( seqid ) );
2763 eLastRegularChoice = seqid.Which();
2764 break;
2765
2766 case CSeq_id::e_Gi:
2767 if( seqid.GetGi() > ZERO_GI ) {
2768 const CFlatFileConfig& cfg = GetContext()->Config();
2769 if (! (cfg.HideGI() || cfg.IsPolicyFtp())) {
2770 if ( eLastRegularChoice == CSeq_id::e_not_set ) {
2771 // use as protein_id if it's the first usable one
2772 x_AddQual( eFQ_protein_id, new CFlatSeqIdQVal( seqid ) );
2773 }
2774 x_AddQual( eFQ_db_xref, new CFlatSeqIdQVal( seqid, true ) );
2775 }
2776 }
2777 break;
2778
2779 case CSeq_id::e_General:
2780 // show it if it's the *one* usable general seqid. otherwise, ignore
2781 if( *seqid_ci == pTheOneUsableGeneralSeqId ) {
2782 x_AddQual( eFQ_protein_id, new CFlatSeqIdQVal( seqid ) );
2783 }
2784 break;
2785
2786 default:
2787 // ignore other types
2788 break;
2789 }
2790 }
2791 } else if( protId ) {
2792
2793 TGi gi = ZERO_GI;
2794 string prot_acc;
2795
2796 // get gi and prot_acc
2797 if ( protId->IsGi() ) {
2798 gi = protId->GetGi();
2799 if( gi > ZERO_GI ) {
2800 try {
2801 prot_acc = GetAccessionForGi( gi, ctx.GetScope() );
2802 } catch ( CException& ) {}
2803 }
2804 } else {
2805
2806 // swap is faster than assignment
2807 // protId->GetSeqIdString(true).swap( prot_acc );
2808 prot_acc = protId->GetSeqIdString(true);
2809
2810 // find prot_acc and gi
2811 //const CTextseq_id* pTextSeq_id = protId->GetTextseq_Id();
2812 //if( pTextSeq_id ) {
2813 // stringstream protAccStrm;
2814 // pTextSeq_id->AsFastaString(protAccStrm);
2815 // // swap is faster than assignment
2816 // protAccStrm.str().swap( prot_acc );
2817
2818 //}
2819 try {
2820 gi = ctx.GetScope().GetGi( CSeq_id_Handle::GetHandle(*protId) );
2821 } catch(CException &) {
2822 // could not get gi
2823 }
2824 }
2825
2826 if( ! prot_acc.empty() ) {
2827 if ( ! ctx.Config().DropIllegalQuals() || IsValidAccession( prot_acc ) ) {
2828 try {
2829 CRef<CSeq_id> acc_id( new CSeq_id( prot_acc ) );
2830 x_AddQual( eFQ_protein_id, new CFlatSeqIdQVal( *acc_id ) );
2831 } catch( CException & ) {
2832 x_AddQual( eFQ_protein_id, new CFlatStringQVal(prot_acc) );
2833 }
2834 }
2835 }
2836
2837 if( gi > ZERO_GI ) {
2838 CConstRef<CSeq_id> pGiSeqId(
2839 protId->IsGi() ?
2840 protId.GetPointer() :
2841 new CSeq_id(CSeq_id::e_Gi, gi) );
2842 x_AddQual( eFQ_db_xref, new CFlatSeqIdQVal( *pGiSeqId, true ) );
2843 }
2844 }
2845 }
2846
2847 // ----------------------------------------------------------------------------
x_AddQualCdsProduct(CBioseqContext & ctx,const CProt_ref * protRef)2848 void CFeatureItem::x_AddQualCdsProduct(
2849 CBioseqContext& ctx,
2850 const CProt_ref* protRef )
2851 // ----------------------------------------------------------------------------
2852 {
2853 if ( !protRef ) {
2854 return;
2855 }
2856
2857 const CFlatFileConfig& cfg = ctx.Config();
2858 const CProt_ref::TName& names = protRef->GetName();
2859 if ( !names.empty() ) {
2860 if ( ! cfg.IsModeDump() ) {
2861 x_AddQual( eFQ_cds_product,
2862 new CFlatStringQVal( names.front() ) );
2863 if ( names.size() > 1 ) {
2864 x_AddQual( eFQ_prot_names,
2865 new CFlatProductNamesQVal( names, m_Gene ) );
2866 }
2867
2868 } else {
2869 ITERATE(CProt_ref::TName, it, names) {
2870 x_AddQual( eFQ_cds_product, new CFlatStringQVal(*it) );
2871 }
2872 }
2873 }
2874 }
2875
2876 // ----------------------------------------------------------------------------
x_AddQualProtDesc(const CProt_ref * protRef)2877 void CFeatureItem::x_AddQualProtDesc(
2878 const CProt_ref* protRef )
2879 // ----------------------------------------------------------------------------
2880 {
2881 if ( !protRef || !protRef->IsSetDesc() ) {
2882 return;
2883 }
2884
2885 string desc = protRef->GetDesc();
2886 TrimSpacesAndJunkFromEnds( desc, true );
2887 bool add_period = RemovePeriodFromEnd( desc, true );
2888 CRef<CFlatStringQVal> prot_desc( new CFlatStringQVal( desc ) );
2889 if ( add_period ) {
2890 prot_desc->SetAddPeriod();
2891 }
2892 x_AddQual( eFQ_prot_desc, prot_desc );
2893 }
2894
2895 // ----------------------------------------------------------------------------
x_AddQualProtActivity(const CProt_ref * protRef)2896 void CFeatureItem::x_AddQualProtActivity(
2897 const CProt_ref* protRef )
2898 // ----------------------------------------------------------------------------
2899 {
2900 if ( !protRef || protRef->GetActivity().empty() ) {
2901 return;
2902 }
2903 ITERATE (CProt_ref::TActivity, it, protRef->GetActivity()) {
2904 x_AddQual(eFQ_prot_activity, new CFlatStringQVal(*it));
2905 }
2906 }
2907
2908 // ----------------------------------------------------------------------------
x_AddQualProtEcNumber(CBioseqContext & ctx,const CProt_ref * protRef)2909 void CFeatureItem::x_AddQualProtEcNumber(
2910 CBioseqContext& ctx,
2911 const CProt_ref* protRef )
2912 // ----------------------------------------------------------------------------
2913 {
2914 if ( !protRef || !protRef->IsSetEc() || protRef->GetEc().empty() ) {
2915 return;
2916 }
2917
2918 const CFlatFileConfig& cfg = ctx.Config();
2919 ITERATE(CProt_ref::TEc, ec, protRef->GetEc()) {
2920 if ( !cfg.DropIllegalQuals() || s_IsLegalECNumber( *ec ) ) {
2921 x_AddQual( eFQ_prot_EC_number, new CFlatStringQVal( *ec ) );
2922 }
2923 }
2924 }
2925
2926 // ----------------------------------------------------------------------------
x_AddQualsCdregionIdx(const CMappedFeat & cds,CBioseqContext & ctx,bool pseudo)2927 void CFeatureItem::x_AddQualsCdregionIdx(
2928 const CMappedFeat& cds,
2929 CBioseqContext& ctx,
2930 bool pseudo)
2931 // ----------------------------------------------------------------------------
2932 {
2933 CRef<CSeqEntryIndex> idx = ctx.GetSeqEntryIndex();
2934 if (! idx) return;
2935 CBioseq_Handle hdl = ctx.GetHandle();
2936 CRef<CBioseqIndex> bsx = idx->GetBioseqIndex (hdl);
2937 if (! bsx) return;
2938
2939 const CCdregion& cdr = cds.GetData().GetCdregion();
2940
2941 // const CSeq_loc& cdsloc = cds.GetLocation();
2942 const CSeq_loc& orgloc = cds.GetOriginalFeature().GetLocation();
2943 const CSeq_loc& bsploc = ctx.GetLocation();
2944
2945 // cerr << "CDS " << MSerial_AsnText << cdsloc;
2946 // cerr << "ORG " << MSerial_AsnText << orgloc;
2947 // cerr << "BSP " << MSerial_AsnText << bsploc;
2948
2949 int inset = 0;
2950 if ( ! ctx.GetLocation().IsWhole()) {
2951 if (bsploc.IsInt()) {
2952 const CSeq_interval& bspint = bsploc.GetInt();
2953 if ( orgloc.IsSetStrand() && orgloc.GetStrand() == eNa_strand_minus ) {
2954 CBioseq_Handle& hdl = ctx.GetHandle();
2955 if (hdl) {
2956 int pos = bspint.GetTo();
2957 // cerr << "PS " << pos << endl;
2958 const CSeq_id* bid = bsploc.GetId();
2959 ENa_strand strand = eNa_strand_minus;
2960 CSeq_id& cid = const_cast<CSeq_id&>(*bid);
2961 CConstRef<CSeq_loc> newloc(new CSeq_loc(cid, pos, pos, strand));
2962 // cerr << "NEW " << MSerial_AsnText << newloc;
2963 inset = sequence::LocationOffset(orgloc, *newloc, eOffset_FromStart, &ctx.GetScope());
2964 // cerr << "IS " << inset << endl;
2965 }
2966 } else {
2967 int pos = bspint.GetFrom();
2968 // cerr << "PS " << pos << endl;
2969 const CSeq_id* bid = bsploc.GetId();
2970 ENa_strand strand = eNa_strand_plus;
2971 CSeq_id& cid = const_cast<CSeq_id&>(*bid);
2972 CConstRef<CSeq_loc> newloc(new CSeq_loc(cid, pos, pos, strand));
2973 // cerr << "NEW " << MSerial_AsnText << newloc;
2974 inset = sequence::LocationOffset(orgloc, *newloc, eOffset_FromStart, &ctx.GetScope());
2975 // cerr << "IS " << inset << endl;
2976 }
2977 }
2978 }
2979 if (inset < 0) {
2980 inset = 0;
2981 }
2982 inset = (inset % 3);
2983
2984 const CProt_ref* protRef = 0;
2985 CMappedFeat protFeat;
2986 CConstRef<CSeq_id> prot_id;
2987
2988 string tr_ex;
2989 for (auto& gbqual : cds.GetQual()) {
2990 if (!gbqual->IsSetQual() || !gbqual->IsSetVal()) continue;
2991 if (NStr::CompareNocase( gbqual->GetQual(), "transl_except") != 0) continue;
2992 tr_ex = gbqual->GetVal ();
2993 break;
2994 }
2995 TQI it = m_Quals.begin();
2996 while ( it != m_Quals.end() ) {
2997 if ( it->first == eFQ_transl_except ) {
2998 it = m_Quals.Erase(it);
2999 } else {
3000 ++it;
3001 }
3002 }
3003
3004 x_AddQualTranslationTable( cdr, ctx );
3005 x_AddQualCodonStartIdx( cdr, ctx, inset );
3006 x_AddQualTranslationExceptionIdx( cdr, ctx, tr_ex );
3007 x_AddQualProteinConflict( cdr, ctx );
3008 x_AddQualCodedBy( ctx );
3009 if ( ctx.IsProt() && IsMappedFromCDNA() ) {
3010 return;
3011 }
3012
3013 // protein qualifiers
3014 if (m_Feat.IsSetProduct()) {
3015 CBioseq_Handle prot =
3016 ctx.GetScope().GetBioseqHandle(m_Feat.GetProductId());
3017 x_GetAssociatedProtInfoIdx( ctx, prot, protRef, protFeat, prot_id );
3018 x_AddQualProtComment( prot );
3019 x_AddQualProtMethod( prot );
3020 x_AddQualProtNote( protRef, protFeat );
3021 x_AddQualProteinId( ctx, prot, prot_id );
3022 x_AddQualTranslation( prot, ctx, pseudo );
3023 }
3024
3025 // add qualifiers where associated xref overrides the ref:
3026 const CProt_ref* protXRef = m_Feat.GetProtXref();
3027 if ( ! protXRef ) {
3028 protXRef = protRef;
3029 }
3030 x_AddQualCdsProduct( ctx, protXRef );
3031 x_AddQualProtDesc( protXRef );
3032 x_AddQualProtActivity( protXRef );
3033 x_AddQualProtEcNumber( ctx, protXRef );
3034 }
3035
3036 // ----------------------------------------------------------------------------
x_AddQualsCdregion(const CMappedFeat & cds,CBioseqContext & ctx,bool pseudo)3037 void CFeatureItem::x_AddQualsCdregion(
3038 const CMappedFeat& cds,
3039 CBioseqContext& ctx,
3040 bool pseudo)
3041 // ----------------------------------------------------------------------------
3042 {
3043 const CCdregion& cdr = cds.GetData().GetCdregion();
3044
3045 const CProt_ref* protRef = 0;
3046 CMappedFeat protFeat;
3047 CConstRef<CSeq_id> prot_id;
3048
3049 x_AddQualTranslationTable( cdr, ctx );
3050 x_AddQualCodonStart( cdr, ctx );
3051 x_AddQualTranslationException( cdr, ctx );
3052 x_AddQualProteinConflict( cdr, ctx );
3053 x_AddQualCodedBy( ctx );
3054 if ( ctx.IsProt() && IsMappedFromCDNA() ) {
3055 return;
3056 }
3057
3058 // protein qualifiers
3059 if (m_Feat.IsSetProduct()) {
3060 CBioseq_Handle prot =
3061 ctx.GetScope().GetBioseqHandle(m_Feat.GetProductId());
3062 x_GetAssociatedProtInfo( ctx, prot, protRef, protFeat, prot_id );
3063 x_AddQualProtComment( prot );
3064 x_AddQualProtMethod( prot );
3065 x_AddQualProtNote( protRef, protFeat );
3066 x_AddQualProteinId( ctx, prot, prot_id );
3067 x_AddQualTranslation( prot, ctx, pseudo );
3068 }
3069
3070 // add qualifiers where associated xref overrides the ref:
3071 const CProt_ref* protXRef = m_Feat.GetProtXref();
3072 if ( ! protXRef ) {
3073 protXRef = protRef;
3074 }
3075 x_AddQualCdsProduct( ctx, protXRef );
3076 x_AddQualProtDesc( protXRef );
3077 x_AddQualProtActivity( protXRef );
3078 x_AddQualProtEcNumber( ctx, protXRef );
3079 }
3080
s_ScoreSeqIdHandle(const CSeq_id_Handle & idh)3081 static int s_ScoreSeqIdHandle(const CSeq_id_Handle& idh)
3082 {
3083 CConstRef<CSeq_id> id = idh.GetSeqId();
3084 CRef<CSeq_id> id_non_const
3085 (const_cast<CSeq_id*>(id.GetPointer()));
3086 return CSeq_id::Score(id_non_const);
3087 }
3088
3089
s_FindBestIdChoice(const CBioseq_Handle::TId & ids)3090 CSeq_id_Handle s_FindBestIdChoice(const CBioseq_Handle::TId& ids)
3091 {
3092 //
3093 // Objective:
3094 // Find the best choice among a given subset of id types. I.e. if a certain
3095 // id scores well but is not of a type we approve of, we still reject it.
3096 //
3097 CBestChoiceTracker< CSeq_id_Handle, int (*)(const CSeq_id_Handle&) >
3098 tracker(s_ScoreSeqIdHandle);
3099
3100 ITERATE( CBioseq_Handle::TId, it, ids ) {
3101 switch( (*it).Which() ) {
3102 case CSeq_id::e_Genbank:
3103 case CSeq_id::e_Embl:
3104 case CSeq_id::e_Ddbj:
3105 case CSeq_id::e_Gi:
3106 case CSeq_id::e_Other:
3107 case CSeq_id::e_General:
3108 case CSeq_id::e_Tpg:
3109 case CSeq_id::e_Tpe:
3110 case CSeq_id::e_Tpd:
3111 case CSeq_id::e_Gpipe:
3112 tracker(*it);
3113 break;
3114 default:
3115 break;
3116 }
3117 }
3118 return tracker.GetBestChoice();
3119 }
3120
3121 // ---------------------------------------------------------------------------
x_AddProductIdQuals(CBioseq_Handle & prod,EFeatureQualifier slot)3122 void CFeatureItem::x_AddProductIdQuals(
3123 CBioseq_Handle& prod,
3124 EFeatureQualifier slot)
3125 // ---------------------------------------------------------------------------
3126 {
3127 //
3128 // Objective (according to the C toolkit):
3129 // We need one (and only one) /xxx_id tag. If there are multiple ids
3130 //
3131
3132 if (!prod) {
3133 return;
3134 }
3135 const CBioseq_Handle::TId& ids = prod.GetId();
3136 if (ids.empty()) {
3137 return;
3138 }
3139
3140 CSeq_id_Handle best = s_FindBestIdChoice(ids);
3141 if (!best) {
3142 return;
3143 }
3144 x_AddQual(slot, new CFlatSeqIdQVal(*best.GetSeqId()));
3145
3146 if( m_Feat.GetData().IsCdregion() || ! GetContext()->IsProt() ) {
3147 const CFlatFileConfig& cfg = GetContext()->Config();
3148 ITERATE( CBioseq_Handle::TId, id_iter, ids ) {
3149 if( id_iter->IsGi() ) {
3150 if (! (cfg.HideGI() || cfg.IsPolicyFtp())) {
3151 x_AddQual( eFQ_db_xref,
3152 new CFlatStringQVal("GI:" + NStr::NumericToString(id_iter->GetGi()) ));
3153 }
3154 }
3155 }
3156 }
3157 }
3158
3159 // ----------------------------------------------------------------------------
x_AddQualsRegion(CBioseqContext & ctx)3160 void CFeatureItem::x_AddQualsRegion(
3161 CBioseqContext& ctx )
3162 // ----------------------------------------------------------------------------
3163 {
3164 _ASSERT( m_Feat.GetData().IsRegion() );
3165
3166 //cerr << MSerial_AsnText << m_Feat.GetOriginalFeature();
3167
3168 const CSeqFeatData& data = m_Feat.GetData();
3169 const string ®ion = data.GetRegion();
3170 if ( region.empty() ) {
3171 return;
3172 }
3173
3174 if ( ctx.IsProt() &&
3175 data.GetSubtype() == CSeqFeatData::eSubtype_region )
3176 {
3177 x_AddQual(eFQ_region_name, new CFlatStringQVal(region));
3178 } else {
3179 x_AddQual(eFQ_region, new CFlatStringQVal("Region: " + region));
3180 }
3181
3182 /// parse CDD data from the user object
3183 list< CConstRef<CUser_object> > objs;
3184 if (m_Feat.IsSetExt()) {
3185 objs.push_back(CConstRef<CUser_object>(&m_Feat.GetExt()));
3186 }
3187 if (m_Feat.IsSetExts()) {
3188 copy(m_Feat.GetExts().begin(), m_Feat.GetExts().end(),
3189 back_inserter(objs));
3190 }
3191
3192 ITERATE (list< CConstRef<CUser_object> >, it, objs) {
3193 const CUser_object& obj = **it;
3194 bool found = false;
3195 if (obj.IsSetType() &&
3196 obj.GetType().IsStr() &&
3197 obj.GetType().GetStr() == "cddScoreData") {
3198 CConstRef<CUser_field> f = obj.GetFieldRef("definition");
3199 if (f) {
3200 CUser_field_Base::C_Data::TStr definition_str = f->GetData().GetStr();
3201 RemovePeriodFromEnd(definition_str, true);
3202 if( ! s_StrEqualDisregardFinalPeriod(definition_str, region, NStr::eNocase) ) {
3203 x_AddQual(eFQ_region,
3204 new CFlatStringQVal(definition_str));
3205 found = true;
3206 }
3207 break;
3208
3209 /**
3210 if (ctx.IsProt()) {
3211 if (f->GetData().GetStr() != region || added_raw) {
3212 x_AddQual(eFQ_region,
3213 new CFlatStringQVal(f->GetData().GetStr()));
3214 }
3215 } else {
3216 x_AddQual(eFQ_region,
3217 new CFlatStringQVal(f->GetData().GetStr()));
3218 }
3219
3220 found = true;
3221 break;
3222 **/
3223
3224 /**
3225 if (ctx.IsProt() && region == f->GetData().GetStr()) {
3226 /// skip
3227 } else {
3228 x_AddQual(eFQ_region,
3229 new CFlatStringQVal(f->GetData().GetStr()));
3230 found = true;
3231 break;
3232 }
3233 **/
3234 }
3235 }
3236
3237 if (found) {
3238 break;
3239 }
3240 }
3241 }
3242
3243
3244 // ----------------------------------------------------------------------------
x_AddQualsBond(CBioseqContext & ctx)3245 void CFeatureItem::x_AddQualsBond(
3246 CBioseqContext& ctx )
3247 // ----------------------------------------------------------------------------
3248 {
3249 _ASSERT( m_Feat.GetData().IsBond() );
3250
3251 const CSeqFeatData& data = m_Feat.GetData();
3252 const string& bond = s_GetBondName( data.GetBond() );
3253 if ( NStr::IsBlank( bond ) ) {
3254 return;
3255 }
3256
3257 if ( ctx.IsGenbankFormat() && ctx.IsProt() ) {
3258 x_AddQual( eFQ_bond_type, new CFlatStringQVal( bond ) );
3259 } else {
3260 x_AddQual( eFQ_bond, new CFlatBondQVal( bond ) );
3261 }
3262 }
3263
3264 // ----------------------------------------------------------------------------
x_AddQualsPsecStr(CBioseqContext & ctx)3265 void CFeatureItem::x_AddQualsPsecStr(
3266 CBioseqContext& ctx )
3267 // ----------------------------------------------------------------------------
3268 {
3269 _ASSERT( m_Feat.GetData().IsPsec_str() );
3270
3271 const CSeqFeatData& data = m_Feat.GetData();
3272
3273 CSeqFeatData_Base::TPsec_str sec_str_type = data.GetPsec_str();
3274
3275 string sec_str_as_str = CSeqFeatData_Base::GetTypeInfo_enum_EPsec_str()->FindName( sec_str_type, true );
3276 x_AddQual( eFQ_sec_str_type, new CFlatStringQVal( sec_str_as_str ) );
3277 }
3278
3279 // ----------------------------------------------------------------------------
x_AddQualsNonStd(CBioseqContext & ctx)3280 void CFeatureItem::x_AddQualsNonStd(
3281 CBioseqContext& ctx )
3282 // ----------------------------------------------------------------------------
3283 {
3284 _ASSERT( m_Feat.GetData().IsNon_std_residue() );
3285
3286 const CSeqFeatData& data = m_Feat.GetData();
3287
3288 CSeqFeatData_Base::TNon_std_residue n_s_res = data.GetNon_std_residue();
3289
3290 x_AddQual( eFQ_non_std_residue, new CFlatStringQVal( n_s_res ) );
3291 }
3292
3293 // ----------------------------------------------------------------------------
x_AddQualsHet(CBioseqContext & ctx)3294 void CFeatureItem::x_AddQualsHet(
3295 CBioseqContext& ctx )
3296 // ----------------------------------------------------------------------------
3297 {
3298 _ASSERT( m_Feat.GetData().IsHet() );
3299
3300 const CSeqFeatData& data = m_Feat.GetData();
3301
3302 CSeqFeatData_Base::THet het = data.GetHet();
3303
3304 x_AddQual( eFQ_heterogen, new CFlatStringQVal( het.Get() ) );
3305 }
3306
3307 // ----------------------------------------------------------------------------
x_AddQualsVariation(CBioseqContext & ctx)3308 void CFeatureItem::x_AddQualsVariation(
3309 CBioseqContext& ctx )
3310 // ----------------------------------------------------------------------------
3311 {
3312 _ASSERT( m_Feat.GetData().IsVariation() );
3313
3314 const CSeqFeatData& data = m_Feat.GetData();
3315 const CSeqFeatData_Base::TVariation& variation = data.GetVariation();
3316
3317 // Make the /db_xref qual
3318 if( variation.CanGetId() ) {
3319 const CVariation_ref_Base::TId& dbt = variation.GetId();
3320 // the id tag is quite specific (e.g. db must be "dbSNP", etc.) or it won't print
3321 if ( dbt.IsSetDb() && !dbt.GetDb().empty() &&
3322 dbt.IsSetTag() && dbt.GetTag().IsStr() ) {
3323 const string &oid_str = dbt.GetTag().GetStr();
3324 if( dbt.GetDb() == "dbSNP" && NStr::StartsWith(oid_str, "rs" ) ) {
3325 x_AddQual(eFQ_db_xref, new CFlatStringQVal( dbt.GetDb() + ":" + oid_str.substr( 2 ) ) );
3326 }
3327 }
3328 }
3329
3330 // Make the /replace quals:
3331 if( variation.CanGetData() && variation.GetData().IsInstance() &&
3332 variation.GetData().GetInstance().CanGetDelta() ) {
3333 const CVariation_inst_Base::TDelta& delta = variation.GetData().GetInstance().GetDelta();
3334 ITERATE( CVariation_inst_Base::TDelta, delta_iter, delta ) {
3335 if( *delta_iter && (*delta_iter)->CanGetSeq() ) {
3336 const CDelta_item_Base::TSeq& seq = (*delta_iter)->GetSeq();
3337 if( seq.IsLiteral() && seq.GetLiteral().CanGetSeq_data() ) {
3338 const CDelta_item_Base::C_Seq::TLiteral& seq_literal = seq.GetLiteral();
3339 const CSeq_literal_Base::TSeq_data& seq_data = seq_literal.GetSeq_data();
3340
3341 // convert the data to the standard a,c,g,t
3342 CSeq_data iupacna_seq_data;
3343 CSeqportUtil::Convert( seq_data,
3344 &iupacna_seq_data,
3345 CSeq_data::e_Iupacna );
3346 string nucleotides = iupacna_seq_data.GetIupacna().Get();
3347
3348 // if the specified length and the length of the data conflict,
3349 // use the smaller
3350 const string::size_type max_len_allowed = seq_literal.GetLength();
3351 if( nucleotides.size() > max_len_allowed ) {
3352 nucleotides.resize( max_len_allowed );
3353 }
3354
3355 NStr::ToLower( nucleotides );
3356
3357 if (!NStr::IsBlank(nucleotides)) {
3358 x_AddQual(eFQ_replace, new CFlatStringQVal(nucleotides));
3359 }
3360 }
3361 }
3362 }
3363 }
3364 }
3365
s_GetSiteName(CSeqFeatData::TSite site)3366 static const string& s_GetSiteName(CSeqFeatData::TSite site)
3367 {
3368 static const string kOther = "other";
3369 static const string kDnaBinding = "DNA binding";
3370 static const string kInhibit = "inhibition";
3371
3372 switch (site) {
3373 case CSeqFeatData::eSite_other:
3374 return kOther;
3375 case CSeqFeatData::eSite_dna_binding:
3376 return kDnaBinding;
3377 case CSeqFeatData::eSite_inhibit:
3378 return kInhibit;
3379
3380 default:
3381 return CSeqFeatData::ENUM_METHOD_NAME(ESite)()->FindName(site, true);
3382 }
3383 }
3384
3385 // ----------------------------------------------------------------------------
x_AddQualsSite(CBioseqContext & ctx)3386 void CFeatureItem::x_AddQualsSite(
3387 CBioseqContext& ctx )
3388 // ----------------------------------------------------------------------------
3389 {
3390 _ASSERT( m_Feat.GetData().IsSite() );
3391
3392 const CSeqFeatData& data = m_Feat.GetData();
3393 CSeqFeatData::TSite site = data.GetSite();
3394 const string& site_name = s_GetSiteName( site );
3395
3396 // ID-4627 : site_type qualifier is needed for GBSeq/INSDSeq XMl too
3397 if ( (ctx.Config().IsFormatGenbank() ||
3398 ctx.Config().IsFormatGBSeq() ||
3399 ctx.Config().IsFormatINSDSeq()) && ctx.IsProt() ) {
3400 x_AddQual(eFQ_site_type, new CFlatSiteQVal( site_name ) );
3401 } else {
3402 if ( !m_Feat.IsSetComment() ||
3403 ( NStr::Find( m_Feat.GetComment(), site_name ) == NPOS ) ) {
3404 x_AddQual( eFQ_site, new CFlatSiteQVal( site_name ) );
3405 }
3406 }
3407 }
3408
3409 // ----------------------------------------------------------------------------
x_AddQualsExt(const CUser_field & field,const CSeq_feat::TExt & ext)3410 void CFeatureItem::x_AddQualsExt(
3411 const CUser_field& field, const CSeq_feat::TExt& ext )
3412 // ----------------------------------------------------------------------------
3413 {
3414 if ( field.IsSetLabel() && field.GetLabel().IsStr() ) {
3415 const string& oid = field.GetLabel().GetStr();
3416 if ( oid == "ModelEvidence" ) {
3417 FOR_EACH_GBQUAL_ON_SEQFEAT (gbq_itr, m_Feat) {
3418 const CGb_qual& gbq = **gbq_itr;
3419 if (gbq.IsSetQual()) {
3420 if (NStr::Equal (gbq.GetQual(), "experiment")) return;
3421 }
3422 }
3423 x_AddQual(eFQ_modelev, new CFlatModelEvQVal(ext));
3424 } else if ( oid == "Process" || oid == "Component" || oid == "Function" ) {
3425 x_AddGoQuals(field);
3426 }
3427 }
3428 }
3429
3430 // ----------------------------------------------------------------------------
x_AddQualsExt(const CSeq_feat::TExt & ext)3431 void CFeatureItem::x_AddQualsExt(
3432 const CSeq_feat::TExt& ext )
3433 // ----------------------------------------------------------------------------
3434 {
3435 ITERATE (CUser_object::TData, it, ext.GetData()) {
3436 const CUser_field& field = **it;
3437 if ( !field.IsSetData() ) {
3438 continue;
3439 }
3440 if ( field.GetData().IsObject() ) {
3441 const CUser_object& obj = field.GetData().GetObject();
3442 x_AddQualsExt(obj);
3443 } else if ( field.GetData().IsObjects() ) {
3444 ITERATE (CUser_field::C_Data::TObjects, o, field.GetData().GetObjects()) {
3445 x_AddQualsExt(**o);
3446 }
3447 } else if ( field.GetData().IsFields() ) {
3448 ITERATE (CUser_field::C_Data::TFields, o, field.GetData().GetFields()) {
3449 // x_AddGoQuals(**o);
3450 x_AddQualsExt(**o, ext);
3451 }
3452 }
3453 }
3454 if ( ext.IsSetType() && ext.GetType().IsStr() ) {
3455 const string& oid = ext.GetType().GetStr();
3456 if ( oid == "ModelEvidence" ) {
3457 FOR_EACH_GBQUAL_ON_SEQFEAT (gbq_itr, m_Feat) {
3458 const CGb_qual& gbq = **gbq_itr;
3459 if (gbq.IsSetQual()) {
3460 if (NStr::Equal (gbq.GetQual(), "experiment")) return;
3461 }
3462 }
3463 x_AddQual(eFQ_modelev, new CFlatModelEvQVal(ext));
3464 } else if ( oid == "GeneOntology" ) {
3465 x_AddGoQuals(ext);
3466 }
3467 }
3468 }
3469
3470 // ----------------------------------------------------------------------------
x_AddQualDbXref(CBioseqContext & ctx)3471 void CFeatureItem::x_AddQualDbXref(
3472 CBioseqContext& ctx )
3473 // ----------------------------------------------------------------------------
3474 {
3475 if ( m_Feat.IsSetProduct() &&
3476 ( !m_Feat.GetData().IsCdregion() && ctx.IsProt() && ! IsMappedFromProt() ) ) {
3477 CBioseq_Handle prod =
3478 ctx.GetScope().GetBioseqHandle( m_Feat.GetProductId() );
3479 if ( prod ) {
3480 const CBioseq_Handle::TId& ids = prod.GetId();
3481 if ( ! ids.empty() ) {
3482 ITERATE (CBioseq_Handle::TId, it, ids) {
3483 if ( it->Which() != CSeq_id::e_Gi ) {
3484 continue;
3485 }
3486 CConstRef<CSeq_id> id = it->GetSeqId();
3487 if (!id->IsGeneral()) {
3488 x_AddQual(eFQ_db_xref, new CFlatSeqIdQVal(*id, id->IsGi()));
3489 }
3490 }
3491 }
3492 }
3493 }
3494 if ( ! m_Feat.IsSetDbxref() ) {
3495 return ;
3496 }
3497 x_AddQual( eFQ_db_xref, new CFlatXrefQVal( m_Feat.GetDbxref(), &m_Quals ) );
3498 }
3499
3500 // ----------------------------------------------------------------------------
x_AddGoQuals(const CUser_field & field)3501 void CFeatureItem::x_AddGoQuals(
3502 const CUser_field& field )
3503 // ----------------------------------------------------------------------------
3504 {
3505 if ( field.IsSetLabel() && field.GetLabel().IsStr() ) {
3506 const string& label = field.GetLabel().GetStr();
3507 EFeatureQualifier slot = eFQ_none;
3508 if ( label == "Process" ) {
3509 slot = eFQ_go_process;
3510 } else if ( label == "Component" ) {
3511 slot = eFQ_go_component;
3512 } else if ( label == "Function" ) {
3513 slot = eFQ_go_function;
3514 }
3515 if ( slot == eFQ_none ) {
3516 return;
3517 }
3518
3519 ITERATE (CUser_field::TData::TFields, it, field.GetData().GetFields()) {
3520 if ( (*it)->GetData().IsFields() ) {
3521 CRef<CFlatGoQVal> go_val( new CFlatGoQVal(**it) );
3522
3523 bool okay_to_add = true;
3524
3525 // check for dups
3526 CFeatureItem::TQCI iter = x_GetQual(slot);
3527 for ( ; iter != m_Quals.end() && iter->first == slot; ++iter) {
3528 const CFlatGoQVal & qual = dynamic_cast<const CFlatGoQVal &>( *iter->second );
3529 if( qual.Equals(*go_val) )
3530 {
3531 okay_to_add = false;
3532 break;
3533 }
3534 }
3535
3536 if( okay_to_add ) {
3537 x_AddQual(slot, go_val);
3538 }
3539 }
3540 }
3541 }
3542 }
3543
3544 // ----------------------------------------------------------------------------
x_AddGoQuals(const CUser_object & uo)3545 void CFeatureItem::x_AddGoQuals(
3546 const CUser_object& uo )
3547 // ----------------------------------------------------------------------------
3548 {
3549 ITERATE (CUser_object::TData, uf_it, uo.GetData()) {
3550 const CUser_field& field = **uf_it;
3551 if ( field.IsSetLabel() && field.GetLabel().IsStr() ) {
3552 const string& label = field.GetLabel().GetStr();
3553 EFeatureQualifier slot = eFQ_none;
3554 if ( label == "Process" ) {
3555 slot = eFQ_go_process;
3556 } else if ( label == "Component" ) {
3557 slot = eFQ_go_component;
3558 } else if ( label == "Function" ) {
3559 slot = eFQ_go_function;
3560 }
3561 if ( slot == eFQ_none ) {
3562 continue;
3563 }
3564
3565 ITERATE (CUser_field::TData::TFields, it, field.GetData().GetFields()) {
3566 if ( (*it)->GetData().IsFields() ) {
3567 CRef<CFlatGoQVal> go_val( new CFlatGoQVal(**it) );
3568
3569 bool okay_to_add = true;
3570
3571 // check for dups
3572 CFeatureItem::TQCI iter = x_GetQual(slot);
3573 for ( ; iter != m_Quals.end() && iter->first == slot; ++iter) {
3574 const CFlatGoQVal & qual = dynamic_cast<const CFlatGoQVal &>( *iter->second );
3575 if( qual.Equals(*go_val) )
3576 {
3577 okay_to_add = false;
3578 break;
3579 }
3580 }
3581
3582 if( okay_to_add ) {
3583 x_AddQual(slot, go_val);
3584 }
3585 }
3586 }
3587 }
3588 }
3589 }
3590
3591 // ----------------------------------------------------------------------------
x_AddQualsGene(const CBioseqContext & ctx,const CGene_ref * gene_ref,CConstRef<CSeq_feat> & gene_feat,bool from_overlap)3592 void CFeatureItem::x_AddQualsGene(
3593 const CBioseqContext& ctx,
3594 const CGene_ref* gene_ref,
3595 CConstRef<CSeq_feat>& gene_feat,
3596 bool from_overlap )
3597 // ----------------------------------------------------------------------------
3598 {
3599 const CSeqFeatData& data = m_Feat.GetData();
3600 CSeqFeatData::ESubtype subtype = data.GetSubtype();
3601
3602 if ( m_Feat.GetData().Which() == CSeqFeatData::e_Gene ) {
3603 gene_ref = &( m_Feat.GetData().GetGene() );
3604 }
3605 if ( ! gene_ref && gene_feat ) {
3606 gene_ref = & gene_feat->GetData().GetGene();
3607 }
3608
3609 if ( ! gene_ref || gene_ref->IsSuppressed() ) {
3610 return;
3611 }
3612
3613 const bool is_gene = (subtype == CSeqFeatData::eSubtype_gene);
3614
3615 const bool okay_to_propage = (subtype != CSeqFeatData::eSubtype_mobile_element &&
3616 subtype != CSeqFeatData::eSubtype_centromere &&
3617 subtype != CSeqFeatData::eSubtype_telomere);
3618
3619 const string* locus = (gene_ref->IsSetLocus() && !NStr::IsBlank(gene_ref->GetLocus())) ?
3620 &gene_ref->GetLocus() : NULL;
3621 const string* desc = (gene_ref->IsSetDesc() && !NStr::IsBlank(gene_ref->GetDesc())) ?
3622 &gene_ref->GetDesc() : NULL;
3623 const TGeneSyn* syn = (gene_ref->IsSetSyn() && !gene_ref->GetSyn().empty()) ?
3624 &gene_ref->GetSyn() : NULL;
3625 const string* locus_tag =
3626 (gene_ref->IsSetLocus_tag() && !NStr::IsBlank(gene_ref->GetLocus_tag())) ?
3627 &gene_ref->GetLocus_tag() : 0;
3628
3629 if ( ctx.IsProt() ) {
3630 // skip if GenPept format and not gene or CDS
3631 if (subtype != CSeqFeatData::eSubtype_gene && subtype != CSeqFeatData::eSubtype_cdregion) {
3632 return;
3633 }
3634 }
3635
3636 // gene:
3637 if ( !from_overlap || okay_to_propage ) {
3638 if ( locus != 0 ) {
3639 m_Gene = *locus;
3640 }
3641 else if ( ( desc != 0 ) && okay_to_propage ) {
3642 m_Gene = *desc;
3643 }
3644 else if (syn != NULL) {
3645 CGene_ref::TSyn syns = *syn;
3646 m_Gene = syns.front();
3647 }
3648 if( !m_Gene.empty() ) {
3649 // we suppress the /gene qual when there's no locus but there is a locus tag (imitates C toolkit)
3650 if ( NULL != locus || NULL == locus_tag ) {
3651 x_AddQual(eFQ_gene, new CFlatGeneQVal(m_Gene));
3652 }
3653 }
3654 }
3655
3656 // locus tag:
3657 if ( gene_ref || okay_to_propage ) {
3658 if (locus != NULL) {
3659 if (locus_tag != NULL) {
3660 x_AddQual(eFQ_locus_tag, new CFlatStringQVal(*locus_tag, CFormatQual::eTrim_WhitespaceOnly));
3661 }
3662 }
3663 else if (locus_tag != NULL) {
3664 x_AddQual(eFQ_locus_tag, new CFlatStringQVal(*locus_tag, CFormatQual::eTrim_WhitespaceOnly));
3665 }
3666 }
3667
3668 // gene desc:
3669 if ( gene_ref || okay_to_propage ) {
3670 if (locus != NULL) {
3671 if (is_gene && desc != NULL) {
3672 string desc_cleaned = *desc;
3673 RemovePeriodFromEnd( desc_cleaned, true );
3674 x_AddQual(eFQ_gene_desc, new CFlatStringQVal(desc_cleaned));
3675 }
3676 }
3677 else if (locus_tag != NULL) {
3678 if (is_gene && desc != NULL) {
3679 x_AddQual(eFQ_gene_desc, new CFlatStringQVal(*desc));
3680 }
3681 }
3682 }
3683
3684 // gene syn:
3685 if ( gene_ref || okay_to_propage ) {
3686 if (locus != NULL) {
3687 if (syn != NULL) {
3688 x_AddQual(eFQ_gene_syn, new CFlatGeneSynonymsQVal(*syn));
3689 }
3690 } else if (locus_tag != NULL) {
3691 if (syn != NULL) {
3692 x_AddQual(eFQ_gene_syn, new CFlatGeneSynonymsQVal(*syn));
3693 }
3694 } else if (desc != NULL) {
3695 if (syn != NULL) {
3696 x_AddQual(eFQ_gene_syn, new CFlatGeneSynonymsQVal(*syn));
3697 }
3698 } else if (syn != NULL) {
3699 CGene_ref::TSyn syns = *syn;
3700 syns.pop_front();
3701 // ... and the rest as synonyms
3702 if (syn != NULL) {
3703 x_AddQual(eFQ_gene_syn, new CFlatGeneSynonymsQVal(syns));
3704 }
3705 }
3706 }
3707
3708 // gene nomenclature
3709 if( gene_ref->IsSetFormal_name() && subtype == CSeqFeatData::eSubtype_gene ) {
3710 x_AddQual( eFQ_nomenclature, new CFlatNomenclatureQVal(gene_ref->GetFormal_name()) );
3711 }
3712
3713 // gene allele:
3714 {{
3715 // these bool vars just break up the if-statement to make it easier to understand
3716 const bool is_type_where_allele_from_gene_forbidden = (subtype == CSeqFeatData::eSubtype_variation);
3717 const bool is_type_where_allele_from_gene_forbidden_except_with_embl_or_ddbj =
3718 ( subtype == CSeqFeatData::eSubtype_mobile_element ||
3719 subtype == CSeqFeatData::eSubtype_centromere ||
3720 subtype == CSeqFeatData::eSubtype_telomere );
3721 const bool is_embl_or_ddbj = ( GetContext()->IsEMBL() || GetContext()->IsDDBJ() );
3722 if ( ! is_type_where_allele_from_gene_forbidden &&
3723 ( is_embl_or_ddbj || ! is_type_where_allele_from_gene_forbidden_except_with_embl_or_ddbj ) )
3724 {
3725 if (gene_ref->IsSetAllele() && !NStr::IsBlank(gene_ref->GetAllele())) {
3726 x_AddQual(eFQ_gene_allele, new CFlatStringQVal(gene_ref->GetAllele(),
3727 CFormatQual::eTrim_WhitespaceOnly));
3728 }
3729 }
3730 }}
3731
3732 // gene xref:
3733 if (gene_ref->IsSetDb()) {
3734 x_AddQual(eFQ_gene_xref, new CFlatXrefQVal(gene_ref->GetDb()));
3735 }
3736
3737 // gene db-xref:
3738 switch (m_Feat.GetData().Which()) {
3739 case CSeqFeatData::e_Rna:
3740 case CSeqFeatData::e_Cdregion:
3741 if (gene_feat && gene_feat->IsSetDbxref()) {
3742 CSeq_feat::TDbxref xrefs = gene_feat->GetDbxref();
3743 if (m_Feat.IsSetDbxref()) {
3744 ITERATE (CSeq_feat::TDbxref, it, m_Feat.GetDbxref()) {
3745 for (CSeq_feat::TDbxref::iterator i = xrefs.begin();
3746 i != xrefs.end(); ++i) {
3747 if ((*i)->Equals(**it)) {
3748 xrefs.erase(i);
3749 break;
3750 }
3751 }
3752 }
3753 }
3754 if (xrefs.size()) {
3755 x_AddQual(eFQ_db_xref, new CFlatXrefQVal(xrefs));
3756 }
3757 }
3758 break;
3759
3760 default:
3761 break;
3762 }
3763
3764 // gene map:
3765 if (!from_overlap && gene_ref->IsSetMaploc() && subtype == CSeqFeatData::eSubtype_gene) {
3766 x_AddQual(eFQ_gene_map, new CFlatStringQVal(gene_ref->GetMaploc()));
3767 }
3768
3769 // gene pseudogene qual:
3770
3771 // inherit pseudogene, if possible
3772 if( gene_feat && ! x_HasQual(eFQ_pseudogene) ) {
3773 const string & strPseudoGene = gene_feat->GetNamedQual("pseudogene");
3774 x_AddQual(eFQ_pseudogene, new CFlatStringQVal(strPseudoGene) );
3775 }
3776 }
3777
3778 // ----------------------------------------------------------------------------
x_AddQualsProt(CBioseqContext & ctx,bool pseudo)3779 void CFeatureItem::x_AddQualsProt(
3780 CBioseqContext& ctx,
3781 bool pseudo)
3782 // ----------------------------------------------------------------------------
3783 {
3784 _ASSERT( m_Feat.GetData().IsProt() );
3785
3786 const CSeqFeatData& data = m_Feat.GetData();
3787 const CProt_ref& pref = data.GetProt();
3788 CProt_ref::TProcessed processed = pref.GetProcessed();
3789
3790 //cerr << MSerial_AsnText << m_Feat.GetOriginalFeature();
3791
3792 if ( ctx.IsNuc() || (ctx.IsProt() && !IsMappedFromProt()) ) {
3793 if ( pref.IsSetName() && !pref.GetName().empty() ) {
3794 const CProt_ref::TName& names = pref.GetName();
3795 x_AddQual(eFQ_product, new CFlatStringQVal(names.front()));
3796 if (names.size() > 1) {
3797 x_AddQual(eFQ_prot_names, new CFlatProductNamesQVal(names, m_Gene));
3798 }
3799 }
3800 if ( pref.IsSetDesc() && !pref.GetDesc().empty() ) {
3801 if ( !ctx.IsProt() ) {
3802 string desc = pref.GetDesc();
3803 TrimSpacesAndJunkFromEnds(desc, true);
3804 bool add_period = RemovePeriodFromEnd(desc, true);
3805 CRef<CFlatStringQVal> prot_desc(new CFlatStringQVal(desc));
3806 if (add_period) {
3807 prot_desc->SetAddPeriod();
3808 }
3809 x_AddQual(eFQ_prot_desc, prot_desc);
3810 // had_prot_desc = true;
3811 } else {
3812 x_AddQual(eFQ_prot_name, new CFlatStringQVal(pref.GetDesc()));
3813 }
3814 }
3815 if ( pref.IsSetActivity() && !pref.GetActivity().empty() ) {
3816 ITERATE (CProt_ref::TActivity, it, pref.GetActivity()) {
3817 if (!NStr::IsBlank(*it)) {
3818 x_AddQual(eFQ_prot_activity, new CFlatStringQVal(*it));
3819 }
3820 }
3821 }
3822 if (pref.IsSetEc() && !pref.GetEc().empty()) {
3823 ITERATE(CProt_ref::TEc, ec, pref.GetEc()) {
3824 if ( !ctx.Config().DropIllegalQuals() || s_IsLegalECNumber(*ec)) {
3825 x_AddQual(eFQ_prot_EC_number, new CFlatStringQVal(*ec));
3826 }
3827 }
3828 }
3829 if ( m_Feat.IsSetProduct() ) {
3830 CBioseq_Handle prot =
3831 ctx.GetScope().GetBioseqHandle( m_Feat.GetProductId() );
3832 if ( prot ) {
3833 x_AddProductIdQuals(prot, eFQ_protein_id);
3834 } else {
3835 try {
3836 const CSeq_id& prod_id =
3837 GetId( m_Feat.GetProduct(), &ctx.GetScope());
3838 if ( ctx.IsRefSeq() || !ctx.Config().ForGBRelease() ) {
3839 x_AddQual(eFQ_protein_id, new CFlatSeqIdQVal(prod_id));
3840 }
3841 } catch (CObjmgrUtilException&) {}
3842 }
3843 }
3844 } else { // protein feature on subpeptide bioseq
3845 x_AddQual(eFQ_derived_from, new CFlatSeqLocQVal(m_Feat.GetLocation()));
3846 }
3847 if ( !pseudo && ( ctx.Config().ShowPeptides() || ctx.Config().IsFormatGBSeq() || ctx.Config().IsFormatINSDSeq() ) ) {
3848 if ( processed == CProt_ref::eProcessed_mature ||
3849 processed == CProt_ref::eProcessed_signal_peptide ||
3850 processed == CProt_ref::eProcessed_transit_peptide ||
3851 processed == CProt_ref::eProcessed_propeptide ) {
3852 CSeqVector pep(m_Feat.GetLocation(), ctx.GetScope());
3853 pep.SetCoding(CSeq_data::e_Ncbieaa);
3854 string peptide;
3855 pep.GetSeqData(pep.begin(), pep.end(), peptide);
3856 if (!NStr::IsBlank(peptide)) {
3857 x_AddQual(eFQ_peptide, new CFlatStringQVal(peptide));
3858 }
3859 }
3860 }
3861
3862 ///
3863 /// report molecular weights
3864 ///
3865 if (ctx.IsProt() && ( ctx.IsRefSeq() || ctx.Config().IsFormatGBSeq() || ctx.Config().IsFormatINSDSeq() ) && ! IsMappedFromProt() &&
3866 ! ( m_Feat.IsSetPartial() && m_Feat.GetPartial() ) &&
3867 ! ( m_Feat.GetLocation().IsPartialStart(eExtreme_Biological) ||
3868 m_Feat.GetLocation().IsPartialStop(eExtreme_Biological)) &&
3869 ! pseudo )
3870 {
3871 double wt = 0;
3872 bool has_mat_peptide = false;
3873 bool has_propeptide = false;
3874 bool has_signal_peptide = false;
3875
3876 CConstRef<CSeq_loc> loc(&m_Feat.GetLocation());
3877
3878 const bool is_pept_whole_loc = loc->IsWhole() ||
3879 ( loc->GetStart(eExtreme_Biological) == 0 &&
3880 loc->GetStop(eExtreme_Biological) == (ctx.GetHandle().GetBioseqLength() - 1) );
3881
3882 if (processed == CProt_ref::eProcessed_not_set ||
3883 processed == CProt_ref::eProcessed_preprotein )
3884 {
3885 SAnnotSelector sel = ctx.SetAnnotSelector();
3886 sel.SetFeatType(CSeqFeatData::e_Prot);
3887 for (CFeat_CI feat_it(ctx.GetHandle(), sel); feat_it; ++feat_it) {
3888 bool copy_loc = false;
3889 switch (feat_it->GetData().GetProt().GetProcessed()) {
3890 case CProt_ref::eProcessed_signal_peptide:
3891 case CProt_ref::eProcessed_transit_peptide:
3892 {{
3893 has_signal_peptide = true;
3894 if ( (feat_it->GetLocation().GetTotalRange().GetFrom() ==
3895 m_Feat.GetLocation().GetTotalRange().GetFrom()) &&
3896 ! feat_it->GetLocation().Equals( m_Feat.GetLocation() ) ) {
3897 loc = loc->Subtract(feat_it->GetLocation(),
3898 CSeq_loc::fSortAndMerge_All,
3899 NULL, NULL);
3900 }
3901 }}
3902 break;
3903
3904 case CProt_ref::eProcessed_mature:
3905 has_mat_peptide = true;
3906 break;
3907
3908 case CProt_ref::eProcessed_propeptide:
3909 has_propeptide = true;
3910 break;
3911
3912 default:
3913 break;
3914 }
3915
3916 if (copy_loc) {
3917 /// we need to adjust our location to the end of the signal
3918 /// peptide
3919 CRef<CSeq_loc> l(new CSeq_loc);
3920 loc = l;
3921 l->Assign(m_Feat.GetLocation());
3922 l->SetInt().SetTo
3923 (feat_it->GetLocation().GetTotalRange().GetTo());
3924 }
3925 }
3926 }
3927
3928 /**
3929 CMolInfo::TCompleteness comp = CMolInfo::eCompleteness_partial;
3930 {{
3931 CConstRef<CMolInfo> molinfo
3932 (sequence::GetMolInfo(ctx.GetHandle()));
3933 if (molinfo) {
3934 comp = molinfo->GetCompleteness();
3935 }
3936 }}
3937 **/
3938
3939 if ( !(loc->IsPartialStart(eExtreme_Biological) || loc->IsPartialStop(eExtreme_Biological)) ) {
3940
3941 bool proteinIsAtLeastMature;
3942 switch( pref.GetProcessed() ) {
3943 case CProt_ref::eProcessed_not_set:
3944 case CProt_ref::eProcessed_preprotein:
3945 proteinIsAtLeastMature = false;
3946 break;
3947 default:
3948 proteinIsAtLeastMature = true;
3949 break;
3950 }
3951
3952 if ( (!has_mat_peptide || !has_signal_peptide || !has_propeptide) || (proteinIsAtLeastMature) || (!is_pept_whole_loc) ) {
3953 try {
3954 const TGetProteinWeight flags = 0;
3955 wt = GetProteinWeight(m_Feat.GetOriginalFeature(),
3956 ctx.GetScope(), loc, flags);
3957 }
3958 catch (CException&) {
3959 }
3960 }
3961 }
3962
3963 /// note: we report the weight rounded to the nearest int
3964 if (wt) {
3965 x_AddQual(eFQ_calculated_mol_wt,
3966 new CFlatIntQVal((int(wt + 0.5))));
3967 }
3968 }
3969
3970 // cleanup
3971 if ( processed == CProt_ref::eProcessed_signal_peptide ||
3972 processed == CProt_ref::eProcessed_transit_peptide ) {
3973 if ( !ctx.IsRefSeq() ) {
3974 // Only RefSeq allows product on signal or transit peptide
3975 x_RemoveQuals(eFQ_product);
3976 }
3977 }
3978 if ( processed == CProt_ref::eProcessed_preprotein &&
3979 !ctx.IsRefSeq() && !ctx.IsProt() &&
3980 data.GetSubtype() == CSeqFeatData::eSubtype_preprotein ) {
3981 const CFlatStringQVal* product = x_GetStringQual(eFQ_product);
3982 if (product != NULL) {
3983 x_AddQual(eFQ_encodes, new CFlatStringQVal("encodes " + product->GetValue()));
3984 x_RemoveQuals(eFQ_product);
3985 }
3986 }
3987 }
3988
3989
s_ParseParentQual(const CGb_qual & gbqual,list<string> & vals)3990 static void s_ParseParentQual(const CGb_qual& gbqual, list<string>& vals)
3991 {
3992 vals.clear();
3993
3994 if (!gbqual.IsSetVal() || NStr::IsBlank(gbqual.GetVal())) {
3995 return;
3996 }
3997
3998 const string& val = gbqual.GetVal();
3999
4000 if (val.length() > 1 && NStr::StartsWith(val, '(') &&
4001 NStr::EndsWith(val, ')') && val.find(',') != NPOS) {
4002 NStr::Split(val, "(,)", vals, NStr::fSplit_Tokenize);
4003 } else {
4004 vals.push_back(val);
4005 }
4006
4007 list<string>::iterator it = vals.begin();
4008 while (it != vals.end()) {
4009 if (NStr::IsBlank(*it)) {
4010 it = vals.erase(it);
4011 } else {
4012 ConvertQuotes(*it);
4013 ExpandTildes(*it, eTilde_space);
4014 ++it;
4015 }
4016 }
4017 }
4018
4019
4020 struct SLegalImport {
4021 const char* m_Name;
4022 EFeatureQualifier m_Value;
4023
operator stringSLegalImport4024 operator string(void) const { return m_Name; }
4025 };
4026
4027
s_IsValidDirection(const string & direction)4028 static bool s_IsValidDirection(const string& direction) {
4029 return NStr::EqualNocase(direction, "LEFT") ||
4030 NStr::EqualNocase(direction, "RIGHT") ||
4031 NStr::EqualNocase(direction, "BOTH");
4032 }
4033
4034
s_IsValidnConsSplice(const string & cons_splice)4035 static bool s_IsValidnConsSplice(const string& cons_splice) {
4036 return NStr::EqualNocase(cons_splice, "(5'site:YES, 3'site:YES)") ||
4037 NStr::EqualNocase(cons_splice, "(5'site:YES, 3'site:NO)") ||
4038 NStr::EqualNocase(cons_splice, "(5'site:YES, 3'site:ABSENT)") ||
4039 NStr::EqualNocase(cons_splice, "(5'site:NO, 3'site:YES)") ||
4040 NStr::EqualNocase(cons_splice, "(5'site:NO, 3'site:NO)") ||
4041 NStr::EqualNocase(cons_splice, "(5'site:NO, 3'site:ABSENT)") ||
4042 NStr::EqualNocase(cons_splice, "(5'site:ABSENT, 3'site:YES)") ||
4043 NStr::EqualNocase(cons_splice, "(5'site:ABSENT, 3'site:NO)") ||
4044 NStr::EqualNocase(cons_splice, "(5'site:ABSENT, 3'site:ABSENT)");
4045 }
4046
4047 // currently just converts PMIDs into links
4048 static void
s_HTMLizeExperimentQual(string & out_new_val,const string & val)4049 s_HTMLizeExperimentQual( string &out_new_val, const string &val)
4050 {
4051 static const string kPmid("PMID:");
4052
4053 // just to make sure
4054 out_new_val.clear();
4055
4056 // str_pos should generally be considered as holding the first position
4057 // in val that we have not yet processed and copied to out_new_val.
4058 SIZE_TYPE str_pos = 0;
4059 while( str_pos < val.length() ) {
4060
4061 // find next "PMID:" to process
4062 const SIZE_TYPE pmid_label_pos = val.find( "PMID:", str_pos );
4063 if( pmid_label_pos == NPOS ) {
4064 // no more PMIDs left.
4065 // copy the rest of the string and let's leave
4066 copy( val.begin() + str_pos, val.end(), back_inserter(out_new_val) );
4067 return;
4068 }
4069
4070 // copy val up to just after "PMID:"
4071 const SIZE_TYPE first_pmid_pos = pmid_label_pos + kPmid.length();
4072 copy( val.begin() + str_pos, val.begin() + first_pmid_pos, back_inserter(out_new_val) );
4073 str_pos = first_pmid_pos;
4074
4075 // push pmids (with links) onto the output
4076 // we consider the pmids to be numbers separated by one or more spaces and/or commas.
4077 bool first_num = true;
4078 while( str_pos < val.length() ) {
4079 // skip spaces and commas before pmid
4080 const SIZE_TYPE next_pmid_pos = val.find_first_not_of(" ,", str_pos);
4081 if( next_pmid_pos == NPOS || ! isdigit(val[next_pmid_pos]) ) {
4082 break;
4083 }
4084
4085 // find end of pmid
4086 SIZE_TYPE end_of_pmid_pos = val.find_first_not_of("0123456789", next_pmid_pos );
4087 if( NPOS == end_of_pmid_pos ) {
4088 end_of_pmid_pos = val.length();
4089 }
4090
4091 // extract the actual pmid
4092 string pmid = val.substr(next_pmid_pos, end_of_pmid_pos - next_pmid_pos );
4093
4094 // write pmid with link
4095 if( ! first_num ) {
4096 out_new_val += ',';
4097 }
4098 out_new_val += "<a href=\"";
4099 out_new_val += strLinkBasePubmed;
4100 out_new_val += pmid;
4101 out_new_val += "\">";
4102 out_new_val += pmid;
4103 out_new_val += "</a>";
4104 str_pos = end_of_pmid_pos;
4105
4106 first_num = false;
4107 }
4108 }
4109 }
4110
4111 // ----------------------------------------------------------------------------
x_ImportQuals(CBioseqContext & ctx)4112 void CFeatureItem::x_ImportQuals(
4113 CBioseqContext& ctx )
4114 // ----------------------------------------------------------------------------
4115 {
4116 _ASSERT(m_Feat.IsSetQual());
4117
4118 typedef SStaticPair<const char*, EFeatureQualifier> TLegalImport;
4119 static const TLegalImport kLegalImports[] = {
4120 // Must be in case-insensitive alphabetical order!
4121 #define DO_IMPORT(x) { #x, eFQ_##x }
4122 DO_IMPORT(allele),
4123 DO_IMPORT(bound_moiety),
4124 DO_IMPORT(circular_RNA),
4125 DO_IMPORT(clone),
4126 DO_IMPORT(codon),
4127 DO_IMPORT(compare),
4128 DO_IMPORT(cons_splice),
4129 DO_IMPORT(cyt_map),
4130 DO_IMPORT(direction),
4131 DO_IMPORT(EC_number),
4132 DO_IMPORT(estimated_length),
4133 DO_IMPORT(evidence),
4134 DO_IMPORT(experiment),
4135 DO_IMPORT(frequency),
4136 DO_IMPORT(function),
4137 DO_IMPORT(gap_type),
4138 DO_IMPORT(gen_map),
4139 DO_IMPORT(inference),
4140 DO_IMPORT(insertion_seq),
4141 DO_IMPORT(label),
4142 DO_IMPORT(linkage_evidence),
4143 DO_IMPORT(map),
4144 DO_IMPORT(mobile_element),
4145 DO_IMPORT(mobile_element_type),
4146 DO_IMPORT(mod_base),
4147 DO_IMPORT(ncRNA_class),
4148 DO_IMPORT(number),
4149 DO_IMPORT(old_locus_tag),
4150 DO_IMPORT(operon),
4151 DO_IMPORT(organism),
4152 DO_IMPORT(PCR_conditions),
4153 DO_IMPORT(phenotype),
4154 DO_IMPORT(product),
4155 DO_IMPORT(pseudogene),
4156 DO_IMPORT(rad_map),
4157 DO_IMPORT(recombination_class),
4158 DO_IMPORT(regulatory_class),
4159 DO_IMPORT(replace),
4160 DO_IMPORT(ribosomal_slippage),
4161 DO_IMPORT(rpt_family),
4162 DO_IMPORT(rpt_type),
4163 DO_IMPORT(rpt_unit),
4164 DO_IMPORT(rpt_unit_range),
4165 DO_IMPORT(rpt_unit_seq),
4166 DO_IMPORT(satellite),
4167 DO_IMPORT(standard_name),
4168 DO_IMPORT(tag_peptide),
4169 DO_IMPORT(trans_splicing),
4170 DO_IMPORT(transposon),
4171 DO_IMPORT(UniProtKB_evidence),
4172 DO_IMPORT(usedin)
4173 #undef DO_IMPORT
4174 };
4175 typedef const CStaticPairArrayMap<const char*, EFeatureQualifier, PNocase_CStr> TLegalImportMap;
4176 DEFINE_STATIC_ARRAY_MAP(TLegalImportMap, kLegalImportMap, kLegalImports);
4177
4178 bool check_qual_syntax = ctx.Config().CheckQualSyntax();
4179
4180 const bool old_locus_tag_added_elsewhere = x_HasQual(eFQ_old_locus_tag);
4181
4182 bool first_pseudogene = true;
4183
4184 vector<string> replace_quals;
4185 const CSeq_feat_Base::TQual & qual = m_Feat.GetQual(); // must store reference since ITERATE macro evaluates 3rd arg multiple times
4186 ITERATE( CSeq_feat::TQual, it, qual ) {
4187 if (!(*it)->IsSetQual() || !(*it)->IsSetVal()) {
4188 continue;
4189 }
4190 const string& val = (*it)->GetVal();
4191
4192 const char* name = (*it)->GetQual().c_str();
4193 const TLegalImportMap::const_iterator li = kLegalImportMap.find(name);
4194 EFeatureQualifier slot = eFQ_illegal_qual;
4195 if ( li != kLegalImportMap.end() ) {
4196 slot = li->second;
4197 } else if (check_qual_syntax) {
4198 continue;
4199 }
4200
4201 // only certain slot types may have an empty value (e.g. M96433)
4202 switch(slot) {
4203 case eFQ_replace:
4204 case eFQ_pseudogene:
4205 // empty value allowed for these slot types, so we don't check
4206 break;
4207 default:
4208 // empty value forbidden for other slot types
4209 if( val.empty() ) {
4210 continue;
4211 }
4212 break;
4213 }
4214
4215 switch (slot) {
4216 case eFQ_allele:
4217 // if /allele inherited from gene, suppress allele gbqual on feature
4218 if (x_HasQual(eFQ_gene_allele)) {
4219 continue;
4220 } else {
4221 x_AddQual(slot, new CFlatStringQVal(val,
4222 CFormatQual::eTrim_WhitespaceOnly));
4223 }
4224 break;
4225 case eFQ_codon:
4226 if ((*it)->IsSetVal() && !NStr::IsBlank(val)) {
4227 x_AddQual(slot, new CFlatStringQVal(val, CFormatQual::eUnquoted));
4228 }
4229 break;
4230 case eFQ_cons_splice:
4231 if ((*it)->IsSetVal()) {
4232 if (!check_qual_syntax || s_IsValidnConsSplice(val)) {
4233 x_AddQual(slot, new CFlatStringQVal(val));
4234 }
4235 }
4236 break;
4237 case eFQ_direction:
4238 if ((*it)->IsSetVal()) {
4239 if (!check_qual_syntax || s_IsValidDirection(val)) {
4240 x_AddQual(slot, new CFlatNumberQVal(val));
4241 }
4242 }
4243 break;
4244 case eFQ_estimated_length:
4245 case eFQ_mod_base:
4246 case eFQ_number:
4247 if ((*it)->IsSetVal() && !NStr::IsBlank(val)) {
4248 x_AddQual(slot, new CFlatNumberQVal(val));
4249 }
4250 break;
4251 case eFQ_rpt_type:
4252 x_AddRptTypeQual(val, check_qual_syntax);
4253 break;
4254 case eFQ_rpt_unit:
4255 if ((*it)->IsSetVal()) {
4256 x_AddRptUnitQual(val);
4257 }
4258 break;
4259 case eFQ_usedin:
4260 {{
4261 list<string> vals;
4262 s_ParseParentQual(**it, vals);
4263 ITERATE (list<string>, i, vals) {
4264 x_AddQual(slot, new CFlatStringQVal(*i, CFormatQual::eQuoted));
4265 }
4266 break;
4267 }}
4268 case eFQ_old_locus_tag:
4269 {{
4270 if( ! old_locus_tag_added_elsewhere ) {
4271 list<string> vals;
4272 s_ParseParentQual(**it, vals);
4273 ITERATE (list<string>, i, vals) {
4274 x_AddQual(slot, new CFlatStringQVal(*i, CFormatQual::eQuoted, CFormatQual::eTrim_WhitespaceOnly));
4275 }
4276 }
4277 break;
4278 }}
4279 case eFQ_rpt_family:
4280 if ((*it)->IsSetVal() && !NStr::IsBlank(val)) {
4281 x_AddQual(slot, new CFlatStringQVal(val));
4282 }
4283 break;
4284 case eFQ_label:
4285 x_AddQual(slot, new CFlatLabelQVal(val));
4286 break;
4287 case eFQ_EC_number:
4288 if ((*it)->IsSetVal() &&
4289 ( ! ctx.Config().DropIllegalQuals() || s_IsLegalECNumber(val) ) ) {
4290 x_AddQual(slot, new CFlatStringQVal(val));
4291 }
4292 break;
4293 case eFQ_illegal_qual:
4294 if ( ctx.UsingSeqEntryIndex() && NStr::CompareNocase (name, "transl_except") == 0 ) {
4295 break;
4296 }
4297 x_AddQual(slot, new CFlatIllegalQVal(**it));
4298 break;
4299 case eFQ_product:
4300 if (!x_HasQual(eFQ_product)) {
4301 x_AddQual(slot, new CFlatStringQVal(val));
4302 } else {
4303 const CFlatStringQVal* gene = x_GetStringQual(eFQ_gene);
4304 const string& gene_val =
4305 gene != NULL ? gene->GetValue() : kEmptyStr;
4306 const CFlatStringQVal* product = x_GetStringQual(eFQ_product);
4307 const string& product_val =
4308 product != NULL ? product->GetValue() : kEmptyStr;
4309 if (val != gene_val && val != product_val) {
4310
4311 if ( ! ctx.Config().CodonRecognizedToNote() ||
4312 ! x_HasQual(eFQ_trna_codons) ||
4313 NStr::Find(val, "RNA") == NPOS )
4314 {
4315 x_AddQual(eFQ_xtra_prod_quals, new CFlatStringQVal(val));
4316 }
4317 }
4318 }
4319 break;
4320 case eFQ_compare:
4321 {{
4322 list<string> vals;
4323 s_ParseParentQual(**it, vals);
4324 ITERATE (list<string>, i, vals) {
4325 if (!ctx.Config().CheckQualSyntax() ||
4326 IsValidAccession(*i, eValidateAccDotVer)) {
4327 x_AddQual(slot, new CFlatStringQVal(*i, CFormatQual::eUnquoted));
4328 }
4329 }
4330 }}
4331 break;
4332 case eFQ_evidence:
4333 {{
4334 if ( val == "EXPERIMENTAL" ) {
4335 x_AddQual(eFQ_experiment, new CFlatExperimentQVal());
4336 } else if ( val == "NOT_EXPERIMENTAL" ) {
4337 x_AddQual(eFQ_inference, new CFlatInferenceQVal());
4338 }
4339 }}
4340 break;
4341
4342 case eFQ_rpt_unit_range:
4343 x_AddQual(slot, new CFlatStringQVal(val, CFormatQual::eUnquoted));
4344 break;
4345
4346 case eFQ_replace:
4347 {{
4348 string s(val);
4349 if (string::npos == s.find_first_not_of("ACGTUacgtu")) {
4350 NStr::ToLower(s);
4351 NStr::ReplaceInPlace(s, "u", "t");
4352 }
4353 replace_quals.push_back(s);
4354 }}
4355 break;
4356
4357 case eFQ_operon:
4358 {{
4359 if( ! x_HasQual(eFQ_operon) ) {
4360 x_AddQual(slot, new CFlatStringQVal(val));
4361 }
4362 }}
4363 break;
4364
4365 case eFQ_experiment:
4366 {{
4367 if( ctx.Config().DoHTML() && ! CommentHasSuspiciousHtml(val) ) {
4368 string new_val;
4369 s_HTMLizeExperimentQual(new_val, val);
4370 x_AddQual(slot, new CFlatStringQVal(new_val));
4371 } else {
4372 x_AddQual(slot, new CFlatStringQVal(val));
4373 }
4374 }}
4375 break;
4376
4377 case eFQ_clone:
4378 x_AddQual(slot, new CFlatStringQVal(val, CFormatQual::eTrim_WhitespaceOnly));
4379 break;
4380
4381 case eFQ_pseudogene:
4382
4383 // our pseudogene(s) override(s) any that existed before
4384 if( first_pseudogene ) {
4385 first_pseudogene = false;
4386 x_RemoveQuals(eFQ_pseudogene);
4387 }
4388 x_AddQual(slot, new CFlatStringQVal(val));
4389
4390 break;
4391
4392 case eFQ_regulatory_class:
4393 x_AddRegulatoryClassQual(val, check_qual_syntax);
4394 break;
4395
4396 case eFQ_recombination_class:
4397 x_AddRecombinationClassQual(val, check_qual_syntax);
4398 break;
4399
4400 default:
4401 x_AddQual(slot, new CFlatStringQVal(val));
4402 break;
4403 }
4404 }
4405
4406 if (replace_quals.size()) {
4407 std::sort(replace_quals.begin(), replace_quals.end());
4408 ITERATE (vector<string>, it, replace_quals) {
4409 x_AddQual(eFQ_replace, new CFlatStringQVal(*it));
4410 }
4411 }
4412
4413 // some "map-related" qual adjustments
4414 if( ctx.Config().HideSpecificGeneMaps() && ! x_HasQual(eFQ_map) ) {
4415 if( x_HasQual(eFQ_cyt_map) ) {
4416 x_AddQual(eFQ_map, x_GetQual(eFQ_cyt_map)->second );
4417 } else if( x_HasQual(eFQ_gen_map) ) {
4418 x_AddQual(eFQ_map, x_GetQual(eFQ_gen_map)->second );
4419 } else if( x_HasQual(eFQ_rad_map) ) {
4420 x_AddQual(eFQ_map, x_GetQual(eFQ_rad_map)->second );
4421 }
4422 x_RemoveQuals(eFQ_cyt_map);
4423 x_RemoveQuals(eFQ_gen_map);
4424 x_RemoveQuals(eFQ_rad_map);
4425 }
4426 }
4427
4428 // ----------------------------------------------------------------------------
x_AddRptUnitQual(const string & rpt_unit)4429 void CFeatureItem::x_AddRptUnitQual(
4430 const string& rpt_unit )
4431 // ----------------------------------------------------------------------------
4432 {
4433 if (rpt_unit.empty()) {
4434 return;
4435 }
4436
4437 vector<string> units;
4438
4439 if (NStr::StartsWith(rpt_unit, '(') && NStr::EndsWith(rpt_unit, ')') &&
4440 NStr::Find(rpt_unit, "(", 1) == NPOS) {
4441 string tmp = rpt_unit.substr(1, rpt_unit.length() - 2);
4442 NStr::Split(tmp, ",", units, 0);
4443 } else {
4444 units.push_back(rpt_unit);
4445 }
4446
4447 NON_CONST_ITERATE (vector<string>, it, units) {
4448 if (!it->empty()) {
4449 NStr::TruncateSpacesInPlace(*it);
4450 x_AddQual(eFQ_rpt_unit, new CFlatStringQVal(*it));
4451 }
4452 }
4453 }
4454
4455
4456 // ----------------------------------------------------------------------------
x_AddRptTypeQual(const string & rpt_type,bool check_qual_syntax)4457 void CFeatureItem::x_AddRptTypeQual(
4458 const string& rpt_type,
4459 bool check_qual_syntax )
4460 // ----------------------------------------------------------------------------
4461 {
4462 if (rpt_type.empty()) {
4463 return;
4464 }
4465
4466 string value( rpt_type );
4467 NStr::TruncateSpacesInPlace( value );
4468
4469 vector<string> pieces;
4470 s_SplitCommaSeparatedStringInParens( pieces, value );
4471
4472 ITERATE( vector<string>, it, pieces ) {
4473 if ( ! check_qual_syntax || CGb_qual::IsValidRptTypeValue( *it ) ) {
4474 x_AddQual( eFQ_rpt_type, new CFlatStringQVal( *it, CFormatQual::eUnquoted ) );
4475 }
4476 }
4477 }
4478
4479
s_IsValidRegulatoryClass(const string & type)4480 static bool s_IsValidRegulatoryClass(const string& type)
4481 {
4482 vector<string> valid_types = CSeqFeatData::GetRegulatoryClassList();
4483
4484 FOR_EACH_STRING_IN_VECTOR (itr, valid_types) {
4485 string str = *itr;
4486 if (NStr::Equal (str, type)) return true;
4487 }
4488
4489 return false;
4490 }
4491
s_IsValidRecombinationClass(const string & type)4492 static bool s_IsValidRecombinationClass(const string& type)
4493 {
4494 vector<string> valid_types = CSeqFeatData::GetRecombinationClassList();
4495
4496 FOR_EACH_STRING_IN_VECTOR (itr, valid_types) {
4497 string str = *itr;
4498 if (NStr::Equal (str, type)) return true;
4499 }
4500
4501 return false;
4502 }
4503
4504 // ----------------------------------------------------------------------------
x_AddRecombinationClassQual(const string & recombination_class,bool check_qual_syntax)4505 void CFeatureItem::x_AddRecombinationClassQual(
4506 const string& recombination_class,
4507 bool check_qual_syntax
4508 )
4509 // ----------------------------------------------------------------------------
4510 {
4511 if (recombination_class.empty()) {
4512 return;
4513 }
4514
4515 string recomb_class = recombination_class;
4516
4517 if (NStr::StartsWith(recomb_class, "other:")) {
4518 NStr::TrimPrefixInPlace(recomb_class, "other:");
4519 NStr::TruncateSpacesInPlace(recomb_class);
4520 }
4521 if ( s_IsValidRecombinationClass( recomb_class ) ) {
4522 x_AddQual( eFQ_recombination_class, new CFlatStringQVal(recomb_class));
4523 } else {
4524 x_AddQual( eFQ_recombination_class, new CFlatStringQVal("other"));
4525 x_AddQual( eFQ_seqfeat_note, new CFlatStringQVal(recomb_class));
4526 }
4527 }
4528
4529
4530 // ----------------------------------------------------------------------------
x_AddRegulatoryClassQual(const string & regulatory_class,bool check_qual_syntax)4531 void CFeatureItem::x_AddRegulatoryClassQual(
4532 const string& regulatory_class,
4533 bool check_qual_syntax
4534 )
4535 // ----------------------------------------------------------------------------
4536 {
4537 if (regulatory_class.empty()) {
4538 return;
4539 }
4540
4541 string reg_class = regulatory_class;
4542
4543 if (NStr::StartsWith(reg_class, "other:")) {
4544 NStr::TrimPrefixInPlace(reg_class, "other:");
4545 NStr::TruncateSpacesInPlace(reg_class);
4546 }
4547 if ( s_IsValidRegulatoryClass( reg_class ) ) {
4548 x_AddQual( eFQ_regulatory_class, new CFlatStringQVal(reg_class));
4549 } else if (NStr::CompareNocase(reg_class, "other") == 0 &&
4550 m_Feat.IsSetComment() && !m_Feat.GetComment().empty()) {
4551 x_AddQual( eFQ_regulatory_class, new CFlatStringQVal("other"));
4552 } else {
4553 x_AddQual( eFQ_regulatory_class, new CFlatStringQVal("other"));
4554 x_AddQual( eFQ_seqfeat_note, new CFlatStringQVal(reg_class));
4555 }
4556 }
4557
4558
x_FormatQuals(CFlatFeature & ff) const4559 void CFeatureItem::x_FormatQuals(CFlatFeature& ff) const
4560 {
4561 const CFlatFileConfig& cfg = GetContext()->Config();
4562
4563 if ( cfg.IsFormatFTable() ) {
4564 ff.SetQuals() = m_FTableQuals;
4565 return;
4566 }
4567
4568 ff.SetQuals().reserve(m_Quals.Size());
4569 CFlatFeature::TQuals& qvec = ff.SetQuals();
4570
4571 #define DO_QUAL(x) x_FormatQual(eFQ_##x, #x, qvec)
4572 DO_QUAL(ncRNA_class);
4573 DO_QUAL(regulatory_class);
4574 DO_QUAL(recombination_class);
4575
4576 DO_QUAL(partial);
4577 DO_QUAL(gene);
4578
4579 DO_QUAL(locus_tag);
4580 DO_QUAL(old_locus_tag);
4581
4582 x_FormatQual(eFQ_gene_syn_refseq, "synonym", qvec);
4583 DO_QUAL(gene_syn);
4584
4585 x_FormatQual(eFQ_gene_allele, "allele", qvec);
4586
4587 DO_QUAL(operon);
4588
4589 DO_QUAL(product);
4590
4591 x_FormatQual(eFQ_prot_EC_number, "EC_number", qvec);
4592 x_FormatQual(eFQ_prot_activity, "function", qvec);
4593
4594 DO_QUAL(standard_name);
4595 DO_QUAL(coded_by);
4596 DO_QUAL(derived_from);
4597
4598 x_FormatQual(eFQ_prot_name, "name", qvec);
4599 DO_QUAL(region_name);
4600 DO_QUAL(bond_type);
4601 DO_QUAL(site_type);
4602 DO_QUAL(sec_str_type);
4603 DO_QUAL(heterogen);
4604 DO_QUAL(non_std_residue);
4605
4606 DO_QUAL(tag_peptide);
4607
4608 DO_QUAL(evidence);
4609 DO_QUAL(experiment);
4610 DO_QUAL(inference);
4611 DO_QUAL(exception);
4612 DO_QUAL(ribosomal_slippage);
4613 DO_QUAL(trans_splicing);
4614 DO_QUAL(circular_RNA);
4615 DO_QUAL(artificial_location);
4616
4617 if ( !cfg.GoQualsToNote() ) {
4618 if( cfg.GoQualsEachMerge() ) {
4619 // combine all quals of a given type onto the same qual
4620 x_FormatGOQualCombined(eFQ_go_component, "GO_component", qvec);
4621 x_FormatGOQualCombined(eFQ_go_function, "GO_function", qvec);
4622 x_FormatGOQualCombined(eFQ_go_process, "GO_process", qvec);
4623 } else {
4624 x_FormatQual(eFQ_go_component, "GO_component", qvec);
4625 x_FormatQual(eFQ_go_function, "GO_function", qvec);
4626 x_FormatQual(eFQ_go_process, "GO_process", qvec);
4627 }
4628 }
4629
4630 DO_QUAL(nomenclature);
4631
4632 x_FormatNoteQuals(ff);
4633 DO_QUAL(citation);
4634
4635 DO_QUAL(number);
4636
4637 DO_QUAL(pseudo);
4638 DO_QUAL(pseudogene);
4639 DO_QUAL(selenocysteine);
4640 DO_QUAL(pyrrolysine);
4641
4642 DO_QUAL(codon_start);
4643
4644 DO_QUAL(anticodon);
4645 if ( ! cfg.CodonRecognizedToNote() ) {
4646 DO_QUAL(trna_codons);
4647 }
4648 DO_QUAL(bound_moiety);
4649 DO_QUAL(clone);
4650 DO_QUAL(compare);
4651 // DO_QUAL(cons_splice);
4652 DO_QUAL(direction);
4653 DO_QUAL(function);
4654 DO_QUAL(frequency);
4655 DO_QUAL(EC_number);
4656 x_FormatQual(eFQ_gene_map, "map", qvec);
4657 // In certain modes, cyt_map, gen_map, and rad_map are
4658 // moved to eFQ_gene_map by x_ImportQuals:
4659 DO_QUAL(cyt_map);
4660 DO_QUAL(gen_map);
4661 DO_QUAL(rad_map);
4662 DO_QUAL(estimated_length);
4663 DO_QUAL(gap_type);
4664 DO_QUAL(linkage_evidence);
4665 DO_QUAL(allele);
4666 DO_QUAL(map);
4667 DO_QUAL(mod_base);
4668 DO_QUAL(PCR_conditions);
4669 DO_QUAL(phenotype);
4670 DO_QUAL(rpt_family);
4671 DO_QUAL(rpt_type);
4672 DO_QUAL(rpt_unit);
4673 DO_QUAL(rpt_unit_range);
4674 DO_QUAL(rpt_unit_seq);
4675 DO_QUAL(satellite);
4676 DO_QUAL(mobile_element);
4677 DO_QUAL(mobile_element_type);
4678 DO_QUAL(usedin);
4679
4680 // extra imports, actually...
4681 x_FormatQual(eFQ_illegal_qual, "illegal", qvec);
4682
4683 DO_QUAL(replace);
4684
4685 DO_QUAL(transl_except);
4686 DO_QUAL(transl_table);
4687 DO_QUAL(codon);
4688 DO_QUAL(organism);
4689 DO_QUAL(label);
4690 x_FormatQual(eFQ_cds_product, "product", qvec);
4691 DO_QUAL(UniProtKB_evidence);
4692 DO_QUAL(protein_id);
4693 DO_QUAL(transcript_id);
4694 DO_QUAL(db_xref);
4695 x_FormatQual(eFQ_gene_xref, "db_xref", qvec);
4696 DO_QUAL(mol_wt);
4697 DO_QUAL(calculated_mol_wt);
4698 DO_QUAL(translation);
4699 DO_QUAL(transcription);
4700 DO_QUAL(peptide);
4701
4702 #undef DO_QUAL
4703 }
4704
4705 /*
4706 // check if str2 is a sub string of str1
4707 static bool s_IsRedundant(const string& str1, const string& str2)
4708 {
4709 size_t pos = NPOS;
4710 bool whole = false;
4711 for (pos = NStr::Find(str1, str2); pos != NPOS && !whole; pos += str2.length()) {
4712 whole = IsWholeWord(str1, pos);
4713 }
4714 return (pos != NPOS && whole);
4715 }
4716
4717
4718 // Remove redundant elements that occur twice or as part of other elements.
4719 static void s_PruneNoteQuals(CFlatFeature::TQuals& qvec)
4720 {
4721 if (qvec.empty()) {
4722 return;
4723 }
4724 CFlatFeature::TQuals::iterator it1 = qvec.begin();
4725 while (it1 != qvec.end()) {
4726 CFlatFeature::TQuals::iterator it2 = it1 + 1;
4727 const string& val1 = (*it1)->GetValue();
4728 while (it2 != qvec.end()) {
4729 const string& val2 = (*it2)->GetValue();
4730 if (s_IsRedundant(val1, val2)) {
4731 it2 = qvec.erase(it2);
4732 } else if (s_IsRedundant(val2, val1)) {
4733 break;
4734 } else {
4735 ++it2;
4736 }
4737 }
4738 if (it2 != qvec.end()) {
4739 it1 = qvec.erase(it1);
4740 } else {
4741 ++it1;
4742 }
4743 }
4744 }
4745 */
4746
x_FormatNoteQuals(CFlatFeature & ff) const4747 void CFeatureItem::x_FormatNoteQuals(CFlatFeature& ff) const
4748 {
4749 const CFlatFileConfig& cfg = GetContext()->Config();
4750 CFlatFeature::TQuals qvec;
4751
4752 #define DO_NOTE(x) x_FormatNoteQual(eFQ_##x, GetStringOfFeatQual(eFQ_##x), qvec)
4753 #define DO_NOTE_PREPEND_NEWLINE(x) x_FormatNoteQual(eFQ_##x, GetStringOfFeatQual(eFQ_##x), qvec, IFlatQVal::fPrependNewline )
4754 DO_NOTE(transcript_id_note);
4755 DO_NOTE(gene_desc);
4756
4757 if ( cfg.CodonRecognizedToNote() ) {
4758 DO_NOTE(trna_codons);
4759 }
4760 DO_NOTE(encodes);
4761 DO_NOTE(prot_desc);
4762 DO_NOTE(prot_note);
4763 DO_NOTE(prot_comment);
4764 DO_NOTE(prot_method);
4765 DO_NOTE(maploc);
4766 DO_NOTE(prot_conflict);
4767 DO_NOTE(prot_missing);
4768 DO_NOTE(seqfeat_note);
4769 DO_NOTE(region);
4770 // DO_NOTE(selenocysteine_note);
4771 DO_NOTE(prot_names);
4772 DO_NOTE(bond);
4773 DO_NOTE(site);
4774 // DO_NOTE(rrna_its);
4775 DO_NOTE(xtra_prod_quals);
4776 // DO_NOTE(inference_bad);
4777 DO_NOTE(modelev);
4778 // DO_NOTE(cdd_definition);
4779 // DO_NOTE(tag_peptide);
4780 DO_NOTE_PREPEND_NEWLINE(exception_note);
4781
4782 string notestr;
4783 string suffix;
4784 // bool add_period = false;
4785 bool add_period = true/*fl*/;
4786
4787 s_QualVectorToNote(qvec, true, notestr, suffix, add_period);
4788
4789 if (GetContext()->Config().GoQualsToNote()) {
4790 qvec.clear();
4791 DO_NOTE(go_component);
4792 DO_NOTE(go_function);
4793 DO_NOTE(go_process);
4794 s_QualVectorToNote(qvec, false, notestr, suffix, add_period);
4795 }
4796 s_NoteFinalize(add_period, notestr, ff, eTilde_tilde);
4797
4798 #undef DO_NOTE
4799 #undef DO_NOTE_PREPEND_NEWLINE
4800 }
4801
x_FormatQual(EFeatureQualifier slot,const char * name,CFlatFeature::TQuals & qvec,IFlatQVal::TFlags flags) const4802 void CFeatureItem::x_FormatQual
4803 (EFeatureQualifier slot,
4804 const char* name,
4805 CFlatFeature::TQuals& qvec,
4806 IFlatQVal::TFlags flags) const
4807 {
4808 TQCI it = m_Quals.LowerBound(slot);
4809 TQCI end = m_Quals.end();
4810 while (it != end && it->first == slot) {
4811 it->second->Format(qvec, name, *GetContext(), flags);
4812 ++it;
4813 }
4814 }
4815
4816
x_FormatNoteQual(EFeatureQualifier slot,const CTempString & name,CFlatFeature::TQuals & qvec,IFlatQVal::TFlags flags) const4817 void CFeatureItem::x_FormatNoteQual
4818 (EFeatureQualifier slot,
4819 const CTempString & name,
4820 CFlatFeature::TQuals& qvec,
4821 IFlatQVal::TFlags flags) const
4822 {
4823 flags |= IFlatQVal::fIsNote;
4824
4825 TQCI it = m_Quals.LowerBound(slot);
4826 TQCI end = m_Quals.end();
4827 while (it != end && it->first == slot) {
4828 it->second->Format(qvec, name, *GetContext(), flags);
4829 ++it;
4830 }
4831 }
4832
4833 // This produces one qual out of all the GO quals of the given slot, with their
4834 // values concatenated.
x_FormatGOQualCombined(EFeatureQualifier slot,const CTempString & name,CFlatFeature::TQuals & qvec,TQualFlags flags) const4835 void CFeatureItem::x_FormatGOQualCombined
4836 (EFeatureQualifier slot,
4837 const CTempString & name,
4838 CFlatFeature::TQuals& qvec,
4839 TQualFlags flags) const
4840 {
4841 // copy all the given quals with that name since we need to sort them
4842 vector<CConstRef<CFlatGoQVal> > goQuals;
4843
4844 TQCI it = m_Quals.LowerBound(slot);
4845 TQCI end = m_Quals.end();
4846 while (it != end && it->first == slot) {
4847 goQuals.push_back( CConstRef<CFlatGoQVal>( dynamic_cast<const CFlatGoQVal*>( it->second.GetNonNullPointer() ) ) );
4848 ++it;
4849 }
4850
4851 if( goQuals.empty() ) {
4852 return;
4853 }
4854
4855 stable_sort( goQuals.begin(), goQuals.end(), CGoQualLessThan() );
4856
4857 CFlatFeature::TQuals temp_qvec;
4858
4859 string combined;
4860
4861
4862 string::size_type this_part_beginning_text_string_pos = 0;
4863
4864 // now concatenate their values into the variable "combined"
4865 const string *pLastQualTextString = NULL;
4866 ITERATE( vector<CConstRef<CFlatGoQVal> >, iter, goQuals ) {
4867
4868 // Use thisQualTextString to tell when we have consecutive quals with the
4869 // same text string.
4870 const string *pThisQualTextString = &(*iter)->GetTextString();
4871 if( NULL == pThisQualTextString ) {
4872 continue;
4873 }
4874
4875 (*iter)->Format(temp_qvec, name, *GetContext(), flags);
4876
4877 if( pLastQualTextString == NULL || ! NStr::EqualNocase( *pLastQualTextString, *pThisQualTextString ) ) {
4878 // normal case: each CFlatGoQVal has its own part
4879 if( ! combined.empty() ) {
4880 combined += "; ";
4881 this_part_beginning_text_string_pos = combined.length() - 1;
4882 }
4883 combined += temp_qvec.back()->GetValue();
4884 } else {
4885 // consecutive CFlatGoQVal with the same text string: merge
4886 // (chop off the part up to and including the text string )
4887 const string & new_value = temp_qvec.back()->GetValue();
4888
4889 // let text_string_pos point to the part *after* the text string
4890 SIZE_TYPE post_text_string_pos = NStr::FindNoCase( new_value, *pLastQualTextString );
4891 _ASSERT( post_text_string_pos != NPOS );
4892 post_text_string_pos += pLastQualTextString->length();
4893
4894 // append the new part after the text string, but only
4895 // if it's not a duplicate
4896 string str_to_append = new_value.substr( post_text_string_pos,
4897 (pLastQualTextString->length() - post_text_string_pos) );
4898 if( NStr::Find(combined, str_to_append, this_part_beginning_text_string_pos) == NPOS ) {
4899 combined.append( str_to_append );
4900 }
4901 }
4902
4903 pLastQualTextString = pThisQualTextString;
4904 }
4905 pLastQualTextString = NULL; // just to make sure we don't accidentally use it
4906
4907 // add the final merged CFormatQual
4908 if( ! combined.empty() ) {
4909 const string prefix = " ";
4910 const string suffix = ";";
4911 TFlatQual res(new CFormatQual(name, combined, prefix, suffix, CFormatQual::eQuoted ));
4912 qvec.push_back(res);
4913 }
4914 }
4915
x_GetStringQual(EFeatureQualifier slot) const4916 const CFlatStringQVal* CFeatureItem::x_GetStringQual(EFeatureQualifier slot) const
4917 {
4918 const IFlatQVal* qual = 0;
4919 if ( x_HasQual(slot) ) {
4920 qual = m_Quals.Find(slot)->second;
4921 }
4922 return dynamic_cast<const CFlatStringQVal*>(qual);
4923 }
4924
4925
x_GetStringListQual(EFeatureQualifier slot) const4926 CFlatStringListQVal* CFeatureItem::x_GetStringListQual(EFeatureQualifier slot) const
4927 {
4928 IFlatQVal* qual = 0;
4929 if (x_HasQual(slot)) {
4930 qual = const_cast<IFlatQVal*>(&*m_Quals.Find(slot)->second);
4931 }
4932 return dynamic_cast<CFlatStringListQVal*>(qual);
4933 }
4934
x_GetFlatProductNamesQual(EFeatureQualifier slot) const4935 CFlatProductNamesQVal * CFeatureItem::x_GetFlatProductNamesQual(EFeatureQualifier slot) const
4936 {
4937 IFlatQVal* qual = 0;
4938 if (x_HasQual(slot)) {
4939 qual = const_cast<IFlatQVal*>(&*m_Quals.Find(slot)->second);
4940 }
4941 return dynamic_cast<CFlatProductNamesQVal*>(qual);
4942 }
4943
4944 // maps each valid mobile_element_type prefix to whether it
4945 // must have more info after the prefix
4946 typedef SStaticPair<const char *, bool> TMobileElemTypeKey;
4947 static const TMobileElemTypeKey mobile_element_key_to_suffix_required [] = {
4948 { "LINE", false },
4949 { "MITE", false },
4950 { "SINE", false },
4951 { "insertion sequence", false },
4952 { "integron", false },
4953 { "non-LTR retrotransposon", false },
4954 { "other", true },
4955 { "retrotransposon", false },
4956 { "transposon", false }
4957 };
4958
4959 typedef CStaticPairArrayMap <const char*, bool, PCase_CStr> TMobileElemTypeMap;
4960 DEFINE_STATIC_ARRAY_MAP(TMobileElemTypeMap, sm_MobileElemTypeKeys, mobile_element_key_to_suffix_required);
4961
4962 // returns whether or not it's valid
s_ValidateMobileElementType(const string & mobile_element_type_value)4963 bool s_ValidateMobileElementType( const string & mobile_element_type_value )
4964 {
4965 if( mobile_element_type_value.empty() ) {
4966 return false;
4967 }
4968
4969 // if there's a colon, we ignore the part after the colon for testing purposes
4970 string::size_type colon_pos = mobile_element_type_value.find( ':' );
4971
4972 const string value_before_colon = ( string::npos == colon_pos
4973 ? mobile_element_type_value
4974 : mobile_element_type_value.substr( 0, colon_pos ) );
4975
4976 TMobileElemTypeMap::const_iterator prefix_info =
4977 sm_MobileElemTypeKeys.find( value_before_colon.c_str() );
4978 if( prefix_info == sm_MobileElemTypeKeys.end() ) {
4979 return false; // prefix not found
4980 }
4981
4982 // check if info required after prefix (colon plus info, actually)
4983 if( prefix_info->second ) {
4984 if( string::npos == colon_pos ) {
4985 return false; // no additional info supplied, even though required
4986 }
4987 }
4988
4989 // all tests passed
4990 return true;
4991 }
4992
4993 class CInStringPred
4994 {
4995 public:
CInStringPred(const string & comparisonString)4996 explicit CInStringPred( const string &comparisonString )
4997 : m_ComparisonString( comparisonString )
4998 { }
4999
operator ()(const string & arg)5000 bool operator()( const string &arg ) {
5001 return NStr::Find( m_ComparisonString, arg ) != NPOS;
5002 }
5003 private:
5004 const string &m_ComparisonString;
5005 };
5006
x_CleanQuals(const CGene_ref * gene_ref)5007 void CFeatureItem::x_CleanQuals(
5008 const CGene_ref* gene_ref )
5009 {
5010 const TGeneSyn* gene_syn =
5011 (gene_ref && gene_ref->IsSetSyn() && !gene_ref->GetSyn().empty() )
5012 ?
5013 &gene_ref->GetSyn()
5014 :
5015 0;
5016 const CBioseqContext& ctx = *GetContext();
5017
5018 if (ctx.Config().DropIllegalQuals()) {
5019 x_DropIllegalQuals();
5020 }
5021
5022 CFlatProductNamesQVal * prot_names = x_GetFlatProductNamesQual(eFQ_prot_names);
5023 const CFlatStringQVal* gene = x_GetStringQual(eFQ_gene);
5024 const CFlatStringQVal* prot_desc = x_GetStringQual(eFQ_prot_desc);
5025 const CFlatStringQVal* standard_name = x_GetStringQual(eFQ_standard_name);
5026 const CFlatStringQVal* seqfeat_note = x_GetStringQual(eFQ_seqfeat_note);
5027
5028 if (gene != NULL) {
5029 const string& gene_name = gene->GetValue();
5030
5031 // /gene same as feature.comment will suppress /note
5032 if (m_Feat.IsSetComment()) {
5033 if (NStr::Equal(gene_name, m_Feat.GetComment())) {
5034 x_RemoveQuals(eFQ_seqfeat_note);
5035 seqfeat_note = NULL;
5036 }
5037 }
5038
5039 // remove protein description that equals the gene name, case sensitive
5040 if (prot_desc != NULL) {
5041 if (s_StrEqualDisregardFinalPeriod(gene_name, prot_desc->GetValue(), NStr::eCase)) {
5042 x_RemoveQuals(eFQ_prot_desc);
5043 prot_desc = NULL;
5044 }
5045 }
5046
5047 // remove prot name if equals gene
5048 if (prot_names != NULL) {
5049
5050 CProt_ref::TName::iterator remove_start = prot_names->SetValue().begin();
5051 ++remove_start; // The "++" is because the first one shouldn't be erased since it's used for the product
5052 CProt_ref::TName::iterator new_end =
5053 remove( remove_start, prot_names->SetValue().end(), gene_name );
5054 prot_names->SetValue().erase( new_end, prot_names->SetValue().end() );
5055
5056 if (prot_names->GetValue().empty()) {
5057 x_RemoveQuals(eFQ_prot_names);
5058 prot_names = NULL;
5059 }
5060 }
5061 }
5062
5063 if (prot_desc != NULL) {
5064 const string& pdesc = prot_desc->GetValue();
5065
5066 // remove prot name if in prot_desc
5067 if (prot_names != NULL) {
5068 CProt_ref::TName::iterator remove_start = prot_names->SetValue().begin();
5069 ++remove_start; // The "++" is because the first one shouldn't be erased since it's used for the product
5070 CProt_ref::TName::iterator new_end =
5071 remove_if( remove_start, prot_names->SetValue().end(),
5072 CInStringPred(pdesc) );
5073 prot_names->SetValue().erase( new_end, prot_names->SetValue().end() );
5074
5075 if (prot_names->GetValue().empty()) {
5076 x_RemoveQuals(eFQ_prot_names);
5077 prot_names = NULL;
5078 }
5079 }
5080 // remove protein description that equals the cds product, case sensitive
5081 const CFlatStringQVal* cds_prod = x_GetStringQual(eFQ_cds_product);
5082 if (cds_prod != NULL) {
5083 if (NStr::Equal(pdesc, cds_prod->GetValue())) {
5084 x_RemoveQuals(eFQ_prot_desc);
5085 prot_desc = NULL;
5086 }
5087 }
5088
5089 // remove protein description that equals the standard name
5090 if (prot_desc != NULL && standard_name != NULL) {
5091 // We use s_StrEqualDisregardFinalPeriod rather than plain NStr::EqualNoCase
5092 // because of, e.g., CU638784
5093 if (s_StrEqualDisregardFinalPeriod(pdesc, standard_name->GetValue(), NStr::eNocase )) {
5094 x_RemoveQuals(eFQ_prot_desc);
5095 prot_desc = NULL;
5096 }
5097 }
5098
5099 // remove protein description that equals a gene synonym
5100 // NC_001823 leave in prot_desc if no cds_product
5101 if (prot_desc != NULL && gene_syn != NULL && cds_prod != NULL) {
5102 ITERATE (TGeneSyn, it, *gene_syn) {
5103 if (!NStr::IsBlank(*it) && pdesc == *it) {
5104 x_RemoveQuals(eFQ_prot_desc);
5105 prot_desc = NULL;
5106 break;
5107 }
5108 }
5109 }
5110 }
5111
5112 // check if need to remove seqfeat_note
5113 // (This generally occurs when it's equal to (or, sometimes, contained in) another qual
5114 if (m_Feat.IsSetComment()) {
5115 const string &feat_comment = m_Feat.GetComment();
5116 const CFlatStringQVal* product = x_GetStringQual(eFQ_product);
5117 const CFlatStringQVal* cds_product = x_GetStringQual(eFQ_cds_product);
5118
5119 if (product != NULL) {
5120 if (NStr::EqualNocase(product->GetValue(), feat_comment)) {
5121 x_RemoveQuals(eFQ_seqfeat_note);
5122 seqfeat_note = NULL;
5123 }
5124 }
5125 if (cds_product != NULL && seqfeat_note != NULL) {
5126 if ( s_StrEqualDisregardFinalPeriod(cds_product->GetValue(), seqfeat_note->GetValue(), NStr::eCase ) ) {
5127 x_RemoveQuals(eFQ_seqfeat_note);
5128 seqfeat_note = NULL;
5129 }
5130 }
5131 // suppress selenocysteine note if already in comment
5132 // if (NStr::Find(feat_comment, "selenocysteine") != NPOS) {
5133 // x_RemoveQuals(eFQ_selenocysteine_note);
5134 // }
5135
5136 // /EC_number same as feat.comment will suppress /note
5137 if( seqfeat_note != NULL ) {
5138 for (TQCI it = x_GetQual(eFQ_EC_number); it != m_Quals.end() && it->first == eFQ_EC_number; ++it) {
5139 const CFlatStringQVal* ec = dynamic_cast<const CFlatStringQVal*>(it->second.GetPointerOrNull());
5140 if (ec != NULL) {
5141 if (NStr::EqualNocase(seqfeat_note->GetValue(), ec->GetValue())) {
5142 x_RemoveQuals(eFQ_seqfeat_note);
5143 seqfeat_note = NULL;
5144 break;
5145 }
5146 }
5147 }
5148 }
5149
5150 // this sort of note provides no additional info (we already know this is a tRNA by other places)
5151 if( feat_comment == "tRNA-" ) {
5152 x_RemoveQuals(eFQ_seqfeat_note);
5153 seqfeat_note = NULL;
5154 }
5155 }
5156
5157 const CFlatStringQVal* note = x_GetStringQual(eFQ_seqfeat_note);
5158 if (note != NULL && standard_name != NULL) {
5159 if (NStr::Equal(note->GetValue(), standard_name->GetValue())) {
5160 x_RemoveQuals(eFQ_seqfeat_note);
5161 note = NULL;
5162 }
5163 }
5164 if ( ! ctx.IsProt() && note != NULL && gene_syn != NULL) {
5165 ITERATE (TGeneSyn, it, *gene_syn) {
5166 if (NStr::EqualNocase(note->GetValue(), *it)) {
5167 x_RemoveQuals(eFQ_seqfeat_note);
5168 note = NULL;
5169 break;
5170 }
5171 }
5172 }
5173 if( note != NULL && prot_desc != NULL ) { // e.g. L07143, U28372
5174 if( NStr::Find(prot_desc->GetValue(), note->GetValue()) != NPOS ) {
5175 x_RemoveQuals(eFQ_seqfeat_note);
5176 note = NULL;
5177 }
5178 }
5179
5180 // if there is a prot_desc, then we don't add a period to seqfeat_note
5181 // (Obviously, this part must come after the part that cleans up
5182 // the prot_descs, otherwise we may think we have a prot_desc, when the
5183 // prot_desc is actually to be removed )
5184 if( note != NULL && x_GetStringQual(eFQ_prot_desc ) ) {
5185 const_cast<CFlatStringQVal*>(note)->SetAddPeriod( false );
5186 }
5187
5188 // hide invalid mobile_element_quals
5189 if( ctx.Config().IsModeRelease() || ctx.Config().IsModeEntrez() ) {
5190
5191 const CFlatStringQVal *mobile_element_type = x_GetStringQual( eFQ_mobile_element_type );
5192 if( NULL != mobile_element_type && ! s_ValidateMobileElementType(mobile_element_type->GetValue()) ) {
5193 x_RemoveQuals( eFQ_mobile_element_type );
5194 }
5195
5196 }
5197
5198 // remove invalid pseudogenes:
5199 {
5200 TQI pseudogene_iter = m_Quals.Find(eFQ_pseudogene);
5201 while( pseudogene_iter != m_Quals.end() &&
5202 pseudogene_iter->first == eFQ_pseudogene )
5203 {
5204 const CFlatStringQVal & qual = dynamic_cast<const CFlatStringQVal &>( *pseudogene_iter->second );
5205 if( s_IsValidPseudoGene(GetContext()->Config().GetMode(), qual.GetValue() ) ) {
5206 // keep valid pseudogene
5207 ++pseudogene_iter;
5208 } else {
5209 // erase invalid pseudogene
5210 TQI pseudogene_iter_to_erase = pseudogene_iter;
5211 ++pseudogene_iter;
5212
5213 m_Quals.Erase(pseudogene_iter_to_erase);
5214 }
5215 }
5216 }
5217
5218 // /pseudogene qual suppresses /pseudo qual if /pseudogene fits certain patterns
5219 if( // ( GetContext()->Config().IsModeRelease() || GetContext()->Config().IsModeEntrez() ) &&
5220 x_HasQual(eFQ_pseudo) && x_HasQual(eFQ_pseudogene) )
5221 {
5222 const CFlatStringQVal* qval = x_GetStringQual(eFQ_pseudogene);
5223 // in this part, always use release-mode validation logic, regardless of actual mode
5224 if( qval && s_IsValidPseudoGene( CFlatFileConfig::eMode_Release, qval->GetValue() ) ) {
5225 x_RemoveQuals(eFQ_pseudo);
5226 }
5227 }
5228 }
5229
5230
5231 typedef SStaticPair<EFeatureQualifier, CSeqFeatData::EQualifier> TQualPair;
5232 static const TQualPair sc_GbToFeatQualMap[] = {
5233 { eFQ_none, CSeqFeatData::eQual_bad },
5234 { eFQ_allele, CSeqFeatData::eQual_allele },
5235 { eFQ_anticodon, CSeqFeatData::eQual_anticodon },
5236 { eFQ_artificial_location, CSeqFeatData::eQual_artificial_location },
5237 { eFQ_bond, CSeqFeatData::eQual_note },
5238 { eFQ_bond_type, CSeqFeatData::eQual_bond_type },
5239 { eFQ_bound_moiety, CSeqFeatData::eQual_bound_moiety },
5240 { eFQ_calculated_mol_wt, CSeqFeatData::eQual_calculated_mol_wt },
5241 { eFQ_cds_product, CSeqFeatData::eQual_product },
5242 { eFQ_circular_RNA, CSeqFeatData::eQual_circular_RNA },
5243 { eFQ_citation, CSeqFeatData::eQual_citation },
5244 { eFQ_clone, CSeqFeatData::eQual_clone },
5245 { eFQ_coded_by, CSeqFeatData::eQual_coded_by },
5246 { eFQ_codon, CSeqFeatData::eQual_codon },
5247 { eFQ_codon_start, CSeqFeatData::eQual_codon_start },
5248 { eFQ_compare, CSeqFeatData::eQual_compare },
5249 { eFQ_cons_splice, CSeqFeatData::eQual_cons_splice },
5250 { eFQ_cyt_map, CSeqFeatData::eQual_map },
5251 { eFQ_db_xref, CSeqFeatData::eQual_db_xref },
5252 { eFQ_derived_from, CSeqFeatData::eQual_derived_from },
5253 { eFQ_direction, CSeqFeatData::eQual_direction },
5254 { eFQ_EC_number, CSeqFeatData::eQual_EC_number },
5255 { eFQ_encodes, CSeqFeatData::eQual_note },
5256 { eFQ_estimated_length, CSeqFeatData::eQual_estimated_length },
5257 { eFQ_experiment, CSeqFeatData::eQual_experiment },
5258 { eFQ_exception, CSeqFeatData::eQual_exception },
5259 { eFQ_exception_note, CSeqFeatData::eQual_note },
5260 { eFQ_figure, CSeqFeatData::eQual_note },
5261 { eFQ_frequency, CSeqFeatData::eQual_frequency },
5262 { eFQ_function, CSeqFeatData::eQual_function },
5263 { eFQ_gap_type, CSeqFeatData::eQual_gap_type },
5264 { eFQ_gene, CSeqFeatData::eQual_gene },
5265 { eFQ_gene_desc, CSeqFeatData::eQual_note },
5266 { eFQ_gene_allele, CSeqFeatData::eQual_allele },
5267 { eFQ_gene_map, CSeqFeatData::eQual_map },
5268 { eFQ_gene_syn, CSeqFeatData::eQual_note },
5269 { eFQ_gene_syn_refseq, CSeqFeatData::eQual_note },
5270 { eFQ_gene_note, CSeqFeatData::eQual_note },
5271 { eFQ_gene_xref, CSeqFeatData::eQual_db_xref },
5272 { eFQ_go_component, CSeqFeatData::eQual_note },
5273 { eFQ_go_function, CSeqFeatData::eQual_note },
5274 { eFQ_go_process, CSeqFeatData::eQual_note },
5275 { eFQ_heterogen, CSeqFeatData::eQual_heterogen },
5276 { eFQ_illegal_qual, CSeqFeatData::eQual_bad },
5277 { eFQ_inference, CSeqFeatData::eQual_inference },
5278 { eFQ_label, CSeqFeatData::eQual_label },
5279 { eFQ_linkage_evidence, CSeqFeatData::eQual_linkage_evidence },
5280 { eFQ_locus_tag, CSeqFeatData::eQual_locus_tag },
5281 { eFQ_map, CSeqFeatData::eQual_map },
5282 { eFQ_maploc, CSeqFeatData::eQual_note },
5283 { eFQ_mobile_element, CSeqFeatData::eQual_mobile_element },
5284 { eFQ_mobile_element_type, CSeqFeatData::eQual_mobile_element_type },
5285 { eFQ_mod_base, CSeqFeatData::eQual_mod_base },
5286 { eFQ_modelev, CSeqFeatData::eQual_note },
5287 { eFQ_mol_wt, CSeqFeatData::eQual_calculated_mol_wt },
5288 { eFQ_ncRNA_class, CSeqFeatData::eQual_ncRNA_class },
5289 { eFQ_nomenclature, CSeqFeatData::eQual_nomenclature },
5290 { eFQ_non_std_residue, CSeqFeatData::eQual_non_std_residue },
5291 { eFQ_number, CSeqFeatData::eQual_number },
5292 { eFQ_old_locus_tag, CSeqFeatData::eQual_old_locus_tag },
5293 { eFQ_operon, CSeqFeatData::eQual_operon },
5294 { eFQ_organism, CSeqFeatData::eQual_organism },
5295 { eFQ_partial, CSeqFeatData::eQual_partial },
5296 { eFQ_PCR_conditions, CSeqFeatData::eQual_PCR_conditions },
5297 { eFQ_peptide, CSeqFeatData::eQual_bad },
5298 { eFQ_phenotype, CSeqFeatData::eQual_phenotype },
5299 { eFQ_product, CSeqFeatData::eQual_product },
5300 { eFQ_product_quals, CSeqFeatData::eQual_product },
5301 { eFQ_prot_activity, CSeqFeatData::eQual_function },
5302 { eFQ_prot_comment, CSeqFeatData::eQual_note },
5303 { eFQ_prot_EC_number, CSeqFeatData::eQual_EC_number },
5304 { eFQ_prot_note, CSeqFeatData::eQual_note },
5305 { eFQ_prot_method, CSeqFeatData::eQual_note },
5306 { eFQ_prot_conflict, CSeqFeatData::eQual_note },
5307 { eFQ_prot_desc, CSeqFeatData::eQual_note },
5308 { eFQ_prot_missing, CSeqFeatData::eQual_note },
5309 { eFQ_prot_name, CSeqFeatData::eQual_name },
5310 { eFQ_prot_names, CSeqFeatData::eQual_note },
5311 { eFQ_protein_id, CSeqFeatData::eQual_protein_id },
5312 { eFQ_pseudo, CSeqFeatData::eQual_pseudo },
5313 { eFQ_pseudogene, CSeqFeatData::eQual_pseudogene },
5314 { eFQ_region, CSeqFeatData::eQual_note },
5315 { eFQ_region_name, CSeqFeatData::eQual_region_name },
5316 { eFQ_recombination_class, CSeqFeatData::eQual_recombination_class },
5317 { eFQ_regulatory_class, CSeqFeatData::eQual_regulatory_class },
5318 { eFQ_replace, CSeqFeatData::eQual_replace },
5319 { eFQ_ribosomal_slippage, CSeqFeatData::eQual_ribosomal_slippage },
5320 { eFQ_rpt_family, CSeqFeatData::eQual_rpt_family },
5321 { eFQ_rpt_type, CSeqFeatData::eQual_rpt_type },
5322 { eFQ_rpt_unit, CSeqFeatData::eQual_rpt_unit },
5323 { eFQ_rpt_unit_range, CSeqFeatData::eQual_rpt_unit_range },
5324 { eFQ_rpt_unit_seq, CSeqFeatData::eQual_rpt_unit_seq },
5325 { eFQ_rrna_its, CSeqFeatData::eQual_note },
5326 { eFQ_satellite, CSeqFeatData::eQual_satellite },
5327 { eFQ_sec_str_type, CSeqFeatData::eQual_sec_str_type },
5328 // { eFQ_selenocysteine, CSeqFeatData::eQual_note },
5329 // { eFQ_selenocysteine_note, CSeqFeatData::eQual_note },
5330 { eFQ_seqfeat_note, CSeqFeatData::eQual_note },
5331 { eFQ_site, CSeqFeatData::eQual_note },
5332 { eFQ_site_type, CSeqFeatData::eQual_site_type },
5333 { eFQ_standard_name, CSeqFeatData::eQual_standard_name },
5334 { eFQ_tag_peptide, CSeqFeatData::eQual_tag_peptide },
5335 { eFQ_trans_splicing, CSeqFeatData::eQual_trans_splicing },
5336 { eFQ_transcription, CSeqFeatData::eQual_bad },
5337 { eFQ_transcript_id, CSeqFeatData::eQual_note },
5338 { eFQ_transcript_id_note, CSeqFeatData::eQual_note },
5339 { eFQ_transl_except, CSeqFeatData::eQual_transl_except },
5340 { eFQ_transl_table, CSeqFeatData::eQual_transl_table },
5341 { eFQ_translation, CSeqFeatData::eQual_translation },
5342 { eFQ_trna_aa, CSeqFeatData::eQual_bad },
5343 { eFQ_trna_codons, CSeqFeatData::eQual_note },
5344 { eFQ_UniProtKB_evidence, CSeqFeatData::eQual_UniProtKB_evidence },
5345 { eFQ_usedin, CSeqFeatData::eQual_usedin },
5346 { eFQ_xtra_prod_quals, CSeqFeatData::eQual_note }
5347 };
5348 typedef CStaticPairArrayMap<EFeatureQualifier, CSeqFeatData::EQualifier> TQualMap;
5349 DEFINE_STATIC_ARRAY_MAP(TQualMap, sc_QualMap, sc_GbToFeatQualMap);
5350
s_GbToSeqFeatQual(EFeatureQualifier qual)5351 static CSeqFeatData::EQualifier s_GbToSeqFeatQual(EFeatureQualifier qual)
5352 {
5353 TQualMap::const_iterator it = sc_QualMap.find(qual);
5354 if ( it != sc_QualMap.end() ) {
5355 return it->second;
5356 }
5357 return CSeqFeatData::eQual_bad;
5358 }
5359
5360
x_DropIllegalQuals(void) const5361 void CFeatureItem::x_DropIllegalQuals(void) const
5362 {
5363 const CSeqFeatData& data = m_Feat.GetData();
5364
5365 TQI it = m_Quals.begin();
5366 while ( it != m_Quals.end() ) {
5367 CSeqFeatData::EQualifier qual = s_GbToSeqFeatQual(it->first);
5368 if ( !data.IsLegalQualifier(qual) ) {
5369 it = m_Quals.Erase(it);
5370 } else {
5371 ++it;
5372 }
5373 }
5374 }
5375
x_IsSeqFeatDataFeatureLegal(CSeqFeatData::EQualifier qual)5376 bool CFeatureItem::x_IsSeqFeatDataFeatureLegal( CSeqFeatData::EQualifier qual )
5377 {
5378 const CSeqFeatData& data = m_Feat.GetData();
5379 return data.IsLegalQualifier(qual);
5380 }
5381
5382 // ----------------------------------------------------------------------------
x_AddFTableQuals(CBioseqContext & ctx)5383 void CFeatureItem::x_AddFTableQuals(
5384 CBioseqContext& ctx )
5385 // ----------------------------------------------------------------------------
5386 {
5387 bool pseudo = m_Feat.IsSetPseudo() && m_Feat.GetPseudo();
5388
5389 const CSeqFeatData& data = m_Feat.GetData();
5390
5391 switch ( m_Feat.GetData().Which() ) {
5392 case CSeqFeatData::e_Gene:
5393 pseudo |= x_AddFTableGeneQuals(data.GetGene());
5394 break;
5395 case CSeqFeatData::e_Rna:
5396 x_AddFTableRnaQuals(m_Feat, ctx);
5397 break;
5398 case CSeqFeatData::e_Cdregion:
5399 x_AddFTableCdregionQuals(m_Feat, ctx);
5400 break;
5401 case CSeqFeatData::e_Prot:
5402 x_AddFTableProtQuals(m_Feat);
5403 break;
5404 case CSeqFeatData::e_Region:
5405 x_AddFTableRegionQuals(data.GetRegion());
5406 break;
5407 case CSeqFeatData::e_Bond:
5408 x_AddFTableBondQuals(data.GetBond());
5409 break;
5410 case CSeqFeatData::e_Site:
5411 x_AddFTableSiteQuals(data.GetSite());
5412 break;
5413 case CSeqFeatData::e_Psec_str:
5414 x_AddFTablePsecStrQuals(data.GetPsec_str());
5415 break;
5416 case CSeqFeatData::e_Non_std_residue:
5417 x_AddFTableNonStdQuals(data.GetNon_std_residue());
5418 break;
5419 case CSeqFeatData::e_Het:
5420 x_AddFTablePsecStrQuals(data.GetHet());
5421 break;
5422 case CSeqFeatData::e_Biosrc:
5423 x_AddFTableBiosrcQuals(data.GetBiosrc());
5424 break;
5425 default:
5426 break;
5427 }
5428 if ( pseudo ) {
5429 x_AddFTableQual("pseudo");
5430 }
5431 const CGene_ref* grp = m_Feat.GetGeneXref();
5432 if ( grp != 0 ) {
5433 string gene_label;
5434 if (grp->IsSuppressed()) {
5435 gene_label = "-";
5436 } else {
5437 grp->GetLabel(&gene_label);
5438 }
5439 x_AddFTableQual("gene", gene_label);
5440 }
5441 if ( m_Feat.IsSetComment() && !m_Feat.GetComment().empty() ) {
5442 x_AddFTableQual("note", m_Feat.GetComment());
5443 }
5444 if ( m_Feat.IsSetExp_ev() ) {
5445 string ev;
5446 switch ( m_Feat.GetExp_ev() ) {
5447 case CSeq_feat::eExp_ev_experimental:
5448 ev = "experimental";
5449 break;
5450 case CSeq_feat::eExp_ev_not_experimental:
5451 ev = "not_experimental";
5452 break;
5453 }
5454 x_AddFTableQual("evidence", ev);
5455 }
5456 if ( m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty() ) {
5457 string exception_text = m_Feat.GetExcept_text();
5458 if ( exception_text == "ribosomal slippage" ) {
5459 x_AddFTableQual("ribosomal_slippage");
5460 }
5461 else if ( exception_text == "trans-splicing" ) {
5462 x_AddFTableQual("trans_splicing");
5463 }
5464 else if ( exception_text == "circular RNA" ) {
5465 x_AddFTableQual("circular_RNA");
5466 }
5467 x_AddFTableQual("exception", m_Feat.GetExcept_text());
5468 } else if ( m_Feat.IsSetExcept() && m_Feat.GetExcept() ) {
5469 x_AddFTableQual("exception");
5470 }
5471 const CSeq_feat_Base::TQual & qual = m_Feat.GetQual(); // must store reference since ITERATE macro evaluates 3rd arg multiple times
5472 const bool hide_ids = GetContext()->Config().HideProteinID();
5473 ITERATE( CSeq_feat::TQual, it, qual ) {
5474 const CGb_qual& qual = **it;
5475 const string& key = qual.IsSetQual() ? qual.GetQual() : kEmptyStr;
5476 const string& val = qual.IsSetVal() ? qual.GetVal() : kEmptyStr;
5477 if ( !key.empty() && !val.empty() ) {
5478 if (hide_ids &&
5479 (key == "protein_id" ||
5480 key == "orig_protein_id" ||
5481 key == "transcript_id" ||
5482 key == "orig_transcript_id"))
5483 {
5484 continue;
5485 }
5486 x_AddFTableQual(key, val);
5487 }
5488 }
5489 if ( m_Feat.IsSetExt() ) {
5490 x_AddFTableExtQuals(m_Feat.GetExt());
5491 }
5492 if ( data.IsGene() ) {
5493 x_AddFTableDbxref(data.GetGene().GetDb());
5494 } else if ( data.IsProt() ) {
5495 x_AddFTableDbxref(data.GetProt().GetDb());
5496 }
5497 x_AddFTableDbxref(m_Feat.GetDbxref());
5498 }
5499
5500 // ----------------------------------------------------------------------------
x_AddFTableExtQuals(const CSeq_feat::TExt & ext)5501 void CFeatureItem::x_AddFTableExtQuals(
5502 const CSeq_feat::TExt& ext )
5503 // ----------------------------------------------------------------------------
5504 {
5505 ITERATE (CUser_object::TData, it, ext.GetData()) {
5506 const CUser_field& field = **it;
5507 if ( !field.IsSetData() ) {
5508 continue;
5509 }
5510 if ( field.GetData().IsObject() ) {
5511 const CUser_object& obj = field.GetData().GetObject();
5512 x_AddQualsExt(obj);
5513 return;
5514 } else if ( field.GetData().IsObjects() ) {
5515 ITERATE (CUser_field::C_Data::TObjects, o, field.GetData().GetObjects()) {
5516 x_AddQualsExt(**o);
5517 }
5518 return;
5519 }
5520 }
5521 if ( ext.IsSetType() && ext.GetType().IsStr() ) {
5522 const string& oid = ext.GetType().GetStr();
5523 if ( oid == "GeneOntology" ) {
5524 ITERATE (CUser_object::TData, uf_it, ext.GetData()) {
5525 const CUser_field& field = **uf_it;
5526 if ( field.IsSetLabel() && field.GetLabel().IsStr() ) {
5527 const string& label = field.GetLabel().GetStr();
5528 string name;
5529 if ( label == "Process" ) {
5530 name = "GO_process";
5531 } else if ( label == "Component" ) {
5532 name = "GO_component";
5533 } else if ( label == "Function" ) {
5534 name = "GO_function";
5535 }
5536 if ( name.empty() ) {
5537 continue;
5538 }
5539
5540 ITERATE (CUser_field::TData::TFields, it, field.GetData().GetFields()) {
5541 if ( (*it)->GetData().IsFields() ) {
5542 CFlatGoQVal(**it).Format(m_FTableQuals, name, *GetContext(), 0);;
5543 }
5544 }
5545 }
5546 }
5547 }
5548 }
5549 }
5550
5551 // ----------------------------------------------------------------------------
x_AddFTableDbxref(const CSeq_feat::TDbxref & dbxref)5552 void CFeatureItem::x_AddFTableDbxref(
5553 const CSeq_feat::TDbxref& dbxref )
5554 // ----------------------------------------------------------------------------
5555 {
5556 ITERATE (CSeq_feat::TDbxref, it, dbxref) {
5557 const CDbtag& dbt = **it;
5558 if ( dbt.IsSetDb() && !dbt.GetDb().empty() &&
5559 dbt.IsSetTag() ) {
5560 const CObject_id& oid = dbt.GetTag();
5561 switch ( oid.Which() ) {
5562 case CObject_id::e_Str:
5563 if ( !oid.GetStr().empty() ) {
5564 x_AddFTableQual("db_xref", dbt.GetDb() + ":" + oid.GetStr());
5565 }
5566 break;
5567 case CObject_id::e_Id:
5568 x_AddFTableQual("db_xref", dbt.GetDb() + ":" + NStr::IntToString(oid.GetId()));
5569 break;
5570 default:
5571 break;
5572 }
5573 }
5574 }
5575 }
5576
5577 // ----------------------------------------------------------------------------
x_AddFTableGeneQuals(const CGene_ref & gene)5578 bool CFeatureItem::x_AddFTableGeneQuals(
5579 const CGene_ref& gene )
5580 // ----------------------------------------------------------------------------
5581 {
5582 if ( gene.IsSetLocus() && !gene.GetLocus().empty() ) {
5583 x_AddFTableQual("gene", gene.GetLocus(), CFormatQual::eTrim_WhitespaceOnly);
5584 }
5585 if ( gene.IsSetAllele() && !gene.GetAllele().empty() ) {
5586 x_AddFTableQual("allele", gene.GetAllele());
5587 }
5588 ITERATE (CGene_ref::TSyn, it, gene.GetSyn()) {
5589 x_AddFTableQual("gene_syn", *it, CFormatQual::eTrim_WhitespaceOnly);
5590 }
5591 if ( gene.IsSetDesc() && !gene.GetDesc().empty() ) {
5592 x_AddFTableQual("gene_desc", gene.GetDesc());
5593 }
5594 if ( gene.IsSetMaploc() && !gene.GetMaploc().empty() ) {
5595 x_AddFTableQual("map", gene.GetMaploc());
5596 }
5597 if ( gene.IsSetLocus_tag() && !gene.GetLocus_tag().empty() ) {
5598 x_AddFTableQual("locus_tag", gene.GetLocus_tag(), CFormatQual::eTrim_WhitespaceOnly);
5599 }
5600
5601 return (gene.IsSetPseudo() && gene.GetPseudo());
5602 }
5603
5604
x_AddFTableAnticodon(const CTrna_ext & trna_ext,CBioseqContext & ctx)5605 void CFeatureItem::x_AddFTableAnticodon(
5606 const CTrna_ext& trna_ext,
5607 CBioseqContext& ctx)
5608 {
5609
5610
5611 if (!trna_ext.IsSetAnticodon()) {
5612 return;
5613 }
5614
5615 const auto& loc = trna_ext.GetAnticodon();
5616 string pos = CFlatSeqLoc(loc, ctx).GetString();
5617
5618 string aa;
5619 switch(trna_ext.GetAa().Which()) {
5620 case CTrna_ext::C_Aa::e_Iupacaa:
5621 aa = GetAAName(trna_ext.GetAa().GetIupacaa(), true);
5622 break;
5623 case CTrna_ext::C_Aa::e_Ncbieaa:
5624 aa = GetAAName(trna_ext.GetAa().GetNcbieaa(), true);
5625 break;
5626 case CTrna_ext::C_Aa::e_Ncbi8aa:
5627 aa = GetAAName(trna_ext.GetAa().GetNcbi8aa(), false);
5628 break;
5629 case CTrna_ext::C_Aa::e_Ncbistdaa:
5630 aa = GetAAName(trna_ext.GetAa().GetNcbistdaa(), false);
5631 break;
5632 default:
5633 break;
5634 }
5635
5636 string seq("---");
5637 try {
5638 CSeqVector seq_vec(loc, ctx.GetScope(), CBioseq_Handle::eCoding_Iupac);
5639 seq_vec.GetSeqData(0, 3, seq);
5640 NStr::ToLower(seq);
5641 }
5642 catch(...)
5643 {}
5644
5645
5646 x_AddFTableQual("anticodon", "(pos:" + pos + ",aa:" + aa + ",seq:" + seq + ")");
5647
5648 }
5649
5650 // ----------------------------------------------------------------------------
x_AddFTableRnaQuals(const CMappedFeat & feat,CBioseqContext & ctx)5651 void CFeatureItem::x_AddFTableRnaQuals(
5652 const CMappedFeat& feat,
5653 CBioseqContext& ctx )
5654 // ----------------------------------------------------------------------------
5655 {
5656 string label;
5657
5658 if ( !feat.GetData().IsRna() ) {
5659 return;
5660 }
5661 const CFlatFileConfig& cfg = GetContext()->Config();
5662 const CSeqFeatData::TRna& rna = feat.GetData().GetRna();
5663 if (rna.IsSetExt()) {
5664 const CRNA_ref::TExt& ext = rna.GetExt();
5665 if (ext.IsName()) {
5666 if (!ext.GetName().empty()) {
5667 x_AddFTableQual("product", ext.GetName());
5668 }
5669 } else if (ext.IsTRNA()) {
5670 feature::GetLabel(feat.GetOriginalFeature(), &label,
5671 feature::fFGL_Content, &ctx.GetScope());
5672 x_AddFTableQual("product", label);
5673 // check for anticodon
5674 x_AddFTableAnticodon(ext.GetTRNA(), ctx);
5675 }
5676 else if ( ext.IsGen() ) {
5677 const CRNA_gen& gen = ext.GetGen();
5678 if ( gen.IsSetClass() ) {
5679 if ( gen.IsLegalClass()) {
5680 x_AddFTableQual("ncRNA_class", gen.GetClass());
5681 }
5682 else {
5683 x_AddFTableQual("ncRNA_class", "other");
5684 x_AddFTableQual("note", gen.GetClass());
5685 }
5686 }
5687
5688 if ( gen.IsSetProduct() ) {
5689 x_AddFTableQual("product", gen.GetProduct());
5690 }
5691 }
5692 }
5693
5694 if ( feat.IsSetProduct() && !cfg.HideProteinID()) {
5695 CBioseq_Handle prod =
5696 ctx.GetScope().GetBioseqHandle(m_Feat.GetProductId());
5697 if ( prod ) {
5698 string id_str = x_SeqIdWriteForTable(*(prod.GetBioseqCore()), ctx.Config().SuppressLocalId(), !(ctx.Config().HideGI() || ctx.Config().IsPolicyFtp()));
5699 if (!NStr::IsBlank(id_str)) {
5700 x_AddFTableQual("transcript_id", id_str);
5701 }
5702 }
5703 }
5704 }
5705
5706
5707 // originally SeqIdWriteForTable in the C Toolkit
5708 // specific Seq-ids are included in the value, in a specific order
x_SeqIdWriteForTable(const CBioseq & seq,bool suppress_local,bool giOK)5709 string CFeatureItem::x_SeqIdWriteForTable(const CBioseq& seq, bool suppress_local, bool giOK)
5710
5711 {
5712 if (!seq.IsSetId()) {
5713 return kEmptyStr;
5714 }
5715 const CSeq_id* accn = NULL;
5716 const CSeq_id* local = NULL;
5717 const CSeq_id* general = NULL;
5718 const CSeq_id* gi = NULL;
5719
5720 ITERATE(CBioseq::TId, it, seq.GetId()) {
5721 switch ((*it)->Which()) {
5722 case CSeq_id::e_Local:
5723 local = it->GetPointer();
5724 break;
5725 case CSeq_id::e_Genbank:
5726 case CSeq_id::e_Embl:
5727 case CSeq_id::e_Pir:
5728 case CSeq_id::e_Swissprot:
5729 case CSeq_id::e_Ddbj:
5730 case CSeq_id::e_Prf:
5731 case CSeq_id::e_Tpg:
5732 case CSeq_id::e_Tpe:
5733 case CSeq_id::e_Tpd:
5734 case CSeq_id::e_Other:
5735 case CSeq_id::e_Gpipe:
5736 accn = it->GetPointer();
5737 break;
5738 case CSeq_id::e_General:
5739 if (!(*it)->GetGeneral().IsSkippable()) {
5740 general = it->GetPointer();
5741 }
5742 break;
5743 case CSeq_id::e_Gi:
5744 gi = it->GetPointer();
5745 break;
5746 default:
5747 break;
5748 }
5749 }
5750
5751 string label;
5752
5753 if (accn != NULL) {
5754 label = accn->AsFastaString();
5755 }
5756
5757 if (general != NULL) {
5758 if (!label.empty()) {
5759 label += "|";
5760 }
5761 label += general->AsFastaString();
5762 }
5763
5764 if (local != NULL && (!suppress_local) && label.empty()) {
5765 label = local->AsFastaString();
5766 }
5767
5768 if (gi != NULL && giOK && label.empty()) {
5769 label = gi->AsFastaString();
5770 }
5771
5772 return label;
5773 }
5774
5775
5776 // ----------------------------------------------------------------------------
x_AddFTableCdregionQuals(const CMappedFeat & feat,CBioseqContext & ctx)5777 void CFeatureItem::x_AddFTableCdregionQuals(
5778 const CMappedFeat& feat,
5779 CBioseqContext& ctx )
5780 // ----------------------------------------------------------------------------
5781 {
5782 CBioseq_Handle prod;
5783 const CFlatFileConfig& cfg = GetContext()->Config();
5784 if ( feat.IsSetProduct() ) {
5785 prod = ctx.GetScope().GetBioseqHandle(feat.GetProductId());
5786 }
5787
5788 const CProt_ref* prot_xref = feat.GetProtXref();
5789 if (prot_xref) {
5790 x_AddFTableProtQuals(*prot_xref);
5791 }
5792 else
5793 if ( prod ) {
5794 CMappedFeat prot_ref = s_GetBestProtFeature(prod);
5795 if ( prot_ref ) {
5796 /// FIXME: we take the first; we want the longest
5797 x_AddFTableProtQuals(prot_ref);
5798 }
5799 }
5800 const CCdregion& cdr = feat.GetData().GetCdregion();
5801 if ( cdr.IsSetFrame() && cdr.GetFrame() > CCdregion::eFrame_one ) {
5802 x_AddFTableQual("codon_start", NStr::IntToString(cdr.GetFrame()));
5803 }
5804 ITERATE (CCdregion::TCode_break, it, cdr.GetCode_break()) {
5805 string pos = CFlatSeqLoc((*it)->GetLoc(), ctx).GetString();
5806 string aa = "OTHER";
5807 switch ((*it)->GetAa().Which()) {
5808 case CCode_break::C_Aa::e_Ncbieaa:
5809 aa = GetAAName((*it)->GetAa().GetNcbieaa(), true);
5810 break;
5811 case CCode_break::C_Aa::e_Ncbi8aa:
5812 aa = GetAAName((*it)->GetAa().GetNcbi8aa(), false);
5813 break;
5814 case CCode_break::C_Aa::e_Ncbistdaa:
5815 aa = GetAAName((*it)->GetAa().GetNcbistdaa(), false);
5816 break;
5817 default:
5818 break;
5819 }
5820 x_AddFTableQual("transl_except", "(pos:" + pos + ",aa:" + aa + ")");
5821 }
5822
5823 if (cdr.IsSetCode()) {
5824 int gcode = cdr.GetCode().GetId();
5825 if (gcode > 1 && gcode != 255) {
5826 x_AddFTableQual("transl_table", NStr::NumericToString(gcode));
5827 }
5828 }
5829
5830 if (prod && !cfg.HideProteinID()) {
5831 string id_str = x_SeqIdWriteForTable(*(prod.GetBioseqCore()), ctx.Config().SuppressLocalId(), !(ctx.Config().HideGI() || ctx.Config().IsPolicyFtp()));
5832 if (!NStr::IsBlank(id_str)) {
5833 x_AddFTableQual("protein_id", id_str);
5834 }
5835 }
5836 }
5837
5838 // ----------------------------------------------------------------------------
x_AddFTableProtQuals(const CMappedFeat & prot)5839 void CFeatureItem::x_AddFTableProtQuals(
5840 const CMappedFeat& prot )
5841 // ----------------------------------------------------------------------------
5842 {
5843 if ( !prot.GetData().IsProt() ) {
5844 return;
5845 }
5846 x_AddFTableProtQuals(prot.GetData().GetProt());
5847
5848 if ( prot.IsSetComment() && !prot.GetComment().empty() ) {
5849 x_AddFTableQual("prot_note", prot.GetComment());
5850 }
5851 }
5852
5853 // ----------------------------------------------------------------------------
x_AddFTableProtQuals(const CProt_ref & prot_ref)5854 void CFeatureItem::x_AddFTableProtQuals(
5855 const CProt_ref& prot_ref)
5856 // ----------------------------------------------------------------------------
5857 {
5858 ITERATE (CProt_ref::TName, it, prot_ref.GetName()) {
5859 if ( !it->empty() ) {
5860 x_AddFTableQual("product", *it);
5861 }
5862 }
5863 if ( prot_ref.IsSetDesc() && !prot_ref.GetDesc().empty() ) {
5864 x_AddFTableQual("prot_desc", prot_ref.GetDesc());
5865 }
5866 ITERATE (CProt_ref::TActivity, it, prot_ref.GetActivity()) {
5867 if ( !it->empty() ) {
5868 x_AddFTableQual("function", *it);
5869 }
5870 }
5871 ITERATE (CProt_ref::TEc, it, prot_ref.GetEc()) {
5872 if ( !it->empty() ) {
5873 x_AddFTableQual("EC_number", *it);
5874 }
5875 }
5876 }
5877
5878 // ----------------------------------------------------------------------------
x_AddFTableRegionQuals(const CSeqFeatData::TRegion & region)5879 void CFeatureItem::x_AddFTableRegionQuals(
5880 const CSeqFeatData::TRegion& region )
5881 // ----------------------------------------------------------------------------
5882 {
5883 if ( !region.empty() ) {
5884 x_AddFTableQual("region", region);
5885 }
5886 }
5887
5888 // ----------------------------------------------------------------------------
x_AddFTableBondQuals(const CSeqFeatData::TBond & bond)5889 void CFeatureItem::x_AddFTableBondQuals(
5890 const CSeqFeatData::TBond& bond )
5891 // ----------------------------------------------------------------------------
5892 {
5893 x_AddFTableQual("bond_type", s_GetBondName(bond));
5894 }
5895
5896 // ----------------------------------------------------------------------------
x_AddFTableSiteQuals(const CSeqFeatData::TSite & site)5897 void CFeatureItem::x_AddFTableSiteQuals(
5898 const CSeqFeatData::TSite& site)
5899 // ----------------------------------------------------------------------------
5900 {
5901 x_AddFTableQual("site_type", s_GetSiteName(site));
5902 }
5903
5904 // ----------------------------------------------------------------------------
x_AddFTablePsecStrQuals(const CSeqFeatData::TPsec_str & psec_str)5905 void CFeatureItem::x_AddFTablePsecStrQuals(
5906 const CSeqFeatData::TPsec_str& psec_str )
5907 // ----------------------------------------------------------------------------
5908 {
5909 const string& psec = CSeqFeatData::ENUM_METHOD_NAME(EPsec_str)()->FindName(
5910 psec_str, true );
5911 x_AddFTableQual("sec_str_type", psec);
5912 }
5913
5914 // ----------------------------------------------------------------------------
x_AddFTablePsecStrQuals(const CSeqFeatData::THet & het)5915 void CFeatureItem::x_AddFTablePsecStrQuals(
5916 const CSeqFeatData::THet& het)
5917 // ----------------------------------------------------------------------------
5918 {
5919 if ( !het.Get().empty() ) {
5920 x_AddFTableQual("heterogen", het.Get());
5921 }
5922 }
5923
5924 // ----------------------------------------------------------------------------
x_AddFTableNonStdQuals(const CSeqFeatData::TNon_std_residue & res)5925 void CFeatureItem::x_AddFTableNonStdQuals(
5926 const CSeqFeatData::TNon_std_residue& res )
5927 // ----------------------------------------------------------------------------
5928 {
5929 if ( !res.empty() ) {
5930 x_AddFTableQual("non_std_residue", res);
5931 }
5932 }
5933
5934
s_GetSubtypeString(const COrgMod::TSubtype & subtype)5935 static const string s_GetSubtypeString(const COrgMod::TSubtype& subtype)
5936 {
5937 switch ( subtype ) {
5938 case COrgMod::eSubtype_strain: return "strain";
5939 case COrgMod::eSubtype_substrain: return "substrain";
5940 case COrgMod::eSubtype_type: return "type";
5941 case COrgMod::eSubtype_subtype: return "subtype";
5942 case COrgMod::eSubtype_variety: return "variety";
5943 case COrgMod::eSubtype_serotype: return "serotype";
5944 case COrgMod::eSubtype_serogroup: return "serogroup";
5945 case COrgMod::eSubtype_serovar: return "serovar";
5946 case COrgMod::eSubtype_cultivar: return "cultivar";
5947 case COrgMod::eSubtype_pathovar: return "pathovar";
5948 case COrgMod::eSubtype_chemovar: return "chemovar";
5949 case COrgMod::eSubtype_biovar: return "biovar";
5950 case COrgMod::eSubtype_biotype: return "biotype";
5951 case COrgMod::eSubtype_group: return "group";
5952 case COrgMod::eSubtype_subgroup: return "subgroup";
5953 case COrgMod::eSubtype_isolate: return "isolate";
5954 case COrgMod::eSubtype_common: return "common";
5955 case COrgMod::eSubtype_acronym: return "acronym";
5956 case COrgMod::eSubtype_dosage: return "dosage";
5957 case COrgMod::eSubtype_nat_host: return "nat_host";
5958 case COrgMod::eSubtype_sub_species: return "sub_species";
5959 case COrgMod::eSubtype_specimen_voucher: return "specimen_voucher";
5960 case COrgMod::eSubtype_authority: return "authority";
5961 case COrgMod::eSubtype_forma: return "forma";
5962 case COrgMod::eSubtype_forma_specialis: return "dosage";
5963 case COrgMod::eSubtype_ecotype: return "ecotype";
5964 case COrgMod::eSubtype_synonym: return "synonym";
5965 case COrgMod::eSubtype_anamorph: return "anamorph";
5966 case COrgMod::eSubtype_teleomorph: return "teleomorph";
5967 case COrgMod::eSubtype_breed: return "breed";
5968 case COrgMod::eSubtype_gb_acronym: return "gb_acronym";
5969 case COrgMod::eSubtype_gb_anamorph: return "gb_anamorph";
5970 case COrgMod::eSubtype_gb_synonym: return "gb_synonym";
5971 case COrgMod::eSubtype_old_lineage: return "old_lineage";
5972 case COrgMod::eSubtype_old_name: return "old_name";
5973 case COrgMod::eSubtype_culture_collection: return "culture_collection";
5974 case COrgMod::eSubtype_bio_material: return "bio_material";
5975 case COrgMod::eSubtype_metagenome_source: return "metagenome_source";
5976 case COrgMod::eSubtype_type_material: return "type_material";
5977 case COrgMod::eSubtype_other: return "note";
5978 default: return kEmptyStr;
5979 }
5980 return kEmptyStr;
5981 }
5982
5983
s_GetSubsourceString(const CSubSource::TSubtype & subtype)5984 static const string s_GetSubsourceString(const CSubSource::TSubtype& subtype)
5985 {
5986 switch ( subtype ) {
5987 case CSubSource::eSubtype_chromosome: return "chromosome";
5988 case CSubSource::eSubtype_map: return "map";
5989 case CSubSource::eSubtype_clone: return "clone";
5990 case CSubSource::eSubtype_subclone: return "subclone";
5991 case CSubSource::eSubtype_haplogroup: return "haplogroup";
5992 case CSubSource::eSubtype_haplotype: return "haplotype";
5993 case CSubSource::eSubtype_genotype: return "genotype";
5994 case CSubSource::eSubtype_sex: return "sex";
5995 case CSubSource::eSubtype_cell_line: return "cell_line";
5996 case CSubSource::eSubtype_cell_type: return "cell_type";
5997 case CSubSource::eSubtype_tissue_type: return "tissue_type";
5998 case CSubSource::eSubtype_clone_lib: return "clone_lib";
5999 case CSubSource::eSubtype_dev_stage: return "dev_stage";
6000 case CSubSource::eSubtype_frequency: return "frequency";
6001 case CSubSource::eSubtype_germline: return "germline";
6002 case CSubSource::eSubtype_rearranged: return "rearranged";
6003 case CSubSource::eSubtype_lab_host: return "lab_host";
6004 case CSubSource::eSubtype_pop_variant: return "pop_variant";
6005 case CSubSource::eSubtype_tissue_lib: return "tissue_lib";
6006 case CSubSource::eSubtype_plasmid_name: return "plasmid_name";
6007 case CSubSource::eSubtype_transposon_name: return "transposon_name";
6008 case CSubSource::eSubtype_insertion_seq_name: return "insertion_seq_name";
6009 case CSubSource::eSubtype_plastid_name: return "plastid_name";
6010 case CSubSource::eSubtype_country: return "country";
6011 case CSubSource::eSubtype_segment: return "segment";
6012 case CSubSource::eSubtype_endogenous_virus_name: return "endogenous_virus_name";
6013 case CSubSource::eSubtype_transgenic: return "transgenic";
6014 case CSubSource::eSubtype_environmental_sample: return "environmental_sample";
6015 case CSubSource::eSubtype_isolation_source: return "isolation_source";
6016 case CSubSource::eSubtype_other: return "note";
6017 default: return kEmptyStr;
6018 }
6019 return kEmptyStr;
6020 }
6021
6022 // ----------------------------------------------------------------------------
x_AddFTableBiosrcQuals(const CBioSource & src)6023 void CFeatureItem::x_AddFTableBiosrcQuals(
6024 const CBioSource& src )
6025 // ----------------------------------------------------------------------------
6026 {
6027 if ( src.IsSetOrg() ) {
6028 const CBioSource::TOrg& org = src.GetOrg();
6029
6030 if ( org.IsSetTaxname() && !org.GetTaxname().empty() ) {
6031 x_AddFTableQual("organism", org.GetTaxname());
6032 }
6033
6034 if ( org.IsSetOrgname() ) {
6035 ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
6036 if ( (*it)->IsSetSubtype() ) {
6037 string str = s_GetSubtypeString((*it)->GetSubtype());
6038 if ( str.empty() ) {
6039 continue;
6040 }
6041 if ( (*it)->IsSetSubname() && !(*it)->GetSubname().empty() ) {
6042 str += (*it)->GetSubname();
6043 }
6044 x_AddFTableQual(str);
6045 }
6046 }
6047 }
6048 }
6049
6050 ITERATE (CBioSource::TSubtype, it, src.GetSubtype()) {
6051 if ( (*it)->IsSetSubtype() ) {
6052 string str = s_GetSubsourceString((*it)->GetSubtype());
6053 if ( str.empty() ) {
6054 continue;
6055 }
6056 if ( (*it)->IsSetName() ) {
6057 str += (*it)->GetName();
6058 }
6059 x_AddFTableQual(str);
6060 }
6061 }
6062 }
6063
6064
6065 /////////////////////////////////////////////////////////////////////////////
6066 // Source Feature
6067 /////////////////////////////////////////////////////////////////////////////
6068
CSourceFeatureItem(const CMappedFeat & feat,CBioseqContext & ctx,CRef<feature::CFeatTree> ftree,const CSeq_loc * loc)6069 CSourceFeatureItem::CSourceFeatureItem
6070 (const CMappedFeat& feat,
6071 CBioseqContext& ctx,
6072 CRef<feature::CFeatTree> ftree,
6073 const CSeq_loc* loc)
6074 : CFeatureItemBase(feat, ctx, ftree, loc ? loc : &feat.GetLocation()),
6075 m_WasDesc(false), m_IsFocus(false), m_IsSynthetic(false)
6076 {
6077 x_GatherInfo(ctx);
6078 }
6079
6080
GetItemType(void) const6081 IFlatItem::EItem CSourceFeatureItem::GetItemType(void) const
6082 {
6083 return eItem_SourceFeat;
6084 }
6085
x_GatherInfo(CBioseqContext & ctx)6086 void CSourceFeatureItem::x_GatherInfo(CBioseqContext& ctx)
6087 {
6088 const CBioSource& bsrc = GetSource();
6089 if (!bsrc.IsSetOrg()) {
6090 m_Feat = CMappedFeat();
6091 x_SetSkip();
6092 return;
6093 }
6094
6095 m_IsFocus = bsrc.IsSetIs_focus();
6096 if (bsrc.GetOrigin() == CBioSource::eOrigin_synthetic) {
6097 m_IsSynthetic = true;
6098 }
6099 if (!m_IsSynthetic && bsrc.GetOrg().IsSetOrgname()) {
6100 m_IsSynthetic = bsrc.GetOrg().GetOrgname().IsSetDiv() &&
6101 NStr::EqualNocase(bsrc.GetOrg().GetOrgname().GetDiv(), "SYN");
6102 }
6103 if (!m_IsSynthetic && bsrc.IsSetOrg() && bsrc.GetOrg().IsSetTaxname()) {
6104 if (NStr::EqualNocase(bsrc.GetOrg().GetTaxname(), "synthetic construct")) {
6105 m_IsSynthetic = true;
6106 }
6107 }
6108 x_AddQuals(ctx);
6109 }
6110
6111
x_AddQuals(CBioseqContext & ctx)6112 void CSourceFeatureItem::x_AddQuals(CBioseqContext& ctx)
6113 {
6114 const CSeqFeatData& data = m_Feat.GetData();
6115 _ASSERT(data.IsOrg() || data.IsBiosrc());
6116 // add various generic qualifiers...
6117 x_AddQual(eSQ_mol_type,
6118 new CFlatMolTypeQVal(ctx.GetBiomol(), ctx.GetMol()));
6119 x_AddQual(eSQ_submitter_seqid,
6120 new CFlatSubmitterSeqidQVal(ctx.GetTech()));
6121 if (m_Feat.IsSetComment()) {
6122 x_AddQual(eSQ_seqfeat_note, new CFlatStringQVal(m_Feat.GetComment()));
6123 }
6124 if (m_Feat.IsSetTitle()) {
6125 x_AddQual(eSQ_label, new CFlatLabelQVal(m_Feat.GetTitle()));
6126 }
6127 if (m_Feat.IsSetCit()) {
6128 x_AddQual(eSQ_citation, new CFlatPubSetQVal(m_Feat.GetCit()));
6129 }
6130 if (m_Feat.IsSetDbxref()) {
6131 x_AddQual(eSQ_org_xref, new CFlatXrefQVal(m_Feat.GetDbxref()));
6132 }
6133
6134 // add qualifiers from biosource fields
6135 x_AddQuals(data.GetBiosrc(), ctx);
6136 }
6137
6138
s_OrgModToSlot(const COrgMod & om)6139 static ESourceQualifier s_OrgModToSlot(const COrgMod& om)
6140 {
6141 return GetSourceQualOfOrgMod( static_cast<COrgMod::ESubtype>(om.GetSubtype()) );
6142 }
6143
s_GetSpecimenVoucherText(CBioseqContext & ctx,const string & strRawName)6144 static string s_GetSpecimenVoucherText(
6145 CBioseqContext& ctx,
6146 const string& strRawName )
6147 {
6148 if ( ! ctx.Config().DoHTML() ) {
6149 return strRawName;
6150 }
6151
6152 // doesn't COrgMod already have the code for this?
6153 string inst;
6154 string coll;
6155 string id;
6156 {
6157 if( ! COrgMod::ParseStructuredVoucher(strRawName, inst, coll, id) || NStr::IsBlank(inst)) {
6158 return strRawName;
6159 }
6160 if( ! coll.empty() ) {
6161 inst += ':' + coll;
6162 }
6163 }
6164
6165 CInstInfoMap::TVoucherInfoRef voucher_info_ref = CInstInfoMap::GetInstitutionVoucherInfo( inst );
6166 if( voucher_info_ref ) {
6167 CNcbiOstrstream text;
6168
6169 string inst_full_name = COrgMod::GetInstitutionFullName( inst );
6170 if (inst_full_name.empty()) {
6171 inst_full_name = voucher_info_ref->m_InstFullName;
6172 }
6173 text << "<acronym title=\""
6174 << NStr::Replace(inst_full_name, "\"", """)
6175 << "\" class=\"voucher\">"
6176 << inst << "</acronym>"
6177 << ":"
6178 << "<a href=\"" << *voucher_info_ref->m_Links;
6179
6180 if( voucher_info_ref->m_PrependInstitute) {
6181 text << inst;
6182 }
6183 if( voucher_info_ref->m_PrependCollection) {
6184 text << coll;
6185 }
6186 if( voucher_info_ref->m_Prefix != NULL ) {
6187 text << *voucher_info_ref->m_Prefix;
6188 }
6189 if( voucher_info_ref->m_Trim != NULL ) {
6190 const string& trim = *voucher_info_ref->m_Trim;
6191 if (NStr::StartsWith(id, trim)) {
6192 NStr::TrimPrefixInPlace(id, trim);
6193 NStr::TruncateSpacesInPlace(id);
6194 }
6195 }
6196 if( voucher_info_ref->m_PadTo > 0 && voucher_info_ref->m_PadWith != NULL) {
6197 int len_id = id.length();
6198 int len_pad = voucher_info_ref->m_PadWith->length();
6199 while (len_id < voucher_info_ref->m_PadTo) {
6200 text << *voucher_info_ref->m_PadWith;
6201 len_id += len_pad;
6202 }
6203 }
6204 text << id;
6205 if( voucher_info_ref->m_Suffix ) {
6206 text << *voucher_info_ref->m_Suffix;
6207 }
6208 text << "\">" << id << "</a>";
6209 return CNcbiOstrstreamToString(text);
6210 } else {
6211 // fall back on at least getting institution name
6212 const string &inst_full_name = COrgMod::GetInstitutionFullName( inst );
6213 if( ! inst_full_name.empty() ) {
6214 CNcbiOstrstream text;
6215
6216 text << "<acronym title=\"" << NStr::Replace(inst_full_name, "\"", """) << "\" class=\"voucher\">"
6217 << inst << "</acronym>"
6218 << ":" << id;
6219
6220 return CNcbiOstrstreamToString(text);
6221 } else {
6222 // if all else fails, return the string we were initially given
6223 return strRawName;
6224 }
6225 }
6226 }
6227
6228
x_AddQuals(const COrg_ref & org,CBioseqContext & ctx) const6229 void CSourceFeatureItem::x_AddQuals(const COrg_ref& org, CBioseqContext& ctx) const
6230 {
6231 CTempString taxname;
6232 CTempString common;
6233 if ( org.IsSetTaxname() ) {
6234 taxname = org.GetTaxname();
6235 }
6236 if ( taxname.empty() && ctx.Config().NeedOrganismQual() ) {
6237 taxname = "unknown";
6238 if ( org.IsSetCommon() ) {
6239 common = org.GetCommon();
6240 }
6241 }
6242 if ( !taxname.empty() ) {
6243 x_AddQual(eSQ_organism, new CFlatStringQVal(taxname));
6244 }
6245 if ( !common.empty() ) {
6246 x_AddQual(eSQ_common_name, new CFlatStringQVal(common));
6247 }
6248 if ( org.IsSetOrgname() ) {
6249 set<CTempString> ecotypesSeen; // holds the ones we've seen so don't show them again
6250 ecotypesSeen.insert(kEmptyStr); // empty string is always considered seen so we hide it
6251 ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
6252
6253 const COrgMod& mod = **it;
6254 const string & sSubname = (
6255 mod.CanGetSubname() ? mod.GetSubname() : kEmptyStr );
6256
6257 ESourceQualifier slot = s_OrgModToSlot(**it);
6258 switch( slot ) {
6259 case eSQ_ecotype:
6260 if( ecotypesSeen.find(sSubname) != ecotypesSeen.end() ) {
6261 break; // already seen
6262 }
6263 ecotypesSeen.insert( sSubname );
6264 x_AddQual(slot, new CFlatOrgModQVal(mod));
6265 break;
6266 case eSQ_none:
6267 break;
6268 default:
6269 {
6270 const COrgMod::TSubtype stype = mod.GetSubtype();
6271 if( COrgMod::HoldsInstitutionCode(stype) ) {
6272 CRef<COrgMod> new_mod( new COrgMod(stype,
6273 ( sSubname.empty() ? kEmptyStr : s_GetSpecimenVoucherText(ctx, sSubname) ) ));
6274 x_AddQual(slot, new CFlatOrgModQVal(*new_mod));
6275 } else if (stype == COrgMod::eSubtype_type_material && (! COrgMod::IsINSDCValidTypeMaterial(sSubname))) {
6276 CRef<COrgMod> new_mod( new COrgMod(COrgMod::eSubtype_other,
6277 ( sSubname.empty() ? kEmptyStr : "type_material: " + sSubname ) ));
6278 x_AddQual(eSQ_orgmod_note, new CFlatOrgModQVal(*new_mod));
6279 } else {
6280 x_AddQual(slot, new CFlatOrgModQVal(**it));
6281 }
6282 }
6283 break;
6284 }
6285 }
6286 }
6287 if (!WasDesc() && org.IsSetMod()) {
6288 x_AddQual(eSQ_unstructured, new CFlatStringListQVal(org.GetMod()));
6289 }
6290 if ( org.IsSetDb() ) {
6291 x_AddQual(eSQ_db_xref, new CFlatXrefQVal(org.GetDb()));
6292 }
6293 }
6294
x_AddPcrPrimersQuals(const CBioSource & src,CBioseqContext & ctx) const6295 void CSourceFeatureItem::x_AddPcrPrimersQuals(const CBioSource& src, CBioseqContext& ctx) const
6296 {
6297 if( ! src.IsSetPcr_primers() ) {
6298 return;
6299 }
6300
6301 const CBioSource_Base::TPcr_primers & primers = src.GetPcr_primers();
6302 if( primers.CanGet() ) {
6303 ITERATE( CBioSource_Base::TPcr_primers::Tdata, it, primers.Get() ) {
6304 string primer_value;
6305
6306 bool has_fwd_seq = false;
6307 bool has_rev_seq = false;
6308
6309 if( (*it)->IsSetForward() ) {
6310 const CPCRReaction_Base::TForward &forward = (*it)->GetForward();
6311 if( forward.CanGet() ) {
6312 ITERATE( CPCRReaction_Base::TForward::Tdata, it2, forward.Get() ) {
6313 const string &fwd_name = ( (*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
6314 if( ! fwd_name.empty() ) {
6315 s_AddPcrPrimersQualsAppend( primer_value, "fwd_name: ", fwd_name);
6316 }
6317 const string &fwd_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
6318 // NStr::ToLower( fwd_seq );
6319 if( ! fwd_seq.empty() ) {
6320 s_AddPcrPrimersQualsAppend( primer_value, "fwd_seq: ", fwd_seq);
6321 has_fwd_seq = true;
6322 }
6323 }
6324 }
6325 }
6326 if( (*it)->IsSetReverse() ) {
6327 const CPCRReaction_Base::TReverse &reverse = (*it)->GetReverse();
6328 if( reverse.CanGet() ) {
6329 ITERATE( CPCRReaction_Base::TReverse::Tdata, it2, reverse.Get() ) {
6330 const string &rev_name = ((*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
6331 if( ! rev_name.empty() ) {
6332 s_AddPcrPrimersQualsAppend( primer_value, "rev_name: ", rev_name);
6333 }
6334 const string &rev_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
6335 // NStr::ToLower( rev_seq ); // do we need this?
6336 if( ! rev_seq.empty() ) {
6337 s_AddPcrPrimersQualsAppend( primer_value, "rev_seq: ", rev_seq);
6338 has_rev_seq = true;
6339 }
6340 }
6341 }
6342 }
6343
6344 if( ! primer_value.empty() ) {
6345 const bool is_in_note = ( ! has_fwd_seq || ! has_rev_seq );
6346 if( is_in_note ) {
6347 primer_value = "PCR_primers=" + primer_value;
6348 }
6349 const ESourceQualifier srcQual = ( is_in_note ? eSQ_pcr_primer_note : eSQ_PCR_primers );
6350 x_AddQual( srcQual, new CFlatStringQVal( primer_value ) );
6351 }
6352 }
6353 }
6354 }
6355
s_SubSourceToSlot(const CSubSource & ss)6356 static ESourceQualifier s_SubSourceToSlot(const CSubSource& ss)
6357 {
6358 return GetSourceQualOfSubSource( static_cast<CSubSource::ESubtype>(ss.GetSubtype()) );
6359 }
6360
x_AddQuals(const CBioSource & src,CBioseqContext & ctx) const6361 void CSourceFeatureItem::x_AddQuals(const CBioSource& src, CBioseqContext& ctx) const
6362 {
6363 // add qualifiers from Org_ref field
6364 if ( src.IsSetOrg() ) {
6365 x_AddQuals(src.GetOrg(), ctx);
6366 }
6367 x_AddQual(eSQ_focus, new CFlatBoolQVal(src.IsSetIs_focus()));
6368
6369
6370 bool insertion_seq_name = false,
6371 plasmid_name = false,
6372 transposon_name = false;
6373
6374 ITERATE (CBioSource::TSubtype, it, src.GetSubtype()) {
6375 ESourceQualifier slot = s_SubSourceToSlot(**it);
6376
6377 switch( slot ) {
6378
6379 case eSQ_insertion_seq_name:
6380 insertion_seq_name = true;
6381 x_AddQual(slot, new CFlatSubSourceQVal(**it));
6382 break;
6383
6384 case eSQ_plasmid_name:
6385 plasmid_name = true;
6386 x_AddQual(slot, new CFlatSubSourceQVal(**it));
6387 break;
6388
6389 case eSQ_transposon_name:
6390 transposon_name = true;
6391 x_AddQual(slot, new CFlatSubSourceQVal(**it));
6392 break;
6393
6394 case eSQ_metagenomic:
6395 x_AddQual( eSQ_metagenomic, new CFlatStringQVal( "metagenomic") );
6396 break;
6397
6398 default:
6399 if (slot != eSQ_none) {
6400 x_AddQual(slot, new CFlatSubSourceQVal(**it));
6401 }
6402 break;
6403 }
6404 }
6405
6406 // Gets direct "pcr-primers" tag from file and adds the quals from that
6407 x_AddPcrPrimersQuals(src, ctx);
6408
6409 // some qualifiers are flags in genome and names in subsource,
6410 // print once with name
6411 CBioSource::TGenome genome = src.GetGenome();
6412 CRef<CFlatOrganelleQVal> organelle(new CFlatOrganelleQVal(genome));
6413 if ( (insertion_seq_name && genome == CBioSource::eGenome_insertion_seq) ||
6414 (plasmid_name && genome == CBioSource::eGenome_plasmid) ||
6415 (transposon_name && genome == CBioSource::eGenome_transposon) ) {
6416 organelle.Reset();
6417 }
6418 if ( organelle ) {
6419 x_AddQual(eSQ_organelle, organelle);
6420 }
6421
6422 if ( !WasDesc() && m_Feat.IsSetComment() ) {
6423 x_AddQual(eSQ_seqfeat_note, new CFlatStringQVal(m_Feat.GetComment()));
6424 }
6425 }
6426
6427
x_FormatQuals(CFlatFeature & ff) const6428 void CSourceFeatureItem::x_FormatQuals(CFlatFeature& ff) const
6429 {
6430 ff.SetQuals().reserve(m_Quals.Size());
6431 CFlatFeature::TQuals& qvec = ff.SetQuals();
6432
6433 #define DO_QUAL(x) x_FormatQual(eSQ_##x, GetStringOfSourceQual(eSQ_##x), qvec)
6434 DO_QUAL(organism);
6435
6436 DO_QUAL(organelle);
6437
6438 DO_QUAL(mol_type);
6439
6440 DO_QUAL(submitter_seqid);
6441
6442 DO_QUAL(strain);
6443 DO_QUAL(substrain);
6444 DO_QUAL(variety);
6445 DO_QUAL(serotype);
6446 DO_QUAL(serovar);
6447 DO_QUAL(cultivar);
6448 DO_QUAL(isolate);
6449 DO_QUAL(isolation_source);
6450 DO_QUAL(spec_or_nat_host);
6451 DO_QUAL(sub_species);
6452
6453 DO_QUAL(specimen_voucher);
6454 DO_QUAL(culture_collection);
6455 DO_QUAL(bio_material);
6456
6457 DO_QUAL(type_material);
6458
6459 DO_QUAL(db_xref);
6460 DO_QUAL(org_xref);
6461
6462 DO_QUAL(chromosome);
6463
6464 DO_QUAL(segment);
6465
6466 DO_QUAL(map);
6467 DO_QUAL(clone);
6468 DO_QUAL(subclone);
6469 DO_QUAL(haplotype);
6470 DO_QUAL(haplogroup);
6471 DO_QUAL(sex);
6472 DO_QUAL(mating_type);
6473 DO_QUAL(cell_line);
6474 DO_QUAL(cell_type);
6475 DO_QUAL(tissue_type);
6476 DO_QUAL(clone_lib);
6477 DO_QUAL(dev_stage);
6478 DO_QUAL(ecotype);
6479
6480 if( ! GetContext()->Config().FrequencyToNote() ) {
6481 DO_QUAL(frequency);
6482 }
6483
6484 DO_QUAL(germline);
6485 DO_QUAL(rearranged);
6486 DO_QUAL(transgenic);
6487 DO_QUAL(environmental_sample);
6488
6489 DO_QUAL(lab_host);
6490 DO_QUAL(pop_variant);
6491 DO_QUAL(tissue_lib);
6492
6493 DO_QUAL(plasmid_name);
6494 DO_QUAL(mobile_element);
6495 DO_QUAL(transposon_name);
6496 DO_QUAL(insertion_seq_name);
6497
6498 DO_QUAL(country);
6499
6500 DO_QUAL(focus);
6501
6502 DO_QUAL(lat_lon);
6503 DO_QUAL(altitude);
6504 DO_QUAL(collection_date);
6505 DO_QUAL(collected_by);
6506 DO_QUAL(identified_by);
6507 DO_QUAL(PCR_primers);
6508 DO_QUAL(metagenome_source);
6509
6510 if ( !GetContext()->Config().SrcQualsToNote() ) {
6511 // some note qualifiers appear as regular quals in GBench or Dump mode
6512 x_FormatGBNoteQuals(ff);
6513 }
6514
6515 DO_QUAL(sequenced_mol);
6516 DO_QUAL(label);
6517 DO_QUAL(usedin);
6518 // DO_QUAL(citation);
6519 #undef DO_QUAL
6520
6521 // Format the rest of the note quals (ones that weren't formatted above)
6522 // as a single note qualifier
6523 x_FormatNoteQuals(ff);
6524 }
6525
6526
x_FormatGBNoteQuals(CFlatFeature & ff) const6527 void CSourceFeatureItem::x_FormatGBNoteQuals(CFlatFeature& ff) const
6528 {
6529 _ASSERT(!GetContext()->Config().SrcQualsToNote());
6530 CFlatFeature::TQuals& qvec = ff.SetQuals();
6531
6532 #define DO_QUAL(x) x_FormatQual(eSQ_##x, GetStringOfSourceQual(eSQ_##x), qvec)
6533 DO_QUAL(metagenomic);
6534 DO_QUAL(linkage_group);
6535
6536 DO_QUAL(type);
6537 DO_QUAL(subtype);
6538 DO_QUAL(serogroup);
6539 DO_QUAL(pathovar);
6540 DO_QUAL(chemovar);
6541 DO_QUAL(biovar);
6542 DO_QUAL(biotype);
6543 DO_QUAL(group);
6544 DO_QUAL(subgroup);
6545 DO_QUAL(common);
6546 DO_QUAL(acronym);
6547 DO_QUAL(dosage);
6548
6549 DO_QUAL(authority);
6550 DO_QUAL(forma);
6551 DO_QUAL(forma_specialis);
6552 DO_QUAL(synonym);
6553 DO_QUAL(anamorph);
6554 DO_QUAL(teleomorph);
6555 DO_QUAL(breed);
6556 if( GetContext()->Config().FrequencyToNote() ) {
6557 DO_QUAL(frequency);
6558 }
6559
6560 // DO_QUAL(metagenome_source),
6561 // DO_QUAL(collection_date);
6562 // DO_QUAL(collected_by);
6563 // DO_QUAL(identified_by);
6564 // DO_QUAL(pcr_primer);
6565 DO_QUAL(genotype);
6566 DO_QUAL(plastid_name);
6567
6568 DO_QUAL(endogenous_virus_name);
6569
6570 DO_QUAL(zero_orgmod);
6571 DO_QUAL(one_orgmod);
6572 DO_QUAL(zero_subsrc);
6573 #undef DO_QUAL
6574 }
6575
6576
6577 /*
6578 static bool s_IsExactAndNonExactMatchOnNoteQuals(CFlatFeature::TQuals& qvec, const string& str)
6579 {
6580 if (qvec.empty()) {
6581 return false;
6582 }
6583
6584 int has_exact = 0;
6585 int non_exact = 0;
6586
6587 CFlatFeature::TQuals::iterator it = qvec.begin();
6588 while (it != qvec.end()) {
6589 const string& val = (*it)->GetValue();
6590 if (NStr::Find(val, str) != NPOS) {
6591 if (NStr::Equal(val, str)) {
6592 has_exact++;
6593 } else {
6594 non_exact++;
6595 }
6596 }
6597 ++it;
6598 }
6599
6600 if (has_exact == 1 && non_exact > 0) return true;
6601 return false;
6602 }
6603 */
6604
6605
6606
x_FormatNoteQuals(CFlatFeature & ff) const6607 void CSourceFeatureItem::x_FormatNoteQuals(CFlatFeature& ff) const
6608 {
6609 CFlatFeature::TQuals qvec;
6610 bool add_period = false;
6611
6612 #define DO_NOTE(x) x_FormatNoteQual(eSQ_##x, #x, qvec)
6613 if (m_WasDesc) {
6614 x_FormatNoteQual(eSQ_seqfeat_note, "note", qvec);
6615 DO_NOTE(orgmod_note);
6616 DO_NOTE(subsource_note);
6617 } else {
6618 DO_NOTE(unstructured);
6619 }
6620
6621 if ( GetContext()->Config().SrcQualsToNote() ) {
6622 DO_NOTE(metagenomic);
6623 DO_NOTE(linkage_group);
6624 DO_NOTE(type);
6625 DO_NOTE(subtype);
6626 DO_NOTE(serogroup);
6627 DO_NOTE(pathovar);
6628 DO_NOTE(chemovar);
6629 DO_NOTE(biovar);
6630 DO_NOTE(biotype);
6631 DO_NOTE(group);
6632 DO_NOTE(subgroup);
6633 DO_NOTE(common);
6634 DO_NOTE(acronym);
6635 DO_NOTE(dosage);
6636
6637 DO_NOTE(authority);
6638 DO_NOTE(forma);
6639 DO_NOTE(forma_specialis);
6640 DO_NOTE(synonym);
6641 DO_NOTE(anamorph);
6642 DO_NOTE(teleomorph);
6643 DO_NOTE(breed);
6644 if( GetContext()->Config().FrequencyToNote() ) {
6645 DO_NOTE(frequency);
6646 }
6647
6648 /*
6649 if (s_IsExactAndNonExactMatchOnNoteQuals(qvec, "metagenomic")) {
6650 x_FormatNoteQual(eSQ_metagenome_source, "metagenomic; derived from metagenome", qvec);
6651 } else {
6652 x_FormatNoteQual(eSQ_metagenome_source, "derived from metagenome", qvec);
6653 }
6654 */
6655
6656 DO_NOTE(genotype);
6657 x_FormatNoteQual(eSQ_plastid_name, "plastid", qvec);
6658
6659 x_FormatNoteQual(eSQ_endogenous_virus_name, "endogenous_virus", qvec);
6660 }
6661 DO_NOTE(pcr_primer_note);
6662
6663 if (!m_WasDesc) {
6664 x_FormatNoteQual(eSQ_seqfeat_note, "note", qvec);
6665 DO_NOTE(orgmod_note);
6666 DO_NOTE(subsource_note);
6667 }
6668
6669 x_FormatNoteQual(eSQ_common_name, "common", qvec);
6670
6671 if ( GetContext()->Config().SrcQualsToNote() ) {
6672 x_FormatNoteQual(eSQ_zero_orgmod, "?", qvec);
6673 x_FormatNoteQual(eSQ_one_orgmod, "?", qvec);
6674 x_FormatNoteQual(eSQ_zero_subsrc, "?", qvec);
6675 }
6676 #undef DO_NOTE
6677
6678 string notestr;
6679 string suffix;
6680
6681 if ( GetSource().IsSetGenome() &&
6682 GetSource().GetGenome() == CBioSource::eGenome_extrachrom ) {
6683 static const string kEOL = "\n";
6684 notestr += "extrachromosomal";
6685 suffix = kEOL;
6686 }
6687
6688 s_QualVectorToNote(qvec, true, notestr, suffix, add_period);
6689 s_NoteFinalize(add_period, notestr, ff, eTilde_note);
6690 }
6691
6692
CSourceFeatureItem(const CBioSource & src,TRange range,CBioseqContext & ctx,CRef<feature::CFeatTree> ftree)6693 CSourceFeatureItem::CSourceFeatureItem
6694 (const CBioSource& src,
6695 TRange range,
6696 CBioseqContext& ctx,
6697 CRef<feature::CFeatTree> ftree)
6698 : CFeatureItemBase(CMappedFeat(), ctx, ftree),
6699 m_WasDesc(true), m_IsFocus(false), m_IsSynthetic(false)
6700 {
6701 if (!src.IsSetOrg()) {
6702 m_Feat = CMappedFeat();
6703 x_SetSkip();
6704 return;
6705 }
6706 x_SetObject(src);
6707
6708 /// We build a fake BioSource feature - even for a source descriptor
6709 CRef<CSeq_feat> feat(new CSeq_feat);
6710 feat->SetData().SetBiosrc(const_cast<CBioSource&>(src));
6711 if ( range.IsWhole() ) {
6712 feat->SetLocation().SetWhole(*ctx.GetPrimaryId());
6713 } else {
6714 CSeq_interval& ival = feat->SetLocation().SetInt();
6715 ival.SetFrom(range.GetFrom());
6716 ival.SetTo(range.GetTo());
6717 ival.SetId(*ctx.GetPrimaryId());
6718 }
6719
6720 CRef<CSeq_annot> an(new CSeq_annot);
6721 an->SetData().SetFtable().push_back(feat);
6722
6723 CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
6724 CSeq_annot_Handle sah = local_scope->AddSeq_annot(*an);
6725 m_Feat = *(CFeat_CI(sah));
6726 m_Loc = &m_Feat.GetLocation();
6727 x_SetObject(m_Feat.GetOriginalFeature());
6728
6729 x_GatherInfo(ctx);
6730 }
6731
6732
x_FormatQual(ESourceQualifier slot,const CTempString & name,CFlatFeature::TQuals & qvec,IFlatQVal::TFlags flags) const6733 void CSourceFeatureItem::x_FormatQual
6734 (ESourceQualifier slot,
6735 const CTempString& name,
6736 CFlatFeature::TQuals& qvec,
6737 IFlatQVal::TFlags flags) const
6738 {
6739 TQCI it = m_Quals.LowerBound(slot);
6740 TQCI end = m_Quals.end();
6741 while (it != end && it->first == slot) {
6742 const IFlatQVal* qual = it->second;
6743 qual->Format(qvec, name, *GetContext(),
6744 flags | IFlatQVal::fIsSource);
6745 ++it;
6746 }
6747 }
6748
6749
Subtract(const CSourceFeatureItem & other,CScope & scope)6750 void CSourceFeatureItem::Subtract(const CSourceFeatureItem& other, CScope &scope)
6751 {
6752 m_Loc = Seq_loc_Subtract(GetLoc(), other.GetLoc(), CSeq_loc::fStrand_Ignore, &scope);
6753 }
6754
6755
SetLoc(const CSeq_loc & loc)6756 void CSourceFeatureItem::SetLoc(const CSeq_loc& loc)
6757 {
6758 m_Loc.Reset(&loc);
6759 }
6760
6761
6762 // ----------------------------------------------------------------------------
x_GetGbValue(const string & key,string & value) const6763 bool CFeatureItem::x_GetGbValue(
6764 const string& key,
6765 string& value ) const
6766 // ----------------------------------------------------------------------------
6767 {
6768 CSeq_feat::TQual gbQuals = m_Feat.GetQual();
6769 for ( CSeq_feat::TQual::iterator it = gbQuals.begin();
6770 it != gbQuals.end(); ++it )
6771 {
6772 //
6773 // Idea:
6774 // If a gbqual specifying the inference exists then bail out and let
6775 // gbqual processing take care of this qualifier. If no such gbqual is
6776 // present then add a default inference qualifier.
6777 //
6778 if (!(*it)->IsSetQual() || !(*it)->IsSetVal()) {
6779 continue;
6780 }
6781 if ( (*it)->GetQual() == key ) {
6782 value = (*it)->GetVal();
6783 return true;
6784 }
6785 }
6786 return false;
6787 }
6788
x_HasMethodtRNAscanSE(void) const6789 bool CFeatureItem::x_HasMethodtRNAscanSE(void) const
6790 {
6791 // try to make this fast, since it could be checked by every feature.
6792
6793 // try to do cheap checks first
6794
6795 if( ! m_Feat.IsSetExt() ) {
6796 return false;
6797 }
6798 const CUser_object & ext = m_Feat.GetExt();
6799 if( ! ext.IsSetType() || ! ext.IsSetData() ) {
6800 return false;
6801 }
6802 const CUser_object_Base::TType & ext_type = ext.GetType();
6803 if( ! ext_type.IsStr() || ext_type.GetStr() != "CombinedFeatureUserObjects" ) {
6804 return false;
6805 }
6806 const CUser_object::TData & ext_data = ext.GetData();
6807 ITERATE( CUser_object::TData, field_iter, ext_data ) {
6808 const CUser_field & field = **field_iter;
6809 if( ! field.IsSetLabel() || ! field.IsSetData() ) {
6810 continue;
6811 }
6812 const CUser_field::TLabel & field_label = field.GetLabel();
6813 const CUser_field::TData & field_data = field.GetData();
6814 if( ! field_label.IsStr() || ! field_data.IsObject() ||
6815 field_label.GetStr() != "ModelEvidence" )
6816 {
6817 continue;
6818 }
6819 const CUser_object & evidence_object = field_data.GetObject();
6820 if( ! evidence_object.IsSetData() ||
6821 ! evidence_object.IsSetType() ||
6822 ! evidence_object.GetType().IsStr() ||
6823 evidence_object.GetType().GetStr() != "ModelEvidence" )
6824 {
6825 continue;
6826 }
6827 const CUser_object::TData & evidence_data = evidence_object.GetData();
6828 ITERATE( CUser_object::TData, evidence_iter, evidence_data ) {
6829 const CUser_field & evidence_field = **evidence_iter;
6830 if( ! evidence_field.IsSetLabel() ||
6831 ! evidence_field.GetLabel().IsStr() ||
6832 evidence_field.GetLabel().GetStr() != "Method" ||
6833 ! evidence_field.IsSetData() ||
6834 ! evidence_field.GetData().IsStr() ||
6835 evidence_field.GetData().GetStr() != "tRNAscan-SE" )
6836 {
6837 continue;
6838 }
6839 // we found proof of method tRNAscan-SE, so we return true
6840 return true;
6841 }
6842 }
6843
6844 // didn't find any proof of method tRNAscan-SE
6845 return false;
6846 }
6847
6848 END_SCOPE(objects)
6849 END_NCBI_SCOPE
6850
6851