1 /* nucprot.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: nucprot.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 *
34 * Take a Seq-entry or elements of a Bioseq-set, do orglookup,
35 * protein translation lookup, then make a nucleic acid protein
36 * sequence.
37 *
38 * Get genetic code from either from Taxonomy database or from
39 * guess rules (if the organism is different in the segment set or
40 * Taxserver is not available)
41 *
42 * orglookup includes
43 * - lookup taxname, common name
44 * - get lineage and division
45 * - get genetic codes
46 *
47 * Protein translation lookup includes
48 * - lookup internal and end stop codon
49 * - compare two sequences, one from CdRegion, one from
50 * translation qualifier
51 *
52 * Take our translation when the only diff is start codon.
53 *
54 * This program only assign 3 different level of Bioseqset:
55 * class = nucprot, assign level = 1
56 * class = segset, assign level = 2
57 * class = parts, assign levle = 3
58 *
59 */
60 #include <ncbi_pch.hpp>
61
62 #include "ftacpp.hpp"
63
64 #include <objmgr/scope.hpp>
65 #include <objects/seq/Seq_descr.hpp>
66 #include <objects/seq/Seq_annot.hpp>
67 #include <objects/seqfeat/Imp_feat.hpp>
68 #include <objects/seqfeat/Code_break.hpp>
69 #include <objmgr/util/seq_loc_util.hpp>
70 #include <objects/seqfeat/SeqFeatXref.hpp>
71 #include <objects/seqfeat/Cdregion.hpp>
72 #include <objects/seqfeat/Genetic_code.hpp>
73 #include <objmgr/util/sequence.hpp>
74 #include <objects/seq/Seq_data.hpp>
75 #include <objects/general/Object_id.hpp>
76 #include <objects/general/Dbtag.hpp>
77 #include <objects/seqfeat/Org_ref.hpp>
78 #include <objtools/cleanup/cleanup.hpp>
79 #include <objects/general/User_object.hpp>
80 #include <objects/seqblock/EMBL_block.hpp>
81 #include <objects/seq/GIBB_method.hpp>
82 #include <objtools/edit/cds_fix.hpp>
83
84
85 #include "index.h"
86
87 #include <objtools/flatfile/flatfile_parser.hpp>
88 #include <objtools/flatfile/flatdefn.h>
89
90 #include "ftaerr.hpp"
91 #include "asci_blk.h"
92 #include "add.h"
93 #include "utilfeat.h"
94 #include "nucprot.h"
95 #include "fta_src.h"
96 #include "utilfun.h"
97 #include "indx_blk.h"
98 #include "xgbparint.h"
99
100 #ifdef THIS_FILE
101 # undef THIS_FILE
102 #endif
103 #define THIS_FILE "nucprot.cpp"
104
105 BEGIN_NCBI_SCOPE
106 USING_SCOPE(objects);
107
108 typedef list<CRef<objects::CCode_break> > TCodeBreakList;
109
110 const char *GBExceptionQualVals[] = {
111 "RNA editing",
112 "reasons given in citation",
113 "rearrangement required for product",
114 "annotated by transcript or proteomic data",
115 NULL
116 };
117
118 const char *RSExceptionQualVals[] = {
119 "RNA editing",
120 "reasons given in citation",
121 "ribosomal slippage",
122 "trans-splicing",
123 "alternative processing",
124 "artificial frameshift",
125 "nonconsensus splice site",
126 "rearrangement required for product",
127 "modified codon recognition",
128 "alternative start codon",
129 "unclassified transcription discrepancy",
130 "unclassified translation discrepancy",
131 "mismatches in transcription",
132 "mismatches in translation",
133 NULL
134 };
135
136 /**********************************************************
137 *
138 * bool FindTheQual(qlist, qual):
139 *
140 * Finds qual in the "qlist" return TRUE.
141 * Otherwise, return FALSE.
142 *
143 **********************************************************/
FindTheQual(const objects::CSeq_feat & feat,const Char * qual_to_find)144 static bool FindTheQual(const objects::CSeq_feat& feat, const Char *qual_to_find)
145 {
146 ITERATE(TQualVector, qual, feat.GetQual())
147 {
148 if ((*qual)->IsSetQual() && (*qual)->GetQual() == qual_to_find)
149 return true;
150 }
151
152 return(false);
153 }
154
155 /**********************************************************
156 *
157 * static char* CpTheQualValueNext(qlist, retq, qual):
158 *
159 * Return qual's value if found the "qual" in the
160 * "qlist", and retq points to next available searching
161 * list; Otherwise, return NULL value and retq points
162 * to NULL.
163 *
164 **********************************************************/
CpTheQualValueNext(TQualVector::iterator & cur_qual,const TQualVector::iterator & end_qual,const char * qual)165 static char* CpTheQualValueNext(TQualVector::iterator& cur_qual, const TQualVector::iterator& end_qual,
166 const char *qual)
167 {
168 std::string qvalue;
169
170 for (; cur_qual != end_qual; ++cur_qual) {
171 if (!(*cur_qual)->IsSetQual() || (*cur_qual)->GetQual() != qual || !(*cur_qual)->IsSetVal())
172 continue;
173
174 qvalue = NStr::Sanitize((*cur_qual)->GetVal());
175
176 ++cur_qual;
177 break;
178 }
179
180 char* ret = NULL;
181 if (!qvalue.empty())
182 ret = StringSave(qvalue.c_str());
183
184 return ret;
185 }
186
187 /**********************************************************/
fta_get_genetic_code(ParserPtr pp)188 static Int4 fta_get_genetic_code(ParserPtr pp)
189 {
190 IndexblkPtr ibp;
191 ProtBlkPtr pbp;
192 Int4 gcode;
193
194 if (pp->taxserver != 1)
195 return(0);
196
197 ibp = pp->entrylist[pp->curindx];
198 if (ibp->gc_genomic < 1 && ibp->gc_mito < 1)
199 return(0);
200
201 pbp = pp->pbp;
202 gcode = ibp->gc_genomic;
203 if (pbp->genome == 4 || pbp->genome == 5)
204 gcode = ibp->gc_mito;
205 pp->no_code = false;
206
207 return(gcode);
208 }
209
210 /**********************************************************/
GuessGeneticCode(ParserPtr pp,const objects::CSeq_descr & descrs)211 static void GuessGeneticCode(ParserPtr pp, const objects::CSeq_descr& descrs)
212 {
213 ProtBlkPtr pbp;
214 Int4 gcode = 0;
215
216 pbp = pp->pbp;
217
218 ITERATE(TSeqdescList, descr, descrs.Get())
219 {
220 if (!(*descr)->IsModif())
221 continue;
222
223 ITERATE(objects::CSeqdesc::TModif, modif, (*descr)->GetModif())
224 {
225 if (*modif == objects::eGIBB_mod_mitochondrial ||
226 *modif == objects::eGIBB_mod_kinetoplast) {
227 pbp->genome = 5; /* mitochondrion */
228 break;
229 }
230 }
231 break;
232 }
233
234 ITERATE(TSeqdescList, descr, descrs.Get())
235 {
236 if (!(*descr)->IsSource())
237 continue;
238
239 pbp->genome = (*descr)->GetSource().IsSetGenome() ? (*descr)->GetSource().GetGenome() : 0;
240 break;
241 }
242
243 gcode = fta_get_genetic_code(pp);
244 if (gcode <= 0)
245 return;
246
247 pbp->orig_gcode = gcode;
248 pbp->gcode.SetId(gcode);
249 }
250
251 /**********************************************************/
GetGcode(TEntryList & seq_entries,ParserPtr pp)252 static void GetGcode(TEntryList& seq_entries, ParserPtr pp)
253 {
254 if (pp != NULL && pp->pbp != NULL && !pp->pbp->gcode.IsId()) {
255 ITERATE(TEntryList, entry, seq_entries)
256 {
257 GuessGeneticCode(pp, GetDescrPointer(*(*entry)));
258
259 if (pp->pbp->gcode.IsId())
260 break;
261 }
262 }
263 }
264
265 /**********************************************************/
ProtBlkFree(ProtBlkPtr pbp)266 static void ProtBlkFree(ProtBlkPtr pbp)
267 {
268 pbp->gcode.Reset();
269 pbp->feats.clear();
270
271 pbp->entries.clear();
272 InfoBioseqFree(pbp->ibp);
273 }
274
275 /**********************************************************/
ProtBlkInit(ProtBlkPtr pbp)276 static void ProtBlkInit(ProtBlkPtr pbp)
277 {
278 pbp->biosep = nullptr;
279
280 pbp->gcode.Reset();
281 pbp->segset = false;
282 pbp->genome = 0;
283
284 InfoBioseqPtr ibp = pbp->ibp;
285 if (ibp) {
286 ibp->ids.clear();
287 ibp->locus = NULL;
288 ibp->acnum = NULL;
289 }
290 }
291
292 /**********************************************************/
AssignBioseqSetLevel(TEntryList & seq_entries)293 static void AssignBioseqSetLevel(TEntryList& seq_entries)
294 {
295
296 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
297 {
298 for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
299 switch (bio_set->GetClass()) {
300 case objects::CBioseq_set::eClass_nuc_prot:
301 bio_set->SetLevel(1);
302 break;
303 case objects::CBioseq_set::eClass_segset:
304 bio_set->SetLevel(2);
305 break;
306 case objects::CBioseq_set::eClass_parts:
307 bio_set->SetLevel(3);
308 break;
309 default:
310 ErrPostEx(SEV_INFO, ERR_BIOSEQSETCLASS_NewClass,
311 "BioseqSeq class %d not handled", (int)bio_set->GetClass());
312 }
313 }
314 }
315 }
316
317 /**********************************************************
318 *
319 * static bool check_short_CDS(pp, sfp, err_msg):
320 *
321 * If CDS location contains one of the sequence ends
322 * return TRUE, e.g. it's short do not create protein
323 * bioseq, copy prot-ref to Xref.
324 *
325 **********************************************************/
check_short_CDS(ParserPtr pp,const objects::CSeq_feat & feat,bool err_msg)326 static bool check_short_CDS(ParserPtr pp, const objects::CSeq_feat& feat, bool err_msg)
327 {
328 const objects::CSeq_interval& interval = feat.GetLocation().GetInt();
329 if (interval.GetFrom() == 0 || interval.GetTo() == (TSeqPos)(pp->entrylist[pp->curindx]->bases) - 1)
330 return true;
331
332 if (err_msg) {
333 char* p = location_to_string(feat.GetLocation());
334 ErrPostEx(SEV_WARNING, ERR_CDREGION_ShortProtein,
335 "Short CDS (< 6 aa) located in the middle of the sequence: %s",
336 (p == NULL) ? "" : p);
337 MemFree(p);
338 }
339 return false;
340 }
341
342 /**********************************************************/
GetProtRefSeqId(objects::CBioseq::TId & ids,InfoBioseqPtr ibp,int * num,ParserPtr pp,CScope & scope,objects::CSeq_feat & cds)343 static void GetProtRefSeqId(objects::CBioseq::TId& ids, InfoBioseqPtr ibp, int* num,
344 ParserPtr pp, CScope& scope, objects::CSeq_feat& cds)
345 {
346 const char *r;
347
348 char* protacc;
349 char* p;
350 char* q;
351 ErrSev sev;
352 Uint1 cho;
353 Uint1 ncho;
354 Char str[100];
355
356 if (pp->mode == Parser::EMode::Relaxed) {
357 protacc = CpTheQualValue(cds.SetQual(), "protein_id");
358 if (protacc == NULL || *protacc == '\0') {
359 if (protacc)
360 MemFree(protacc);
361 int protein_id_counter=0;
362 string idLabel;
363 auto pProteinId =
364 edit::GetNewProtId(scope.GetBioseqHandle(cds.GetLocation()), protein_id_counter, idLabel, false);
365 cds.SetProduct().SetWhole().Assign(*pProteinId);
366 ids.push_back(pProteinId);
367 return;
368 }
369 CSeq_id::ParseIDs(ids, protacc);
370 MemFree(protacc);
371 return;
372 }
373
374 if(pp->source == Parser::ESource::USPTO)
375 {
376 protacc = CpTheQualValue(cds.SetQual(), "protein_id");
377 CRef<objects::CSeq_id> pat_seq_id(new objects::CSeq_id);
378 CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId(protacc);
379 pat_seq_id->SetPatent(*pat_id);
380 ids.push_back(pat_seq_id);
381 return;
382 }
383
384 const objects::CTextseq_id* text_id = nullptr;
385 ITERATE(TSeqIdList, id, ibp->ids)
386 {
387 if (!(*id)->IsPatent()) {
388 text_id = (*id)->GetTextseq_Id();
389 break;
390 }
391 }
392
393 if (text_id == nullptr)
394 return;
395
396 if (pp->accver == false || (pp->source != Parser::ESource::EMBL &&
397 pp->source != Parser::ESource::NCBI && pp->source != Parser::ESource::DDBJ)) {
398 ++(*num);
399 sprintf(str, "%d", (int)*num);
400
401 CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
402 std::string& obj_id_str = seq_id->SetLocal().SetStr();
403 obj_id_str = text_id->GetAccession();
404 obj_id_str += "_";
405 obj_id_str += str;
406 ids.push_back(seq_id);
407 return;
408 }
409
410 protacc = CpTheQualValue(cds.SetQual(), "protein_id");
411 if (protacc == NULL || *protacc == '\0') {
412 p = location_to_string(cds.GetLocation());
413 ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinId,
414 "/protein_id qualifier is missing for CDS feature: \"%s\".",
415 (p == NULL) ? "" : p);
416 if (p != NULL)
417 MemFree(p);
418 if (protacc != NULL)
419 MemFree(protacc);
420
421 return;
422 }
423
424 if (pp->mode == Parser::EMode::HTGSCON) {
425 MemFree(protacc);
426 ++(*num);
427 sprintf(str, "%d", (int)*num);
428
429 CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
430 std::string& obj_id_str = seq_id->SetLocal().SetStr();
431 obj_id_str = text_id->GetAccession();
432 obj_id_str += "_";
433 obj_id_str += str;
434 ids.push_back(seq_id);
435 return;
436 }
437
438 p = StringChr(protacc, '.');
439 if (p == NULL || *(p + 1) == '\0') {
440 p = location_to_string(cds.GetLocation());
441 ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinVersion,
442 "/protein_id qualifier has missing version for CDS feature: \"%s\".",
443 (p == NULL) ? "" : p);
444 if (p != NULL)
445 MemFree(p);
446 MemFree(protacc);
447 return;
448 }
449
450 for (q = p + 1; *q >= '0' && *q <= '9';)
451 q++;
452 if (*q != '\0') {
453 p = location_to_string(cds.GetLocation());
454 ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinVersion,
455 "/protein_id qualifier \"%s\" has incorrect version for CDS feature: \"%s\".",
456 protacc, (p == NULL) ? "" : p);
457 if (p != NULL)
458 MemFree(p);
459 MemFree(protacc);
460 return;
461 }
462 q = p + 1;
463 *p = '\0';
464 cho = GetProtAccOwner(protacc);
465 if (cho == 0) {
466 p = location_to_string(cds.GetLocation());
467 ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinAccession,
468 "/protein_id qualifier has incorrect accession \"%s\" for CDS feature: \"%s\".",
469 protacc, (p == NULL) ? "" : p);
470 if (p != NULL)
471 MemFree(p);
472 MemFree(protacc);
473 return;
474 }
475
476 r = NULL;
477 ncho = cho;
478 if (pp->source == Parser::ESource::EMBL && cho != objects::CSeq_id::e_Embl && cho != objects::CSeq_id::e_Tpe)
479 r = "EMBL";
480 else if (pp->source == Parser::ESource::DDBJ && cho != objects::CSeq_id::e_Ddbj &&
481 cho != objects::CSeq_id::e_Tpd)
482 r = "DDBJ";
483 else if (pp->source == Parser::ESource::NCBI && cho != objects::CSeq_id::e_Genbank &&
484 cho != objects::CSeq_id::e_Tpg)
485 r = "NCBI";
486 else {
487 ncho = GetNucAccOwner(text_id->GetAccession().c_str(), pp->entrylist[pp->curindx]->is_tpa);
488 if (ncho == objects::CSeq_id::e_Tpe && cho == objects::CSeq_id::e_Embl)
489 cho = ncho;
490 }
491
492 if (r != NULL || ncho != cho) {
493 p = location_to_string(cds.GetLocation());
494 if (pp->ign_prot_src == false)
495 sev = SEV_FATAL;
496 else
497 sev = SEV_WARNING;
498 if (r != NULL)
499 ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession,
500 "/protein_id qualifier has incorrect accession prefix \"%s\" for source %s for CDS feature: \"%s\".",
501 protacc, r, (p == NULL) ? "" : p);
502 else
503 ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession,
504 "Found mismatching TPA and non-TPA nucleotides's and protein's accessions in one nuc-prot set. Nuc = \"%s\", prot = \"%s\".",
505 text_id->GetAccession().c_str(), protacc);
506 if (p != NULL)
507 MemFree(p);
508 if (pp->ign_prot_src == false) {
509 MemFree(protacc);
510 return;
511 }
512 }
513
514 CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
515
516 CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
517 new_text_id->SetAccession(protacc);
518 new_text_id->SetVersion(atoi(q));
519 SetTextId(cho, *seq_id, *new_text_id);
520
521 ids.push_back(seq_id);
522
523 if ((pp->source != Parser::ESource::DDBJ && pp->source != Parser::ESource::EMBL) ||
524 pp->entrylist[pp->curindx]->is_wgs == false ||
525 StringLen(ibp->acnum) == 8) {
526 return;
527 }
528
529 seq_id.Reset(new objects::CSeq_id);
530 seq_id->SetGeneral().SetTag().SetStr(protacc);
531
532 std::string& db = seq_id->SetGeneral().SetDb();
533 if (pp->entrylist[pp->curindx]->is_tsa != false)
534 db = "TSA:";
535 else if (pp->entrylist[pp->curindx]->is_tls != false)
536 db = "TLS:";
537 else
538 db = "WGS:";
539
540 db.append(ibp->acnum, ibp->acnum + 4);
541 ids.push_back(seq_id);
542 }
543
544 /**********************************************************/
stripStr(char * base,char * str)545 static char* stripStr(char* base, char* str)
546 {
547 char* bptr;
548 char* eptr;
549
550 if (base == NULL || str == NULL)
551 return(NULL);
552 bptr = StringStr(base, str);
553 if (bptr != NULL) {
554 eptr = bptr + StringLen(str);
555 fta_StringCpy(bptr, eptr);
556 }
557
558 return(base);
559 }
560
561 /**********************************************************/
StripCDSComment(objects::CSeq_feat & feat)562 static void StripCDSComment(objects::CSeq_feat& feat)
563 {
564 static const char *strA[] = {
565 "Author-given protein sequence is in conflict with the conceptual translation.",
566 "direct peptide sequencing.",
567 "Method: conceptual translation with partial peptide sequencing.",
568 "Method: sequenced peptide, ordered by overlap.",
569 "Method: sequenced peptide, ordered by homology.",
570 "Method: conceptual translation supplied by author.",
571 NULL
572 };
573 const char **b;
574 char* pchComment;
575 char* comment;
576
577 if (!feat.IsSetComment())
578 return;
579
580 comment = StringSave(feat.GetComment().c_str());
581 pchComment = tata_save(comment);
582 if (comment != NULL && comment[0] != 0)
583 feat.SetComment(comment);
584 else
585 feat.ResetComment();
586
587 MemFree(comment);
588
589 for (b = strA; *b != NULL; b++) {
590 pchComment = stripStr(pchComment, (char*)*b);
591 }
592 comment = tata_save(pchComment);
593 if (comment != NULL && *comment != '\0')
594 ShrinkSpaces(comment);
595 MemFree(pchComment);
596
597 if (comment != NULL && comment[0] != 0)
598 feat.SetComment(comment);
599 else
600 feat.ResetComment();
601
602 MemFree(comment);
603 }
604
605 /**********************************************************
606 *
607 * static SeqAnnotPtr GetProtRefAnnot(ibp, sfp, seqid,
608 * length):
609 *
610 * "product" qualifier ==> prp->name.
611 * "note" or "gene" or "standard_name" or
612 * "label" ==> prp->desc.
613 * "EC_number" qualifier ==> a ValNodePtr in
614 * ProtRefPtr, prp->ec.
615 * "function" qualifier ==> a ValNodePtr in
616 * ProtRefPtr, prp->activity.
617 *
618 **********************************************************/
GetProtRefAnnot(InfoBioseqPtr ibp,objects::CSeq_feat & feat,objects::CBioseq & bioseq)619 static void GetProtRefAnnot(InfoBioseqPtr ibp, objects::CSeq_feat& feat,
620 objects::CBioseq& bioseq)
621 {
622 char* qval;
623 char* prid;
624 bool partial5;
625 bool partial3;
626
627 std::set<string> names;
628
629 for (;;) {
630 qval = GetTheQualValue(feat.SetQual(), "product");
631 if (qval == NULL)
632 break;
633
634 std::string qval_str = qval;
635 MemFree(qval);
636 qval = NULL;
637
638 if (qval_str[0] == '\'')
639 qval_str = qval_str.substr(1, qval_str.size() - 2);
640
641 names.insert(qval_str);
642 }
643
644 if (names.empty()) {
645 qval = CpTheQualValue(feat.GetQual(), "gene");
646 if (qval == NULL)
647 qval = CpTheQualValue(feat.GetQual(), "standard_name");
648 if (qval == NULL)
649 qval = CpTheQualValue(feat.GetQual(), "label");
650 }
651
652 CRef<objects::CProt_ref> prot_ref(new objects::CProt_ref);
653
654 if (names.empty() && qval == NULL) {
655 prid = CpTheQualValue(feat.GetQual(), "protein_id");
656 qval = location_to_string(feat.GetLocation());
657 if (prid != NULL) {
658 ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein,
659 "No product, gene, or standard_name qualifier found for protein \"%s\" on CDS:%s",
660 prid, (qval == NULL) ? "" : qval);
661 MemFree(prid);
662 }
663 else
664 ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein,
665 "No product, gene, or standard_name qualifier found on CDS:%s",
666 (qval == NULL) ? "" : qval);
667 MemFree(qval);
668 prot_ref->SetDesc("unnamed protein product");
669 }
670 else {
671 if (!names.empty()) {
672 prot_ref->SetName().clear();
673 std::copy(names.begin(), names.end(), std::back_inserter(prot_ref->SetName()));
674 names.clear();
675 }
676 else
677 prot_ref->SetDesc(qval);
678 }
679
680 while ((qval = GetTheQualValue(feat.SetQual(), "EC_number")) != NULL) {
681 prot_ref->SetEc().push_back(qval);
682 }
683
684 while ((qval = GetTheQualValue(feat.SetQual(), "function")) != NULL) {
685 prot_ref->SetActivity().push_back(qval);
686 }
687
688 if (feat.GetQual().empty())
689 feat.ResetQual();
690
691 CRef<objects::CSeq_feat> feat_prot(new objects::CSeq_feat);
692 feat_prot->SetData().SetProt(*prot_ref);
693 feat_prot->SetLocation(*fta_get_seqloc_int_whole(*bioseq.SetId().front(), bioseq.GetLength()));
694
695 if (feat.IsSetPartial())
696 feat_prot->SetPartial(feat.GetPartial());
697
698 partial5 = feat.GetLocation().IsPartialStart(objects::eExtreme_Biological);
699 partial3 = feat.GetLocation().IsPartialStop(objects::eExtreme_Biological);
700
701 if (partial5 || partial3) {
702 objects::CSeq_interval& interval = feat_prot->SetLocation().SetInt();
703
704 if (partial5) {
705 interval.SetFuzz_from().SetLim(objects::CInt_fuzz::eLim_lt);
706 }
707
708 if (partial3) {
709 interval.SetFuzz_to().SetLim(objects::CInt_fuzz::eLim_gt);
710 }
711 }
712
713 CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
714 annot->SetData().SetFtable().push_back(feat_prot);
715
716 bioseq.SetAnnot().push_back(annot);
717 }
718
719 /**********************************************************/
GetProtRefDescr(objects::CSeq_feat & feat,Uint1 method,const objects::CBioseq & bioseq,TSeqdescList & descrs)720 static void GetProtRefDescr(objects::CSeq_feat& feat, Uint1 method, const objects::CBioseq& bioseq, TSeqdescList& descrs)
721 {
722 char* p;
723 char* q;
724 bool partial5;
725 bool partial3;
726 Int4 diff_lowest;
727 Int4 diff_current;
728 Int4 cdslen;
729 Int4 orglen;
730 Uint1 strand;
731
732 strand = feat.GetLocation().GetStrand();
733
734 std::string organism;
735
736 ITERATE(TSeqdescList, desc, bioseq.GetDescr().Get())
737 {
738 if (!(*desc)->IsSource())
739 continue;
740
741 const objects::CBioSource& source = (*desc)->GetSource();
742 if (source.IsSetOrg() && source.GetOrg().IsSetTaxname()) {
743 organism = source.GetOrg().GetTaxname();
744 break;
745 }
746 }
747
748 if (!fta_if_special_org(organism.c_str())) {
749 diff_lowest = -1;
750 cdslen = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
751
752 ITERATE(objects::CBioseq::TAnnot, annot, bioseq.GetAnnot())
753 {
754 if (!(*annot)->IsFtable())
755 continue;
756
757 bool found = false;
758 ITERATE(TSeqFeatList, cur_feat, (*annot)->GetData().GetFtable())
759 {
760 if (!(*cur_feat)->IsSetData() || !(*cur_feat)->GetData().IsBiosrc())
761 continue;
762
763 orglen = objects::sequence::GetLength((*cur_feat)->GetLocation(), &GetScope());
764
765 const objects::CBioSource& source = (*cur_feat)->GetData().GetBiosrc();
766 if (!source.IsSetOrg() || !source.GetOrg().IsSetTaxname() ||
767 strand != (*cur_feat)->GetLocation().GetStrand())
768 continue;
769
770 objects::sequence::ECompare cmp_res = objects::sequence::Compare(feat.GetLocation(), (*cur_feat)->GetLocation(), nullptr, objects::sequence::fCompareOverlapping);
771 if (cmp_res == objects::sequence::eNoOverlap)
772 continue;
773
774 if (cmp_res == objects::sequence::eSame) {
775 organism = source.GetOrg().GetTaxname();
776 break;
777 }
778
779 if (cmp_res == objects::sequence::eContained) {
780 diff_current = orglen - cdslen;
781 if (diff_lowest == -1 || diff_current < diff_lowest) {
782 diff_lowest = diff_current;
783 organism = source.GetOrg().GetTaxname();
784 }
785 }
786 else if (cmp_res == objects::sequence::eOverlap && diff_lowest < 0)
787 organism = source.GetOrg().GetTaxname();
788 }
789
790 if (found)
791 break;
792 }
793 }
794
795 CRef<objects::CMolInfo> mol_info(new objects::CMolInfo);
796 mol_info->SetBiomol(objects::CMolInfo::eBiomol_peptide); /* peptide */
797
798 partial5 = feat.GetLocation().IsPartialStart(objects::eExtreme_Biological);
799 partial3 = feat.GetLocation().IsPartialStop(objects::eExtreme_Biological);
800
801 if (partial5 && partial3)
802 mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_ends);
803 else if (partial5)
804 mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_left);
805 else if (partial3)
806 mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_right);
807 else if (feat.IsSetPartial() && feat.GetPartial())
808 mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_partial);
809
810 if (method == objects::eGIBB_method_concept_trans_a)
811 mol_info->SetTech(objects::CMolInfo::eTech_concept_trans_a);
812 else if (method == objects::eGIBB_method_concept_trans)
813 mol_info->SetTech(objects::CMolInfo::eTech_concept_trans);
814
815 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
816 descr->SetMolinfo(*mol_info);
817 descrs.push_back(descr);
818
819 p = NULL;
820 ITERATE(TQualVector, qual, feat.GetQual())
821 {
822 if ((*qual)->GetQual() != "product")
823 continue;
824
825 const std::string& val_str = (*qual)->GetVal();
826 if (p == NULL) {
827 p = StringSave((*qual)->GetVal().c_str());
828 continue;
829 }
830
831 q = (char*)MemNew(StringLen(p) + val_str.size() + 3);
832 StringCpy(q, p);
833 StringCat(q, "; ");
834 StringCat(q, val_str.c_str());
835 MemFree(p);
836 p = q;
837 }
838
839 if (p == NULL)
840 p = CpTheQualValue(feat.GetQual(), "gene");
841
842 if (p == NULL)
843 p = CpTheQualValue(feat.GetQual(), "label");
844
845 if (p == NULL)
846 p = CpTheQualValue(feat.GetQual(), "standard_name");
847
848 if (p == NULL)
849 p = StringSave("unnamed protein product");
850 else {
851 for (q = p; *q != '\0';)
852 q++;
853 if (q > p) {
854 for (q--; *q == ' ' || *q == ','; q--)
855 if (q == p)
856 break;
857 if (*q != ' ' && *q != ',')
858 q++;
859 *q = '\0';
860 }
861 }
862
863 if (StringLen(p) < 511 && !organism.empty() && StringStr(p, organism.c_str()) == NULL) {
864 q = (char*)MemNew(StringLen(p) + organism.size() + 4);
865 StringCpy(q, p);
866 StringCat(q, " [");
867 StringCat(q, organism.c_str());
868 StringCat(q, "]");
869 MemFree(p);
870 p = q;
871 }
872
873 if (StringLen(p) > 511) {
874 p[510] = '>';
875 p[511] = '\0';
876 q = StringSave(p);
877 MemFree(p);
878 }
879 else
880 q = p;
881
882 descr.Reset(new objects::CSeqdesc);
883 descr->SetTitle(q);
884 descrs.push_back(descr);
885 }
886
887 /**********************************************************
888 *
889 * Function:
890 * static SeqIdPtr QualsToSeqID(pSeqFeat, source)
891 *
892 * Purpose:
893 * Searches all /db_xref qualifiers make from them
894 * corresponding SeqId and removing found qualifiers
895 * from given SeqFeature.
896 *
897 * Tatiana: /db_xref qualifiers were already processed
898 * sfp->dbxref in loadfeat.c
899 *
900 * Parameters:
901 * pSeqFeat - pointer to SeqFeat structure which have
902 * to be processed.
903 *
904 * Return:
905 * Pointer to resultant SeqId chain if successful,
906 * otherwise NULL.
907 *
908 * Note:
909 * Returned SeqId must be freed by caller.
910 *
911 **********************************************************/
QualsToSeqID(objects::CSeq_feat & feat,Parser::ESource source,TSeqIdList & ids)912 static void QualsToSeqID(objects::CSeq_feat& feat, Parser::ESource source, TSeqIdList& ids)
913 {
914 char* p;
915
916 if (!feat.IsSetQual())
917 return;
918
919 for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
920 if ((*qual)->GetQual() != "db_xref") {
921 ++qual;
922 continue;
923 }
924
925 CRef<objects::CSeq_id> seq_id;
926 p = StringIStr((*qual)->GetVal().c_str(), "pid:");
927 if (p != NULL)
928 seq_id = StrToSeqId(p + 4, true);
929
930 if (seq_id.Empty()) {
931 ErrPostEx(SEV_ERROR, ERR_CDREGION_InvalidDb_xref,
932 "Invalid data format /db_xref = \"%s\", ignore it",
933 (*qual)->GetVal().c_str());
934 }
935 else {
936 if (p[4] == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))
937 seq_id.Reset();
938 else
939 ids.push_back(seq_id);
940 }
941
942 qual = feat.SetQual().erase(qual);
943 }
944
945 if (feat.GetQual().empty())
946 feat.ResetQual();
947 }
948
949 /**********************************************************
950 *
951 * Function:
952 * static SeqIdPtr ValidateQualSeqId(pSeqID)
953 *
954 * Purpose:
955 * Validates consistency of SeqId list obtained from
956 * /db_xref in following maner. The number of SeqId
957 * must be not more then 3. Each SeqId must refer to
958 * a different GenBank. If two or more SeqId's refer
959 * to the same GenBank only first is taken in account
960 * and other ones are abandoned.
961 * During validating the function drop corresponding
962 * error messages if something wrong occured.
963 *
964 * Parameters:
965 * pSeqFeat - pointer to SeqFeat structure which have
966 * to be processed;
967 *
968 * Return:
969 * Pointer to resultant SeqId chain. If pointer is
970 * NULL it means that there is no good SeqId.
971 *
972 **********************************************************/
ValidateQualSeqId(TSeqIdList & ids)973 static void ValidateQualSeqId(TSeqIdList& ids)
974 {
975 bool abGenBanks[3] = { false, false, false };
976 Int2 num;
977 Char ch;
978
979 if (ids.empty())
980 return;
981
982 ch = '\0';
983 for (TSeqIdList::iterator id = ids.begin(); id != ids.end();) {
984 num = -1;
985 const Char* dbtag_str = (*id)->IsGeneral() ? (*id)->GetGeneral().GetTag().GetStr().c_str() : "";
986 if ((*id)->IsGi()) {
987 num = 1;
988 ch = 'g';
989 }
990 else if (*dbtag_str == 'e') {
991 num = 0;
992 ch = 'e';
993 }
994 else if (*dbtag_str == 'd') {
995 num = 2;
996 ch = 'd';
997 }
998 if (num == -1)
999 continue;
1000
1001 if (abGenBanks[num]) {
1002 /* PID of this type already exist, ignore it
1003 */
1004 ErrPostEx(SEV_WARNING, ERR_CDREGION_Multiple_PID,
1005 "/db_xref=\"pid:%c%i\" refer the same data base",
1006 ch, (*id)->GetGeneral().GetTag().GetId());
1007
1008 id = ids.erase(id);
1009 }
1010 else {
1011 abGenBanks[num] = true;
1012 ++id;
1013 }
1014 }
1015 }
1016
1017 /**********************************************************/
DbxrefToSeqID(objects::CSeq_feat & feat,Parser::ESource source,TSeqIdList & ids)1018 static void DbxrefToSeqID(objects::CSeq_feat& feat, Parser::ESource source, TSeqIdList& ids)
1019 {
1020 if (!feat.IsSetDbxref())
1021 return;
1022
1023 for (objects::CSeq_feat::TDbxref::iterator xref = feat.SetDbxref().begin(); xref != feat.SetDbxref().end();) {
1024 if (!(*xref)->IsSetTag() || !(*xref)->IsSetDb()) {
1025 ++xref;
1026 continue;
1027 }
1028
1029 CRef<objects::CSeq_id> id;
1030
1031 if ((*xref)->GetDb() == "PID") {
1032 const Char* tag_str = (*xref)->GetTag().GetStr().c_str();
1033 switch (tag_str[0]) {
1034 case 'g':
1035 if (source != Parser::ESource::DDBJ && source != Parser::ESource::EMBL) {
1036 id.Reset(new objects::CSeq_id);
1037 id->SetGi(GI_FROM(long long, strtoll(tag_str + 1, NULL, 10)));
1038 }
1039 break;
1040
1041 case 'd':
1042 case 'e':
1043 id.Reset(new objects::CSeq_id);
1044 id->SetGeneral(*(*xref));
1045 break;
1046
1047 default:
1048 break;
1049 }
1050
1051 xref = feat.SetDbxref().erase(xref);
1052 }
1053 else
1054 ++xref;
1055
1056 if (id.NotEmpty())
1057 ids.push_back(id);
1058 }
1059
1060 if (feat.GetDbxref().empty())
1061 feat.ResetDbxref();
1062 }
1063
1064 /**********************************************************
1065 *
1066 * Function:
1067 * static void ProcessForDbxref(pBioseq, pSeqFeat,
1068 * source)
1069 *
1070 * Purpose:
1071 * Finds all qualifiers corresponding to /db_xref,
1072 * makes from them SeqId and remove them from further
1073 * processing. Also the function looks for PID which
1074 * can be found in /note (pSeqFeat->comment) and
1075 * removes PID record from /note.
1076 *
1077 * Parameters:
1078 * pBioseq - pointer or a Bioseq structure which will
1079 * hold resultant list of SeqId;
1080 * pSeqFeat - pointer to SeqFeat structure which have
1081 * to processed.
1082 * source - source of sequence.
1083 *
1084 * Return:
1085 * void
1086 *
1087 **********************************************************/
ProcessForDbxref(objects::CBioseq & bioseq,objects::CSeq_feat & feat,Parser::ESource source)1088 static void ProcessForDbxref(objects::CBioseq& bioseq, objects::CSeq_feat& feat,
1089 Parser::ESource source)
1090 {
1091 TSeqIdList ids;
1092 QualsToSeqID(feat, source, ids);
1093 if (!ids.empty()) {
1094 ValidateQualSeqId(ids);
1095 if (!ids.empty()) {
1096 bioseq.SetId().splice(bioseq.SetId().end(), ids);
1097 return;
1098 }
1099 }
1100
1101 DbxrefToSeqID(feat, source, ids);
1102 if (feat.IsSetComment() && feat.GetComment().empty())
1103 feat.ResetComment();
1104
1105 if (!ids.empty())
1106 bioseq.SetId().splice(bioseq.SetId().end(), ids);
1107 }
1108
1109 /**********************************************************/
BldProtRefSeqEntry(ProtBlkPtr pbp,objects::CSeq_feat & feat,std::string & seq_data,Uint1 method,ParserPtr pp,const objects::CBioseq & bioseq,objects::CBioseq::TId & ids)1110 static CRef<objects::CBioseq> BldProtRefSeqEntry(ProtBlkPtr pbp, objects::CSeq_feat& feat,
1111 std::string& seq_data, Uint1 method,
1112 ParserPtr pp, const objects::CBioseq& bioseq,
1113 objects::CBioseq::TId& ids)
1114 {
1115 CRef<objects::CBioseq> new_bioseq;
1116
1117 if (ids.empty())
1118 return new_bioseq;
1119
1120 new_bioseq.Reset(new objects::CBioseq);
1121
1122 new_bioseq->SetId().swap(ids);
1123
1124 ProcessForDbxref(*new_bioseq, feat, pp->source);
1125
1126 new_bioseq->SetInst().SetLength(static_cast<TSeqPos>(seq_data.size()));
1127
1128 GetProtRefDescr(feat, method, bioseq, new_bioseq->SetDescr());
1129 GetProtRefAnnot(pbp->ibp, feat, *new_bioseq);
1130
1131 new_bioseq->SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
1132 new_bioseq->SetInst().SetMol(objects::CSeq_inst::eMol_aa);
1133
1134 /* Seq_code always ncbieaa 08.08.96
1135 */
1136 CRef<objects::CSeq_data> data(new objects::CSeq_data(seq_data, objects::CSeq_data::e_Ncbieaa));
1137 new_bioseq->SetInst().SetSeq_data(*data);
1138
1139 return new_bioseq;
1140 }
1141
1142 /**********************************************************/
AddProtRefSeqEntry(ProtBlkPtr pbp,objects::CBioseq & bioseq)1143 static void AddProtRefSeqEntry(ProtBlkPtr pbp, objects::CBioseq& bioseq)
1144 {
1145 CRef<objects::CSeq_entry> entry(new objects::CSeq_entry);
1146 entry->SetSeq(bioseq);
1147 pbp->entries.push_back(entry);
1148 }
1149
1150 /**********************************************************/
SimpleValuePos(char * qval)1151 static char* SimpleValuePos(char* qval)
1152 {
1153 char* bptr;
1154 char* eptr;
1155
1156 bptr = StringStr(qval, "(pos:");
1157 if (bptr == NULL)
1158 return(NULL);
1159
1160 bptr += 5;
1161 while (*bptr == ' ')
1162 bptr++;
1163 for (eptr = bptr; *eptr != ',' && *eptr != '\0';)
1164 eptr++;
1165
1166 return StringSave(std::string(bptr, eptr).c_str());
1167 }
1168
1169 /**********************************************************
1170 *
1171 * static CodeBreakPtr GetCdRegionCB(ibp, sfp, accver):
1172 *
1173 * If protein translation (InternalStopCodon) O.K.,
1174 * then this qualifier will be deleted ==> different from
1175 * Karl's parser.
1176 * For transl_except of type OTHER, use the ncibeaa
1177 * code 'X' for Code-break.aa.
1178 * CodeBreakPtr->aa.choice = 1 ==> for ncbieaa;
1179 * CodeBreakPtr->aa.value.intvalue = (Int4)'X' ==>
1180 * for unknown.
1181 *
1182 **********************************************************/
GetCdRegionCB(InfoBioseqPtr ibp,objects::CSeq_feat & feat,TCodeBreakList & code_breaks,unsigned char * dif,bool accver)1183 static void GetCdRegionCB(InfoBioseqPtr ibp, objects::CSeq_feat& feat,
1184 TCodeBreakList& code_breaks, unsigned char* dif, bool accver)
1185 {
1186 Int4 feat_start = -1;
1187 Int4 feat_stop = -1;
1188
1189 if (feat.IsSetLocation()) {
1190 feat_start = feat.GetLocation().GetStart(objects::eExtreme_Positional);
1191 feat_stop = feat.GetLocation().GetStop(objects::eExtreme_Positional);
1192 }
1193
1194 Uint1 res = 2;
1195
1196 if (feat.IsSetQual()) {
1197 TQualVector::iterator cur_qual = feat.SetQual().begin(),
1198 end_qual = feat.SetQual().end();
1199
1200 char* qval = NULL;
1201 while ((qval = CpTheQualValueNext(cur_qual, end_qual, "transl_except")) != NULL) {
1202 CRef<objects::CCode_break> code_break(new objects::CCode_break);
1203
1204 int ncbieaa_val = GetQualValueAa(qval, false);
1205
1206 code_break->SetAa().SetNcbieaa(ncbieaa_val);
1207
1208 char* pos = SimpleValuePos(qval);
1209
1210 int num_errs = 0;
1211 bool locmap = false,
1212 sitesmap = false;
1213
1214 CRef<objects::CSeq_loc> location = xgbparseint_ver(pos, locmap, sitesmap, num_errs, ibp->ids, accver);
1215 if (location.NotEmpty())
1216 code_break->SetLoc(*location);
1217
1218 Int4 start = code_break->IsSetLoc() ? code_break->GetLoc().GetStart(objects::eExtreme_Positional) : -1;
1219 Int4 stop = code_break->IsSetLoc() ? code_break->GetLoc().GetStop(objects::eExtreme_Positional) : -1;
1220
1221 Uint1 i = (start > stop) ? 3 : (stop - start);
1222
1223 if (i != 2)
1224 res = i;
1225
1226 bool itis = false;
1227 if (ncbieaa_val == 42 &&
1228 (start == feat_start ||
1229 stop == feat_stop))
1230 itis = true;
1231
1232 bool pos_range = false;
1233 if (i == 2)
1234 pos_range = false;
1235 else if (i > 2)
1236 pos_range = true;
1237 else
1238 pos_range = !itis;
1239
1240 if (num_errs > 0 || pos_range) {
1241 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
1242 "transl_except range is wrong, %s, drop the transl_except",
1243 pos);
1244 MemFree(pos);
1245 MemFree(qval);
1246 break;
1247 }
1248
1249 if (code_break->IsSetLoc()) {
1250 if (feat.GetLocation().IsSetStrand())
1251 code_break->SetLoc().SetStrand(feat.GetLocation().GetStrand());
1252
1253 objects::sequence::ECompare cmp_res = objects::sequence::Compare(feat.GetLocation(), code_break->GetLoc(), nullptr, objects::sequence::fCompareOverlapping);
1254 if (cmp_res != objects::sequence::eContains) {
1255 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
1256 "/transl_except not in CDS: %s", qval);
1257 }
1258 }
1259
1260 MemFree(pos);
1261 MemFree(qval);
1262
1263 if (code_break.NotEmpty())
1264 code_breaks.push_back(code_break);
1265 }
1266 }
1267
1268 *dif = 2 - res;
1269 }
1270
1271 /**********************************************************/
CkEndStop(const objects::CSeq_feat & feat,Uint1 dif)1272 static void CkEndStop(const objects::CSeq_feat& feat, Uint1 dif)
1273 {
1274 Int4 len;
1275 Int4 r;
1276 Int4 frm;
1277 char* p;
1278
1279 len = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
1280 const objects::CCdregion& cdregion = feat.GetData().GetCdregion();
1281
1282 if (!cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1283 frm = 0;
1284 else
1285 frm = cdregion.GetFrame() - 1;
1286
1287 r = (len - frm + (Int4)dif) % 3;
1288 if (r != 0 && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1289 p = location_to_string(feat.GetLocation());
1290 ErrPostEx(SEV_WARNING, ERR_CDREGION_UnevenLocation,
1291 "CDS: %s. Length is not divisable by 3, the remain is: %d",
1292 (p == NULL) ? "" : p, r);
1293 MemFree(p);
1294 }
1295 }
1296
1297 /**********************************************************/
check_end_internal(size_t protlen,const objects::CSeq_feat & feat,Uint1 dif)1298 static void check_end_internal(size_t protlen, const objects::CSeq_feat& feat, Uint1 dif)
1299 {
1300 Int4 frm;
1301
1302 const objects::CCdregion& cdregion = feat.GetData().GetCdregion();
1303
1304 if (!cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1305 frm = 0;
1306 else
1307 frm = cdregion.GetFrame() - 1;
1308
1309 size_t len = objects::sequence::GetLength(feat.GetLocation(), &GetScope()) - frm + dif;
1310
1311 if (protlen * 3 != len && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1312 char* p = location_to_string(feat.GetLocation());
1313 ErrPostEx(SEV_WARNING, ERR_CDREGION_LocationLength,
1314 "Length of the CDS: %s (%ld) disagree with the length calculated from the protein: %ld",
1315 (p == NULL) ? "" : p, len, protlen * 3);
1316 MemFree(p);
1317 }
1318 }
1319
1320 /**********************************************************
1321 *
1322 * static void ErrByteStorePtr(ibp, sfp, bsp):
1323 *
1324 * For debugging, put to error logfile, it needs
1325 * to delete for "buildcds.c program.
1326 *
1327 **********************************************************/
ErrByteStorePtr(InfoBioseqPtr ibp,const objects::CSeq_feat & feat,const std::string & prot)1328 static void ErrByteStorePtr(InfoBioseqPtr ibp, const objects::CSeq_feat& feat,
1329 const std::string& prot)
1330 {
1331 char* str;
1332 char* qval;
1333
1334 qval = CpTheQualValue(feat.GetQual(), "translation");
1335 if (qval == NULL)
1336 qval = StringSave("no translation qualifier");
1337
1338 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1339 str = location_to_string(feat.GetLocation());
1340 ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff,
1341 "Location: %s, translation: %s",
1342 (str == NULL) ? "" : str, qval);
1343 MemFree(str);
1344 }
1345
1346 MemFree(qval);
1347
1348 ErrLogPrintStr(prot.c_str());
1349 ErrLogPrintStr((char *) "\n");
1350 }
1351
1352 /**********************************************************
1353 *
1354 * static ByteStorePtr CkProteinTransl(pp, ibp, bsp, sfp,
1355 * qval, intercodon,
1356 * gcode, method):
1357 *
1358 * If bsp != translation's value, then take
1359 * translation's value and print out warning message.
1360 * If the only diff is start codon and bsp has "M"
1361 * take bsp.
1362 * If intercodon = TRUE, then no comparison.
1363 *
1364 **********************************************************/
CkProteinTransl(ParserPtr pp,InfoBioseqPtr ibp,std::string & prot,objects::CSeq_feat & feat,char * qval,bool intercodon,char * gcode,unsigned char * method)1365 static void CkProteinTransl(ParserPtr pp, InfoBioseqPtr ibp,
1366 std::string& prot, objects::CSeq_feat& feat,
1367 char* qval, bool intercodon,
1368 char* gcode, unsigned char* method)
1369 {
1370 char* ptr;
1371 Char msg2[1100];
1372 Char aastr[100];
1373 char* msg;
1374 Int2 residue;
1375 Int4 num = 0;
1376 size_t aa;
1377 bool msgout = false;
1378 bool first = false;
1379 Int4 difflen;
1380
1381 objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1382 size_t len = StringLen(qval);
1383 msg2[0] = '\0';
1384
1385 msg = location_to_string(feat.GetLocation());
1386 if (msg == NULL)
1387 msg = StringSave("");
1388
1389 size_t blen = prot.size();
1390
1391 if (len != blen && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1392 ErrPostEx(SEV_ERROR, ERR_CDREGION_ProteinLenDiff,
1393 "Lengths of conceptual translation and translation qualifier differ : %d : %d : %s",
1394 blen, len, msg);
1395 }
1396
1397 difflen = 0;
1398 if (!intercodon) {
1399 if (len != blen)
1400 difflen = 1;
1401 if (len > blen)
1402 difflen = 2;
1403
1404 size_t residue_idx = 0;
1405 for (ptr = qval, num = 0, aa = 1; residue_idx < blen; ++aa, ++residue_idx) {
1406 residue = prot[residue_idx];
1407
1408 if (aa > len) {
1409 if (residue == '*')
1410 continue;
1411 difflen = 2;
1412 break;
1413 }
1414
1415 if (residue == *ptr) {
1416 ptr++;
1417 continue;
1418 }
1419
1420 if (aa == 1 && residue == 'M')
1421 first = true;
1422
1423 msgout = true;
1424 num++;
1425 if (num == 1)
1426 StringCpy(msg2, "at AA # ");
1427
1428 if (num < 11 && StringLen(msg2) < 1000) {
1429 sprintf(aastr, "%d(%c,%c), ",
1430 (int)aa, (char)residue, (char)*ptr);
1431 StringCat(msg2, aastr);
1432 }
1433 else if (num == 11 && StringLen(msg2) < 1000)
1434 StringCat(msg2, ", additional details suppressed, ");
1435 ptr++;
1436 }
1437
1438 if (num > 0) {
1439 cdregion.SetConflict(true);
1440 sprintf(aastr, "using genetic code %s, total %d difference%s",
1441 gcode, (int)num, (num > 1) ? "s" : "");
1442 StringCat(msg2, aastr);
1443 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1444 ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff,
1445 "%s:%s", msg2, msg);
1446 }
1447 }
1448
1449 if (!msgout) {
1450 cdregion.ResetConflict();
1451 ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationsAgree,
1452 "Parser-generated conceptual translation agrees with input translation %s",
1453 msg);
1454 }
1455
1456 if (difflen == 2) {
1457 cdregion.SetConflict(true);
1458 msgout = true;
1459 }
1460 } /* intercodon */
1461 else {
1462 msgout = true;
1463 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1464 ErrPostEx(SEV_WARNING, ERR_CDREGION_NoTranslationCompare,
1465 "Conceptual translation has internal stop codons, no comparison: %s",
1466 msg);
1467 }
1468 }
1469
1470 if (msgout) {
1471 if (pp->debug) {
1472 ErrByteStorePtr(ibp, feat, prot);
1473 }
1474 }
1475
1476 if (pp->accver == false) {
1477 if ((num == 1 && first) ||
1478 (pp->transl && !prot.empty() && cdregion.IsSetConflict() && cdregion.GetConflict())) {
1479 cdregion.ResetConflict();
1480 ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationOverride,
1481 "Input translation is replaced with conceptual translation: %s",
1482 msg);
1483 *method = objects::eGIBB_method_concept_trans;
1484 MemFree(msg);
1485 }
1486 }
1487
1488 if ((cdregion.IsSetConflict() && cdregion.GetConflict() == true) || difflen != 0)
1489 *method = objects::eGIBB_method_concept_trans_a;
1490
1491 if (msgout && pp->transl == false && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1492 ErrPostEx(SEV_WARNING, ERR_CDREGION_SuppliedProteinUsed,
1493 "In spite of previously reported problems, the supplied protein translation will be used : %s",
1494 msg);
1495 }
1496
1497 prot = qval;
1498 MemFree(msg);
1499 }
1500
1501 /**********************************************************
1502 *
1503 * static bool check_translation(bsp, qval):
1504 *
1505 * If bsp != translation's value return FALSE.
1506 *
1507 **********************************************************/
check_translation(std::string & prot,char * qval)1508 static bool check_translation(std::string& prot, char* qval)
1509 {
1510 size_t len = 0;
1511 size_t blen = prot.size();
1512
1513 for (len = StringLen(qval); len != 0; len--) {
1514 if (qval[len - 1] != 'X') /* remove terminal X */
1515 break;
1516 }
1517
1518 size_t cur_pos = blen;
1519 for (; cur_pos != 0; --cur_pos) {
1520 if (prot[cur_pos - 1] != 'X')
1521 break;
1522 }
1523
1524 if (cur_pos != len)
1525 return false;
1526
1527 cur_pos = 0;
1528 for (; *qval != '\0' && cur_pos < blen; qval++, ++cur_pos) {
1529 if (prot[cur_pos] != *qval)
1530 return false;
1531 }
1532
1533 return true;
1534 }
1535
1536 // Workaround for translation function.
1537 // This is needed because feat->location usually does not have a 'version' in its ID,
1538 // but feat->cdregion->code_break->location has.
1539 // In this case both locations is treated as they are from different sequences, but they definitely
1540 // belong to the same one.
1541 // Therefore, this function changes all IDs of the codebreaks to feat->location->id before translation,
1542 // and return all this stuff back after translation.
1543 // TODO: it probably should be organized in another way
Translate(objects::CSeq_feat & feat,std::string & prot)1544 static bool Translate(objects::CSeq_feat& feat, std::string& prot)
1545 {
1546 std::list<CRef<objects::CSeq_id>> orig_ids;
1547 const objects::CSeq_id* feat_loc_id = nullptr;
1548 if (feat.IsSetLocation())
1549 feat_loc_id = feat.GetLocation().GetId();
1550
1551 bool change = feat_loc_id && feat.GetData().GetCdregion().IsSetCode_break();
1552 if (change) {
1553
1554 NON_CONST_ITERATE(objects::CCdregion::TCode_break, code_break, feat.SetData().SetCdregion().SetCode_break())
1555 {
1556 orig_ids.push_back(CRef<objects::CSeq_id>(new objects::CSeq_id));
1557 orig_ids.back()->Assign(*(*code_break)->GetLoc().GetId());
1558 (*code_break)->SetLoc().SetId(*feat_loc_id);
1559 }
1560 }
1561
1562 bool ret = true;
1563 try {
1564 objects::CSeqTranslator::Translate(feat, GetScope(), prot);
1565 }
1566 catch (objects::CSeqMapException& e) {
1567 ErrPostEx(SEV_REJECT, 0, 0, "%s", e.GetMsg().c_str());
1568 ret = false;
1569 }
1570
1571 if (change) {
1572
1573 std::list<CRef<objects::CSeq_id>>::iterator it = orig_ids.begin();
1574 NON_CONST_ITERATE(objects::CCdregion::TCode_break, code_break, feat.SetData().SetCdregion().SetCode_break())
1575 {
1576 (*code_break)->SetLoc().SetId(**it);
1577 ++it;
1578 }
1579 }
1580
1581 return ret;
1582 }
1583
1584 /**********************************************************
1585 *
1586 * static Int2 EndAdded(sfp, gene):
1587 *
1588 * From EndStopCodonBaseAdded:
1589 * Return 0 if no bases added, no end stop codon found
1590 * after added one or two bases, or can not add
1591 * (substract) any bases at end because it is really
1592 * a partial.
1593 * Return -1 if something bad.
1594 * Return +1 if it found end stop codon after bases
1595 * added.
1596 *
1597 **********************************************************/
EndAdded(objects::CSeq_feat & feat,GeneRefFeats & gene_refs)1598 static Int2 EndAdded(objects::CSeq_feat& feat, GeneRefFeats& gene_refs)
1599 {
1600 Int4 pos;
1601 Int4 pos1;
1602 Int4 pos2;
1603 Int4 len;
1604 Int4 len2;
1605
1606 Uint4 remainder;
1607
1608 Uint4 oldfrom;
1609 Uint4 oldto;
1610
1611 size_t i;
1612
1613 char* transl;
1614 char* name;
1615
1616 objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1617 len = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
1618 len2 = len;
1619
1620 int frame = cdregion.IsSetFrame() ? cdregion.GetFrame() : 0;
1621 if (frame > 1 && frame < 4)
1622 len -= frame - 1;
1623
1624 remainder = 3 - len % 3;
1625 if (remainder == 0)
1626 return(0);
1627
1628 if (cdregion.IsSetCode_break()) {
1629 bool ret_condition = false;
1630 ITERATE(objects::CCdregion::TCode_break, code_break, cdregion.GetCode_break())
1631 {
1632 pos1 = INT4_MAX;
1633 pos2 = -10;
1634
1635 for (objects::CSeq_loc_CI loc((*code_break)->GetLoc()); loc; ++loc) {
1636 pos = objects::sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), objects::sequence::eOffset_FromStart);
1637 if (pos < pos1)
1638 pos1 = pos;
1639 pos = objects::sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), objects::sequence::eOffset_FromEnd);
1640 if (pos > pos2)
1641 pos2 = pos;
1642 }
1643 pos = pos2 - pos1;
1644 if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len2 - 1)) {
1645 ret_condition = true;
1646 break;
1647 }
1648 }
1649
1650 if (ret_condition)
1651 return(0);
1652 }
1653
1654 objects::CSeq_interval* last_interval = nullptr;
1655 for (CTypeIterator<objects::CSeq_interval> loc(Begin(feat.SetLocation())); loc; ++loc) {
1656 last_interval = &(*loc);
1657 }
1658
1659 if (last_interval == nullptr)
1660 return(0);
1661
1662 const objects::CBioseq_Handle& bioseq_h = GetScope().GetBioseqHandle(last_interval->GetId());
1663 if (bioseq_h.GetState() != objects::CBioseq_Handle::fState_none)
1664 return(0);
1665
1666 oldfrom = last_interval->GetFrom();
1667 oldto = last_interval->GetTo();
1668
1669 if (last_interval->IsSetStrand() && last_interval->GetStrand() == objects::eNa_strand_minus) {
1670 if (last_interval->GetFrom() < remainder || last_interval->IsSetFuzz_from())
1671 return(0);
1672 last_interval->SetFrom(oldfrom - remainder);
1673 }
1674 else {
1675 if (last_interval->GetTo() >= bioseq_h.GetBioseqLength() - remainder || last_interval->IsSetFuzz_to())
1676 return(0);
1677 last_interval->SetTo(oldto + remainder);
1678 }
1679
1680 std::string newprot;
1681 Translate(feat, newprot);
1682
1683 if (newprot.empty()) {
1684 last_interval->SetFrom(oldfrom);
1685 last_interval->SetTo(oldto);
1686 return(0);
1687 }
1688
1689 size_t protlen = newprot.size();
1690 if (protlen != (size_t) (len + remainder) / 3) {
1691 last_interval->SetFrom(oldfrom);
1692 last_interval->SetTo(oldto);
1693 return(0);
1694 }
1695
1696 if (newprot[protlen - 1] != '*') {
1697 last_interval->SetFrom(oldfrom);
1698 last_interval->SetTo(oldto);
1699 return(0);
1700 }
1701
1702 transl = CpTheQualValue(feat.GetQual(), "translation");
1703 if (transl == NULL) {
1704 last_interval->SetFrom(oldfrom);
1705 last_interval->SetTo(oldto);
1706 return(0);
1707 }
1708
1709 protlen--;
1710 newprot = newprot.substr(0, newprot.size() - 1);
1711
1712 for (i = 0; i < protlen; i++) {
1713 if (transl[i] != newprot[i])
1714 break;
1715 }
1716
1717 MemFree(transl);
1718 if (i < protlen) {
1719 last_interval->SetFrom(oldfrom);
1720 last_interval->SetTo(oldto);
1721 return(0);
1722 }
1723
1724 if (!gene_refs.valid)
1725 return(remainder);
1726
1727 name = CpTheQualValue(feat.GetQual(), "gene");
1728 if (name == NULL)
1729 return(remainder);
1730
1731 for (TSeqFeatList::iterator gene = gene_refs.first; gene != gene_refs.last; ++gene) {
1732 if (!(*gene)->IsSetData() || !(*gene)->GetData().IsGene())
1733 continue;
1734
1735 int cur_strand = (*gene)->GetLocation().IsSetStrand() ? (*gene)->GetLocation().GetStrand() : 0,
1736 last_strand = last_interval->IsSetStrand() ? last_interval->GetStrand() : 0;
1737 if (cur_strand != last_strand)
1738 continue;
1739
1740 const objects::CGene_ref& cur_gene_ref = (*gene)->GetData().GetGene();
1741 if (StringICmp(cur_gene_ref.GetLocus().c_str(), name) != 0) {
1742 if (!cur_gene_ref.IsSetSyn())
1743 continue;
1744
1745 bool found = false;
1746 ITERATE(objects::CGene_ref::TSyn, syn, cur_gene_ref.GetSyn())
1747 {
1748 if (StringICmp(name, syn->c_str()) == 0) {
1749 found = true;
1750 break;
1751 }
1752 }
1753
1754 if (!found)
1755 continue;
1756 }
1757
1758 for (CTypeIterator<objects::CSeq_interval> loc(Begin((*gene)->SetLocation())); loc; ++loc) {
1759 if (!loc->GetId().Match(last_interval->GetId()))
1760 continue;
1761
1762 int cur_strand = loc->IsSetStrand() ? loc->GetStrand() : 0;
1763 if (cur_strand == objects::eNa_strand_minus && loc->GetFrom() == oldfrom)
1764 loc->SetFrom(last_interval->GetFrom());
1765 else if (cur_strand != objects::eNa_strand_minus && loc->GetTo() == oldto)
1766 loc->SetTo(last_interval->GetTo());
1767 }
1768 }
1769
1770 MemFree(name);
1771 return(remainder);
1772 }
1773
1774 /**********************************************************/
fta_check_codon_quals(objects::CSeq_feat & feat)1775 static void fta_check_codon_quals(objects::CSeq_feat& feat)
1776 {
1777 char* p;
1778
1779 if (!feat.IsSetQual())
1780 return;
1781
1782 for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
1783 if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal() ||
1784 (*qual)->GetQual() != "codon") {
1785 ++qual;
1786 continue;
1787 }
1788
1789 p = location_to_string(feat.GetLocation());
1790 ErrPostEx(SEV_ERROR, ERR_CDREGION_CodonQualifierUsed,
1791 "Encountered /codon qualifier for \"CDS\" feature at \"%s\". Code-breaks (/transl_except) should be used instead.",
1792 p);
1793 if (p != NULL)
1794 MemFree(p);
1795
1796 qual = feat.SetQual().erase(qual);
1797 }
1798
1799 if (feat.GetQual().empty())
1800 feat.ResetQual();
1801 }
1802
1803 /**********************************************************
1804 *
1805 * static ByteStorePtr InternalStopCodon(pp, ibp, sfp,
1806 * method, gene):
1807 *
1808 * Return NULL if there is no protein sequence, or
1809 * there is no translation qualifier and protein sequence
1810 * has internal stop codon; otherwise, return a protein
1811 * sequence.
1812 * In the embl format, it may not have "translation"
1813 * qualifier, take protein sequence (without
1814 * end_stop_codon) instead.
1815 *
1816 **********************************************************/
InternalStopCodon(ParserPtr pp,InfoBioseqPtr ibp,objects::CSeq_feat & feat,unsigned char * method,Uint1 dif,GeneRefFeats & gene_refs,std::string & seq_data)1817 static void InternalStopCodon(ParserPtr pp, InfoBioseqPtr ibp,
1818 objects::CSeq_feat& feat, unsigned char* method,
1819 Uint1 dif, GeneRefFeats& gene_refs, std::string& seq_data)
1820 {
1821 char* qval;
1822 char* msg;
1823 bool intercodon = false;
1824 bool again = true;
1825
1826 ErrSev sev;
1827
1828 Uint1 m = 0;
1829 Char gcode_str[10];
1830 Char stopmsg[550];
1831 Char aastr[100];
1832 size_t protlen;
1833 Int4 aa;
1834 Int4 num = 0;
1835 Int2 residue;
1836 Int2 r;
1837
1838 objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1839
1840 if (!cdregion.IsSetCode())
1841 return;
1842
1843 objects::CGenetic_code& gen_code = cdregion.SetCode();
1844 objects::CGenetic_code::C_E* cur_code = nullptr;
1845
1846 NON_CONST_ITERATE(objects::CGenetic_code::Tdata, gcode, gen_code.Set())
1847 {
1848 if ((*gcode)->IsId()) {
1849 cur_code = (*gcode);
1850 break;
1851 }
1852 }
1853
1854 if (cur_code == nullptr)
1855 return;
1856
1857 if (pp->no_code) {
1858 int orig_code_id = cur_code->GetId();
1859 for (int cur_code_id = cur_code->GetId(); again && cur_code_id < 14; ++cur_code_id) {
1860 cur_code->SetId(cur_code_id);
1861 intercodon = false;
1862
1863 std::string prot;
1864 if (!Translate(feat, prot))
1865 pp->entrylist[pp->curindx]->drop = 1;
1866
1867 if (prot.empty())
1868 continue;
1869
1870 protlen = prot.size();
1871 residue = prot[protlen - 1];
1872
1873 if (residue == '*')
1874 prot = prot.substr(0, prot.size() - 1);
1875
1876 intercodon = prot.find('*') != std::string::npos;
1877
1878 if (!intercodon) {
1879 qval = CpTheQualValue(feat.GetQual(), "translation");
1880 if (qval != NULL) /* compare protein sequence */
1881 {
1882 if (check_translation(prot, qval)) {
1883 sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1884 ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
1885 "No genetic code from TaxArch, trying to guess, code %d assumed",
1886 cur_code_id);
1887 again = false;
1888 }
1889 MemFree(qval);
1890 }
1891 else
1892 break;
1893 }
1894 }
1895
1896 if (again) {
1897 sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1898 ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
1899 "Can't guess genetic code, code %d assumed", orig_code_id);
1900 cur_code->SetId(orig_code_id);
1901 }
1902 }
1903
1904 std::string prot;
1905 if (!Translate(feat, prot))
1906 pp->entrylist[pp->curindx]->drop = 1;
1907
1908 if (cur_code != NULL)
1909 sprintf(gcode_str, "%d", (int)cur_code->GetId());
1910 else
1911 StringCpy(gcode_str, "unknown");
1912
1913 qval = CpTheQualValue(feat.GetQual(), "translation");
1914 intercodon = false;
1915
1916 msg = location_to_string(feat.GetLocation());
1917 if (msg == NULL)
1918 msg = StringSave("");
1919
1920 if (!prot.empty()) {
1921 protlen = prot.size();
1922 residue = prot[protlen - 1];
1923 if ((!feat.IsSetPartial() || feat.GetPartial() == false) && !SeqLocHaveFuzz(feat.GetLocation())) {
1924 CkEndStop(feat, dif);
1925 }
1926
1927 if (residue != '*') {
1928 r = EndAdded(feat, gene_refs);
1929 if (r > 0 && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1930 ErrPostEx(SEV_WARNING, ERR_CDREGION_TerminalStopCodonMissing,
1931 "CDS: %s |found end stop codon after %d bases added",
1932 msg, r);
1933 }
1934
1935 if ((!feat.IsSetPartial() || feat.GetPartial() == false) && !SeqLocHaveFuzz(feat.GetLocation())) {
1936 /* if there is no partial qualifier and location
1937 * doesn't have "fuzz" then output message
1938 */
1939 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1940 ErrPostEx(SEV_ERROR, ERR_CDREGION_TerminalStopCodonMissing,
1941 "No end stop codon found for CDS: %s", msg);
1942 }
1943 }
1944 }
1945 else /* remove termination codon from protein */
1946 {
1947 check_end_internal(protlen, feat, dif);
1948 prot = prot.substr(0, prot.size() - 1);
1949 }
1950
1951 /* check internal stop codon */
1952 size_t residue_idx = 0;
1953 protlen = prot.size();
1954 for (stopmsg[0] = '\0', aa = 1; residue_idx < protlen; ++residue_idx) {
1955 residue = prot[residue_idx];
1956 if (aa == 1 && residue == '-') {
1957 /* if unrecognized start of translation,
1958 * a ncbigap character is inserted
1959 */
1960 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1961 ErrPostEx(SEV_WARNING, ERR_CDREGION_IllegalStart,
1962 "unrecognized initiation codon from CDS: %s",
1963 msg);
1964 }
1965 if (qval == NULL) /* no /translation */
1966 {
1967 MemFree(msg);
1968 return;
1969 }
1970 }
1971
1972 if (residue == '*') /* only report 10 internal stop codons */
1973 {
1974 intercodon = true;
1975 ++num;
1976
1977 if (num < 11 && StringLen(stopmsg) < 500) {
1978 sprintf(aastr, "%d ", (int)aa);
1979 StringCat(stopmsg, aastr);
1980 }
1981 else if (num == 11 && StringLen(stopmsg) < 500)
1982 StringCat(stopmsg, ", only report 10 positions");
1983 }
1984
1985 aa++;
1986 }
1987
1988 if (intercodon) {
1989 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1990 ErrPostEx(SEV_ERROR, ERR_CDREGION_InternalStopCodonFound,
1991 "Found %d internal stop codon, at AA # %s, on feature key, CDS, frame # %d, genetic code %s:%s",
1992 (int)num, stopmsg, cdregion.GetFrame(), gcode_str, msg);
1993 }
1994
1995 if (pp->debug) {
1996 ErrByteStorePtr(ibp, feat, prot);
1997 }
1998 }
1999 }
2000 else if (!feat.IsSetExcept() || feat.GetExcept() == false) {
2001 ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq,
2002 "No protein sequence found:%s", msg);
2003 }
2004
2005 if (qval != NULL) /* compare protein sequence */
2006 {
2007 CkProteinTransl(pp, ibp, prot, feat, qval, intercodon, gcode_str, &m);
2008 *method = m;
2009 MemFree(qval);
2010 MemFree(msg);
2011 seq_data.swap(prot);
2012 return;
2013 }
2014
2015 if (!prot.empty() && !intercodon) {
2016 if (prot.size() > 6 || !check_short_CDS(pp, feat, false)) {
2017 ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationAdded,
2018 "input CDS lacks a translation: %s", msg);
2019 }
2020 *method = objects::eGIBB_method_concept_trans;
2021 MemFree(msg);
2022 seq_data.swap(prot);
2023 return;
2024 }
2025
2026 /* no translation qual and internal stop codon
2027 */
2028 if (intercodon) {
2029 cdregion.SetStops(num);
2030 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
2031 ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq,
2032 "internal stop codons, and no translation qualifier CDS:%s",
2033 msg);
2034 }
2035 }
2036
2037 MemFree(msg);
2038 }
2039
2040 /**********************************************************/
check_gen_code(char * qval,ProtBlkPtr pbp,Uint1 taxserver)2041 static void check_gen_code(char* qval, ProtBlkPtr pbp, Uint1 taxserver)
2042 {
2043 ErrSev sev;
2044 Uint1 gcpvalue;
2045 Uint1 genome;
2046 Uint1 value;
2047
2048 if (pbp == NULL || !pbp->gcode.IsId())
2049 return;
2050
2051 gcpvalue = pbp->gcode.GetId();
2052 value = (Uint1)atoi(qval);
2053 genome = pbp->genome;
2054
2055 if (value == gcpvalue)
2056 return;
2057
2058 if (value == 7 || value == 8) {
2059 ErrPostEx(SEV_WARNING, ERR_CDREGION_InvalidGcodeTable,
2060 "genetic code table is obsolete /transl_table = %d", value);
2061 pbp->gcode.SetId(pbp->orig_gcode);
2062 return;
2063 }
2064
2065 if (value != 11 || (genome != 2 && genome != 3 && genome != 6 &&
2066 genome != 12 && genome != 16 && genome != 17 &&
2067 genome != 18 && genome != 22)) {
2068 sev = (taxserver == 0) ? SEV_INFO : SEV_ERROR;
2069 ErrPostEx(sev, ERR_CDREGION_GeneticCodeDiff,
2070 "Genetic code from Taxonomy server: %d, from /transl_table: %d",
2071 gcpvalue, value);
2072 }
2073
2074 pbp->gcode.SetId(value);
2075 }
2076
2077 /**********************************************************/
CpGeneticCodePtr(objects::CGenetic_code & code,const objects::CGenetic_code::C_E & gcode)2078 static bool CpGeneticCodePtr(objects::CGenetic_code& code, const objects::CGenetic_code::C_E& gcode)
2079 {
2080 if (!gcode.IsId())
2081 return false;
2082
2083 CRef<objects::CGenetic_code::C_E> ce(new objects::CGenetic_code::C_E);
2084 ce->SetId(gcode.GetId());
2085 code.Set().push_back(ce);
2086
2087 return true;
2088 }
2089
2090 /**********************************************************/
IfOnlyStopCodon(const objects::CBioseq & bioseq,const objects::CSeq_feat & feat,bool transl)2091 static Int4 IfOnlyStopCodon(const objects::CBioseq& bioseq, const objects::CSeq_feat& feat, bool transl)
2092 {
2093 char* p;
2094 Uint1 strand;
2095 Int4 len;
2096 Int4 i;
2097
2098 if (!feat.IsSetLocation() || transl)
2099 return(0);
2100
2101 const objects::CSeq_loc& loc = feat.GetLocation();
2102 TSeqPos start = loc.GetStart(objects::eExtreme_Positional),
2103 stop = loc.GetStop(objects::eExtreme_Positional) + 1;
2104
2105 if (start == kInvalidSeqPos || stop == kInvalidSeqPos)
2106 return(0);
2107
2108 len = stop - start;
2109 if (len < 1 || len > 5)
2110 return(0);
2111
2112 strand = loc.IsSetStrand() ? loc.GetStrand() : 0;
2113
2114 p = location_to_string(loc);
2115 if ((strand == 2 && stop == bioseq.GetLength()) || (strand != 2 && start == 0)) {
2116 ErrPostEx(SEV_INFO, ERR_CDREGION_StopCodonOnly,
2117 "Assuming coding region at \"%s\" annotates the stop codon of an upstream or downstream coding region.",
2118 (p == NULL) ? "???" : p);
2119 i = 1;
2120 }
2121 else {
2122 ErrPostEx(SEV_REJECT, ERR_CDREGION_StopCodonBadInterval,
2123 "Coding region at \"%s\" appears to annotate a stop codon, but its location does not include a sequence endpoint.",
2124 (p == NULL) ? "???" : p);
2125 i = -1;
2126 }
2127
2128 if (p != NULL)
2129 MemFree(p);
2130
2131 return(i);
2132 }
2133
2134 /**********************************************************/
fta_concat_except_text(objects::CSeq_feat & feat,const Char * text)2135 static void fta_concat_except_text(objects::CSeq_feat& feat, const Char* text)
2136 {
2137 if (text == NULL)
2138 return;
2139
2140 if (feat.IsSetExcept_text()) {
2141 feat.SetExcept_text() += ", ";
2142 feat.SetExcept_text() += text;
2143 }
2144 else
2145 feat.SetExcept_text() = text;
2146 }
2147
2148 /**********************************************************/
fta_check_exception(objects::CSeq_feat & feat,Parser::ESource source)2149 static bool fta_check_exception(objects::CSeq_feat& feat, Parser::ESource source)
2150 {
2151 const char **b;
2152 ErrSev sev;
2153 char* p;
2154
2155 if (!feat.IsSetQual())
2156 return true;
2157
2158 bool stopped = false;
2159 for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
2160 if (!(*qual)->IsSetQual()) {
2161 ++qual;
2162 continue;
2163 }
2164
2165 if ((*qual)->GetQual() == "ribosomal_slippage")
2166 p = (char*) "ribosomal slippage";
2167 else if ((*qual)->GetQual() == "trans_splicing")
2168 p = (char*) "trans-splicing";
2169 else
2170 p = NULL;
2171
2172 if (p != NULL) {
2173 feat.SetExcept(true);
2174
2175 qual = feat.SetQual().erase(qual);
2176 fta_concat_except_text(feat, p);
2177 continue;
2178 }
2179
2180 if ((*qual)->GetQual() != "exception" || !(*qual)->IsSetVal()) {
2181 ++qual;
2182 continue;
2183 }
2184
2185 if (source == Parser::ESource::Refseq)
2186 b = RSExceptionQualVals;
2187 else
2188 b = GBExceptionQualVals;
2189
2190 const Char* cur_val = (*qual)->GetVal().c_str();
2191 for (; *b != NULL; b++) {
2192 if (StringICmp(*b, cur_val) == 0)
2193 break;
2194 }
2195
2196 if (*b == NULL) {
2197 sev = (source == Parser::ESource::Refseq) ? SEV_ERROR : SEV_REJECT;
2198
2199 p = location_to_string(feat.GetLocation());
2200 ErrPostEx(sev, ERR_QUALIFIER_InvalidException,
2201 "/exception value \"%s\" on feature \"CDS\" at location \"%s\" is invalid.",
2202 cur_val, (p == NULL) ? "Unknown" : p);
2203 if (p != NULL)
2204 MemFree(p);
2205 if (source != Parser::ESource::Refseq) {
2206 stopped = true;
2207 break;
2208 }
2209 }
2210 else {
2211 feat.SetExcept(true);
2212 fta_concat_except_text(feat, cur_val);
2213 }
2214
2215 qual = feat.SetQual().erase(qual);
2216 }
2217
2218 if (feat.GetQual().empty())
2219 feat.ResetQual();
2220
2221 if (stopped)
2222 return false;
2223
2224 return true;
2225 }
2226
2227 /**********************************************************
2228 *
2229 * static Int2 CkCdRegion(pp, sfp, bsq, num, gene):
2230 *
2231 * Routine returns 0, and
2232 * sfp->data.choice = SEQFEAT_IMP :
2233 * - if ambiguous frame number and no "codon_start"
2234 * qualifier;
2235 * - if there is a "pseudo" qualifier;
2236 * - if the range of the "transl_except" qualifier is
2237 * wrong;
2238 * Otherwise return 1, and
2239 * sfp->data.choice = SEQFEAT_CDREGION
2240 *
2241 **********************************************************/
CkCdRegion(ParserPtr pp,CScope & scope,objects::CSeq_feat & cds,objects::CBioseq & bioseq,int * num,GeneRefFeats & gene_refs)2242 static Int2 CkCdRegion(ParserPtr pp, CScope& scope, objects::CSeq_feat& cds, objects::CBioseq& bioseq,
2243 int* num, GeneRefFeats& gene_refs)
2244 {
2245 ProtBlkPtr pbp;
2246 const char *r;
2247
2248 char* qval = NULL;
2249 char* p;
2250 bool is_pseudo;
2251 bool is_stop;
2252 bool is_transl;
2253
2254 ErrSev sev;
2255
2256 Uint1 method = 0;
2257 Uint1 dif;
2258 Uint1 codon_start;
2259 Int2 frame;
2260 Int2 i;
2261
2262 pbp = pp->pbp;
2263 if (pp->buf != NULL)
2264 MemFree(pp->buf);
2265 pp->buf = NULL;
2266
2267 TCodeBreakList code_breaks;
2268 GetCdRegionCB(pbp->ibp, cds, code_breaks, &dif, pp->accver);
2269
2270 is_pseudo = cds.IsSetPseudo() ? cds.GetPseudo() : false;
2271 is_transl = FindTheQual(cds, "translation");
2272
2273 objects::CCode_break* first_code_break = nullptr;
2274 if (!code_breaks.empty())
2275 first_code_break = *code_breaks.begin();
2276
2277 if (first_code_break != nullptr && first_code_break->GetAa().GetNcbieaa() == 42)
2278 is_stop = true;
2279 else if (is_pseudo)
2280 is_stop = false;
2281 else {
2282 i = IfOnlyStopCodon(bioseq, cds, is_transl);
2283 if (i < 0)
2284 return(-1);
2285 is_stop = (i != 0);
2286 }
2287
2288 if (!is_transl) {
2289 bool found = false;
2290 ITERATE(TQualVector, qual, cds.GetQual())
2291 {
2292 if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal())
2293 continue;
2294
2295 if ((*qual)->GetQual() == "product" ||
2296 (*qual)->GetQual() == "function" ||
2297 (*qual)->GetQual() == "EC_number") {
2298 found = true;
2299 break;
2300 }
2301 }
2302
2303 if (found) {
2304 CRef<objects::CSeqFeatXref> xfer(new objects::CSeqFeatXref);
2305 objects::CProt_ref& prot_ref = xfer->SetData().SetProt();
2306 ITERATE(TQualVector, qual, cds.GetQual())
2307 {
2308 if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal())
2309 continue;
2310
2311 if ((*qual)->GetQual() == "product")
2312 prot_ref.SetName().push_back((*qual)->GetVal());
2313 else if ((*qual)->GetQual() == "EC_number")
2314 prot_ref.SetEc().push_back((*qual)->GetVal());
2315 else if ((*qual)->GetQual() == "function")
2316 prot_ref.SetActivity().push_back((*qual)->GetVal());
2317 }
2318
2319 DeleteQual(cds.SetQual(), "product");
2320 DeleteQual(cds.SetQual(), "EC_number");
2321 DeleteQual(cds.SetQual(), "function");
2322
2323 if (cds.GetQual().empty())
2324 cds.ResetQual();
2325
2326 cds.SetXref().push_back(xfer);
2327 }
2328 }
2329
2330 if (pp->accver && is_transl && is_pseudo) {
2331 p = location_to_string(cds.GetLocation());
2332 ErrPostEx(SEV_ERROR, ERR_CDREGION_PseudoWithTranslation,
2333 "Coding region flagged as /pseudo has a /translation qualifier : \"%s\".",
2334 (p == NULL) ? "" : p);
2335 MemFree(p);
2336 return(-1);
2337 }
2338
2339 if (pp->mode != Parser::EMode::Relaxed &&
2340 pp->accver && is_transl == false && FindTheQual(cds, "protein_id")) {
2341 p = location_to_string(cds.GetLocation());
2342 ErrPostEx(SEV_ERROR, ERR_CDREGION_UnexpectedProteinId,
2343 "CDS without /translation should not have a /protein_id qualifier. CDS = \"%s\".",
2344 (p == NULL) ? "" : p);
2345 MemFree(p);
2346 return(-1);
2347 }
2348
2349 if (pp->mode != Parser::EMode::Relaxed &&
2350 is_transl == false && is_pseudo == false && is_stop == false) {
2351 p = location_to_string(cds.GetLocation());
2352 if (pp->accver == false) {
2353 r = "Feature and protein bioseq";
2354 i = -2;
2355 }
2356 else {
2357 r = "Record";
2358 i = -1;
2359 }
2360 sev = (i == -1) ? SEV_REJECT : SEV_ERROR;
2361 ErrPostEx(sev, ERR_CDREGION_MissingTranslation,
2362 "Missing /translation qualifier for CDS \"%s\". %s rejected.",
2363 (p == NULL) ? "" : p, r);
2364 MemFree(p);
2365 return(i);
2366 }
2367
2368 /* check exception qualifier
2369 */
2370 if (!fta_check_exception(cds, pp->source))
2371 return(-1);
2372
2373 CRef<objects::CImp_feat> imp_feat(new objects::CImp_feat);
2374 if (cds.IsSetData() && cds.GetData().IsImp())
2375 imp_feat->Assign(cds.GetData().GetImp());
2376
2377 codon_start = 1;
2378 qval = GetTheQualValue(cds.SetQual(), "codon_start");
2379
2380 if (qval == NULL) {
2381 if (pp->source == Parser::ESource::EMBL)
2382 frame = 1;
2383 else {
2384 frame = 0;
2385 objects::CCdregion::EFrame loc_frame = objects::CCdregion::eFrame_not_set;
2386 if (objects::CCleanup::SetFrameFromLoc(loc_frame, cds.GetLocation(), scope))
2387 frame = loc_frame;
2388
2389 if (frame == 0 && is_pseudo == false) {
2390 p = location_to_string(cds.GetLocation());
2391 sev = (pp->source == Parser::ESource::DDBJ) ? SEV_INFO : SEV_ERROR;
2392 ErrPostEx(sev, ERR_CDREGION_MissingCodonStart,
2393 "CDS feature \"%s\" is lacking /codon_start qualifier; assuming frame = 1.",
2394 (p == NULL) ? "" : p);
2395 MemFree(p);
2396 frame = 1;
2397 }
2398 }
2399 }
2400 else {
2401 frame = (Uint1)atoi(qval);
2402 MemFree(qval);
2403 }
2404
2405 CRef<objects::CCdregion> cdregion(new objects::CCdregion);
2406
2407 if (frame > 0)
2408 cdregion->SetFrame(static_cast<objects::CCdregion::EFrame>(frame));
2409
2410 qval = GetTheQualValue(cds.SetQual(), "transl_table");
2411
2412 if (qval != NULL) {
2413 check_gen_code(qval, pbp, pp->taxserver);
2414 pp->no_code = false;
2415 MemFree(qval);
2416 }
2417 else if (pbp != NULL && pbp->gcode.IsId())
2418 pbp->gcode.SetId(pbp->orig_gcode);
2419
2420 if (!code_breaks.empty())
2421 cdregion->SetCode_break().swap(code_breaks);
2422
2423 if (!CpGeneticCodePtr(cdregion->SetCode(), pbp->gcode))
2424 cdregion->ResetCode();
2425
2426 cds.SetData().SetCdregion(*cdregion);
2427
2428 if (cds.GetQual().empty())
2429 cds.ResetQual();
2430
2431 if (!is_transl) {
2432 imp_feat.Reset();
2433 return(0);
2434 }
2435
2436 objects::CBioseq::TId ids;
2437 GetProtRefSeqId(ids, pbp->ibp, num, pp, scope, cds);
2438
2439 if (!ids.empty())
2440 fta_check_codon_quals(cds);
2441
2442 std::string sequence_data;
2443 InternalStopCodon(pp, pbp->ibp, cds, &method, dif, gene_refs, sequence_data);
2444
2445 if (cds.GetQual().empty())
2446 cds.ResetQual();
2447
2448 if (cdregion->IsSetConflict() && cdregion->GetConflict() && codon_start == 0) {
2449 p = location_to_string(cds.GetLocation());
2450 ErrPostEx(SEV_ERROR, ERR_CDREGION_TooBad,
2451 "Input translation does not agree with parser generated one, cdregion \"%s\" is lacking /codon_start, frame not set, - so sequence will be rejected.",
2452 (p == NULL) ? "" : p);
2453 MemFree(p);
2454 return(-1);
2455 }
2456
2457 if (!sequence_data.empty()) {
2458 imp_feat.Reset();
2459 CRef<objects::CBioseq> new_bioseq = BldProtRefSeqEntry(pbp, cds, sequence_data, method, pp, bioseq, ids);
2460
2461 if (new_bioseq.Empty()) {
2462 return(-1);
2463 }
2464
2465 scope.AddBioseq(*new_bioseq);
2466
2467 /* remove qualifiers which were processed before
2468 */
2469 DeleteQual(cds.SetQual(), "codon_start");
2470 DeleteQual(cds.SetQual(), "transl_except");
2471 DeleteQual(cds.SetQual(), "translation");
2472 DeleteQual(cds.SetQual(), "protein_id");
2473
2474 if (cds.GetQual().empty())
2475 cds.ResetQual();
2476
2477 if (sequence_data.size() < 6 && pp->accver == false && check_short_CDS(pp, cds, true)) {
2478 /* make xref from prot-ref for short CDS only
2479 */
2480 if (new_bioseq->IsSetAnnot()) {
2481 NON_CONST_ITERATE(objects::CBioseq::TAnnot, annot, new_bioseq->SetAnnot())
2482 {
2483 if (!(*annot)->IsFtable())
2484 continue;
2485
2486 ITERATE(TSeqFeatList, cur_feat, (*annot)->GetData().GetFtable())
2487 {
2488 if (!(*cur_feat)->IsSetData() || !(*cur_feat)->GetData().IsProt())
2489 continue;
2490
2491 CRef<objects::CSeqFeatXref> new_xref(new objects::CSeqFeatXref);
2492 new_xref->SetData().SetProt().Assign((*cur_feat)->GetData().GetProt());
2493
2494 cds.SetXref().push_back(new_xref);
2495 }
2496 }
2497 }
2498 return(0);
2499 }
2500
2501 objects::CSeq_id& first_id = *(*new_bioseq->SetId().begin());
2502 cds.SetProduct().SetWhole(first_id);
2503
2504 AddProtRefSeqEntry(pbp, *new_bioseq);
2505
2506 return(1);
2507 }
2508
2509 /* no protein sequence, or there is no translation qualifier
2510 * and protein sequence has internal stop codon
2511 */
2512
2513 cds.SetExcept(false);
2514 if (cds.IsSetExcept_text())
2515 cds.ResetExcept_text();
2516
2517 cds.ResetData();
2518 if (imp_feat.NotEmpty())
2519 cds.SetData().SetImp(*imp_feat);
2520
2521 if (!is_pseudo) {
2522 p = location_to_string(cds.GetLocation());
2523 ErrPostEx(SEV_ERROR, ERR_CDREGION_ConvertToImpFeat,
2524 "non-pseudo CDS with data problems is converted to ImpFeat%s",
2525 (p == NULL) ? "" : p);
2526 MemFree(p);
2527 }
2528 return(0);
2529 }
2530
2531 /**********************************************************
2532 *
2533 * static SeqFeatPtr SrchCdRegion(pp, bsp, sfp, gene):
2534 *
2535 * Return a link list of SeqFeatPtr of type CDREGION.
2536 *
2537 **********************************************************/
SrchCdRegion(ParserPtr pp,CScope & scope,objects::CBioseq & bioseq,objects::CSeq_annot & annot,GeneRefFeats & gene_refs)2538 static void SrchCdRegion(ParserPtr pp, CScope& scope, objects::CBioseq& bioseq, objects::CSeq_annot& annot,
2539 GeneRefFeats& gene_refs)
2540 {
2541 char* p;
2542 Int4 num = 0;
2543 Int2 i;
2544
2545 if (!annot.IsSetData() || !annot.GetData().IsFtable())
2546 return;
2547
2548 for (objects::CSeq_annot::C_Data::TFtable::iterator feat = annot.SetData().SetFtable().begin();
2549 feat != annot.SetData().SetFtable().end();) {
2550 if (!(*feat)->IsSetData() || !(*feat)->GetData().IsImp()) {
2551 ++feat;
2552 continue;
2553 }
2554
2555 const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
2556 if (!imp_feat.IsSetKey() || imp_feat.GetKey() != "CDS") {
2557 ++feat;
2558 continue;
2559 }
2560
2561 /* remove asn2ff_generated comments
2562 */
2563 StripCDSComment(*(*feat));
2564
2565 const objects::CSeq_loc& loc = (*feat)->GetLocation();
2566 if (loc.IsEmpty() || loc.IsEquiv() || loc.IsBond()) {
2567 p = location_to_string(loc);
2568 ErrPostEx(SEV_REJECT, ERR_CDREGION_BadLocForTranslation,
2569 "Coding region feature has a location that cannot be processed: \"%s\".",
2570 (p == NULL) ? "" : p);
2571 if (p != NULL)
2572 MemFree(p);
2573 pp->entrylist[pp->curindx]->drop = 1;
2574 break;
2575 }
2576
2577 i = CkCdRegion(pp, scope, *(*feat), bioseq, &num, gene_refs);
2578
2579 if (i == -2) {
2580 feat = annot.SetData().SetFtable().erase(feat);
2581 continue;
2582 }
2583
2584 if (i == -1) {
2585 pp->entrylist[pp->curindx]->drop = 1;
2586 break;
2587 }
2588
2589 if (i != 1) {
2590 ++feat;
2591 continue;
2592 }
2593
2594 /* prepare cdregion to link list, for nuc-prot level
2595 */
2596 pp->pbp->feats.push_back(*feat);
2597
2598 feat = annot.SetData().SetFtable().erase(feat);
2599 }
2600 }
2601
2602 /**********************************************************/
FindCd(TEntryList & seq_entries,CScope & scope,ParserPtr pp,GeneRefFeats & gene_refs)2603 static void FindCd(TEntryList& seq_entries, CScope& scope, ParserPtr pp, GeneRefFeats& gene_refs)
2604 {
2605 ProtBlkPtr pbp = pp->pbp;
2606
2607 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2608 {
2609 for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
2610 pbp->segset = true;
2611 pbp->biosep = *entry;
2612 break;
2613 }
2614
2615 if (pbp->segset)
2616 break;
2617 }
2618
2619 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2620 {
2621 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2622 const objects::CSeq_id& first_id = *(*bioseq->GetId().begin());
2623 if (IsSegBioseq(first_id))
2624 continue;
2625
2626 InfoBioseqFree(pbp->ibp);
2627 if(pp->source != Parser::ESource::USPTO)
2628 CpSeqId(pbp->ibp, first_id);
2629
2630 if (bioseq->IsSetAnnot()) {
2631 for (objects::CBioseq::TAnnot::iterator annot = bioseq->SetAnnot().begin(); annot != bioseq->SetAnnot().end();) {
2632 if (!(*annot)->IsFtable()) {
2633 ++annot;
2634 continue;
2635 }
2636
2637 SrchCdRegion(pp, scope, *bioseq, *(*annot), gene_refs);
2638 if (!(*annot)->GetData().GetFtable().empty()) {
2639 ++annot;
2640 continue;
2641 }
2642
2643 annot = bioseq->SetAnnot().erase(annot);
2644 }
2645
2646 if (bioseq->GetAnnot().empty())
2647 bioseq->ResetAnnot();
2648 }
2649
2650 if (!pbp->segset) {
2651 pbp->biosep = *entry;
2652 }
2653 }
2654 }
2655 }
2656
2657 /**********************************************************/
check_GIBB(TSeqdescList & descrs)2658 static bool check_GIBB(TSeqdescList& descrs)
2659 {
2660 if (descrs.empty())
2661 return false;
2662
2663 const objects::CSeqdesc* descr_modif = nullptr;
2664 ITERATE(TSeqdescList, descr, descrs)
2665 {
2666 if ((*descr)->IsModif()) {
2667 descr_modif = *descr;
2668 break;
2669 }
2670 }
2671 if (descr_modif == nullptr)
2672 return true;
2673
2674 if (!descr_modif->GetModif().empty()) {
2675 objects::EGIBB_mod gmod = *descr_modif->GetModif().begin();
2676 if (gmod == objects::eGIBB_mod_dna || gmod == objects::eGIBB_mod_rna ||
2677 gmod == objects::eGIBB_mod_est)
2678 return false;
2679 }
2680 return true;
2681 }
2682
2683 /**********************************************************/
ValNodeExtractUserObject(TSeqdescList & descrs_from,TSeqdescList & descrs_to,const Char * tag)2684 static void ValNodeExtractUserObject(TSeqdescList& descrs_from, TSeqdescList& descrs_to, const Char* tag)
2685 {
2686 for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2687 if ((*descr)->IsUser() && (*descr)->GetUser().IsSetData() &&
2688 (*descr)->GetUser().IsSetType() && (*descr)->GetUser().GetType().IsStr() &&
2689 (*descr)->GetUser().GetType().GetStr() == tag) {
2690 descrs_to.push_back(*descr);
2691 descr = descrs_from.erase(descr);
2692 break;
2693 }
2694 else
2695 ++descr;
2696 }
2697 }
2698
2699 /**********************************************************/
ExtractDescrs(TSeqdescList & descrs_from,TSeqdescList & descrs_to,objects::CSeqdesc::E_Choice choice)2700 void ExtractDescrs(TSeqdescList& descrs_from, TSeqdescList& descrs_to, objects::CSeqdesc::E_Choice choice)
2701 {
2702 for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2703 if ((*descr)->Which() == choice) {
2704 descrs_to.push_back(*descr);
2705 descr = descrs_from.erase(descr);
2706 }
2707 else
2708 ++descr;
2709 }
2710 }
2711
2712 /**********************************************************/
GetBioseqSetDescr(ProtBlkPtr pbp,TSeqdescList & descrs)2713 static void GetBioseqSetDescr(ProtBlkPtr pbp, TSeqdescList& descrs)
2714 {
2715 TSeqdescList* descrs_from = nullptr;
2716 if (pbp->segset) {
2717 if (!pbp->biosep->GetSet().GetDescr().Get().empty())
2718 descrs_from = &pbp->biosep->SetSet().SetDescr().Set();
2719 }
2720 else {
2721 if (!pbp->biosep->GetSeq().GetDescr().Get().empty())
2722 descrs_from = &pbp->biosep->SetSeq().SetDescr().Set();
2723 }
2724
2725 if (descrs_from == nullptr)
2726 return;
2727
2728 ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Org);
2729
2730 if (check_GIBB(*descrs_from)) {
2731 ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Modif);
2732 }
2733
2734 ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Comment);
2735 ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Pub);
2736 ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Update_date);
2737
2738 ValNodeExtractUserObject(*descrs_from, descrs, "GenomeProjectsDB");
2739 ValNodeExtractUserObject(*descrs_from, descrs, "DBLink");
2740 ValNodeExtractUserObject(*descrs_from, descrs, "FeatureFetchPolicy");
2741 }
2742
2743 /**********************************************************/
BuildProtBioseqSet(ProtBlkPtr pbp,TEntryList & entries)2744 static void BuildProtBioseqSet(ProtBlkPtr pbp, TEntryList& entries)
2745 {
2746 CRef<objects::CSeq_entry> entry(new objects::CSeq_entry);
2747 objects::CBioseq_set& seq_set = entry->SetSet();
2748 seq_set.SetClass(objects::CBioseq_set::eClass_nuc_prot);
2749
2750 /* add descr if nuc-prot
2751 */
2752 GetBioseqSetDescr(pbp, seq_set.SetDescr().Set()); /* get from ASN.1 tree */
2753 if (seq_set.GetDescr().Get().empty())
2754 seq_set.ResetDescr();
2755
2756 seq_set.SetSeq_set().splice(seq_set.SetSeq_set().end(), entries);
2757
2758 CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
2759
2760 if (!pbp->feats.empty())
2761 annot->SetData().SetFtable().swap(pbp->feats);
2762
2763 seq_set.SetAnnot().push_back(annot);
2764
2765 entries.push_back(entry);
2766 }
2767
2768 /**********************************************************/
ProcNucProt(ParserPtr pp,TEntryList & seq_entries,GeneRefFeats & gene_refs)2769 void ProcNucProt(ParserPtr pp, TEntryList& seq_entries, GeneRefFeats& gene_refs)
2770 {
2771 ProtBlkPtr pbp;
2772 ErrSev sev;
2773 Int4 gcode = 0;
2774
2775 pbp = pp->pbp;
2776 ProtBlkInit(pbp);
2777
2778 GetGcode(seq_entries, pp);
2779
2780 if (!pbp->gcode.IsId()) {
2781 gcode = (pbp->genome == 4 || pbp->genome == 5) ? 2 : 1;
2782 pp->no_code = true;
2783 sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
2784 ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
2785 "No %sgenetic code from TaxArch, code %d assumed",
2786 (gcode == 2) ? "mitochondrial " : "", gcode);
2787 pbp->gcode.SetId(gcode);
2788 pbp->orig_gcode = gcode;
2789 }
2790
2791 FindCd(seq_entries, GetScope(), pp, gene_refs);
2792
2793 if (pp->entrylist[pp->curindx]->drop == 1) {
2794 ProtBlkFree(pbp);
2795 seq_entries.clear();
2796 return;
2797 }
2798
2799 if (!pbp->entries.empty()) {
2800 seq_entries.splice(seq_entries.end(), pbp->entries);
2801
2802 BuildProtBioseqSet(pbp, seq_entries);
2803 AssignBioseqSetLevel(seq_entries);
2804 }
2805
2806 ProtBlkFree(pbp);
2807 }
2808
2809 /**********************************************************/
GetDateFromDescrs(const TSeqdescList & descrs,objects::CSeqdesc::E_Choice what)2810 static const objects::CDate* GetDateFromDescrs(const TSeqdescList& descrs, objects::CSeqdesc::E_Choice what)
2811 {
2812 const objects::CDate* set_date = nullptr;
2813 ITERATE(TSeqdescList, descr, descrs)
2814 {
2815 if ((*descr)->Which() == what) {
2816 if (what == objects::CSeqdesc::e_Create_date)
2817 set_date = &(*descr)->GetCreate_date();
2818 else if (what == objects::CSeqdesc::e_Update_date)
2819 set_date = &(*descr)->GetUpdate_date();
2820
2821 if (set_date)
2822 break;
2823 }
2824 }
2825
2826 return set_date;
2827 }
2828
2829 /**********************************************************/
FixDupDates(objects::CBioseq_set & bio_set,objects::CSeqdesc::E_Choice what)2830 static void FixDupDates(objects::CBioseq_set& bio_set, objects::CSeqdesc::E_Choice what)
2831 {
2832 if (!bio_set.IsSetSeq_set() || !bio_set.IsSetDescr())
2833 return;
2834
2835 NON_CONST_ITERATE(TEntryList, entry, bio_set.SetSeq_set())
2836 {
2837 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2838 if (!bioseq->IsSetInst() || !bioseq->GetInst().IsSetMol() || !bioseq->GetInst().IsNa() || !bioseq->IsSetDescr())
2839 continue;
2840
2841 const objects::CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), what);
2842
2843 TSeqdescList& cur_descrs = bioseq->SetDescr().Set();
2844 TSeqdescList::iterator cur_descr = cur_descrs.begin();
2845
2846 for (; cur_descr != cur_descrs.end(); ++cur_descr) {
2847 if ((*cur_descr)->Which() == what)
2848 break;
2849 }
2850
2851 if (cur_descr == cur_descrs.end())
2852 continue;
2853
2854 const objects::CDate* seq_date = nullptr;
2855 if (what == objects::CSeqdesc::e_Create_date)
2856 seq_date = &(*cur_descr)->GetCreate_date();
2857 else if (what == objects::CSeqdesc::e_Update_date)
2858 seq_date = &(*cur_descr)->GetUpdate_date();
2859
2860 if (seq_date == nullptr)
2861 continue;
2862
2863 if (set_date && seq_date->Compare(*set_date) == objects::CDate::eCompare_same)
2864 cur_descrs.erase(cur_descr);
2865
2866 if (set_date == nullptr) {
2867 bio_set.SetDescr().Set().push_back(*cur_descr);
2868 cur_descrs.erase(cur_descr);
2869 }
2870 }
2871 }
2872 }
2873
2874 /**********************************************************/
FixCreateDates(objects::CBioseq_set & bio_set)2875 static void FixCreateDates(objects::CBioseq_set& bio_set)
2876 {
2877 FixDupDates(bio_set, objects::CSeqdesc::e_Create_date);
2878 }
2879
2880 /**********************************************************/
FixUpdateDates(objects::CBioseq_set & bio_set)2881 static void FixUpdateDates(objects::CBioseq_set& bio_set)
2882 {
2883 FixDupDates(bio_set, objects::CSeqdesc::e_Update_date);
2884 }
2885
2886 /**********************************************************/
FixEmblUpdateDates(objects::CBioseq_set & bio_set)2887 static void FixEmblUpdateDates(objects::CBioseq_set& bio_set)
2888 {
2889 if (!bio_set.IsSetSeq_set() || !bio_set.IsSetDescr())
2890 return;
2891
2892 NON_CONST_ITERATE(TEntryList, entry, bio_set.SetSeq_set())
2893 {
2894 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2895 if (!bioseq->IsSetInst() || !bioseq->GetInst().IsSetMol() || !bioseq->GetInst().IsNa() || !bioseq->IsSetDescr())
2896 continue;
2897
2898 const objects::CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), objects::CSeqdesc::e_Update_date);
2899
2900 const objects::CEMBL_block* embl_block = nullptr;
2901 ITERATE(TSeqdescList, descr, bioseq->GetDescr().Get())
2902 {
2903 if ((*descr)->IsEmbl()) {
2904 embl_block = &(*descr)->GetEmbl();
2905 break;
2906 }
2907 }
2908
2909 const objects::CDate* seq_date = nullptr;
2910 if (embl_block != nullptr && embl_block->IsSetUpdate_date())
2911 seq_date = &embl_block->GetUpdate_date();
2912
2913 if (seq_date == nullptr)
2914 continue;
2915
2916 if (set_date && seq_date->Compare(*set_date) == objects::CDate::eCompare_same)
2917 continue;
2918
2919 if (set_date == nullptr) {
2920 CRef<objects::CSeqdesc> new_descr(new objects::CSeqdesc);
2921 new_descr->SetUpdate_date().Assign(*seq_date);
2922 bio_set.SetDescr().Set().push_back(new_descr);
2923 }
2924 }
2925 }
2926 }
2927
2928 /**********************************************************/
CheckDupDates(TEntryList & seq_entries)2929 void CheckDupDates(TEntryList& seq_entries)
2930 {
2931 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2932 {
2933 for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
2934 if (bio_set->IsSetClass() && bio_set->GetClass() == objects::CBioseq_set::eClass_nuc_prot) {
2935 FixCreateDates(*bio_set);
2936 FixUpdateDates(*bio_set);
2937 FixEmblUpdateDates(*bio_set);
2938 }
2939 }
2940 }
2941 }
2942
2943 END_NCBI_SCOPE
2944