1 /* nucprot.c
2 *
3 * ===========================================================================
4 *
5 *                            PUBLIC DOMAIN NOTICE
6 *               National Center for Biotechnology Information
7 *
8 *  This software/database is a "United States Government Work" under the
9 *  terms of the United States Copyright Act.  It was written as part of
10 *  the author's official duties as a United States Government employee and
11 *  thus cannot be copyrighted.  This software/database is freely available
12 *  to the public for use. The National Library of Medicine and the U.S.
13 *  Government have not placed any restriction on its use or reproduction.
14 *
15 *  Although all reasonable efforts have been taken to ensure the accuracy
16 *  and reliability of the software and data, the NLM and the U.S.
17 *  Government do not and cannot warrant the performance or results that
18 *  may be obtained by using this software or data. The NLM and the U.S.
19 *  Government disclaim all warranties, express or implied, including
20 *  warranties of performance, merchantability or fitness for any particular
21 *  purpose.
22 *
23 *  Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name:  nucprot.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 *
34 *      Take a Seq-entry or elements of a Bioseq-set, do orglookup,
35 * protein translation lookup, then make a nucleic acid protein
36 * sequence.
37 *
38 *      Get genetic code from either from Taxonomy database or from
39 * guess rules (if the organism is different in the segment set or
40 * Taxserver is not available)
41 *
42 *      orglookup includes
43 *      - lookup taxname, common name
44 *      - get lineage and division
45 *      - get genetic codes
46 *
47 *      Protein translation lookup includes
48 *      - lookup internal and end stop codon
49 *      - compare two sequences, one from CdRegion, one from
50 *        translation qualifier
51 *
52 *      Take our translation when the only diff is start codon.
53 *
54 *      This program only assign 3 different level of Bioseqset:
55 *         class = nucprot, assign level = 1
56 *         class = segset,  assign level = 2
57 *         class = parts,   assign levle = 3
58 *
59 */
60 #include <ncbi_pch.hpp>
61 
62 #include "ftacpp.hpp"
63 
64 #include <objmgr/scope.hpp>
65 #include <objects/seq/Seq_descr.hpp>
66 #include <objects/seq/Seq_annot.hpp>
67 #include <objects/seqfeat/Imp_feat.hpp>
68 #include <objects/seqfeat/Code_break.hpp>
69 #include <objmgr/util/seq_loc_util.hpp>
70 #include <objects/seqfeat/SeqFeatXref.hpp>
71 #include <objects/seqfeat/Cdregion.hpp>
72 #include <objects/seqfeat/Genetic_code.hpp>
73 #include <objmgr/util/sequence.hpp>
74 #include <objects/seq/Seq_data.hpp>
75 #include <objects/general/Object_id.hpp>
76 #include <objects/general/Dbtag.hpp>
77 #include <objects/seqfeat/Org_ref.hpp>
78 #include <objtools/cleanup/cleanup.hpp>
79 #include <objects/general/User_object.hpp>
80 #include <objects/seqblock/EMBL_block.hpp>
81 #include <objects/seq/GIBB_method.hpp>
82 #include <objtools/edit/cds_fix.hpp>
83 
84 
85 #include "index.h"
86 
87 #include <objtools/flatfile/flatfile_parser.hpp>
88 #include <objtools/flatfile/flatdefn.h>
89 
90 #include "ftaerr.hpp"
91 #include "asci_blk.h"
92 #include "add.h"
93 #include "utilfeat.h"
94 #include "nucprot.h"
95 #include "fta_src.h"
96 #include "utilfun.h"
97 #include "indx_blk.h"
98 #include "xgbparint.h"
99 
100 #ifdef THIS_FILE
101 #    undef THIS_FILE
102 #endif
103 #define THIS_FILE "nucprot.cpp"
104 
105 BEGIN_NCBI_SCOPE
106 USING_SCOPE(objects);
107 
108 typedef list<CRef<objects::CCode_break> > TCodeBreakList;
109 
110 const char *GBExceptionQualVals[] = {
111     "RNA editing",
112     "reasons given in citation",
113     "rearrangement required for product",
114     "annotated by transcript or proteomic data",
115     NULL
116 };
117 
118 const char *RSExceptionQualVals[] = {
119     "RNA editing",
120     "reasons given in citation",
121     "ribosomal slippage",
122     "trans-splicing",
123     "alternative processing",
124     "artificial frameshift",
125     "nonconsensus splice site",
126     "rearrangement required for product",
127     "modified codon recognition",
128     "alternative start codon",
129     "unclassified transcription discrepancy",
130     "unclassified translation discrepancy",
131     "mismatches in transcription",
132     "mismatches in translation",
133     NULL
134 };
135 
136 /**********************************************************
137 *
138 *   bool FindTheQual(qlist, qual):
139 *
140 *      Finds qual in the "qlist" return TRUE.
141 *   Otherwise, return FALSE.
142 *
143 **********************************************************/
FindTheQual(const objects::CSeq_feat & feat,const Char * qual_to_find)144 static bool FindTheQual(const objects::CSeq_feat& feat, const Char *qual_to_find)
145 {
146     ITERATE(TQualVector, qual, feat.GetQual())
147     {
148         if ((*qual)->IsSetQual() && (*qual)->GetQual() == qual_to_find)
149             return true;
150     }
151 
152     return(false);
153 }
154 
155 /**********************************************************
156 *
157 *   static char* CpTheQualValueNext(qlist, retq, qual):
158 *
159 *      Return qual's value if found the "qual" in the
160 *   "qlist", and retq points to next available searching
161 *   list; Otherwise, return NULL value and retq points
162 *   to NULL.
163 *
164 **********************************************************/
CpTheQualValueNext(TQualVector::iterator & cur_qual,const TQualVector::iterator & end_qual,const char * qual)165 static char* CpTheQualValueNext(TQualVector::iterator& cur_qual, const TQualVector::iterator& end_qual,
166                                   const char *qual)
167 {
168     std::string qvalue;
169 
170     for (; cur_qual != end_qual; ++cur_qual) {
171         if (!(*cur_qual)->IsSetQual() || (*cur_qual)->GetQual() != qual || !(*cur_qual)->IsSetVal())
172             continue;
173 
174         qvalue = NStr::Sanitize((*cur_qual)->GetVal());
175 
176         ++cur_qual;
177         break;
178     }
179 
180     char* ret = NULL;
181     if (!qvalue.empty())
182         ret = StringSave(qvalue.c_str());
183 
184     return ret;
185 }
186 
187 /**********************************************************/
fta_get_genetic_code(ParserPtr pp)188 static Int4 fta_get_genetic_code(ParserPtr pp)
189 {
190     IndexblkPtr ibp;
191     ProtBlkPtr  pbp;
192     Int4        gcode;
193 
194     if (pp->taxserver != 1)
195         return(0);
196 
197     ibp = pp->entrylist[pp->curindx];
198     if (ibp->gc_genomic < 1 && ibp->gc_mito < 1)
199         return(0);
200 
201     pbp = pp->pbp;
202     gcode = ibp->gc_genomic;
203     if (pbp->genome == 4 || pbp->genome == 5)
204         gcode = ibp->gc_mito;
205     pp->no_code = false;
206 
207     return(gcode);
208 }
209 
210 /**********************************************************/
GuessGeneticCode(ParserPtr pp,const objects::CSeq_descr & descrs)211 static void GuessGeneticCode(ParserPtr pp, const objects::CSeq_descr& descrs)
212 {
213     ProtBlkPtr   pbp;
214     Int4         gcode = 0;
215 
216     pbp = pp->pbp;
217 
218     ITERATE(TSeqdescList, descr, descrs.Get())
219     {
220         if (!(*descr)->IsModif())
221             continue;
222 
223         ITERATE(objects::CSeqdesc::TModif, modif, (*descr)->GetModif())
224         {
225             if (*modif == objects::eGIBB_mod_mitochondrial ||
226                 *modif == objects::eGIBB_mod_kinetoplast) {
227                 pbp->genome = 5;        /* mitochondrion */
228                 break;
229             }
230         }
231         break;
232     }
233 
234     ITERATE(TSeqdescList, descr, descrs.Get())
235     {
236         if (!(*descr)->IsSource())
237             continue;
238 
239         pbp->genome = (*descr)->GetSource().IsSetGenome() ? (*descr)->GetSource().GetGenome() : 0;
240         break;
241     }
242 
243     gcode = fta_get_genetic_code(pp);
244     if (gcode <= 0)
245         return;
246 
247     pbp->orig_gcode = gcode;
248     pbp->gcode.SetId(gcode);
249 }
250 
251 /**********************************************************/
GetGcode(TEntryList & seq_entries,ParserPtr pp)252 static void GetGcode(TEntryList& seq_entries, ParserPtr pp)
253 {
254     if (pp != NULL && pp->pbp != NULL && !pp->pbp->gcode.IsId()) {
255         ITERATE(TEntryList, entry, seq_entries)
256         {
257             GuessGeneticCode(pp, GetDescrPointer(*(*entry)));
258 
259             if (pp->pbp->gcode.IsId())
260                 break;
261         }
262     }
263 }
264 
265 /**********************************************************/
ProtBlkFree(ProtBlkPtr pbp)266 static void ProtBlkFree(ProtBlkPtr pbp)
267 {
268     pbp->gcode.Reset();
269     pbp->feats.clear();
270 
271     pbp->entries.clear();
272     InfoBioseqFree(pbp->ibp);
273 }
274 
275 /**********************************************************/
ProtBlkInit(ProtBlkPtr pbp)276 static void ProtBlkInit(ProtBlkPtr pbp)
277 {
278     pbp->biosep = nullptr;
279 
280     pbp->gcode.Reset();
281     pbp->segset = false;
282     pbp->genome = 0;
283 
284     InfoBioseqPtr ibp = pbp->ibp;
285     if (ibp) {
286         ibp->ids.clear();
287         ibp->locus = NULL;
288         ibp->acnum = NULL;
289     }
290 }
291 
292 /**********************************************************/
AssignBioseqSetLevel(TEntryList & seq_entries)293 static void AssignBioseqSetLevel(TEntryList& seq_entries)
294 {
295 
296     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
297     {
298         for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
299             switch (bio_set->GetClass()) {
300             case objects::CBioseq_set::eClass_nuc_prot:
301                 bio_set->SetLevel(1);
302                 break;
303             case objects::CBioseq_set::eClass_segset:
304                 bio_set->SetLevel(2);
305                 break;
306             case objects::CBioseq_set::eClass_parts:
307                 bio_set->SetLevel(3);
308                 break;
309             default:
310                 ErrPostEx(SEV_INFO, ERR_BIOSEQSETCLASS_NewClass,
311                           "BioseqSeq class %d not handled", (int)bio_set->GetClass());
312             }
313         }
314     }
315 }
316 
317 /**********************************************************
318 *
319 *   static bool check_short_CDS(pp, sfp, err_msg):
320 *
321 *      If CDS location contains one of the sequence ends
322 *   return TRUE, e.g. it's short do not create protein
323 *   bioseq, copy prot-ref to Xref.
324 *
325 **********************************************************/
check_short_CDS(ParserPtr pp,const objects::CSeq_feat & feat,bool err_msg)326 static bool check_short_CDS(ParserPtr pp, const objects::CSeq_feat& feat, bool err_msg)
327 {
328     const objects::CSeq_interval& interval = feat.GetLocation().GetInt();
329     if (interval.GetFrom() == 0 || interval.GetTo() == (TSeqPos)(pp->entrylist[pp->curindx]->bases) - 1)
330         return true;
331 
332     if (err_msg) {
333         char* p = location_to_string(feat.GetLocation());
334         ErrPostEx(SEV_WARNING, ERR_CDREGION_ShortProtein,
335                   "Short CDS (< 6 aa) located in the middle of the sequence: %s",
336                   (p == NULL) ? "" : p);
337         MemFree(p);
338     }
339     return false;
340 }
341 
342 /**********************************************************/
GetProtRefSeqId(objects::CBioseq::TId & ids,InfoBioseqPtr ibp,int * num,ParserPtr pp,CScope & scope,objects::CSeq_feat & cds)343 static void GetProtRefSeqId(objects::CBioseq::TId& ids, InfoBioseqPtr ibp, int* num,
344                             ParserPtr pp, CScope& scope, objects::CSeq_feat& cds)
345 {
346     const char   *r;
347 
348     char*      protacc;
349     char*      p;
350     char*      q;
351     ErrSev       sev;
352     Uint1        cho;
353     Uint1        ncho;
354     Char         str[100];
355 
356     if (pp->mode == Parser::EMode::Relaxed) {
357         protacc = CpTheQualValue(cds.SetQual(), "protein_id");
358         if (protacc == NULL || *protacc == '\0') {
359             if (protacc)
360                 MemFree(protacc);
361             int protein_id_counter=0;
362             string idLabel;
363             auto pProteinId =
364                 edit::GetNewProtId(scope.GetBioseqHandle(cds.GetLocation()), protein_id_counter, idLabel, false);
365             cds.SetProduct().SetWhole().Assign(*pProteinId);
366             ids.push_back(pProteinId);
367             return;
368         }
369         CSeq_id::ParseIDs(ids, protacc);
370         MemFree(protacc);
371         return;
372     }
373 
374     if(pp->source == Parser::ESource::USPTO)
375     {
376         protacc = CpTheQualValue(cds.SetQual(), "protein_id");
377         CRef<objects::CSeq_id> pat_seq_id(new objects::CSeq_id);
378         CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId(protacc);
379         pat_seq_id->SetPatent(*pat_id);
380         ids.push_back(pat_seq_id);
381         return;
382     }
383 
384     const objects::CTextseq_id* text_id = nullptr;
385     ITERATE(TSeqIdList, id, ibp->ids)
386     {
387         if (!(*id)->IsPatent()) {
388             text_id = (*id)->GetTextseq_Id();
389             break;
390         }
391     }
392 
393     if (text_id == nullptr)
394         return;
395 
396     if (pp->accver == false || (pp->source != Parser::ESource::EMBL &&
397         pp->source != Parser::ESource::NCBI && pp->source != Parser::ESource::DDBJ)) {
398         ++(*num);
399         sprintf(str, "%d", (int)*num);
400 
401         CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
402         std::string& obj_id_str = seq_id->SetLocal().SetStr();
403         obj_id_str = text_id->GetAccession();
404         obj_id_str += "_";
405         obj_id_str += str;
406         ids.push_back(seq_id);
407         return;
408     }
409 
410     protacc = CpTheQualValue(cds.SetQual(), "protein_id");
411     if (protacc == NULL || *protacc == '\0') {
412         p = location_to_string(cds.GetLocation());
413         ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinId,
414                   "/protein_id qualifier is missing for CDS feature: \"%s\".",
415                   (p == NULL) ? "" : p);
416         if (p != NULL)
417             MemFree(p);
418         if (protacc != NULL)
419             MemFree(protacc);
420 
421         return;
422     }
423 
424     if (pp->mode == Parser::EMode::HTGSCON) {
425         MemFree(protacc);
426         ++(*num);
427         sprintf(str, "%d", (int)*num);
428 
429         CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
430         std::string& obj_id_str = seq_id->SetLocal().SetStr();
431         obj_id_str = text_id->GetAccession();
432         obj_id_str += "_";
433         obj_id_str += str;
434         ids.push_back(seq_id);
435         return;
436     }
437 
438     p = StringChr(protacc, '.');
439     if (p == NULL || *(p + 1) == '\0') {
440         p = location_to_string(cds.GetLocation());
441         ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinVersion,
442                   "/protein_id qualifier has missing version for CDS feature: \"%s\".",
443                   (p == NULL) ? "" : p);
444         if (p != NULL)
445             MemFree(p);
446         MemFree(protacc);
447         return;
448     }
449 
450     for (q = p + 1; *q >= '0' && *q <= '9';)
451         q++;
452     if (*q != '\0') {
453         p = location_to_string(cds.GetLocation());
454         ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinVersion,
455                   "/protein_id qualifier \"%s\" has incorrect version for CDS feature: \"%s\".",
456                   protacc, (p == NULL) ? "" : p);
457         if (p != NULL)
458             MemFree(p);
459         MemFree(protacc);
460         return;
461     }
462     q = p + 1;
463     *p = '\0';
464     cho = GetProtAccOwner(protacc);
465     if (cho == 0) {
466         p = location_to_string(cds.GetLocation());
467         ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinAccession,
468                   "/protein_id qualifier has incorrect accession \"%s\" for CDS feature: \"%s\".",
469                   protacc, (p == NULL) ? "" : p);
470         if (p != NULL)
471             MemFree(p);
472         MemFree(protacc);
473         return;
474     }
475 
476     r = NULL;
477     ncho = cho;
478     if (pp->source == Parser::ESource::EMBL && cho != objects::CSeq_id::e_Embl && cho != objects::CSeq_id::e_Tpe)
479         r = "EMBL";
480     else if (pp->source == Parser::ESource::DDBJ && cho != objects::CSeq_id::e_Ddbj &&
481              cho != objects::CSeq_id::e_Tpd)
482              r = "DDBJ";
483     else if (pp->source == Parser::ESource::NCBI && cho != objects::CSeq_id::e_Genbank &&
484              cho != objects::CSeq_id::e_Tpg)
485              r = "NCBI";
486     else {
487         ncho = GetNucAccOwner(text_id->GetAccession().c_str(), pp->entrylist[pp->curindx]->is_tpa);
488         if (ncho == objects::CSeq_id::e_Tpe && cho == objects::CSeq_id::e_Embl)
489             cho = ncho;
490     }
491 
492     if (r != NULL || ncho != cho) {
493         p = location_to_string(cds.GetLocation());
494         if (pp->ign_prot_src == false)
495             sev = SEV_FATAL;
496         else
497             sev = SEV_WARNING;
498         if (r != NULL)
499             ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession,
500             "/protein_id qualifier has incorrect accession prefix \"%s\" for source %s for CDS feature: \"%s\".",
501             protacc, r, (p == NULL) ? "" : p);
502         else
503             ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession,
504             "Found mismatching TPA and non-TPA nucleotides's and protein's accessions in one nuc-prot set. Nuc = \"%s\", prot = \"%s\".",
505             text_id->GetAccession().c_str(), protacc);
506         if (p != NULL)
507             MemFree(p);
508         if (pp->ign_prot_src == false) {
509             MemFree(protacc);
510             return;
511         }
512     }
513 
514     CRef<objects::CSeq_id> seq_id(new objects::CSeq_id);
515 
516     CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
517     new_text_id->SetAccession(protacc);
518     new_text_id->SetVersion(atoi(q));
519     SetTextId(cho, *seq_id, *new_text_id);
520 
521     ids.push_back(seq_id);
522 
523     if ((pp->source != Parser::ESource::DDBJ && pp->source != Parser::ESource::EMBL) ||
524         pp->entrylist[pp->curindx]->is_wgs == false ||
525         StringLen(ibp->acnum) == 8) {
526         return;
527     }
528 
529     seq_id.Reset(new objects::CSeq_id);
530     seq_id->SetGeneral().SetTag().SetStr(protacc);
531 
532     std::string& db = seq_id->SetGeneral().SetDb();
533     if (pp->entrylist[pp->curindx]->is_tsa != false)
534         db = "TSA:";
535     else if (pp->entrylist[pp->curindx]->is_tls != false)
536         db = "TLS:";
537     else
538         db = "WGS:";
539 
540     db.append(ibp->acnum, ibp->acnum + 4);
541     ids.push_back(seq_id);
542 }
543 
544 /**********************************************************/
stripStr(char * base,char * str)545 static char* stripStr(char* base, char* str)
546 {
547     char* bptr;
548     char* eptr;
549 
550     if (base == NULL || str == NULL)
551         return(NULL);
552     bptr = StringStr(base, str);
553     if (bptr != NULL) {
554         eptr = bptr + StringLen(str);
555         fta_StringCpy(bptr, eptr);
556     }
557 
558     return(base);
559 }
560 
561 /**********************************************************/
StripCDSComment(objects::CSeq_feat & feat)562 static void StripCDSComment(objects::CSeq_feat& feat)
563 {
564     static const char *strA[] = {
565         "Author-given protein sequence is in conflict with the conceptual translation.",
566         "direct peptide sequencing.",
567         "Method: conceptual translation with partial peptide sequencing.",
568         "Method: sequenced peptide, ordered by overlap.",
569         "Method: sequenced peptide, ordered by homology.",
570         "Method: conceptual translation supplied by author.",
571         NULL
572     };
573     const char        **b;
574     char*           pchComment;
575     char*           comment;
576 
577     if (!feat.IsSetComment())
578         return;
579 
580     comment = StringSave(feat.GetComment().c_str());
581     pchComment = tata_save(comment);
582     if (comment != NULL && comment[0] != 0)
583         feat.SetComment(comment);
584     else
585         feat.ResetComment();
586 
587     MemFree(comment);
588 
589     for (b = strA; *b != NULL; b++) {
590         pchComment = stripStr(pchComment, (char*)*b);
591     }
592     comment = tata_save(pchComment);
593     if (comment != NULL && *comment != '\0')
594         ShrinkSpaces(comment);
595     MemFree(pchComment);
596 
597     if (comment != NULL && comment[0] != 0)
598         feat.SetComment(comment);
599     else
600         feat.ResetComment();
601 
602     MemFree(comment);
603 }
604 
605 /**********************************************************
606 *
607 *   static SeqAnnotPtr GetProtRefAnnot(ibp, sfp, seqid,
608 *                                      length):
609 *
610 *      "product" qualifier ==> prp->name.
611 *      "note" or "gene" or "standard_name" or
612 *   "label" ==> prp->desc.
613 *      "EC_number" qualifier ==> a ValNodePtr in
614 *   ProtRefPtr, prp->ec.
615 *      "function" qualifier ==> a ValNodePtr in
616 *   ProtRefPtr, prp->activity.
617 *
618 **********************************************************/
GetProtRefAnnot(InfoBioseqPtr ibp,objects::CSeq_feat & feat,objects::CBioseq & bioseq)619 static void GetProtRefAnnot(InfoBioseqPtr ibp, objects::CSeq_feat& feat,
620                             objects::CBioseq& bioseq)
621 {
622     char*     qval;
623     char*     prid;
624     bool     partial5;
625     bool     partial3;
626 
627     std::set<string> names;
628 
629     for (;;) {
630         qval = GetTheQualValue(feat.SetQual(), "product");
631         if (qval == NULL)
632             break;
633 
634         std::string qval_str = qval;
635         MemFree(qval);
636         qval = NULL;
637 
638         if (qval_str[0] == '\'')
639             qval_str = qval_str.substr(1, qval_str.size() - 2);
640 
641         names.insert(qval_str);
642     }
643 
644     if (names.empty()) {
645         qval = CpTheQualValue(feat.GetQual(), "gene");
646         if (qval == NULL)
647             qval = CpTheQualValue(feat.GetQual(), "standard_name");
648         if (qval == NULL)
649             qval = CpTheQualValue(feat.GetQual(), "label");
650     }
651 
652     CRef<objects::CProt_ref> prot_ref(new objects::CProt_ref);
653 
654     if (names.empty() && qval == NULL) {
655         prid = CpTheQualValue(feat.GetQual(), "protein_id");
656         qval = location_to_string(feat.GetLocation());
657         if (prid != NULL) {
658             ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein,
659                       "No product, gene, or standard_name qualifier found for protein \"%s\" on CDS:%s",
660                       prid, (qval == NULL) ? "" : qval);
661             MemFree(prid);
662         }
663         else
664             ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein,
665             "No product, gene, or standard_name qualifier found on CDS:%s",
666             (qval == NULL) ? "" : qval);
667         MemFree(qval);
668         prot_ref->SetDesc("unnamed protein product");
669     }
670     else {
671         if (!names.empty()) {
672             prot_ref->SetName().clear();
673             std::copy(names.begin(), names.end(), std::back_inserter(prot_ref->SetName()));
674             names.clear();
675         }
676         else
677             prot_ref->SetDesc(qval);
678     }
679 
680     while ((qval = GetTheQualValue(feat.SetQual(), "EC_number")) != NULL) {
681         prot_ref->SetEc().push_back(qval);
682     }
683 
684     while ((qval = GetTheQualValue(feat.SetQual(), "function")) != NULL) {
685         prot_ref->SetActivity().push_back(qval);
686     }
687 
688     if (feat.GetQual().empty())
689         feat.ResetQual();
690 
691     CRef<objects::CSeq_feat> feat_prot(new objects::CSeq_feat);
692     feat_prot->SetData().SetProt(*prot_ref);
693     feat_prot->SetLocation(*fta_get_seqloc_int_whole(*bioseq.SetId().front(), bioseq.GetLength()));
694 
695     if (feat.IsSetPartial())
696         feat_prot->SetPartial(feat.GetPartial());
697 
698     partial5 = feat.GetLocation().IsPartialStart(objects::eExtreme_Biological);
699     partial3 = feat.GetLocation().IsPartialStop(objects::eExtreme_Biological);
700 
701     if (partial5 || partial3) {
702         objects::CSeq_interval& interval = feat_prot->SetLocation().SetInt();
703 
704         if (partial5) {
705             interval.SetFuzz_from().SetLim(objects::CInt_fuzz::eLim_lt);
706         }
707 
708         if (partial3) {
709             interval.SetFuzz_to().SetLim(objects::CInt_fuzz::eLim_gt);
710         }
711     }
712 
713     CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
714     annot->SetData().SetFtable().push_back(feat_prot);
715 
716     bioseq.SetAnnot().push_back(annot);
717 }
718 
719 /**********************************************************/
GetProtRefDescr(objects::CSeq_feat & feat,Uint1 method,const objects::CBioseq & bioseq,TSeqdescList & descrs)720 static void GetProtRefDescr(objects::CSeq_feat& feat, Uint1 method, const objects::CBioseq& bioseq, TSeqdescList& descrs)
721 {
722     char*      p;
723     char*      q;
724     bool      partial5;
725     bool      partial3;
726     Int4         diff_lowest;
727     Int4         diff_current;
728     Int4         cdslen;
729     Int4         orglen;
730     Uint1        strand;
731 
732     strand = feat.GetLocation().GetStrand();
733 
734     std::string organism;
735 
736     ITERATE(TSeqdescList, desc, bioseq.GetDescr().Get())
737     {
738         if (!(*desc)->IsSource())
739             continue;
740 
741         const objects::CBioSource& source = (*desc)->GetSource();
742         if (source.IsSetOrg() && source.GetOrg().IsSetTaxname()) {
743             organism = source.GetOrg().GetTaxname();
744             break;
745         }
746     }
747 
748     if (!fta_if_special_org(organism.c_str())) {
749         diff_lowest = -1;
750         cdslen = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
751 
752         ITERATE(objects::CBioseq::TAnnot, annot, bioseq.GetAnnot())
753         {
754             if (!(*annot)->IsFtable())
755                 continue;
756 
757             bool found = false;
758             ITERATE(TSeqFeatList, cur_feat, (*annot)->GetData().GetFtable())
759             {
760                 if (!(*cur_feat)->IsSetData() || !(*cur_feat)->GetData().IsBiosrc())
761                     continue;
762 
763                 orglen = objects::sequence::GetLength((*cur_feat)->GetLocation(), &GetScope());
764 
765                 const objects::CBioSource& source = (*cur_feat)->GetData().GetBiosrc();
766                 if (!source.IsSetOrg() || !source.GetOrg().IsSetTaxname() ||
767                     strand != (*cur_feat)->GetLocation().GetStrand())
768                     continue;
769 
770                 objects::sequence::ECompare cmp_res = objects::sequence::Compare(feat.GetLocation(), (*cur_feat)->GetLocation(), nullptr, objects::sequence::fCompareOverlapping);
771                 if (cmp_res == objects::sequence::eNoOverlap)
772                     continue;
773 
774                 if (cmp_res == objects::sequence::eSame) {
775                     organism = source.GetOrg().GetTaxname();
776                     break;
777                 }
778 
779                 if (cmp_res == objects::sequence::eContained) {
780                     diff_current = orglen - cdslen;
781                     if (diff_lowest == -1 || diff_current < diff_lowest) {
782                         diff_lowest = diff_current;
783                         organism = source.GetOrg().GetTaxname();
784                     }
785                 }
786                 else if (cmp_res == objects::sequence::eOverlap && diff_lowest < 0)
787                     organism = source.GetOrg().GetTaxname();
788             }
789 
790             if (found)
791                 break;
792         }
793     }
794 
795     CRef<objects::CMolInfo> mol_info(new objects::CMolInfo);
796     mol_info->SetBiomol(objects::CMolInfo::eBiomol_peptide);             /* peptide */
797 
798     partial5 = feat.GetLocation().IsPartialStart(objects::eExtreme_Biological);
799     partial3 = feat.GetLocation().IsPartialStop(objects::eExtreme_Biological);
800 
801     if (partial5 && partial3)
802         mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_ends);
803     else if (partial5)
804         mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_left);
805     else if (partial3)
806         mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_no_right);
807     else if (feat.IsSetPartial() && feat.GetPartial())
808         mol_info->SetCompleteness(objects::CMolInfo::eCompleteness_partial);
809 
810     if (method == objects::eGIBB_method_concept_trans_a)
811         mol_info->SetTech(objects::CMolInfo::eTech_concept_trans_a);
812     else if (method == objects::eGIBB_method_concept_trans)
813         mol_info->SetTech(objects::CMolInfo::eTech_concept_trans);
814 
815     CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
816     descr->SetMolinfo(*mol_info);
817     descrs.push_back(descr);
818 
819     p = NULL;
820     ITERATE(TQualVector, qual, feat.GetQual())
821     {
822         if ((*qual)->GetQual() != "product")
823             continue;
824 
825         const std::string& val_str = (*qual)->GetVal();
826         if (p == NULL) {
827             p = StringSave((*qual)->GetVal().c_str());
828             continue;
829         }
830 
831         q = (char*)MemNew(StringLen(p) + val_str.size() + 3);
832         StringCpy(q, p);
833         StringCat(q, "; ");
834         StringCat(q, val_str.c_str());
835         MemFree(p);
836         p = q;
837     }
838 
839     if (p == NULL)
840         p = CpTheQualValue(feat.GetQual(), "gene");
841 
842     if (p == NULL)
843         p = CpTheQualValue(feat.GetQual(), "label");
844 
845     if (p == NULL)
846         p = CpTheQualValue(feat.GetQual(), "standard_name");
847 
848     if (p == NULL)
849         p = StringSave("unnamed protein product");
850     else {
851         for (q = p; *q != '\0';)
852             q++;
853         if (q > p) {
854             for (q--; *q == ' ' || *q == ','; q--)
855                 if (q == p)
856                     break;
857             if (*q != ' ' && *q != ',')
858                 q++;
859             *q = '\0';
860         }
861     }
862 
863     if (StringLen(p) < 511 && !organism.empty() && StringStr(p, organism.c_str()) == NULL) {
864         q = (char*)MemNew(StringLen(p) + organism.size() + 4);
865         StringCpy(q, p);
866         StringCat(q, " [");
867         StringCat(q, organism.c_str());
868         StringCat(q, "]");
869         MemFree(p);
870         p = q;
871     }
872 
873     if (StringLen(p) > 511) {
874         p[510] = '>';
875         p[511] = '\0';
876         q = StringSave(p);
877         MemFree(p);
878     }
879     else
880         q = p;
881 
882     descr.Reset(new objects::CSeqdesc);
883     descr->SetTitle(q);
884     descrs.push_back(descr);
885 }
886 
887 /**********************************************************
888 *
889 *   Function:
890 *      static SeqIdPtr QualsToSeqID(pSeqFeat, source)
891 *
892 *   Purpose:
893 *      Searches all /db_xref qualifiers make from them
894 *      corresponding SeqId and removing found qualifiers
895 *      from given SeqFeature.
896 *
897 *      Tatiana: /db_xref qualifiers were already processed
898 *               sfp->dbxref in loadfeat.c
899 *
900 *   Parameters:
901 *      pSeqFeat - pointer to SeqFeat structure which have
902 *                 to be processed.
903 *
904 *   Return:
905 *      Pointer to resultant SeqId chain if successful,
906 *      otherwise NULL.
907 *
908 *   Note:
909 *      Returned SeqId must be freed by caller.
910 *
911 **********************************************************/
QualsToSeqID(objects::CSeq_feat & feat,Parser::ESource source,TSeqIdList & ids)912 static void QualsToSeqID(objects::CSeq_feat& feat, Parser::ESource source, TSeqIdList& ids)
913 {
914     char*   p;
915 
916     if (!feat.IsSetQual())
917         return;
918 
919     for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
920         if ((*qual)->GetQual() != "db_xref") {
921             ++qual;
922             continue;
923         }
924 
925         CRef<objects::CSeq_id> seq_id;
926         p = StringIStr((*qual)->GetVal().c_str(), "pid:");
927         if (p != NULL)
928             seq_id = StrToSeqId(p + 4, true);
929 
930         if (seq_id.Empty()) {
931             ErrPostEx(SEV_ERROR, ERR_CDREGION_InvalidDb_xref,
932                       "Invalid data format /db_xref = \"%s\", ignore it",
933                       (*qual)->GetVal().c_str());
934         }
935         else {
936             if (p[4] == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))
937                 seq_id.Reset();
938             else
939                 ids.push_back(seq_id);
940         }
941 
942         qual = feat.SetQual().erase(qual);
943     }
944 
945     if (feat.GetQual().empty())
946         feat.ResetQual();
947 }
948 
949 /**********************************************************
950 *
951 *   Function:
952 *      static SeqIdPtr ValidateQualSeqId(pSeqID)
953 *
954 *   Purpose:
955 *      Validates consistency of SeqId list obtained from
956 *      /db_xref in following maner. The number of SeqId
957 *      must be not more then 3. Each SeqId must refer to
958 *      a different GenBank. If two or more SeqId's refer
959 *      to the same GenBank only first is taken in account
960 *      and other ones are abandoned.
961 *      During validating the function drop corresponding
962 *      error messages if something wrong occured.
963 *
964 *   Parameters:
965 *      pSeqFeat - pointer to SeqFeat structure which have
966 *                 to be processed;
967 *
968 *   Return:
969 *      Pointer to resultant SeqId chain. If pointer is
970 *      NULL it means that there is no good SeqId.
971 *
972 **********************************************************/
ValidateQualSeqId(TSeqIdList & ids)973 static void ValidateQualSeqId(TSeqIdList& ids)
974 {
975     bool     abGenBanks[3] = { false, false, false };
976     Int2     num;
977     Char     ch;
978 
979     if (ids.empty())
980         return;
981 
982     ch = '\0';
983     for (TSeqIdList::iterator id = ids.begin(); id != ids.end();) {
984         num = -1;
985         const Char* dbtag_str = (*id)->IsGeneral() ? (*id)->GetGeneral().GetTag().GetStr().c_str() : "";
986         if ((*id)->IsGi()) {
987             num = 1;
988             ch = 'g';
989         }
990         else if (*dbtag_str == 'e') {
991             num = 0;
992             ch = 'e';
993         }
994         else if (*dbtag_str == 'd') {
995             num = 2;
996             ch = 'd';
997         }
998         if (num == -1)
999             continue;
1000 
1001         if (abGenBanks[num]) {
1002             /* PID of this type already exist, ignore it
1003             */
1004             ErrPostEx(SEV_WARNING, ERR_CDREGION_Multiple_PID,
1005                       "/db_xref=\"pid:%c%i\" refer the same data base",
1006                       ch, (*id)->GetGeneral().GetTag().GetId());
1007 
1008             id = ids.erase(id);
1009         }
1010         else {
1011             abGenBanks[num] = true;
1012             ++id;
1013         }
1014     }
1015 }
1016 
1017 /**********************************************************/
DbxrefToSeqID(objects::CSeq_feat & feat,Parser::ESource source,TSeqIdList & ids)1018 static void DbxrefToSeqID(objects::CSeq_feat& feat, Parser::ESource source, TSeqIdList& ids)
1019 {
1020     if (!feat.IsSetDbxref())
1021         return;
1022 
1023     for (objects::CSeq_feat::TDbxref::iterator xref = feat.SetDbxref().begin(); xref != feat.SetDbxref().end();) {
1024         if (!(*xref)->IsSetTag() || !(*xref)->IsSetDb()) {
1025             ++xref;
1026             continue;
1027         }
1028 
1029         CRef<objects::CSeq_id> id;
1030 
1031         if ((*xref)->GetDb() == "PID") {
1032             const Char* tag_str = (*xref)->GetTag().GetStr().c_str();
1033             switch (tag_str[0]) {
1034             case 'g':
1035                 if (source != Parser::ESource::DDBJ && source != Parser::ESource::EMBL) {
1036                     id.Reset(new objects::CSeq_id);
1037                     id->SetGi(GI_FROM(long long, strtoll(tag_str + 1, NULL, 10)));
1038                 }
1039                 break;
1040 
1041             case 'd':
1042             case 'e':
1043                 id.Reset(new objects::CSeq_id);
1044                 id->SetGeneral(*(*xref));
1045                 break;
1046 
1047             default:
1048                 break;
1049             }
1050 
1051             xref = feat.SetDbxref().erase(xref);
1052         }
1053         else
1054             ++xref;
1055 
1056         if (id.NotEmpty())
1057             ids.push_back(id);
1058     }
1059 
1060     if (feat.GetDbxref().empty())
1061         feat.ResetDbxref();
1062 }
1063 
1064 /**********************************************************
1065 *
1066 *   Function:
1067 *      static void ProcessForDbxref(pBioseq, pSeqFeat,
1068 *                                   source)
1069 *
1070 *   Purpose:
1071 *      Finds all qualifiers corresponding to /db_xref,
1072 *      makes from them SeqId and remove them from further
1073 *      processing. Also the function looks for PID which
1074 *      can be found in /note (pSeqFeat->comment) and
1075 *      removes PID record from /note.
1076 *
1077 *   Parameters:
1078 *      pBioseq  - pointer or a Bioseq structure which will
1079 *                 hold resultant list of SeqId;
1080 *      pSeqFeat - pointer to SeqFeat structure which have
1081 *                 to processed.
1082 *      source   - source of sequence.
1083 *
1084 *   Return:
1085 *      void
1086 *
1087 **********************************************************/
ProcessForDbxref(objects::CBioseq & bioseq,objects::CSeq_feat & feat,Parser::ESource source)1088 static void ProcessForDbxref(objects::CBioseq& bioseq, objects::CSeq_feat& feat,
1089                              Parser::ESource source)
1090 {
1091     TSeqIdList ids;
1092     QualsToSeqID(feat, source, ids);
1093     if (!ids.empty()) {
1094         ValidateQualSeqId(ids);
1095         if (!ids.empty()) {
1096             bioseq.SetId().splice(bioseq.SetId().end(), ids);
1097             return;
1098         }
1099     }
1100 
1101     DbxrefToSeqID(feat, source, ids);
1102     if (feat.IsSetComment() && feat.GetComment().empty())
1103         feat.ResetComment();
1104 
1105     if (!ids.empty())
1106         bioseq.SetId().splice(bioseq.SetId().end(), ids);
1107 }
1108 
1109 /**********************************************************/
BldProtRefSeqEntry(ProtBlkPtr pbp,objects::CSeq_feat & feat,std::string & seq_data,Uint1 method,ParserPtr pp,const objects::CBioseq & bioseq,objects::CBioseq::TId & ids)1110 static CRef<objects::CBioseq> BldProtRefSeqEntry(ProtBlkPtr pbp, objects::CSeq_feat& feat,
1111                                                              std::string& seq_data, Uint1 method,
1112                                                              ParserPtr pp, const objects::CBioseq& bioseq,
1113                                                              objects::CBioseq::TId& ids)
1114 {
1115     CRef<objects::CBioseq> new_bioseq;
1116 
1117     if (ids.empty())
1118         return new_bioseq;
1119 
1120     new_bioseq.Reset(new objects::CBioseq);
1121 
1122     new_bioseq->SetId().swap(ids);
1123 
1124     ProcessForDbxref(*new_bioseq, feat, pp->source);
1125 
1126     new_bioseq->SetInst().SetLength(static_cast<TSeqPos>(seq_data.size()));
1127 
1128     GetProtRefDescr(feat, method, bioseq, new_bioseq->SetDescr());
1129     GetProtRefAnnot(pbp->ibp, feat, *new_bioseq);
1130 
1131     new_bioseq->SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
1132     new_bioseq->SetInst().SetMol(objects::CSeq_inst::eMol_aa);
1133 
1134     /* Seq_code always ncbieaa  08.08.96
1135     */
1136     CRef<objects::CSeq_data> data(new objects::CSeq_data(seq_data, objects::CSeq_data::e_Ncbieaa));
1137     new_bioseq->SetInst().SetSeq_data(*data);
1138 
1139     return new_bioseq;
1140 }
1141 
1142 /**********************************************************/
AddProtRefSeqEntry(ProtBlkPtr pbp,objects::CBioseq & bioseq)1143 static void AddProtRefSeqEntry(ProtBlkPtr pbp, objects::CBioseq& bioseq)
1144 {
1145     CRef<objects::CSeq_entry> entry(new objects::CSeq_entry);
1146     entry->SetSeq(bioseq);
1147     pbp->entries.push_back(entry);
1148 }
1149 
1150 /**********************************************************/
SimpleValuePos(char * qval)1151 static char* SimpleValuePos(char* qval)
1152 {
1153     char* bptr;
1154     char* eptr;
1155 
1156     bptr = StringStr(qval, "(pos:");
1157     if (bptr == NULL)
1158         return(NULL);
1159 
1160     bptr += 5;
1161     while (*bptr == ' ')
1162         bptr++;
1163     for (eptr = bptr; *eptr != ',' && *eptr != '\0';)
1164         eptr++;
1165 
1166     return StringSave(std::string(bptr, eptr).c_str());
1167 }
1168 
1169 /**********************************************************
1170 *
1171 *   static CodeBreakPtr GetCdRegionCB(ibp, sfp, accver):
1172 *
1173 *      If protein translation (InternalStopCodon) O.K.,
1174 *   then this qualifier will be deleted ==> different from
1175 *   Karl's parser.
1176 *      For transl_except of type OTHER, use the ncibeaa
1177 *   code 'X' for Code-break.aa.
1178 *      CodeBreakPtr->aa.choice = 1  ==>  for ncbieaa;
1179 *      CodeBreakPtr->aa.value.intvalue = (Int4)'X'  ==>
1180 *   for unknown.
1181 *
1182 **********************************************************/
GetCdRegionCB(InfoBioseqPtr ibp,objects::CSeq_feat & feat,TCodeBreakList & code_breaks,unsigned char * dif,bool accver)1183 static void GetCdRegionCB(InfoBioseqPtr ibp, objects::CSeq_feat& feat,
1184                           TCodeBreakList& code_breaks, unsigned char* dif, bool accver)
1185 {
1186     Int4 feat_start = -1;
1187     Int4 feat_stop = -1;
1188 
1189     if (feat.IsSetLocation()) {
1190         feat_start = feat.GetLocation().GetStart(objects::eExtreme_Positional);
1191         feat_stop = feat.GetLocation().GetStop(objects::eExtreme_Positional);
1192     }
1193 
1194     Uint1 res = 2;
1195 
1196     if (feat.IsSetQual()) {
1197         TQualVector::iterator cur_qual = feat.SetQual().begin(),
1198             end_qual = feat.SetQual().end();
1199 
1200         char* qval = NULL;
1201         while ((qval = CpTheQualValueNext(cur_qual, end_qual, "transl_except")) != NULL) {
1202             CRef<objects::CCode_break> code_break(new objects::CCode_break);
1203 
1204             int ncbieaa_val = GetQualValueAa(qval, false);
1205 
1206             code_break->SetAa().SetNcbieaa(ncbieaa_val);
1207 
1208             char* pos = SimpleValuePos(qval);
1209 
1210             int num_errs = 0;
1211             bool locmap = false,
1212                 sitesmap = false;
1213 
1214             CRef<objects::CSeq_loc> location = xgbparseint_ver(pos, locmap, sitesmap, num_errs, ibp->ids, accver);
1215             if (location.NotEmpty())
1216                 code_break->SetLoc(*location);
1217 
1218             Int4 start = code_break->IsSetLoc() ? code_break->GetLoc().GetStart(objects::eExtreme_Positional) : -1;
1219             Int4 stop = code_break->IsSetLoc() ? code_break->GetLoc().GetStop(objects::eExtreme_Positional) : -1;
1220 
1221             Uint1 i = (start > stop) ? 3 : (stop - start);
1222 
1223             if (i != 2)
1224                 res = i;
1225 
1226             bool itis = false;
1227             if (ncbieaa_val == 42 &&
1228                 (start == feat_start ||
1229                 stop == feat_stop))
1230                 itis = true;
1231 
1232             bool pos_range = false;
1233             if (i == 2)
1234                 pos_range = false;
1235             else if (i > 2)
1236                 pos_range = true;
1237             else
1238                 pos_range = !itis;
1239 
1240             if (num_errs > 0 || pos_range) {
1241                 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
1242                           "transl_except range is wrong, %s, drop the transl_except",
1243                           pos);
1244                 MemFree(pos);
1245                 MemFree(qval);
1246                 break;
1247             }
1248 
1249             if (code_break->IsSetLoc()) {
1250                 if (feat.GetLocation().IsSetStrand())
1251                     code_break->SetLoc().SetStrand(feat.GetLocation().GetStrand());
1252 
1253                 objects::sequence::ECompare cmp_res = objects::sequence::Compare(feat.GetLocation(), code_break->GetLoc(), nullptr, objects::sequence::fCompareOverlapping);
1254                 if (cmp_res != objects::sequence::eContains) {
1255                     ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
1256                               "/transl_except not in CDS: %s", qval);
1257                 }
1258             }
1259 
1260             MemFree(pos);
1261             MemFree(qval);
1262 
1263             if (code_break.NotEmpty())
1264                 code_breaks.push_back(code_break);
1265         }
1266     }
1267 
1268     *dif = 2 - res;
1269 }
1270 
1271 /**********************************************************/
CkEndStop(const objects::CSeq_feat & feat,Uint1 dif)1272 static void CkEndStop(const objects::CSeq_feat& feat, Uint1 dif)
1273 {
1274     Int4        len;
1275     Int4        r;
1276     Int4        frm;
1277     char*     p;
1278 
1279     len = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
1280     const objects::CCdregion& cdregion = feat.GetData().GetCdregion();
1281 
1282     if (!cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1283         frm = 0;
1284     else
1285         frm = cdregion.GetFrame() - 1;
1286 
1287     r = (len - frm + (Int4)dif) % 3;
1288     if (r != 0 && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1289         p = location_to_string(feat.GetLocation());
1290         ErrPostEx(SEV_WARNING, ERR_CDREGION_UnevenLocation,
1291                   "CDS: %s. Length is not divisable by 3, the remain is: %d",
1292                   (p == NULL) ? "" : p, r);
1293         MemFree(p);
1294     }
1295 }
1296 
1297 /**********************************************************/
check_end_internal(size_t protlen,const objects::CSeq_feat & feat,Uint1 dif)1298 static void check_end_internal(size_t protlen, const objects::CSeq_feat& feat, Uint1 dif)
1299 {
1300     Int4        frm;
1301 
1302     const objects::CCdregion& cdregion = feat.GetData().GetCdregion();
1303 
1304     if (!cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1305         frm = 0;
1306     else
1307         frm = cdregion.GetFrame() - 1;
1308 
1309     size_t len = objects::sequence::GetLength(feat.GetLocation(), &GetScope()) - frm + dif;
1310 
1311     if (protlen * 3 != len && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1312         char* p = location_to_string(feat.GetLocation());
1313         ErrPostEx(SEV_WARNING, ERR_CDREGION_LocationLength,
1314                   "Length of the CDS: %s (%ld) disagree with the length calculated from the protein: %ld",
1315                   (p == NULL) ? "" : p, len, protlen * 3);
1316         MemFree(p);
1317     }
1318 }
1319 
1320 /**********************************************************
1321 *
1322 *   static void ErrByteStorePtr(ibp, sfp, bsp):
1323 *
1324 *      For debugging, put to error logfile, it needs
1325 *   to delete for "buildcds.c program.
1326 *
1327 **********************************************************/
ErrByteStorePtr(InfoBioseqPtr ibp,const objects::CSeq_feat & feat,const std::string & prot)1328 static void ErrByteStorePtr(InfoBioseqPtr ibp, const objects::CSeq_feat& feat,
1329                             const std::string& prot)
1330 {
1331     char* str;
1332     char* qval;
1333 
1334     qval = CpTheQualValue(feat.GetQual(), "translation");
1335     if (qval == NULL)
1336         qval = StringSave("no translation qualifier");
1337 
1338     if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1339         str = location_to_string(feat.GetLocation());
1340         ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff,
1341                   "Location: %s, translation: %s",
1342                   (str == NULL) ? "" : str, qval);
1343         MemFree(str);
1344     }
1345 
1346     MemFree(qval);
1347 
1348     ErrLogPrintStr(prot.c_str());
1349     ErrLogPrintStr((char *) "\n");
1350 }
1351 
1352 /**********************************************************
1353 *
1354 *   static ByteStorePtr CkProteinTransl(pp, ibp, bsp, sfp,
1355 *                                       qval, intercodon,
1356 *                                       gcode, method):
1357 *
1358 *      If bsp != translation's value, then take
1359 *   translation's value and print out warning message.
1360 *      If the only diff is start codon and bsp has "M"
1361 *   take bsp.
1362 *      If intercodon = TRUE, then no comparison.
1363 *
1364 **********************************************************/
CkProteinTransl(ParserPtr pp,InfoBioseqPtr ibp,std::string & prot,objects::CSeq_feat & feat,char * qval,bool intercodon,char * gcode,unsigned char * method)1365 static void CkProteinTransl(ParserPtr pp, InfoBioseqPtr ibp,
1366                             std::string& prot, objects::CSeq_feat& feat,
1367                             char* qval, bool intercodon,
1368                             char* gcode, unsigned char* method)
1369 {
1370     char*      ptr;
1371     Char         msg2[1100];
1372     Char         aastr[100];
1373     char*      msg;
1374     Int2         residue;
1375     Int4         num = 0;
1376     size_t       aa;
1377     bool         msgout = false;
1378     bool         first = false;
1379     Int4         difflen;
1380 
1381     objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1382     size_t len = StringLen(qval);
1383     msg2[0] = '\0';
1384 
1385     msg = location_to_string(feat.GetLocation());
1386     if (msg == NULL)
1387         msg = StringSave("");
1388 
1389     size_t blen = prot.size();
1390 
1391     if (len != blen && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1392         ErrPostEx(SEV_ERROR, ERR_CDREGION_ProteinLenDiff,
1393                   "Lengths of conceptual translation and translation qualifier differ : %d : %d : %s",
1394                   blen, len, msg);
1395     }
1396 
1397     difflen = 0;
1398     if (!intercodon) {
1399         if (len != blen)
1400             difflen = 1;
1401         if (len > blen)
1402             difflen = 2;
1403 
1404         size_t residue_idx = 0;
1405         for (ptr = qval, num = 0, aa = 1; residue_idx < blen; ++aa, ++residue_idx) {
1406             residue = prot[residue_idx];
1407 
1408             if (aa > len) {
1409                 if (residue == '*')
1410                     continue;
1411                 difflen = 2;
1412                 break;
1413             }
1414 
1415             if (residue == *ptr) {
1416                 ptr++;
1417                 continue;
1418             }
1419 
1420             if (aa == 1 && residue == 'M')
1421                 first = true;
1422 
1423             msgout = true;
1424             num++;
1425             if (num == 1)
1426                 StringCpy(msg2, "at AA # ");
1427 
1428             if (num < 11 && StringLen(msg2) < 1000) {
1429                 sprintf(aastr, "%d(%c,%c), ",
1430                         (int)aa, (char)residue, (char)*ptr);
1431                 StringCat(msg2, aastr);
1432             }
1433             else if (num == 11 && StringLen(msg2) < 1000)
1434                 StringCat(msg2, ", additional details suppressed, ");
1435             ptr++;
1436         }
1437 
1438         if (num > 0) {
1439             cdregion.SetConflict(true);
1440             sprintf(aastr, "using genetic code %s, total %d difference%s",
1441                     gcode, (int)num, (num > 1) ? "s" : "");
1442             StringCat(msg2, aastr);
1443             if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1444                 ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff,
1445                           "%s:%s", msg2, msg);
1446             }
1447         }
1448 
1449         if (!msgout) {
1450             cdregion.ResetConflict();
1451             ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationsAgree,
1452                       "Parser-generated conceptual translation agrees with input translation %s",
1453                       msg);
1454         }
1455 
1456         if (difflen == 2) {
1457             cdregion.SetConflict(true);
1458             msgout = true;
1459         }
1460     } /* intercodon */
1461     else {
1462         msgout = true;
1463         if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1464             ErrPostEx(SEV_WARNING, ERR_CDREGION_NoTranslationCompare,
1465                       "Conceptual translation has internal stop codons, no comparison: %s",
1466                       msg);
1467         }
1468     }
1469 
1470     if (msgout) {
1471         if (pp->debug) {
1472             ErrByteStorePtr(ibp, feat, prot);
1473         }
1474     }
1475 
1476     if (pp->accver == false) {
1477         if ((num == 1 && first) ||
1478             (pp->transl && !prot.empty() && cdregion.IsSetConflict() && cdregion.GetConflict())) {
1479             cdregion.ResetConflict();
1480             ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationOverride,
1481                       "Input translation is replaced with conceptual translation: %s",
1482                       msg);
1483             *method = objects::eGIBB_method_concept_trans;
1484             MemFree(msg);
1485         }
1486     }
1487 
1488     if ((cdregion.IsSetConflict() && cdregion.GetConflict() == true) || difflen != 0)
1489         *method = objects::eGIBB_method_concept_trans_a;
1490 
1491     if (msgout && pp->transl == false && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1492         ErrPostEx(SEV_WARNING, ERR_CDREGION_SuppliedProteinUsed,
1493                   "In spite of previously reported problems, the supplied protein translation will be used : %s",
1494                   msg);
1495     }
1496 
1497     prot = qval;
1498     MemFree(msg);
1499 }
1500 
1501 /**********************************************************
1502 *
1503 *   static bool check_translation(bsp, qval):
1504 *
1505 *      If bsp != translation's value return FALSE.
1506 *
1507 **********************************************************/
check_translation(std::string & prot,char * qval)1508 static bool check_translation(std::string& prot, char* qval)
1509 {
1510     size_t len = 0;
1511     size_t blen = prot.size();
1512 
1513     for (len = StringLen(qval); len != 0; len--) {
1514         if (qval[len - 1] != 'X')          /* remove terminal X */
1515             break;
1516     }
1517 
1518     size_t cur_pos = blen;
1519     for (; cur_pos != 0; --cur_pos) {
1520         if (prot[cur_pos - 1] != 'X')
1521             break;
1522     }
1523 
1524     if (cur_pos != len)
1525         return false;
1526 
1527     cur_pos = 0;
1528     for (; *qval != '\0' && cur_pos < blen; qval++, ++cur_pos) {
1529         if (prot[cur_pos] != *qval)
1530             return false;
1531     }
1532 
1533     return true;
1534 }
1535 
1536 // Workaround for translation function.
1537 // This is needed because feat->location usually does not have a 'version' in its ID,
1538 // but feat->cdregion->code_break->location has.
1539 // In this case both locations is treated as they are from different sequences, but they definitely
1540 // belong to the same one.
1541 // Therefore, this function changes all IDs of the codebreaks to feat->location->id before translation,
1542 // and return all this stuff back after translation.
1543 // TODO: it probably should be organized in another way
Translate(objects::CSeq_feat & feat,std::string & prot)1544 static bool Translate(objects::CSeq_feat& feat, std::string& prot)
1545 {
1546     std::list<CRef<objects::CSeq_id>> orig_ids;
1547     const objects::CSeq_id* feat_loc_id = nullptr;
1548     if (feat.IsSetLocation())
1549         feat_loc_id = feat.GetLocation().GetId();
1550 
1551     bool change = feat_loc_id && feat.GetData().GetCdregion().IsSetCode_break();
1552     if (change) {
1553 
1554         NON_CONST_ITERATE(objects::CCdregion::TCode_break, code_break, feat.SetData().SetCdregion().SetCode_break())
1555         {
1556             orig_ids.push_back(CRef<objects::CSeq_id>(new objects::CSeq_id));
1557             orig_ids.back()->Assign(*(*code_break)->GetLoc().GetId());
1558             (*code_break)->SetLoc().SetId(*feat_loc_id);
1559         }
1560     }
1561 
1562     bool ret = true;
1563     try {
1564         objects::CSeqTranslator::Translate(feat, GetScope(), prot);
1565     }
1566     catch (objects::CSeqMapException& e) {
1567         ErrPostEx(SEV_REJECT, 0, 0, "%s", e.GetMsg().c_str());
1568         ret = false;
1569     }
1570 
1571     if (change) {
1572 
1573         std::list<CRef<objects::CSeq_id>>::iterator it = orig_ids.begin();
1574         NON_CONST_ITERATE(objects::CCdregion::TCode_break, code_break, feat.SetData().SetCdregion().SetCode_break())
1575         {
1576             (*code_break)->SetLoc().SetId(**it);
1577             ++it;
1578         }
1579     }
1580 
1581     return ret;
1582 }
1583 
1584 /**********************************************************
1585 *
1586 *   static Int2 EndAdded(sfp, gene):
1587 *
1588 *      From EndStopCodonBaseAdded:
1589 *      Return 0 if no bases added, no end stop codon found
1590 *   after added one or two bases, or can not add
1591 *   (substract) any bases at end because it is really
1592 *   a partial.
1593 *      Return -1 if something bad.
1594 *      Return +1 if it found end stop codon after bases
1595 *   added.
1596 *
1597 **********************************************************/
EndAdded(objects::CSeq_feat & feat,GeneRefFeats & gene_refs)1598 static Int2 EndAdded(objects::CSeq_feat& feat, GeneRefFeats& gene_refs)
1599 {
1600     Int4         pos;
1601     Int4         pos1;
1602     Int4         pos2;
1603     Int4         len;
1604     Int4         len2;
1605 
1606     Uint4        remainder;
1607 
1608     Uint4        oldfrom;
1609     Uint4        oldto;
1610 
1611     size_t       i;
1612 
1613     char*      transl;
1614     char*      name;
1615 
1616     objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1617     len = objects::sequence::GetLength(feat.GetLocation(), &GetScope());
1618     len2 = len;
1619 
1620     int frame = cdregion.IsSetFrame() ? cdregion.GetFrame() : 0;
1621     if (frame > 1 && frame < 4)
1622         len -= frame - 1;
1623 
1624     remainder = 3 - len % 3;
1625     if (remainder == 0)
1626         return(0);
1627 
1628     if (cdregion.IsSetCode_break()) {
1629         bool ret_condition = false;
1630         ITERATE(objects::CCdregion::TCode_break, code_break, cdregion.GetCode_break())
1631         {
1632             pos1 = INT4_MAX;
1633             pos2 = -10;
1634 
1635             for (objects::CSeq_loc_CI loc((*code_break)->GetLoc()); loc; ++loc) {
1636                 pos = objects::sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), objects::sequence::eOffset_FromStart);
1637                 if (pos < pos1)
1638                     pos1 = pos;
1639                 pos = objects::sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), objects::sequence::eOffset_FromEnd);
1640                 if (pos > pos2)
1641                     pos2 = pos;
1642             }
1643             pos = pos2 - pos1;
1644             if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len2 - 1)) {
1645                 ret_condition = true;
1646                 break;
1647             }
1648         }
1649 
1650         if (ret_condition)
1651             return(0);
1652     }
1653 
1654     objects::CSeq_interval* last_interval = nullptr;
1655     for (CTypeIterator<objects::CSeq_interval> loc(Begin(feat.SetLocation())); loc; ++loc) {
1656         last_interval = &(*loc);
1657     }
1658 
1659     if (last_interval == nullptr)
1660         return(0);
1661 
1662     const objects::CBioseq_Handle& bioseq_h = GetScope().GetBioseqHandle(last_interval->GetId());
1663     if (bioseq_h.GetState() != objects::CBioseq_Handle::fState_none)
1664         return(0);
1665 
1666     oldfrom = last_interval->GetFrom();
1667     oldto = last_interval->GetTo();
1668 
1669     if (last_interval->IsSetStrand() && last_interval->GetStrand() == objects::eNa_strand_minus) {
1670         if (last_interval->GetFrom() < remainder || last_interval->IsSetFuzz_from())
1671             return(0);
1672         last_interval->SetFrom(oldfrom - remainder);
1673     }
1674     else {
1675         if (last_interval->GetTo() >= bioseq_h.GetBioseqLength() - remainder || last_interval->IsSetFuzz_to())
1676             return(0);
1677         last_interval->SetTo(oldto + remainder);
1678     }
1679 
1680     std::string newprot;
1681     Translate(feat, newprot);
1682 
1683     if (newprot.empty()) {
1684         last_interval->SetFrom(oldfrom);
1685         last_interval->SetTo(oldto);
1686         return(0);
1687     }
1688 
1689     size_t protlen = newprot.size();
1690     if (protlen != (size_t) (len + remainder) / 3) {
1691         last_interval->SetFrom(oldfrom);
1692         last_interval->SetTo(oldto);
1693         return(0);
1694     }
1695 
1696     if (newprot[protlen - 1] != '*') {
1697         last_interval->SetFrom(oldfrom);
1698         last_interval->SetTo(oldto);
1699         return(0);
1700     }
1701 
1702     transl = CpTheQualValue(feat.GetQual(), "translation");
1703     if (transl == NULL) {
1704         last_interval->SetFrom(oldfrom);
1705         last_interval->SetTo(oldto);
1706         return(0);
1707     }
1708 
1709     protlen--;
1710     newprot = newprot.substr(0, newprot.size() - 1);
1711 
1712     for (i = 0; i < protlen; i++) {
1713         if (transl[i] != newprot[i])
1714             break;
1715     }
1716 
1717     MemFree(transl);
1718     if (i < protlen) {
1719         last_interval->SetFrom(oldfrom);
1720         last_interval->SetTo(oldto);
1721         return(0);
1722     }
1723 
1724     if (!gene_refs.valid)
1725         return(remainder);
1726 
1727     name = CpTheQualValue(feat.GetQual(), "gene");
1728     if (name == NULL)
1729         return(remainder);
1730 
1731     for (TSeqFeatList::iterator gene = gene_refs.first; gene != gene_refs.last; ++gene) {
1732         if (!(*gene)->IsSetData() || !(*gene)->GetData().IsGene())
1733             continue;
1734 
1735         int cur_strand = (*gene)->GetLocation().IsSetStrand() ? (*gene)->GetLocation().GetStrand() : 0,
1736             last_strand = last_interval->IsSetStrand() ? last_interval->GetStrand() : 0;
1737         if (cur_strand != last_strand)
1738             continue;
1739 
1740         const objects::CGene_ref& cur_gene_ref = (*gene)->GetData().GetGene();
1741         if (StringICmp(cur_gene_ref.GetLocus().c_str(), name) != 0) {
1742             if (!cur_gene_ref.IsSetSyn())
1743                 continue;
1744 
1745             bool found = false;
1746             ITERATE(objects::CGene_ref::TSyn, syn, cur_gene_ref.GetSyn())
1747             {
1748                 if (StringICmp(name, syn->c_str()) == 0) {
1749                     found = true;
1750                     break;
1751                 }
1752             }
1753 
1754             if (!found)
1755                 continue;
1756         }
1757 
1758         for (CTypeIterator<objects::CSeq_interval> loc(Begin((*gene)->SetLocation())); loc; ++loc) {
1759             if (!loc->GetId().Match(last_interval->GetId()))
1760                 continue;
1761 
1762             int cur_strand = loc->IsSetStrand() ? loc->GetStrand() : 0;
1763             if (cur_strand == objects::eNa_strand_minus && loc->GetFrom() == oldfrom)
1764                 loc->SetFrom(last_interval->GetFrom());
1765             else if (cur_strand != objects::eNa_strand_minus && loc->GetTo() == oldto)
1766                 loc->SetTo(last_interval->GetTo());
1767         }
1768     }
1769 
1770     MemFree(name);
1771     return(remainder);
1772 }
1773 
1774 /**********************************************************/
fta_check_codon_quals(objects::CSeq_feat & feat)1775 static void fta_check_codon_quals(objects::CSeq_feat& feat)
1776 {
1777     char*   p;
1778 
1779     if (!feat.IsSetQual())
1780         return;
1781 
1782     for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
1783         if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal() ||
1784             (*qual)->GetQual() != "codon") {
1785             ++qual;
1786             continue;
1787         }
1788 
1789         p = location_to_string(feat.GetLocation());
1790         ErrPostEx(SEV_ERROR, ERR_CDREGION_CodonQualifierUsed,
1791                   "Encountered /codon qualifier for \"CDS\" feature at \"%s\". Code-breaks (/transl_except) should be used instead.",
1792                   p);
1793         if (p != NULL)
1794             MemFree(p);
1795 
1796         qual = feat.SetQual().erase(qual);
1797     }
1798 
1799     if (feat.GetQual().empty())
1800         feat.ResetQual();
1801 }
1802 
1803 /**********************************************************
1804 *
1805 *   static ByteStorePtr InternalStopCodon(pp, ibp, sfp,
1806 *                                         method, gene):
1807 *
1808 *      Return NULL if there is no protein sequence, or
1809 *   there is no translation qualifier and protein sequence
1810 *   has internal stop codon; otherwise, return a protein
1811 *   sequence.
1812 *      In the embl format, it may not have "translation"
1813 *   qualifier, take protein sequence (without
1814 *   end_stop_codon) instead.
1815 *
1816 **********************************************************/
InternalStopCodon(ParserPtr pp,InfoBioseqPtr ibp,objects::CSeq_feat & feat,unsigned char * method,Uint1 dif,GeneRefFeats & gene_refs,std::string & seq_data)1817 static void InternalStopCodon(ParserPtr pp, InfoBioseqPtr ibp,
1818                               objects::CSeq_feat& feat, unsigned char* method,
1819                               Uint1 dif, GeneRefFeats& gene_refs, std::string& seq_data)
1820 {
1821     char*      qval;
1822     char*      msg;
1823     bool         intercodon = false;
1824     bool         again = true;
1825 
1826     ErrSev       sev;
1827 
1828     Uint1        m = 0;
1829     Char         gcode_str[10];
1830     Char         stopmsg[550];
1831     Char         aastr[100];
1832     size_t       protlen;
1833     Int4         aa;
1834     Int4         num = 0;
1835     Int2         residue;
1836     Int2         r;
1837 
1838     objects::CCdregion& cdregion = feat.SetData().SetCdregion();
1839 
1840     if (!cdregion.IsSetCode())
1841         return;
1842 
1843     objects::CGenetic_code& gen_code = cdregion.SetCode();
1844     objects::CGenetic_code::C_E* cur_code = nullptr;
1845 
1846     NON_CONST_ITERATE(objects::CGenetic_code::Tdata, gcode, gen_code.Set())
1847     {
1848         if ((*gcode)->IsId()) {
1849             cur_code = (*gcode);
1850             break;
1851         }
1852     }
1853 
1854     if (cur_code == nullptr)
1855         return;
1856 
1857     if (pp->no_code) {
1858         int orig_code_id = cur_code->GetId();
1859         for (int cur_code_id = cur_code->GetId(); again && cur_code_id < 14; ++cur_code_id) {
1860             cur_code->SetId(cur_code_id);
1861             intercodon = false;
1862 
1863             std::string prot;
1864             if (!Translate(feat, prot))
1865                 pp->entrylist[pp->curindx]->drop = 1;
1866 
1867             if (prot.empty())
1868                 continue;
1869 
1870             protlen = prot.size();
1871             residue = prot[protlen - 1];
1872 
1873             if (residue == '*')
1874                 prot = prot.substr(0, prot.size() - 1);
1875 
1876             intercodon = prot.find('*') != std::string::npos;
1877 
1878             if (!intercodon) {
1879                 qval = CpTheQualValue(feat.GetQual(), "translation");
1880                 if (qval != NULL)        /* compare protein sequence */
1881                 {
1882                     if (check_translation(prot, qval)) {
1883                         sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1884                         ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
1885                                   "No genetic code from TaxArch, trying to guess, code %d assumed",
1886                                   cur_code_id);
1887                         again = false;
1888                     }
1889                     MemFree(qval);
1890                 }
1891                 else
1892                     break;
1893             }
1894         }
1895 
1896         if (again) {
1897             sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1898             ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
1899                       "Can't guess genetic code, code %d assumed", orig_code_id);
1900             cur_code->SetId(orig_code_id);
1901         }
1902     }
1903 
1904     std::string prot;
1905     if (!Translate(feat, prot))
1906         pp->entrylist[pp->curindx]->drop = 1;
1907 
1908     if (cur_code != NULL)
1909         sprintf(gcode_str, "%d", (int)cur_code->GetId());
1910     else
1911         StringCpy(gcode_str, "unknown");
1912 
1913     qval = CpTheQualValue(feat.GetQual(), "translation");
1914     intercodon = false;
1915 
1916     msg = location_to_string(feat.GetLocation());
1917     if (msg == NULL)
1918         msg = StringSave("");
1919 
1920     if (!prot.empty()) {
1921         protlen = prot.size();
1922         residue = prot[protlen - 1];
1923         if ((!feat.IsSetPartial() || feat.GetPartial() == false) && !SeqLocHaveFuzz(feat.GetLocation())) {
1924             CkEndStop(feat, dif);
1925         }
1926 
1927         if (residue != '*') {
1928             r = EndAdded(feat, gene_refs);
1929             if (r > 0 && (!feat.IsSetExcept() || feat.GetExcept() == false)) {
1930                 ErrPostEx(SEV_WARNING, ERR_CDREGION_TerminalStopCodonMissing,
1931                           "CDS: %s |found end stop codon after %d bases added",
1932                           msg, r);
1933             }
1934 
1935             if ((!feat.IsSetPartial() || feat.GetPartial() == false) && !SeqLocHaveFuzz(feat.GetLocation())) {
1936                 /* if there is no partial qualifier and location
1937                 * doesn't have "fuzz" then output message
1938                 */
1939                 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1940                     ErrPostEx(SEV_ERROR, ERR_CDREGION_TerminalStopCodonMissing,
1941                               "No end stop codon found for CDS: %s", msg);
1942                 }
1943             }
1944         }
1945         else                    /* remove termination codon from protein */
1946         {
1947             check_end_internal(protlen, feat, dif);
1948             prot = prot.substr(0, prot.size() - 1);
1949         }
1950 
1951         /* check internal stop codon */
1952         size_t residue_idx = 0;
1953         protlen = prot.size();
1954         for (stopmsg[0] = '\0', aa = 1; residue_idx < protlen; ++residue_idx) {
1955             residue = prot[residue_idx];
1956             if (aa == 1 && residue == '-') {
1957                 /* if unrecognized start of translation,
1958                 * a ncbigap character is inserted
1959                 */
1960                 if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1961                     ErrPostEx(SEV_WARNING, ERR_CDREGION_IllegalStart,
1962                               "unrecognized initiation codon from CDS: %s",
1963                               msg);
1964                 }
1965                 if (qval == NULL)        /* no /translation */
1966                 {
1967                     MemFree(msg);
1968                     return;
1969                 }
1970             }
1971 
1972             if (residue == '*')  /* only report 10 internal stop codons */
1973             {
1974                 intercodon = true;
1975                 ++num;
1976 
1977                 if (num < 11 && StringLen(stopmsg) < 500) {
1978                     sprintf(aastr, "%d ", (int)aa);
1979                     StringCat(stopmsg, aastr);
1980                 }
1981                 else if (num == 11 && StringLen(stopmsg) < 500)
1982                     StringCat(stopmsg, ", only report 10 positions");
1983             }
1984 
1985             aa++;
1986         }
1987 
1988         if (intercodon) {
1989             if (!feat.IsSetExcept() || feat.GetExcept() == false) {
1990                 ErrPostEx(SEV_ERROR, ERR_CDREGION_InternalStopCodonFound,
1991                           "Found %d internal stop codon, at AA # %s, on feature key, CDS, frame # %d, genetic code %s:%s",
1992                           (int)num, stopmsg, cdregion.GetFrame(), gcode_str, msg);
1993             }
1994 
1995             if (pp->debug) {
1996                 ErrByteStorePtr(ibp, feat, prot);
1997             }
1998         }
1999     }
2000     else if (!feat.IsSetExcept() || feat.GetExcept() == false) {
2001         ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq,
2002                   "No protein sequence found:%s", msg);
2003     }
2004 
2005     if (qval != NULL)                    /* compare protein sequence */
2006     {
2007         CkProteinTransl(pp, ibp, prot, feat, qval, intercodon, gcode_str, &m);
2008         *method = m;
2009         MemFree(qval);
2010         MemFree(msg);
2011         seq_data.swap(prot);
2012         return;
2013     }
2014 
2015     if (!prot.empty() && !intercodon) {
2016         if (prot.size() > 6 || !check_short_CDS(pp, feat, false)) {
2017             ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationAdded,
2018                       "input CDS lacks a translation: %s", msg);
2019         }
2020         *method = objects::eGIBB_method_concept_trans;
2021         MemFree(msg);
2022         seq_data.swap(prot);
2023         return;
2024     }
2025 
2026     /* no translation qual and internal stop codon
2027     */
2028     if (intercodon) {
2029         cdregion.SetStops(num);
2030         if (!feat.IsSetExcept() || feat.GetExcept() == false) {
2031             ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq,
2032                       "internal stop codons, and no translation qualifier CDS:%s",
2033                       msg);
2034         }
2035     }
2036 
2037     MemFree(msg);
2038 }
2039 
2040 /**********************************************************/
check_gen_code(char * qval,ProtBlkPtr pbp,Uint1 taxserver)2041 static void check_gen_code(char* qval, ProtBlkPtr pbp, Uint1 taxserver)
2042 {
2043     ErrSev     sev;
2044     Uint1      gcpvalue;
2045     Uint1      genome;
2046     Uint1      value;
2047 
2048     if (pbp == NULL || !pbp->gcode.IsId())
2049         return;
2050 
2051     gcpvalue = pbp->gcode.GetId();
2052     value = (Uint1)atoi(qval);
2053     genome = pbp->genome;
2054 
2055     if (value == gcpvalue)
2056         return;
2057 
2058     if (value == 7 || value == 8) {
2059         ErrPostEx(SEV_WARNING, ERR_CDREGION_InvalidGcodeTable,
2060                   "genetic code table is obsolete /transl_table = %d", value);
2061         pbp->gcode.SetId(pbp->orig_gcode);
2062         return;
2063     }
2064 
2065     if (value != 11 || (genome != 2 && genome != 3 && genome != 6 &&
2066         genome != 12 && genome != 16 && genome != 17 &&
2067         genome != 18 && genome != 22)) {
2068         sev = (taxserver == 0) ? SEV_INFO : SEV_ERROR;
2069         ErrPostEx(sev, ERR_CDREGION_GeneticCodeDiff,
2070                   "Genetic code from Taxonomy server: %d, from /transl_table: %d",
2071                   gcpvalue, value);
2072     }
2073 
2074     pbp->gcode.SetId(value);
2075 }
2076 
2077 /**********************************************************/
CpGeneticCodePtr(objects::CGenetic_code & code,const objects::CGenetic_code::C_E & gcode)2078 static bool CpGeneticCodePtr(objects::CGenetic_code& code, const objects::CGenetic_code::C_E& gcode)
2079 {
2080     if (!gcode.IsId())
2081         return false;
2082 
2083     CRef<objects::CGenetic_code::C_E> ce(new objects::CGenetic_code::C_E);
2084     ce->SetId(gcode.GetId());
2085     code.Set().push_back(ce);
2086 
2087     return true;
2088 }
2089 
2090 /**********************************************************/
IfOnlyStopCodon(const objects::CBioseq & bioseq,const objects::CSeq_feat & feat,bool transl)2091 static Int4 IfOnlyStopCodon(const objects::CBioseq& bioseq, const objects::CSeq_feat& feat, bool transl)
2092 {
2093     char* p;
2094     Uint1   strand;
2095     Int4    len;
2096     Int4    i;
2097 
2098     if (!feat.IsSetLocation() || transl)
2099         return(0);
2100 
2101     const objects::CSeq_loc& loc = feat.GetLocation();
2102     TSeqPos start = loc.GetStart(objects::eExtreme_Positional),
2103         stop = loc.GetStop(objects::eExtreme_Positional) + 1;
2104 
2105     if (start == kInvalidSeqPos || stop == kInvalidSeqPos)
2106         return(0);
2107 
2108     len = stop - start;
2109     if (len < 1 || len > 5)
2110         return(0);
2111 
2112     strand = loc.IsSetStrand() ? loc.GetStrand() : 0;
2113 
2114     p = location_to_string(loc);
2115     if ((strand == 2 && stop == bioseq.GetLength()) || (strand != 2 && start == 0)) {
2116         ErrPostEx(SEV_INFO, ERR_CDREGION_StopCodonOnly,
2117                   "Assuming coding region at \"%s\" annotates the stop codon of an upstream or downstream coding region.",
2118                   (p == NULL) ? "???" : p);
2119         i = 1;
2120     }
2121     else {
2122         ErrPostEx(SEV_REJECT, ERR_CDREGION_StopCodonBadInterval,
2123                   "Coding region at \"%s\" appears to annotate a stop codon, but its location does not include a sequence endpoint.",
2124                   (p == NULL) ? "???" : p);
2125         i = -1;
2126     }
2127 
2128     if (p != NULL)
2129         MemFree(p);
2130 
2131     return(i);
2132 }
2133 
2134 /**********************************************************/
fta_concat_except_text(objects::CSeq_feat & feat,const Char * text)2135 static void fta_concat_except_text(objects::CSeq_feat& feat, const Char* text)
2136 {
2137     if (text == NULL)
2138         return;
2139 
2140     if (feat.IsSetExcept_text()) {
2141         feat.SetExcept_text() += ", ";
2142         feat.SetExcept_text() += text;
2143     }
2144     else
2145         feat.SetExcept_text() = text;
2146 }
2147 
2148 /**********************************************************/
fta_check_exception(objects::CSeq_feat & feat,Parser::ESource source)2149 static bool fta_check_exception(objects::CSeq_feat& feat, Parser::ESource source)
2150 {
2151     const char **b;
2152     ErrSev     sev;
2153     char*    p;
2154 
2155     if (!feat.IsSetQual())
2156         return true;
2157 
2158     bool stopped = false;
2159     for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
2160         if (!(*qual)->IsSetQual()) {
2161             ++qual;
2162             continue;
2163         }
2164 
2165         if ((*qual)->GetQual() == "ribosomal_slippage")
2166             p = (char*) "ribosomal slippage";
2167         else if ((*qual)->GetQual() == "trans_splicing")
2168             p = (char*) "trans-splicing";
2169         else
2170             p = NULL;
2171 
2172         if (p != NULL) {
2173             feat.SetExcept(true);
2174 
2175             qual = feat.SetQual().erase(qual);
2176             fta_concat_except_text(feat, p);
2177             continue;
2178         }
2179 
2180         if ((*qual)->GetQual() != "exception" || !(*qual)->IsSetVal()) {
2181             ++qual;
2182             continue;
2183         }
2184 
2185         if (source == Parser::ESource::Refseq)
2186             b = RSExceptionQualVals;
2187         else
2188             b = GBExceptionQualVals;
2189 
2190         const Char* cur_val = (*qual)->GetVal().c_str();
2191         for (; *b != NULL; b++) {
2192             if (StringICmp(*b, cur_val) == 0)
2193                 break;
2194         }
2195 
2196         if (*b == NULL) {
2197             sev = (source == Parser::ESource::Refseq) ? SEV_ERROR : SEV_REJECT;
2198 
2199             p = location_to_string(feat.GetLocation());
2200             ErrPostEx(sev, ERR_QUALIFIER_InvalidException,
2201                       "/exception value \"%s\" on feature \"CDS\" at location \"%s\" is invalid.",
2202                       cur_val, (p == NULL) ? "Unknown" : p);
2203             if (p != NULL)
2204                 MemFree(p);
2205             if (source != Parser::ESource::Refseq) {
2206                 stopped = true;
2207                 break;
2208             }
2209         }
2210         else {
2211             feat.SetExcept(true);
2212             fta_concat_except_text(feat, cur_val);
2213         }
2214 
2215         qual = feat.SetQual().erase(qual);
2216     }
2217 
2218     if (feat.GetQual().empty())
2219         feat.ResetQual();
2220 
2221     if (stopped)
2222         return false;
2223 
2224     return true;
2225 }
2226 
2227 /**********************************************************
2228 *
2229 *   static Int2 CkCdRegion(pp, sfp, bsq, num, gene):
2230 *
2231 *      Routine returns 0, and
2232 *   sfp->data.choice = SEQFEAT_IMP :
2233 *   - if ambiguous frame number and no "codon_start"
2234 *     qualifier;
2235 *   - if there is a "pseudo" qualifier;
2236 *   - if the range of the "transl_except" qualifier is
2237 *     wrong;
2238 *      Otherwise return 1, and
2239 *   sfp->data.choice = SEQFEAT_CDREGION
2240 *
2241 **********************************************************/
CkCdRegion(ParserPtr pp,CScope & scope,objects::CSeq_feat & cds,objects::CBioseq & bioseq,int * num,GeneRefFeats & gene_refs)2242 static Int2 CkCdRegion(ParserPtr pp, CScope& scope, objects::CSeq_feat& cds, objects::CBioseq& bioseq,
2243                        int* num, GeneRefFeats& gene_refs)
2244 {
2245     ProtBlkPtr     pbp;
2246     const char     *r;
2247 
2248     char*        qval = NULL;
2249     char*        p;
2250     bool        is_pseudo;
2251     bool        is_stop;
2252     bool        is_transl;
2253 
2254     ErrSev         sev;
2255 
2256     Uint1          method = 0;
2257     Uint1          dif;
2258     Uint1          codon_start;
2259     Int2           frame;
2260     Int2           i;
2261 
2262     pbp = pp->pbp;
2263     if (pp->buf != NULL)
2264         MemFree(pp->buf);
2265     pp->buf = NULL;
2266 
2267     TCodeBreakList code_breaks;
2268     GetCdRegionCB(pbp->ibp, cds, code_breaks, &dif, pp->accver);
2269 
2270     is_pseudo = cds.IsSetPseudo() ? cds.GetPseudo() : false;
2271     is_transl = FindTheQual(cds, "translation");
2272 
2273     objects::CCode_break* first_code_break = nullptr;
2274     if (!code_breaks.empty())
2275         first_code_break = *code_breaks.begin();
2276 
2277     if (first_code_break != nullptr && first_code_break->GetAa().GetNcbieaa() == 42)
2278         is_stop = true;
2279     else if (is_pseudo)
2280         is_stop = false;
2281     else {
2282         i = IfOnlyStopCodon(bioseq, cds, is_transl);
2283         if (i < 0)
2284             return(-1);
2285         is_stop = (i != 0);
2286     }
2287 
2288     if (!is_transl) {
2289         bool found = false;
2290         ITERATE(TQualVector, qual, cds.GetQual())
2291         {
2292             if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal())
2293                 continue;
2294 
2295             if ((*qual)->GetQual() == "product" ||
2296                 (*qual)->GetQual() == "function" ||
2297                 (*qual)->GetQual() == "EC_number") {
2298                 found = true;
2299                 break;
2300             }
2301         }
2302 
2303         if (found) {
2304             CRef<objects::CSeqFeatXref> xfer(new objects::CSeqFeatXref);
2305             objects::CProt_ref& prot_ref = xfer->SetData().SetProt();
2306             ITERATE(TQualVector, qual, cds.GetQual())
2307             {
2308                 if (!(*qual)->IsSetQual() || !(*qual)->IsSetVal())
2309                     continue;
2310 
2311                 if ((*qual)->GetQual() == "product")
2312                     prot_ref.SetName().push_back((*qual)->GetVal());
2313                 else if ((*qual)->GetQual() == "EC_number")
2314                     prot_ref.SetEc().push_back((*qual)->GetVal());
2315                 else if ((*qual)->GetQual() == "function")
2316                     prot_ref.SetActivity().push_back((*qual)->GetVal());
2317             }
2318 
2319             DeleteQual(cds.SetQual(), "product");
2320             DeleteQual(cds.SetQual(), "EC_number");
2321             DeleteQual(cds.SetQual(), "function");
2322 
2323             if (cds.GetQual().empty())
2324                 cds.ResetQual();
2325 
2326             cds.SetXref().push_back(xfer);
2327         }
2328     }
2329 
2330     if (pp->accver && is_transl && is_pseudo) {
2331         p = location_to_string(cds.GetLocation());
2332         ErrPostEx(SEV_ERROR, ERR_CDREGION_PseudoWithTranslation,
2333                   "Coding region flagged as /pseudo has a /translation qualifier : \"%s\".",
2334                   (p == NULL) ? "" : p);
2335         MemFree(p);
2336         return(-1);
2337     }
2338 
2339     if (pp->mode != Parser::EMode::Relaxed &&
2340         pp->accver && is_transl == false && FindTheQual(cds, "protein_id")) {
2341         p = location_to_string(cds.GetLocation());
2342         ErrPostEx(SEV_ERROR, ERR_CDREGION_UnexpectedProteinId,
2343                   "CDS without /translation should not have a /protein_id qualifier. CDS = \"%s\".",
2344                   (p == NULL) ? "" : p);
2345         MemFree(p);
2346         return(-1);
2347     }
2348 
2349     if (pp->mode != Parser::EMode::Relaxed &&
2350         is_transl == false && is_pseudo == false && is_stop == false) {
2351         p = location_to_string(cds.GetLocation());
2352         if (pp->accver == false) {
2353             r = "Feature and protein bioseq";
2354             i = -2;
2355         }
2356         else {
2357             r = "Record";
2358             i = -1;
2359         }
2360         sev = (i == -1) ? SEV_REJECT : SEV_ERROR;
2361         ErrPostEx(sev, ERR_CDREGION_MissingTranslation,
2362                   "Missing /translation qualifier for CDS \"%s\". %s rejected.",
2363                   (p == NULL) ? "" : p, r);
2364         MemFree(p);
2365         return(i);
2366     }
2367 
2368     /* check exception qualifier
2369     */
2370     if (!fta_check_exception(cds, pp->source))
2371         return(-1);
2372 
2373     CRef<objects::CImp_feat> imp_feat(new objects::CImp_feat);
2374     if (cds.IsSetData() && cds.GetData().IsImp())
2375         imp_feat->Assign(cds.GetData().GetImp());
2376 
2377     codon_start = 1;
2378     qval = GetTheQualValue(cds.SetQual(), "codon_start");
2379 
2380     if (qval == NULL) {
2381         if (pp->source == Parser::ESource::EMBL)
2382             frame = 1;
2383         else {
2384             frame = 0;
2385             objects::CCdregion::EFrame loc_frame = objects::CCdregion::eFrame_not_set;
2386             if (objects::CCleanup::SetFrameFromLoc(loc_frame, cds.GetLocation(), scope))
2387                 frame = loc_frame;
2388 
2389             if (frame == 0 && is_pseudo == false) {
2390                 p = location_to_string(cds.GetLocation());
2391                 sev = (pp->source == Parser::ESource::DDBJ) ? SEV_INFO : SEV_ERROR;
2392                 ErrPostEx(sev, ERR_CDREGION_MissingCodonStart,
2393                           "CDS feature \"%s\" is lacking /codon_start qualifier; assuming frame = 1.",
2394                           (p == NULL) ? "" : p);
2395                 MemFree(p);
2396                 frame = 1;
2397             }
2398         }
2399     }
2400     else {
2401         frame = (Uint1)atoi(qval);
2402         MemFree(qval);
2403     }
2404 
2405     CRef<objects::CCdregion> cdregion(new objects::CCdregion);
2406 
2407     if (frame > 0)
2408         cdregion->SetFrame(static_cast<objects::CCdregion::EFrame>(frame));
2409 
2410     qval = GetTheQualValue(cds.SetQual(), "transl_table");
2411 
2412     if (qval != NULL) {
2413         check_gen_code(qval, pbp, pp->taxserver);
2414         pp->no_code = false;
2415         MemFree(qval);
2416     }
2417     else if (pbp != NULL && pbp->gcode.IsId())
2418         pbp->gcode.SetId(pbp->orig_gcode);
2419 
2420     if (!code_breaks.empty())
2421         cdregion->SetCode_break().swap(code_breaks);
2422 
2423     if (!CpGeneticCodePtr(cdregion->SetCode(), pbp->gcode))
2424         cdregion->ResetCode();
2425 
2426     cds.SetData().SetCdregion(*cdregion);
2427 
2428     if (cds.GetQual().empty())
2429         cds.ResetQual();
2430 
2431     if (!is_transl) {
2432         imp_feat.Reset();
2433         return(0);
2434     }
2435 
2436     objects::CBioseq::TId ids;
2437     GetProtRefSeqId(ids, pbp->ibp, num, pp, scope, cds);
2438 
2439     if (!ids.empty())
2440         fta_check_codon_quals(cds);
2441 
2442     std::string sequence_data;
2443     InternalStopCodon(pp, pbp->ibp, cds, &method, dif, gene_refs, sequence_data);
2444 
2445     if (cds.GetQual().empty())
2446         cds.ResetQual();
2447 
2448     if (cdregion->IsSetConflict() && cdregion->GetConflict() && codon_start == 0) {
2449         p = location_to_string(cds.GetLocation());
2450         ErrPostEx(SEV_ERROR, ERR_CDREGION_TooBad,
2451                   "Input translation does not agree with parser generated one, cdregion \"%s\" is lacking /codon_start, frame not set, - so sequence will be rejected.",
2452                   (p == NULL) ? "" : p);
2453         MemFree(p);
2454         return(-1);
2455     }
2456 
2457     if (!sequence_data.empty()) {
2458         imp_feat.Reset();
2459         CRef<objects::CBioseq> new_bioseq = BldProtRefSeqEntry(pbp, cds, sequence_data, method, pp, bioseq, ids);
2460 
2461         if (new_bioseq.Empty()) {
2462             return(-1);
2463         }
2464 
2465         scope.AddBioseq(*new_bioseq);
2466 
2467         /* remove qualifiers which were processed before
2468         */
2469         DeleteQual(cds.SetQual(), "codon_start");
2470         DeleteQual(cds.SetQual(), "transl_except");
2471         DeleteQual(cds.SetQual(), "translation");
2472         DeleteQual(cds.SetQual(), "protein_id");
2473 
2474         if (cds.GetQual().empty())
2475             cds.ResetQual();
2476 
2477         if (sequence_data.size() < 6 && pp->accver == false && check_short_CDS(pp, cds, true)) {
2478             /* make xref from prot-ref for short CDS only
2479             */
2480             if (new_bioseq->IsSetAnnot()) {
2481                 NON_CONST_ITERATE(objects::CBioseq::TAnnot, annot, new_bioseq->SetAnnot())
2482                 {
2483                     if (!(*annot)->IsFtable())
2484                         continue;
2485 
2486                     ITERATE(TSeqFeatList, cur_feat, (*annot)->GetData().GetFtable())
2487                     {
2488                         if (!(*cur_feat)->IsSetData() || !(*cur_feat)->GetData().IsProt())
2489                             continue;
2490 
2491                         CRef<objects::CSeqFeatXref> new_xref(new objects::CSeqFeatXref);
2492                         new_xref->SetData().SetProt().Assign((*cur_feat)->GetData().GetProt());
2493 
2494                         cds.SetXref().push_back(new_xref);
2495                     }
2496                 }
2497             }
2498             return(0);
2499         }
2500 
2501         objects::CSeq_id& first_id = *(*new_bioseq->SetId().begin());
2502         cds.SetProduct().SetWhole(first_id);
2503 
2504         AddProtRefSeqEntry(pbp, *new_bioseq);
2505 
2506         return(1);
2507     }
2508 
2509     /* no protein sequence, or there is no translation qualifier
2510     * and protein sequence has internal stop codon
2511     */
2512 
2513     cds.SetExcept(false);
2514     if (cds.IsSetExcept_text())
2515         cds.ResetExcept_text();
2516 
2517     cds.ResetData();
2518     if (imp_feat.NotEmpty())
2519         cds.SetData().SetImp(*imp_feat);
2520 
2521     if (!is_pseudo) {
2522         p = location_to_string(cds.GetLocation());
2523         ErrPostEx(SEV_ERROR, ERR_CDREGION_ConvertToImpFeat,
2524                   "non-pseudo CDS with data problems is converted to ImpFeat%s",
2525                   (p == NULL) ? "" : p);
2526         MemFree(p);
2527     }
2528     return(0);
2529 }
2530 
2531 /**********************************************************
2532 *
2533 *   static SeqFeatPtr SrchCdRegion(pp, bsp, sfp, gene):
2534 *
2535 *      Return a link list of SeqFeatPtr of type CDREGION.
2536 *
2537 **********************************************************/
SrchCdRegion(ParserPtr pp,CScope & scope,objects::CBioseq & bioseq,objects::CSeq_annot & annot,GeneRefFeats & gene_refs)2538 static void SrchCdRegion(ParserPtr pp, CScope& scope, objects::CBioseq& bioseq, objects::CSeq_annot& annot,
2539                          GeneRefFeats& gene_refs)
2540 {
2541     char*    p;
2542     Int4       num = 0;
2543     Int2       i;
2544 
2545     if (!annot.IsSetData() || !annot.GetData().IsFtable())
2546         return;
2547 
2548     for (objects::CSeq_annot::C_Data::TFtable::iterator feat = annot.SetData().SetFtable().begin();
2549          feat != annot.SetData().SetFtable().end();) {
2550         if (!(*feat)->IsSetData() || !(*feat)->GetData().IsImp()) {
2551             ++feat;
2552             continue;
2553         }
2554 
2555         const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
2556         if (!imp_feat.IsSetKey() || imp_feat.GetKey() != "CDS") {
2557             ++feat;
2558             continue;
2559         }
2560 
2561         /* remove asn2ff_generated comments
2562         */
2563         StripCDSComment(*(*feat));
2564 
2565         const objects::CSeq_loc& loc = (*feat)->GetLocation();
2566         if (loc.IsEmpty() || loc.IsEquiv() || loc.IsBond()) {
2567             p = location_to_string(loc);
2568             ErrPostEx(SEV_REJECT, ERR_CDREGION_BadLocForTranslation,
2569                       "Coding region feature has a location that cannot be processed: \"%s\".",
2570                       (p == NULL) ? "" : p);
2571             if (p != NULL)
2572                 MemFree(p);
2573             pp->entrylist[pp->curindx]->drop = 1;
2574             break;
2575         }
2576 
2577         i = CkCdRegion(pp, scope, *(*feat), bioseq, &num, gene_refs);
2578 
2579         if (i == -2) {
2580             feat = annot.SetData().SetFtable().erase(feat);
2581             continue;
2582         }
2583 
2584         if (i == -1) {
2585             pp->entrylist[pp->curindx]->drop = 1;
2586             break;
2587         }
2588 
2589         if (i != 1) {
2590             ++feat;
2591             continue;
2592         }
2593 
2594         /* prepare cdregion to link list, for nuc-prot level
2595         */
2596         pp->pbp->feats.push_back(*feat);
2597 
2598         feat = annot.SetData().SetFtable().erase(feat);
2599     }
2600 }
2601 
2602 /**********************************************************/
FindCd(TEntryList & seq_entries,CScope & scope,ParserPtr pp,GeneRefFeats & gene_refs)2603 static void FindCd(TEntryList& seq_entries, CScope& scope, ParserPtr pp, GeneRefFeats& gene_refs)
2604 {
2605     ProtBlkPtr    pbp = pp->pbp;
2606 
2607     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2608     {
2609         for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
2610             pbp->segset = true;
2611             pbp->biosep = *entry;
2612             break;
2613         }
2614 
2615         if (pbp->segset)
2616             break;
2617     }
2618 
2619     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2620     {
2621         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2622             const objects::CSeq_id& first_id = *(*bioseq->GetId().begin());
2623             if (IsSegBioseq(first_id))
2624                 continue;
2625 
2626             InfoBioseqFree(pbp->ibp);
2627             if(pp->source != Parser::ESource::USPTO)
2628                 CpSeqId(pbp->ibp, first_id);
2629 
2630             if (bioseq->IsSetAnnot()) {
2631                 for (objects::CBioseq::TAnnot::iterator annot = bioseq->SetAnnot().begin(); annot != bioseq->SetAnnot().end();) {
2632                     if (!(*annot)->IsFtable()) {
2633                         ++annot;
2634                         continue;
2635                     }
2636 
2637                     SrchCdRegion(pp, scope, *bioseq, *(*annot), gene_refs);
2638                     if (!(*annot)->GetData().GetFtable().empty()) {
2639                         ++annot;
2640                         continue;
2641                     }
2642 
2643                     annot = bioseq->SetAnnot().erase(annot);
2644                 }
2645 
2646                 if (bioseq->GetAnnot().empty())
2647                     bioseq->ResetAnnot();
2648             }
2649 
2650             if (!pbp->segset) {
2651                 pbp->biosep = *entry;
2652             }
2653         }
2654     }
2655 }
2656 
2657 /**********************************************************/
check_GIBB(TSeqdescList & descrs)2658 static bool check_GIBB(TSeqdescList& descrs)
2659 {
2660     if (descrs.empty())
2661         return false;
2662 
2663     const objects::CSeqdesc* descr_modif = nullptr;
2664     ITERATE(TSeqdescList, descr, descrs)
2665     {
2666         if ((*descr)->IsModif()) {
2667             descr_modif = *descr;
2668             break;
2669         }
2670     }
2671     if (descr_modif == nullptr)
2672         return true;
2673 
2674     if (!descr_modif->GetModif().empty()) {
2675         objects::EGIBB_mod gmod = *descr_modif->GetModif().begin();
2676         if (gmod == objects::eGIBB_mod_dna || gmod == objects::eGIBB_mod_rna ||
2677             gmod == objects::eGIBB_mod_est)
2678             return false;
2679     }
2680     return true;
2681 }
2682 
2683 /**********************************************************/
ValNodeExtractUserObject(TSeqdescList & descrs_from,TSeqdescList & descrs_to,const Char * tag)2684 static void ValNodeExtractUserObject(TSeqdescList& descrs_from, TSeqdescList& descrs_to, const Char* tag)
2685 {
2686     for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2687         if ((*descr)->IsUser() && (*descr)->GetUser().IsSetData() &&
2688             (*descr)->GetUser().IsSetType() && (*descr)->GetUser().GetType().IsStr() &&
2689             (*descr)->GetUser().GetType().GetStr() == tag) {
2690             descrs_to.push_back(*descr);
2691             descr = descrs_from.erase(descr);
2692             break;
2693         }
2694         else
2695             ++descr;
2696     }
2697 }
2698 
2699 /**********************************************************/
ExtractDescrs(TSeqdescList & descrs_from,TSeqdescList & descrs_to,objects::CSeqdesc::E_Choice choice)2700 void ExtractDescrs(TSeqdescList& descrs_from, TSeqdescList& descrs_to, objects::CSeqdesc::E_Choice choice)
2701 {
2702     for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2703         if ((*descr)->Which() == choice) {
2704             descrs_to.push_back(*descr);
2705             descr = descrs_from.erase(descr);
2706         }
2707         else
2708             ++descr;
2709     }
2710 }
2711 
2712 /**********************************************************/
GetBioseqSetDescr(ProtBlkPtr pbp,TSeqdescList & descrs)2713 static void GetBioseqSetDescr(ProtBlkPtr pbp, TSeqdescList& descrs)
2714 {
2715     TSeqdescList* descrs_from = nullptr;
2716     if (pbp->segset) {
2717         if (!pbp->biosep->GetSet().GetDescr().Get().empty())
2718             descrs_from = &pbp->biosep->SetSet().SetDescr().Set();
2719     }
2720     else {
2721         if (!pbp->biosep->GetSeq().GetDescr().Get().empty())
2722             descrs_from = &pbp->biosep->SetSeq().SetDescr().Set();
2723     }
2724 
2725     if (descrs_from == nullptr)
2726         return;
2727 
2728     ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Org);
2729 
2730     if (check_GIBB(*descrs_from)) {
2731         ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Modif);
2732     }
2733 
2734     ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Comment);
2735     ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Pub);
2736     ExtractDescrs(*descrs_from, descrs, objects::CSeqdesc::e_Update_date);
2737 
2738     ValNodeExtractUserObject(*descrs_from, descrs, "GenomeProjectsDB");
2739     ValNodeExtractUserObject(*descrs_from, descrs, "DBLink");
2740     ValNodeExtractUserObject(*descrs_from, descrs, "FeatureFetchPolicy");
2741 }
2742 
2743 /**********************************************************/
BuildProtBioseqSet(ProtBlkPtr pbp,TEntryList & entries)2744 static void BuildProtBioseqSet(ProtBlkPtr pbp, TEntryList& entries)
2745 {
2746     CRef<objects::CSeq_entry> entry(new objects::CSeq_entry);
2747     objects::CBioseq_set& seq_set = entry->SetSet();
2748     seq_set.SetClass(objects::CBioseq_set::eClass_nuc_prot);
2749 
2750     /* add descr if nuc-prot
2751     */
2752     GetBioseqSetDescr(pbp, seq_set.SetDescr().Set());       /* get from ASN.1 tree */
2753     if (seq_set.GetDescr().Get().empty())
2754         seq_set.ResetDescr();
2755 
2756     seq_set.SetSeq_set().splice(seq_set.SetSeq_set().end(), entries);
2757 
2758     CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
2759 
2760     if (!pbp->feats.empty())
2761         annot->SetData().SetFtable().swap(pbp->feats);
2762 
2763     seq_set.SetAnnot().push_back(annot);
2764 
2765     entries.push_back(entry);
2766 }
2767 
2768 /**********************************************************/
ProcNucProt(ParserPtr pp,TEntryList & seq_entries,GeneRefFeats & gene_refs)2769 void ProcNucProt(ParserPtr pp, TEntryList& seq_entries, GeneRefFeats& gene_refs)
2770 {
2771     ProtBlkPtr    pbp;
2772     ErrSev        sev;
2773     Int4          gcode = 0;
2774 
2775     pbp = pp->pbp;
2776     ProtBlkInit(pbp);
2777 
2778     GetGcode(seq_entries, pp);
2779 
2780     if (!pbp->gcode.IsId()) {
2781         gcode = (pbp->genome == 4 || pbp->genome == 5) ? 2 : 1;
2782         pp->no_code = true;
2783         sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
2784         ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed,
2785                   "No %sgenetic code from TaxArch, code %d assumed",
2786                   (gcode == 2) ? "mitochondrial " : "", gcode);
2787         pbp->gcode.SetId(gcode);
2788         pbp->orig_gcode = gcode;
2789     }
2790 
2791     FindCd(seq_entries, GetScope(), pp, gene_refs);
2792 
2793     if (pp->entrylist[pp->curindx]->drop == 1) {
2794         ProtBlkFree(pbp);
2795         seq_entries.clear();
2796         return;
2797     }
2798 
2799     if (!pbp->entries.empty()) {
2800         seq_entries.splice(seq_entries.end(), pbp->entries);
2801 
2802         BuildProtBioseqSet(pbp, seq_entries);
2803         AssignBioseqSetLevel(seq_entries);
2804     }
2805 
2806     ProtBlkFree(pbp);
2807 }
2808 
2809 /**********************************************************/
GetDateFromDescrs(const TSeqdescList & descrs,objects::CSeqdesc::E_Choice what)2810 static const objects::CDate* GetDateFromDescrs(const TSeqdescList& descrs, objects::CSeqdesc::E_Choice what)
2811 {
2812     const objects::CDate* set_date = nullptr;
2813     ITERATE(TSeqdescList, descr, descrs)
2814     {
2815         if ((*descr)->Which() == what) {
2816             if (what == objects::CSeqdesc::e_Create_date)
2817                 set_date = &(*descr)->GetCreate_date();
2818             else if (what == objects::CSeqdesc::e_Update_date)
2819                 set_date = &(*descr)->GetUpdate_date();
2820 
2821             if (set_date)
2822                 break;
2823         }
2824     }
2825 
2826     return set_date;
2827 }
2828 
2829 /**********************************************************/
FixDupDates(objects::CBioseq_set & bio_set,objects::CSeqdesc::E_Choice what)2830 static void FixDupDates(objects::CBioseq_set& bio_set, objects::CSeqdesc::E_Choice what)
2831 {
2832     if (!bio_set.IsSetSeq_set() || !bio_set.IsSetDescr())
2833         return;
2834 
2835     NON_CONST_ITERATE(TEntryList, entry, bio_set.SetSeq_set())
2836     {
2837         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2838             if (!bioseq->IsSetInst() || !bioseq->GetInst().IsSetMol() || !bioseq->GetInst().IsNa() || !bioseq->IsSetDescr())
2839                 continue;
2840 
2841             const objects::CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), what);
2842 
2843             TSeqdescList& cur_descrs = bioseq->SetDescr().Set();
2844             TSeqdescList::iterator cur_descr = cur_descrs.begin();
2845 
2846             for (; cur_descr != cur_descrs.end(); ++cur_descr) {
2847                 if ((*cur_descr)->Which() == what)
2848                     break;
2849             }
2850 
2851             if (cur_descr == cur_descrs.end())
2852                 continue;
2853 
2854             const objects::CDate* seq_date = nullptr;
2855             if (what == objects::CSeqdesc::e_Create_date)
2856                 seq_date = &(*cur_descr)->GetCreate_date();
2857             else if (what == objects::CSeqdesc::e_Update_date)
2858                 seq_date = &(*cur_descr)->GetUpdate_date();
2859 
2860             if (seq_date == nullptr)
2861                 continue;
2862 
2863             if (set_date && seq_date->Compare(*set_date) == objects::CDate::eCompare_same)
2864                 cur_descrs.erase(cur_descr);
2865 
2866             if (set_date == nullptr) {
2867                 bio_set.SetDescr().Set().push_back(*cur_descr);
2868                 cur_descrs.erase(cur_descr);
2869             }
2870         }
2871     }
2872 }
2873 
2874 /**********************************************************/
FixCreateDates(objects::CBioseq_set & bio_set)2875 static void FixCreateDates(objects::CBioseq_set& bio_set)
2876 {
2877     FixDupDates(bio_set, objects::CSeqdesc::e_Create_date);
2878 }
2879 
2880 /**********************************************************/
FixUpdateDates(objects::CBioseq_set & bio_set)2881 static void FixUpdateDates(objects::CBioseq_set& bio_set)
2882 {
2883     FixDupDates(bio_set, objects::CSeqdesc::e_Update_date);
2884 }
2885 
2886 /**********************************************************/
FixEmblUpdateDates(objects::CBioseq_set & bio_set)2887 static void FixEmblUpdateDates(objects::CBioseq_set& bio_set)
2888 {
2889     if (!bio_set.IsSetSeq_set() || !bio_set.IsSetDescr())
2890         return;
2891 
2892     NON_CONST_ITERATE(TEntryList, entry, bio_set.SetSeq_set())
2893     {
2894         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq) {
2895             if (!bioseq->IsSetInst() || !bioseq->GetInst().IsSetMol() || !bioseq->GetInst().IsNa() || !bioseq->IsSetDescr())
2896                 continue;
2897 
2898             const objects::CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), objects::CSeqdesc::e_Update_date);
2899 
2900             const objects::CEMBL_block* embl_block = nullptr;
2901             ITERATE(TSeqdescList, descr, bioseq->GetDescr().Get())
2902             {
2903                 if ((*descr)->IsEmbl()) {
2904                     embl_block = &(*descr)->GetEmbl();
2905                     break;
2906                 }
2907             }
2908 
2909             const objects::CDate* seq_date = nullptr;
2910             if (embl_block != nullptr && embl_block->IsSetUpdate_date())
2911                 seq_date = &embl_block->GetUpdate_date();
2912 
2913             if (seq_date == nullptr)
2914                 continue;
2915 
2916             if (set_date && seq_date->Compare(*set_date) == objects::CDate::eCompare_same)
2917                 continue;
2918 
2919             if (set_date == nullptr) {
2920                 CRef<objects::CSeqdesc> new_descr(new objects::CSeqdesc);
2921                 new_descr->SetUpdate_date().Assign(*seq_date);
2922                 bio_set.SetDescr().Set().push_back(new_descr);
2923             }
2924         }
2925     }
2926 }
2927 
2928 /**********************************************************/
CheckDupDates(TEntryList & seq_entries)2929 void CheckDupDates(TEntryList& seq_entries)
2930 {
2931     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
2932     {
2933         for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set) {
2934             if (bio_set->IsSetClass() && bio_set->GetClass() == objects::CBioseq_set::eClass_nuc_prot) {
2935                 FixCreateDates(*bio_set);
2936                 FixUpdateDates(*bio_set);
2937                 FixEmblUpdateDates(*bio_set);
2938             }
2939         }
2940     }
2941 }
2942 
2943 END_NCBI_SCOPE
2944