1 /* add.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  add.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Additional parser functions.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 #include <objects/seq/Seq_gap.hpp>
40 #include <objects/general/User_object.hpp>
41 #include <objects/general/User_field.hpp>
42 #include <objects/general/Object_id.hpp>
43 #include <objects/seq/Seq_descr.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seq/MolInfo.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seq_ext.hpp>
48 #include <objects/seq/Seq_hist.hpp>
49 #include <objects/seq/Seq_hist_rec.hpp>
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Dense_seg.hpp>
52 #include <objects/general/Dbtag.hpp>
53 #include <objects/seqalign/Seq_align_set.hpp>
54 #include <objects/seq/Seq_annot.hpp>
55 #include <objects/seqfeat/Imp_feat.hpp>
56 #include <objects/seq/seqport_util.hpp>
57 #include <objects/seq/Delta_ext.hpp>
58 #include <objects/seq/Delta_seq.hpp>
59 #include <objects/seq/Seq_literal.hpp>
60 #include <objects/seqloc/Seq_point.hpp>
61 #include <objects/seqloc/Seq_loc_equiv.hpp>
62 #include <objects/seqset/Bioseq_set.hpp>
63 #include <objects/seq/seq_id_handle.hpp>
64 
65 #include "index.h"
66 #include "genbank.h"                    /* for ParFlat_FEATURES */
67 #include "embl.h"                       /* for ParFlat_FH */
68 
69 #include <objtools/flatfile/flatdefn.h>
70 #include "ftanet.h"
71 
72 #include "ftaerr.hpp"
73 #include "indx_blk.h"
74 #include "asci_blk.h"
75 #include "utilfun.h"
76 
77 #ifdef THIS_FILE
78 #    undef THIS_FILE
79 #endif
80 #define THIS_FILE "add.cpp"
81 
82 #define HTG_GAP   100
83 #define SHORT_GAP 20
84 
85 BEGIN_NCBI_SCOPE
86 USING_SCOPE(objects);
87 
88 typedef struct _seq_loc_ids {
89     objects::CSeq_loc* badslp;
90     const Char*   wgsacc;
91     const Char*   wgscont;
92     const Char*   wgsscaf;
93     Int4      genbank;
94     Int4      embl;
95     Int4      pir;
96     Int4      swissprot;
97     Int4      other;
98     Int4      ddbj;
99     Int4      prf;
100     Int4      tpg;
101     Int4      tpe;
102     Int4      tpd;
103     Int4      total;
104 } SeqLocIds, *SeqLocIdsPtr;
105 
106 typedef struct _fta_tpa_block {
107     Int4                       from1;
108     Int4                       to1;
109     char*                    accession;
110     Int4                       version;
111     Int4                       from2;
112     Int4                       to2;
113     Uint1                      strand;
114     Uint1                      sicho;   /* SeqId choice */
115     struct _fta_tpa_block* next;
116 } FTATpaBlock, *FTATpaBlockPtr;
117 
118 typedef struct _fta_tpa_span {
119     Int4                      from;
120     Int4                      to;
121     struct _fta_tpa_span* next;
122 } FTATpaSpan, *FTATpaSpanPtr;
123 
124 /**********************************************************/
fta_tpa_block_free(FTATpaBlockPtr ftbp)125 static void fta_tpa_block_free(FTATpaBlockPtr ftbp)
126 {
127     FTATpaBlockPtr next;
128 
129     for(; ftbp != NULL; ftbp = next)
130     {
131         next = ftbp->next;
132         if(ftbp->accession != NULL)
133             MemFree(ftbp->accession);
134         MemFree(ftbp);
135     }
136 }
137 
138 /**********************************************************
139  *
140  *   char* tata_save(str):
141  *
142  *      Deletes spaces from the begining and the end and
143  *   returns Nlm_StringSave.
144  *
145  **********************************************************/
tata_save(char * str)146 char* tata_save(char* str)
147 {
148     char* s;
149     char* ss;
150 
151     if(str == NULL)
152         return(NULL);
153 
154     while(isspace((int) *str) != 0 || *str == ',')
155         str++;
156     for(s = str; *s != '\0'; s++)
157     {
158         if(*s != '\n')
159             continue;
160 
161         for(ss = s + 1; isspace((int) *ss) != 0;)
162             ss++;
163         *s = ' ';
164         fta_StringCpy(s + 1, ss);
165     }
166     s = str + StringLen(str) - 1;
167     while(s >= str && (*s == ' ' || *s == ';' || *s == ',' || *s == '\"' ||
168                        *s == '\t'))
169         *s-- = '\0';
170 
171     if(*str == '\0')
172         return(NULL);
173 
174     return(StringSave(str));
175 }
176 
177 /**********************************************************/
no_date(Parser::EFormat format,const TSeqdescList & descrs)178 bool no_date(Parser::EFormat format, const TSeqdescList& descrs)
179 {
180     bool no_create = true;
181     bool no_update = true;
182 
183     ITERATE(TSeqdescList, desc, descrs)
184     {
185         if ((*desc)->IsCreate_date())
186             no_create = false;
187         else if ((*desc)->IsUpdate_date())
188             no_update = false;
189 
190         if (no_create == false && no_update == false)
191             break;
192     }
193 
194     if(format == Parser::EFormat::GenBank)
195         return(no_update);
196 
197     return(no_create || no_update);
198 }
199 
200 /**********************************************************
201  *
202  *   bool no_reference(bsp):
203  *
204  *      Search for at least one reference in bioseq->desr
205  *   or in bioseq->annot.
206  *      If no reference return TRUE.
207  *
208  **********************************************************/
no_reference(const objects::CBioseq & bioseq)209 bool no_reference(const objects::CBioseq& bioseq)
210 {
211     ITERATE(TSeqdescList, desc, bioseq.GetDescr().Get())
212     {
213         if ((*desc)->IsPub())
214             return false;
215     }
216 
217     ITERATE(objects::CBioseq::TAnnot, annot, bioseq.GetAnnot())
218     {
219         if (!(*annot)->IsFtable())
220             continue;
221 
222         ITERATE(objects::CSeq_annot::C_Data::TFtable, feat, (*annot)->GetData().GetFtable())
223         {
224             if ((*feat)->IsSetData() && (*feat)->GetData().IsPub())
225                 return false;
226         }
227 
228         ITERATE(objects::CSeq_annot::C_Data::TFtable, feat, (*annot)->GetData().GetFtable())
229         {
230             if (!(*feat)->IsSetData() || !(*feat)->GetData().IsImp())
231                 continue;
232 
233             const objects::CImp_feat& imp = (*feat)->GetData().GetImp();
234             if (imp.GetKey() == "Site-ref")
235             {
236                 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
237                            "The entry has only 'sites' references");
238                 return false;
239             }
240         }
241     }
242 
243     return true;
244 }
245 
246 /**********************************************************
247  *
248  *   bool check_cds(entry, format):
249  *
250  *      Returns TRUE if CDS is in the entry.
251  *
252  **********************************************************/
check_cds(DataBlkPtr entry,Parser::EFormat format)253 bool check_cds(DataBlkPtr entry, Parser::EFormat format)
254 {
255     DataBlkPtr temp;
256     DataBlkPtr dbp;
257     const char *str;
258     char*    p;
259     Char       ch;
260     Int2       type;
261 
262     if(format == Parser::EFormat::EMBL)
263     {
264         type = ParFlat_FH;
265         str = "\nFT   CDS  ";
266     }
267     else if(format == Parser::EFormat::GenBank)
268     {
269         type = ParFlat_FEATURES;
270         str = "\n     CDS  ";
271     }
272     else
273         return false;
274 
275     for(temp = TrackNodeType(entry, type); temp != NULL; temp = temp->next)
276     {
277         if(temp->type != type)
278             continue;
279 
280         size_t len = 0;
281         for(dbp = (DataBlkPtr) temp->data; dbp != NULL; dbp = dbp->next)
282             len += dbp->len;
283         if(len == 0)
284             continue;
285 
286         dbp = (DataBlkPtr) temp->data;
287         ch = dbp->offset[len];
288         dbp->offset[len] = '\0';
289         p = StringStr(dbp->offset, str);
290         dbp->offset[len] = ch;
291 
292         if(p != NULL)
293             break;
294     }
295 
296     if(temp == NULL)
297         return false;
298     return true;
299 }
300 
301 /**********************************************************/
err_install(IndexblkPtr ibp,bool accver)302 void err_install(IndexblkPtr ibp, bool accver)
303 {
304     Char temp[200];
305 
306     FtaInstallPrefix(PREFIX_LOCUS, ibp->locusname, NULL);
307     if(accver && ibp->vernum > 0)
308         sprintf(temp, "%s.%d", ibp->acnum, ibp->vernum);
309     else
310         StringCpy(temp, ibp->acnum);
311     if(*temp == '\0')
312         StringCpy(temp, ibp->locusname);
313     FtaInstallPrefix(PREFIX_ACCESSION, temp, NULL);
314 }
315 
316 /**********************************************************/
CreateSeqGap(objects::CSeq_literal & seq_lit,GapFeatsPtr gfp)317 static void CreateSeqGap(objects::CSeq_literal& seq_lit, GapFeatsPtr gfp)
318 {
319     if (gfp == NULL)
320         return;
321 
322     objects::CSeq_gap& sgap = seq_lit.SetSeq_data().SetGap();
323     sgap.SetType(gfp->asn_gap_type);
324 
325     if (!gfp->asn_linkage_evidence.empty())
326         sgap.SetLinkage_evidence().swap(gfp->asn_linkage_evidence);
327 
328     if (StringCmp(gfp->gap_type, "unknown") == 0 ||
329         StringCmp(gfp->gap_type, "within scaffold") == 0 ||
330         StringCmp(gfp->gap_type, "repeat within scaffold") == 0)
331         sgap.SetLinkage(1);
332     else
333         sgap.SetLinkage(0);
334 }
335 
336 /**********************************************************/
AssemblyGapsToDelta(objects::CBioseq & bioseq,GapFeatsPtr gfp,unsigned char * drop)337 void AssemblyGapsToDelta(objects::CBioseq& bioseq, GapFeatsPtr gfp, unsigned char* drop)
338 {
339     if (!bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta() ||
340        gfp == NULL)
341         return;
342 
343     objects::CDelta_ext::Tdata& deltas = bioseq.SetInst().SetExt().SetDelta();
344     objects::CDelta_ext::Tdata::iterator delta = deltas.begin();
345     for (; delta != deltas.end(); ++delta)
346     {
347         if (gfp == NULL)
348             break;
349 
350         if (!(*delta)->IsLiteral())            /* not Seq-lit */
351             continue;
352 
353         objects::CSeq_literal& literal = (*delta)->SetLiteral();
354         if (literal.GetLength() != static_cast<Uint4>(gfp->to - gfp->from + 1))
355         {
356             ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
357                       "The lengths of the CONTIG/CO line gaps disagrees with the lengths of assembly_gap features. First assembly_gap with a mismatch is at \"%d..%d\".",
358                       gfp->from, gfp->to);
359             *drop = 1;
360             break;
361         }
362 
363         CreateSeqGap(literal, gfp);
364 
365         gfp = gfp->next;
366     }
367 
368     if (*drop != 0 || (delta == deltas.end() && gfp == NULL))
369         return;
370 
371     if (delta == deltas.end() && gfp != NULL)
372     {
373         ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
374                   "The number of the assembly_gap features exceeds the number of CONTIG/CO line gaps. First extra assembly_gap is at \"%d..%d\".",
375                   gfp->from, gfp->to);
376         *drop = 1;
377     }
378     else if (delta != deltas.end() && gfp == NULL)
379     {
380         for (; delta != deltas.end(); ++delta)
381         {
382             if ((*delta)->IsLiteral())            /* Seq-lit */
383                 break;
384         }
385 
386         if (delta == deltas.end())
387             return;
388 
389         ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
390                   "The number of the CONTIG/CO line gaps exceeds the number of assembly_gap features.");
391         *drop = 1;
392     }
393 }
394 
395 /**********************************************************/
GapsToDelta(objects::CBioseq & bioseq,GapFeatsPtr gfp,unsigned char * drop)396 void GapsToDelta(objects::CBioseq& bioseq, GapFeatsPtr gfp, unsigned char* drop)
397 {
398     GapFeatsPtr  tgfp;
399 
400     const Char*  p;
401     Int4         prevto;
402     Int4         nextfrom;
403     Int4         i;
404 
405     if (gfp == NULL || !bioseq.GetInst().IsSetSeq_data())
406         return;
407 
408     const std::string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
409     p = sequence.c_str();
410 
411     if (sequence.empty() || sequence.size() != bioseq.GetLength())
412         return;
413 
414     for(prevto = 0, tgfp = gfp; tgfp != NULL; tgfp = tgfp->next)
415     {
416         if(tgfp->next != NULL)
417         {
418             p = sequence.c_str() + tgfp->to;
419             for(i = tgfp->to + 1; i < tgfp->next->from; p++, i++)
420                 if(*p != 'N')
421                     break;
422             if(i == tgfp->next->from && tgfp->next->from > tgfp->to + 1)
423             {
424                 ErrPostEx(SEV_ERROR, ERR_FEATURE_AllNsBetweenGaps,
425                           "A run of all-N sequence exists between the gap features located at \"%d..%d\" and \"%d..%d\".",
426                           tgfp->from, tgfp->to, tgfp->next->from,
427                           tgfp->next->to);
428                 tgfp->rightNs = true;
429                 tgfp->next->leftNs = true;
430             }
431             nextfrom = tgfp->next->from;
432         }
433         else
434             nextfrom = bioseq.GetLength() + 1;
435 
436         if(tgfp->leftNs == false && tgfp->from - prevto > 10)
437         {
438             for (p = sequence.c_str() + tgfp->from - 11, i = 0; i < 10; p++, i++)
439                 if(*p != 'N')
440                     break;
441             if(i == 10)
442             {
443                 ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap,
444                           "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.",
445                           tgfp->from, tgfp->to);
446             }
447         }
448 
449         if(tgfp->rightNs == false && nextfrom - tgfp->to > 10)
450         {
451             for (p = sequence.c_str() + tgfp->to, i = 0; i < 10; p++, i++)
452                 if(*p != 'N')
453                     break;
454             if(i == 10)
455             {
456                 ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap,
457                           "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.",
458                           tgfp->from, tgfp->to);
459             }
460         }
461 
462         for (i = tgfp->from - 1, p = sequence.c_str() + i; i < tgfp->to; p++, i++)
463             if(*p != 'N')
464                 break;
465         if(i < tgfp->to)
466         {
467             ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapSequence,
468                       "The sequence data associated with the gap feature at \"%d..%d\" contains basepairs other than N.",
469                       tgfp->from, tgfp->to);
470             *drop = 1;
471         }
472 
473         prevto = tgfp->to;
474     }
475 
476     if (*drop != 0)
477         return;
478 
479     objects::CDelta_ext::Tdata deltas;
480 
481     for (prevto = 0, tgfp = gfp;; tgfp = tgfp->next)
482     {
483         Int4 len = 0;
484 
485         CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
486         if (tgfp->from - prevto - 1 > 0)
487         {
488             len = tgfp->from - prevto - 1;
489             delta->SetLiteral().SetLength(len);
490             delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
491 
492             deltas.push_back(delta);
493 
494             delta.Reset(new objects::CDelta_seq);
495         }
496 
497         len = tgfp->to - tgfp->from + 1;
498         delta->SetLiteral().SetLength(len);
499         if(tgfp->estimated_length == -100)
500         {
501             delta->SetLiteral().SetFuzz().SetLim();
502         }
503         else if(tgfp->estimated_length != len)
504         {
505             delta->SetLiteral().SetFuzz().SetRange().SetMin(tgfp->estimated_length);
506             delta->SetLiteral().SetFuzz().SetRange().SetMax(len);
507         }
508 
509         if (tgfp->assembly_gap)
510             CreateSeqGap(delta->SetLiteral(), tgfp);
511 
512         deltas.push_back(delta);
513 
514         prevto = tgfp->to;
515 
516         if(tgfp->next == NULL)
517         {
518             if (bioseq.GetLength() - prevto > 0)
519             {
520                 delta.Reset(new objects::CDelta_seq);
521 
522                 len = bioseq.GetLength() - prevto;
523                 delta->SetLiteral().SetLength(len);
524                 delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
525 
526                 deltas.push_back(delta);
527             }
528             break;
529         }
530     }
531 
532     if (!deltas.empty())
533     {
534         bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
535         bioseq.SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
536         bioseq.SetInst().ResetSeq_data();
537     }
538 }
539 
540 /**********************************************************/
SeqToDelta(objects::CBioseq & bioseq,Int2 tech)541 void SeqToDelta(objects::CBioseq& bioseq, Int2 tech)
542 {
543     char*  p;
544     char*  q;
545     char*  r;
546 
547     Int4         i;
548     Int4         j;
549     Int4         gotcha;
550 
551     if (!bioseq.GetInst().IsSetSeq_data())
552         return;
553 
554     const std::string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
555     if (sequence.empty() || sequence.size() != bioseq.GetLength())
556         return;
557 
558     vector<Char> buf(sequence.begin(), sequence.end());
559     buf.push_back(0);
560     p = &buf[0];
561     gotcha = 0;
562 
563     objects::CDelta_ext::Tdata deltas;
564 
565     for (q = p; *p != '\0';)
566     {
567         if(*p != 'N')
568         {
569             p++;
570             continue;
571         }
572 
573         for(r = p, p++, i = 1; *p == 'N'; i++)
574             p++;
575         if(i < HTG_GAP)
576         {
577             if(i >= SHORT_GAP && gotcha == 0)
578                 gotcha = 1;
579             continue;
580         }
581 
582         CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
583         gotcha = 2;
584 
585         if(r != q)
586         {
587             *r = '\0';
588             j = (Int4) (r - q);
589 
590             delta->SetLiteral().SetLength(j);
591             delta->SetLiteral().SetSeq_data().SetIupacna().Set(std::string(q, r));
592 
593             deltas.push_back(delta);
594 
595             delta.Reset(new objects::CDelta_seq);
596 
597             *r = 'N';
598         }
599 
600         delta->SetLiteral().SetLength(i);
601         if (i == 100)
602         {
603             delta->SetLiteral().SetFuzz().SetLim();
604         }
605 
606         deltas.push_back(delta);
607         q = p;
608     }
609 
610     if(p > q)
611     {
612         j = (Int4) (p - q);
613 
614         CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
615         delta->SetLiteral().SetLength(j);
616         delta->SetLiteral().SetSeq_data().SetIupacna().Set(std::string(q, p));
617 
618         deltas.push_back(delta);
619     }
620 
621     if (deltas.size() > 1)
622     {
623         bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
624         bioseq.SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
625         bioseq.SetInst().ResetSeq_data();
626     }
627 
628     if (bioseq.GetInst().GetRepr() != objects::CSeq_inst::eRepr_delta && tech == 1)
629     {
630         ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGWithoutGaps,
631                   "This Phase 1 HTG sequence has no runs of 100 "
632                   "or more N's to indicate gaps between component contigs. "
633                   "This could be an error, or perhaps sequencing is finished "
634                   "and this record should not be Phase 1.");
635     }
636 
637     if (bioseq.GetInst().GetRepr() == objects::CSeq_inst::eRepr_delta)
638     {
639         if(tech == 4)                   /* Phase 0 */
640             ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPhaseZeroHasGap,
641                       "A Phase 0 HTG record usually consists of several reads "
642                       "for one contig, and hence gaps are not expected. But "
643                       "this record does have one (ore more) gaps, hence it "
644                       "may require review.");
645         if(gotcha == 1)
646             ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPossibleShortGap,
647                       "This sequence has one or more runs "
648                       "of at least 20 N's. They could indicate gaps, "
649                       "but have not been treated that way because "
650                       "they are below the minimum of 100 N's.");
651     }
652 }
653 
654 /**********************************************************/
fta_ranges_to_hist(const objects::CGB_block::TExtra_accessions & extra_accs)655 static bool fta_ranges_to_hist(const objects::CGB_block::TExtra_accessions& extra_accs)
656 {
657     std::string ppacc1;
658     std::string ppacc2;
659     char*     master;
660     char*     range;
661     char*     acc1;
662     char*     acc2;
663     char*     p;
664     char*     q;
665     Char        ch1;
666     Char        ch2;
667     Int4        i;
668 
669     if(extra_accs.empty())
670         return false;
671 
672     if(extra_accs.size() != 2)
673         return true;
674 
675     objects::CGB_block::TExtra_accessions::const_iterator it = extra_accs.begin();
676 
677     ppacc1 = *it;
678     ++it;
679     ppacc2 = *it;
680     acc1 = (char*) ppacc1.c_str();
681     acc2 = (char*) ppacc2.c_str();
682 
683 
684     if(acc1 == NULL && acc2 == NULL)
685         return false;
686     if(acc1 == NULL || acc2 == NULL)
687         return true;
688 
689     p = StringChr(acc1, '-');
690     q = StringChr(acc2, '-');
691 
692     if((p == NULL && q == NULL) || (p != NULL && q != NULL))
693         return true;
694 
695     if(p == NULL)
696     {
697         master = acc1;
698         range = acc2;
699         *q = '\0';
700     }
701     else
702     {
703         master = acc2;
704         range = acc1;
705         *p = '\0';
706     }
707 
708     if(fta_if_wgs_acc(master) != 0 || fta_if_wgs_acc(range) != 1)
709     {
710         if(p != NULL)
711             *p = '-';
712         if(q != NULL)
713             *q = '-';
714         return true;
715     }
716 
717     if(p != NULL)
718         *p = '-';
719     if(q != NULL)
720         *q = '-';
721 
722     for(p = master; *p != '\0' && (*p < '0' || *p > '9');)
723         p++;
724     if(*p != '\0')
725         p++;
726     if(*p != '\0')
727         p++;
728     ch1 = *p;
729     *p = '\0';
730 
731     for(q = range; *q != '\0' && (*q < '0' || *q > '9');)
732         q++;
733     if(*q != '\0')
734         q++;
735     if(*q != '\0')
736         q++;
737     ch2 = *q;
738     *q = '\0';
739 
740     i = StringCmp(master, range);
741     *p = ch1;
742     *q = ch2;
743 
744     if(i == 0)
745         return false;
746     return true;
747 }
748 
749 
s_IsConOrScaffold(CBioseq_Handle bsh)750 static bool s_IsConOrScaffold(CBioseq_Handle bsh)
751 {
752     if (bsh &&
753         bsh.IsSetInst_Repr() &&
754         bsh.GetInst_Repr() == CSeq_inst::eRepr_delta &&
755         bsh.IsSetInst_Ext()) {
756         const auto& ext = bsh.GetInst_Ext();
757         if (ext.IsDelta() &&
758             ext.GetDelta().IsSet()) {
759             const auto& delta = ext.GetDelta().Get();
760             return any_of(begin(delta),
761                           end(delta),
762                           [](CRef<CDelta_seq> pDeltaSeq) { return (pDeltaSeq && pDeltaSeq->IsLoc()); });
763         }
764     }
765     return false;
766 }
767 
s_IsAccession(const CSeq_id & id)768 static bool s_IsAccession(const CSeq_id& id) {
769     const auto idType = id.Which();
770     switch (idType) {
771     case CSeq_id::e_Local:
772     case CSeq_id::e_General:
773     case CSeq_id::e_Gi:
774     case CSeq_id::e_Named_annot_track:
775         return false;
776     default:
777         return true;
778     }
779 }
780 
781 
g_DoesNotReferencePrimary(const CDelta_ext & delta_ext,const CSeq_id & primary,CScope & scope)782 bool g_DoesNotReferencePrimary(const CDelta_ext& delta_ext, const CSeq_id& primary, CScope& scope)
783 {
784     const auto primaryType = primary.Which();
785     string primaryString = primary.GetSeqIdString();
786     const bool primaryIsAccession = s_IsAccession(primary);
787     const bool primaryIsGi = primaryIsAccession ?
788                              false :
789                              (primaryType == CSeq_id::e_Gi);
790 
791     unique_ptr<string> pPrimaryAccessionString;
792 
793     for (const auto& pDeltaSeq : delta_ext.Get()) {
794         if (pDeltaSeq && pDeltaSeq->IsLoc()) {
795             auto pId = pDeltaSeq->GetLoc().GetId();
796             const auto& deltaIdType = pId->Which();
797             if (deltaIdType == primaryType) {
798                 if (pId->GetSeqIdString() == primaryString) {
799                     return false;
800                 }
801             }
802             else {
803                 if (primaryIsAccession && deltaIdType == CSeq_id::e_Gi) {
804                     auto deltaHandle = CSeq_id_Handle::GetHandle(pId->GetGi());
805                     auto deltaAccessionHandle = scope.GetAccVer(deltaHandle);
806                     if (!deltaAccessionHandle) {
807                         return false;
808                     }
809 
810                     if (deltaAccessionHandle.GetSeqId()->GetSeqIdString() ==
811                         primaryString) {
812                         return false;
813                     }
814                 }
815                 else
816                 if (primaryIsGi && s_IsAccession(*pId)) {
817                     if (!pPrimaryAccessionString) {
818                         auto primaryGiHandle = CSeq_id_Handle::GetHandle(primary.GetGi());
819                         auto primaryAccessionHandle = scope.GetAccVer(primaryGiHandle);
820                         if (!primaryAccessionHandle) {
821                             return false;
822                         }
823                         pPrimaryAccessionString =
824                             make_unique<string>(primaryAccessionHandle.GetSeqId()->GetSeqIdString());
825                     }
826 
827                     if (*pPrimaryAccessionString == pId->GetSeqIdString()) {
828                         return false;
829                     }
830                 }
831             }
832         }
833     }
834     return true;
835 }
836 
837 
sGetPrefixLength(const CTempString & accession)838 static int sGetPrefixLength(const CTempString& accession)
839 {
840     auto it = find_if(begin(accession),
841                       end(accession),
842                       [](char c) { return !(isalpha(c) || c == '_'); });
843 
844     _ASSERT(it != accession.end());
845     return distance(accession.begin(), it);
846 }
847 
848 
849 /**********************************************************/
fta_add_hist(ParserPtr pp,objects::CBioseq & bioseq,objects::CGB_block::TExtra_accessions & extra_accs,Parser::ESource source,Int4 acctype,bool pricon,char * acc)850 void fta_add_hist(ParserPtr pp, objects::CBioseq& bioseq, objects::CGB_block::TExtra_accessions& extra_accs, Parser::ESource source,
851                   Int4 acctype, bool pricon, char* acc)
852 {
853     IndexblkPtr  ibp;
854 
855     Int4         pri_acc;
856     Int4         sec_acc;
857 
858     if(pp->accver == false || pp->histacc == false ||
859        pp->source != source || pp->entrez_fetch == 0)
860         return;
861 
862     if (!fta_ranges_to_hist(extra_accs))
863         return;
864 
865     objects::CGB_block::TExtra_accessions hist;
866     UnwrapAccessionRange(extra_accs, hist);
867     if (hist.empty())
868         return;
869 
870     ibp = pp->entrylist[pp->curindx];
871 
872     pri_acc = fta_if_wgs_acc(acc);
873 
874     CTempString primaryAccession(acc);
875     int prefixLength=0;
876 
877     list<CRef<CSeq_id>> replaces;
878 
879     for (const auto& accessionString : hist) {
880         if (accessionString.empty())
881             continue;
882 
883         const auto idChoice = GetNucAccOwner(accessionString.c_str(), ibp->is_tpa);
884         if (idChoice == CSeq_id::e_not_set) {
885             continue;
886         }
887         sec_acc = fta_if_wgs_acc(accessionString.c_str());
888         if(sec_acc == 0) { // Project WGS accession
889             continue;
890         }
891 
892         if (sec_acc == 1) // Contig WGS accession
893         {
894             if (pri_acc == 0 || pri_acc == 2) { // A project WGS accession or
895                 continue;                       // a scaffold WGS accession
896             }
897 
898             if (pri_acc == 1) { // Contig WGS accession
899                 if (!prefixLength) {
900                     prefixLength = sGetPrefixLength(primaryAccession);
901                 }
902 
903                 if ( (accessionString.length() <= prefixLength ||
904                      !NStr::EqualNocase(accessionString, 0, prefixLength, primaryAccession.substr(0,prefixLength)) ||
905                      !isdigit(accessionString[prefixLength])) &&
906                     !pp->allow_uwsec ) {
907                     continue;
908                 }
909             }
910         }
911 
912         CRef<CSeq_id> id(new CSeq_id(idChoice, accessionString));
913         auto secondaryBsh = GetScope().GetBioseqHandle(*id);
914         bool IsConOrScaffold=false;
915         try {
916             IsConOrScaffold = s_IsConOrScaffold(secondaryBsh);
917         }
918         catch (...) {
919             ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
920                 "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
921                 accessionString.c_str());
922             continue;
923         }
924 
925         if (!IsConOrScaffold && pricon && idChoice == acctype) {
926             continue;
927         }
928 
929         if (IsConOrScaffold && !pricon) {
930             CRef<CSeq_id> pPrimary(new CSeq_id(primaryAccession));
931             if (g_DoesNotReferencePrimary(secondaryBsh.GetInst_Ext().GetDelta(),
932                         *pPrimary,
933                         GetScope())) {
934                 replaces.push_back(id);
935             }
936             continue;
937         }
938 
939         replaces.push_back(id);
940     }
941 
942 
943     if (!replaces.empty()) {
944         auto& hist_replaces_ids = bioseq.SetInst().SetHist().SetReplaces().SetIds();
945         hist_replaces_ids.splice(hist_replaces_ids.end(), replaces);
946     }
947 }
948 
949 /**********************************************************/
fta_strings_same(const char * s1,const char * s2)950 bool fta_strings_same(const char* s1, const char* s2)
951 {
952     if(s1 == NULL && s2 == NULL)
953         return true;
954     if(s1 == NULL || s2 == NULL || StringCmp(s1, s2) != 0)
955         return false;
956     return true;
957 }
958 
959 /**********************************************************/
fta_check_htg_kwds(TKeywordList & kwds,IndexblkPtr ibp,objects::CMolInfo & mol_info)960 bool fta_check_htg_kwds(TKeywordList& kwds, IndexblkPtr ibp, objects::CMolInfo& mol_info)
961 {
962     bool deldiv = false;
963 
964     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
965     {
966         bool delnode = false;
967         bool errpost = false;
968         if(*key == "HTGS_PHASE0")
969         {
970             if(ibp->htg != 0 && ibp->htg != 5)
971             {
972                 delnode = true;
973                 if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 3)
974                     errpost = true;
975             }
976             else
977             {
978                 ibp->htg = 4;
979                 mol_info.SetTech(objects::CMolInfo::eTech_htgs_0);
980             }
981             deldiv = true;
982         }
983         else if (*key == "HTGS_PHASE1")
984         {
985             if(ibp->htg != 0 && ibp->htg != 5)
986             {
987                 delnode = true;
988                 if(ibp->htg == 2 || ibp->htg == 3 || ibp->htg == 4)
989                     errpost = true;
990             }
991             else
992             {
993                 ibp->htg = 1;
994                 mol_info.SetTech(objects::CMolInfo::eTech_htgs_1);
995             }
996             deldiv = true;
997         }
998         else if (*key == "HTGS_PHASE2")
999         {
1000             if(ibp->htg != 0 && ibp->htg != 5)
1001             {
1002                 delnode = true;
1003                 if(ibp->htg == 1 || ibp->htg == 3 || ibp->htg == 4)
1004                     errpost = true;
1005             }
1006             else
1007             {
1008                 ibp->htg = 2;
1009                 mol_info.SetTech(objects::CMolInfo::eTech_htgs_2);
1010             }
1011             deldiv = true;
1012         }
1013         else if (*key == "HTGS_PHASE3")
1014         {
1015             if(ibp->htg != 0 && ibp->htg != 5)
1016             {
1017                 delnode = true;
1018                 if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
1019                     errpost = true;
1020             }
1021             else
1022             {
1023                 ibp->htg = 3;
1024                 mol_info.SetTech(objects::CMolInfo::eTech_htgs_3);
1025             }
1026             deldiv = true;
1027         }
1028         else if (*key == "HTG")
1029         {
1030             if(ibp->htg == 0)
1031             {
1032                 ibp->htg = 5;
1033                 mol_info.SetTech(objects::CMolInfo::eTech_htgs_3);
1034             }
1035             deldiv = true;
1036         }
1037 
1038         if(errpost)
1039         {
1040             ErrPostEx(SEV_ERROR, ERR_KEYWORD_MultipleHTGPhases,
1041                       "This entry has multiple HTG-related keywords, for differing HTG phases. Ignoring all but the first.");
1042         }
1043 
1044         if (delnode)
1045             key = kwds.erase(key);
1046         else
1047             ++key;
1048     }
1049     if(ibp->htg == 5)
1050         ibp->htg = 3;
1051 
1052     return deldiv;
1053 }
1054 
1055 /**********************************************************/
fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp,Int4 length,bool tpa)1056 static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
1057 {
1058     FTATpaBlockPtr tftbp;
1059     FTATpaSpanPtr  ftsp;
1060     FTATpaSpanPtr  tftsp;
1061     Int4           i1;
1062     Int4           i2;
1063     Int4           j;
1064 
1065     if(ftbp == NULL || length < 1)
1066         return;
1067 
1068     ftsp = (FTATpaSpanPtr) MemNew(sizeof(FTATpaSpan));
1069     ftsp->from = ftbp->from1;
1070     ftsp->to = ftbp->to1;
1071     ftsp->next = NULL;
1072     tftsp = ftsp;
1073     for(tftbp = ftbp; tftbp != NULL; tftbp = tftbp->next)
1074     {
1075         i1 = tftbp->to1 - tftbp->from1;
1076         i2 = tftbp->to2 - tftbp->from2;
1077         j = (i2 > i1) ? (i2 - i1) : (i1 - i2);
1078         i1++;
1079 
1080         if(i1 < 3000 && j * 10 > i1)
1081         {
1082             if(tpa)
1083                 ErrPostEx(SEV_ERROR, ERR_TPA_SpanLengthDiff,
1084                 "Span \"%d..%d\" of this TPA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.",
1085                 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1086             else
1087                 ErrPostEx(SEV_ERROR, ERR_TSA_SpanLengthDiff,
1088                           "Span \"%d..%d\" of this TSA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.",
1089                           tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1090         }
1091 
1092         if(i1 >= 3000 && j > 300)
1093         {
1094             if (tpa)
1095                 ErrPostEx(SEV_ERROR, ERR_TPA_SpanDiffOver300bp,
1096                 "Span \"%d..%d\" of this TPA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.",
1097                 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1098             else
1099                 ErrPostEx(SEV_ERROR, ERR_TSA_SpanDiffOver300bp,
1100                           "Span \"%d..%d\" of this TSA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.",
1101                           tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1102         }
1103 
1104         if(tftbp->from1 <= tftsp->to + 1)
1105         {
1106             if(tftbp->to1 > tftsp->to)
1107                 tftsp->to = tftbp->to1;
1108             continue;
1109         }
1110 
1111         tftsp->next = (FTATpaSpanPtr) MemNew(sizeof(FTATpaSpan));
1112         tftsp = tftsp->next;
1113         tftsp->from = tftbp->from1;
1114         tftsp->to = tftbp->to1;
1115         tftsp->next = NULL;
1116     }
1117 
1118     if(ftsp->from - 1 > 50)
1119     {
1120         if(tpa)
1121             ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1122             "This TPA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1123             ftsp->from - 1);
1124         else
1125             ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1126                       "This TSA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1127                       ftsp->from - 1);
1128     }
1129 
1130     for(; ftsp != NULL; ftsp = tftsp)
1131     {
1132         tftsp = ftsp->next;
1133         if(tftsp != NULL && tftsp->from - ftsp->to - 1 > 50)
1134         {
1135             if(tpa)
1136                 ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1137                 "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1138                 ftsp->to + 1, tftsp->from - 1);
1139             else
1140                 ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1141                           "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1142                           ftsp->to + 1, tftsp->from - 1);
1143         }
1144         else if(tftsp == NULL && length - ftsp->to > 50)
1145         {
1146             if(tpa)
1147                 ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1148                 "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1149                 ftsp->to + 1, length);
1150             else
1151                 ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1152                           "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1153                           ftsp->to + 1, length);
1154         }
1155 
1156         MemFree(ftsp);
1157     }
1158 }
1159 
1160 /**********************************************************/
fta_number_is_huge(const Char * s)1161 bool fta_number_is_huge(const Char* s)
1162 {
1163     size_t i = StringLen(s);
1164     if(i > 10)
1165         return true;
1166     else if(i < 10)
1167         return false;
1168 
1169     if(*s > '2')
1170         return true;
1171     else if(*s < '2')
1172         return false;
1173 
1174     if(*++s > '1')
1175         return true;
1176     else if(*s < '1')
1177         return false;
1178 
1179     if(*++s > '4')
1180         return true;
1181     else if(*s < '4')
1182         return false;
1183 
1184     if(*++s > '7')
1185         return true;
1186     else if(*s < '7')
1187         return false;
1188 
1189     if(*++s > '4')
1190         return true;
1191     else if(*s < '4')
1192         return false;
1193 
1194     if(*++s > '8')
1195         return true;
1196     else if(*s < '8')
1197         return false;
1198 
1199     if(*++s > '3')
1200         return true;
1201     else if(*s < '3')
1202         return false;
1203 
1204     if(*++s > '6')
1205         return true;
1206     else if(*s < '6')
1207         return false;
1208 
1209     if(*++s > '4')
1210         return true;
1211     else if(*s < '4')
1212         return false;
1213 
1214     if(*++s > '7')
1215         return true;
1216     return false;
1217 }
1218 
1219 /**********************************************************/
fta_parse_tpa_tsa_block(objects::CBioseq & bioseq,char * offset,char * acnum,Int2 vernum,size_t len,Int2 col_data,bool tpa)1220 bool fta_parse_tpa_tsa_block(objects::CBioseq& bioseq, char* offset, char* acnum,
1221                              Int2 vernum, size_t len, Int2 col_data, bool tpa)
1222 {
1223     FTATpaBlockPtr ftbp;
1224     FTATpaBlockPtr tftbp;
1225     FTATpaBlockPtr ft;
1226 
1227     char*        buf;
1228     char*        p;
1229     char*        q;
1230     char*        r;
1231     char*        t;
1232     char*        bad_accession;
1233     bool        bad_line;
1234     bool        bad_interval;
1235     Char           ch;
1236     Int4           from1;
1237     Int4           to1;
1238     Int4           len1;
1239     Int4           len2;
1240     Uint1          choice;
1241 
1242     if (offset == NULL || acnum == NULL || len < 2)
1243         return false;
1244 
1245     choice = GetNucAccOwner(acnum, tpa);
1246 
1247     if(col_data == 0)                   /* HACK: XML format */
1248     {
1249         for(p = offset; *p != '\0'; p++)
1250             if(*p == '~')
1251                 *p = '\n';
1252         p = StringChr(offset, '\n');
1253         if(p == NULL)
1254             return false;
1255         buf = (char*) MemNew(StringLen(p) + 1);
1256         StringCpy(buf, p + 1);
1257         StringCat(buf, "\n");
1258     }
1259     else
1260     {
1261         ch = offset[len];
1262         offset[len] = '\0';
1263         p = StringChr(offset, '\n');
1264         if(p == NULL)
1265         {
1266             offset[len] = ch;
1267             return false;
1268         }
1269         buf = StringSave(p + 1);
1270         offset[len] = ch;
1271     }
1272 
1273     ftbp = (FTATpaBlockPtr) MemNew(sizeof(FTATpaBlock));
1274 
1275     bad_line = false;
1276     bad_interval = false;
1277     bad_accession = NULL;
1278     p = buf;
1279     for(q = StringChr(p, '\n'); q != NULL; p = q + 1, q = StringChr(p, '\n'))
1280     {
1281         *q = '\0';
1282         if((Int2) StringLen(p) < col_data)
1283             break;
1284         for(p += col_data; *p == ' ';)
1285             p++;
1286         for(r = p; *p >= '0' && *p <= '9';)
1287             p++;
1288         if(*p != '-')
1289         {
1290             bad_interval = true;
1291             break;
1292         }
1293 
1294         *p++ = '\0';
1295         from1 = atoi(r);
1296 
1297         for(r = p; *p >= '0' && *p <= '9';)
1298             p++;
1299         if(*p != ' ' && *p != '\n' && *p != '\0')
1300         {
1301             bad_interval = true;
1302             break;
1303         }
1304         if(*p != '\0')
1305             *p++ = '\0';
1306         to1 = atoi(r);
1307 
1308         if(from1 >= to1)
1309         {
1310             bad_interval = true;
1311             break;
1312         }
1313 
1314         for(ft = ftbp; ft->next != NULL; ft = ft->next)
1315             if((ft->next->from1 > from1) ||
1316                (ft->next->from1 == from1 && ft->next->to1 > to1))
1317                 break;
1318         tftbp = (FTATpaBlockPtr) MemNew(sizeof(FTATpaBlock));
1319         tftbp->next = ft->next;
1320         ft->next = tftbp;
1321 
1322         tftbp->from1 = from1;
1323         tftbp->to1 = to1;
1324 
1325         while(*p == ' ')
1326             p++;
1327         for(r = p; *p != '\0' && *p != ' ' && *p != '\n';)
1328             p++;
1329         if(*p != '\0')
1330             *p++ = '\0';
1331         tftbp->accession = StringSave(r);
1332         r = StringChr(tftbp->accession, '.');
1333         if(r != NULL)
1334         {
1335             *r++ = '\0';
1336             for(t = r; *t >= '0' && *t <= '9';)
1337                 t++;
1338             if(*t != '\0')
1339             {
1340                 *--r = '.';
1341                 bad_accession = tftbp->accession;
1342                 break;
1343             }
1344             tftbp->version = atoi(r);
1345         }
1346 
1347         if(StringNICmp(tftbp->accession, "ti", 2) == 0)
1348         {
1349             for(r = tftbp->accession + 2; *r == '0';)
1350                 r++;
1351             if(*r == '\0')
1352             {
1353                 bad_accession = tftbp->accession;
1354                 break;
1355             }
1356             while(*r >= '0' && *r <= '9')
1357                 r++;
1358             if(*r != '\0')
1359             {
1360                 bad_accession = tftbp->accession;
1361                 break;
1362             }
1363         }
1364         else
1365         {
1366             tftbp->sicho = GetNucAccOwner(tftbp->accession, false);
1367             if ((tftbp->sicho != objects::CSeq_id::e_Genbank && tftbp->sicho != objects::CSeq_id::e_Embl &&
1368                 tftbp->sicho != objects::CSeq_id::e_Ddbj &&
1369                 (tftbp->sicho != objects::CSeq_id::e_Tpg || tpa == false)))
1370             {
1371                 bad_accession = tftbp->accession;
1372                 break;
1373             }
1374         }
1375 
1376         while(*p == ' ')
1377             p++;
1378 
1379         if(StringNICmp(p, "not_available", 13) == 0)
1380         {
1381             p += 13;
1382             tftbp->from2 = 1;
1383             tftbp->to2 = 1;
1384         }
1385         else
1386         {
1387             for(r = p; *p >= '0' && *p <= '9';)
1388                 p++;
1389             if(*p != '-')
1390             {
1391                 bad_interval = true;
1392                 break;
1393             }
1394             *p++ = '\0';
1395             tftbp->from2 = atoi(r);
1396 
1397             for(r = p; *p >= '0' && *p <= '9';)
1398                 p++;
1399             if(*p != ' ' && *p != '\n' && *p != '\0')
1400             {
1401                 bad_interval = true;
1402                 break;
1403             }
1404             if(*p != '\0')
1405                 *p++ = '\0';
1406             tftbp->to2 = atoi(r);
1407 
1408             if(tftbp->from2 >= tftbp->to2)
1409             {
1410                 bad_interval = true;
1411                 break;
1412             }
1413         }
1414 
1415         while(*p == ' ')
1416             p++;
1417         if(*p == 'c')
1418         {
1419             tftbp->strand = 2;
1420             for(p++; *p == ' ';)
1421                 p++;
1422         }
1423         else
1424             tftbp->strand = 1;
1425         if(*p != '\0')
1426         {
1427             bad_line = true;
1428             break;
1429         }
1430     }
1431 
1432     MemFree(buf);
1433     if (bad_line || bad_interval || bad_accession != NULL)
1434     {
1435         if(bad_interval)
1436         {
1437             if(tpa)
1438                 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySpan,
1439                 "Intervals from primary records on which a TPA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1440             else
1441                 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySpan,
1442                           "Intervals from primary records on which a TSA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1443         }
1444         else if(bad_accession != NULL)
1445         {
1446             if(tpa)
1447                 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySeqId,
1448                 "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.",
1449                 bad_accession);
1450             else
1451                 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySeqId,
1452                           "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.",
1453                           bad_accession);
1454         }
1455         else
1456         {
1457             if(tpa)
1458                 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimaryBlock,
1459                 "Supplied PRIMARY block for TPA record is incorrect. Cannot parse. Entry dropped.");
1460             else
1461                 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimaryBlock,
1462                           "Supplied PRIMARY block for TSA record is incorrect. Cannot parse. Entry dropped.");
1463         }
1464 
1465         if(ftbp != NULL)
1466             fta_tpa_block_free(ftbp);
1467         return false;
1468     }
1469 
1470     tftbp = ftbp->next;
1471     ftbp->next = NULL;
1472     MemFree(ftbp);
1473     ftbp = tftbp;
1474 
1475     fta_check_tpa_tsa_coverage(ftbp, bioseq.GetLength(), tpa);
1476 
1477     objects::CSeq_hist::TAssembly& assembly = bioseq.SetInst().SetHist().SetAssembly();
1478     if (!assembly.empty())
1479         assembly.clear();
1480 
1481     CRef<objects::CSeq_align> root_align(new objects::CSeq_align);
1482 
1483     root_align->SetType(objects::CSeq_align::eType_not_set);
1484     objects::CSeq_align_set& align_set = root_align->SetSegs().SetDisc();
1485 
1486     for(; tftbp != NULL; tftbp = tftbp->next)
1487     {
1488         len1 = tftbp->to1 - tftbp->from1 + 1;
1489         len2 = tftbp->to2 - tftbp->from2 + 1;
1490 
1491         CRef<objects::CSeq_align> align(new objects::CSeq_align);
1492         align->SetType(objects::CSeq_align::eType_partial);
1493         align->SetDim(2);
1494 
1495         objects::CSeq_align::C_Segs::TDenseg& seg = align->SetSegs().SetDenseg();
1496 
1497         seg.SetDim(2);
1498         seg.SetNumseg((len1 == len2) ? 1 : 2);
1499 
1500         seg.SetStarts().push_back(tftbp->from1 - 1);
1501         seg.SetStarts().push_back(tftbp->from2 - 1);
1502 
1503         if (len1 != len2)
1504         {
1505             if (len1 < len2)
1506             {
1507                 seg.SetStarts().push_back(-1);
1508                 seg.SetStarts().push_back(tftbp->from2 - 1 + len1);
1509             }
1510             else
1511             {
1512                 seg.SetStarts().push_back(tftbp->from1 - 1 + len2);
1513                 seg.SetStarts().push_back(-1);
1514             }
1515         }
1516 
1517         if (len1 == len2)
1518             seg.SetLens().push_back(len1);
1519         else if(len1 < len2)
1520         {
1521             seg.SetLens().push_back(len1);
1522             seg.SetLens().push_back(len2 - len1);
1523         }
1524         else
1525         {
1526             seg.SetLens().push_back(len2);
1527             seg.SetLens().push_back(len1 - len2);
1528         }
1529 
1530         seg.SetStrands().push_back(objects::eNa_strand_plus);
1531         seg.SetStrands().push_back(static_cast<objects::ENa_strand>(tftbp->strand));
1532 
1533         if (len1 != len2)
1534         {
1535             seg.SetStrands().push_back(objects::eNa_strand_plus);
1536             seg.SetStrands().push_back(static_cast<objects::ENa_strand>(tftbp->strand));
1537         }
1538 
1539         CRef<objects::CTextseq_id> text_id(new objects::CTextseq_id);
1540         text_id->SetAccession(acnum);
1541 
1542         if(vernum > 0)
1543             text_id->SetVersion(vernum);
1544 
1545         CRef<objects::CSeq_id> id(new objects::CSeq_id),
1546                                            aux_id;
1547         SetTextId(choice, *id, *text_id);
1548         seg.SetIds().push_back(id);
1549 
1550         if(StringNICmp(tftbp->accession, "ti", 2) == 0)
1551         {
1552             CRef<objects::CSeq_id> gen_id(new objects::CSeq_id);
1553             objects::CDbtag& tag = gen_id->SetGeneral();
1554 
1555             for(r = tftbp->accession + 2; *r == '0';)
1556                 r++;
1557             if(fta_number_is_huge(r) == false)
1558                 tag.SetTag().SetId(atoi(r));
1559             else
1560                 tag.SetTag().SetStr(r);
1561 
1562             tag.SetDb("ti");
1563             seg.SetIds().push_back(gen_id);
1564         }
1565         else
1566         {
1567             CRef<objects::CTextseq_id> otext_id(new objects::CTextseq_id);
1568             otext_id->SetAccession(tftbp->accession);
1569 
1570             if (tftbp->version > 0)
1571                 otext_id->SetVersion(tftbp->version);
1572 
1573             aux_id.Reset(new objects::CSeq_id);
1574             SetTextId(tftbp->sicho, *aux_id, *otext_id);
1575         }
1576 
1577         if (aux_id.NotEmpty())
1578             seg.SetIds().push_back(aux_id);
1579 
1580         align_set.Set().push_back(align);
1581     }
1582 
1583     assembly.push_back(root_align);
1584 
1585     if(ftbp != NULL)
1586         fta_tpa_block_free(ftbp);
1587     return true;
1588 }
1589 
1590 /**********************************************************/
StringRStr(char * where,const char * what)1591 char* StringRStr(char* where, const char *what)
1592 {
1593     if(where == NULL || what == NULL || *where == '\0' || *what == '\0')
1594         return(NULL);
1595 
1596     size_t i = StringLen(what);
1597     char* res = nullptr;
1598     for(char* p = where; *p != '\0'; p++)
1599         if(StringNCmp(p, what, i) == 0)
1600             res = p;
1601 
1602     return(res);
1603 }
1604 
1605 /**********************************************************/
fta_get_seqloc_int_whole(objects::CSeq_id & seq_id,size_t len)1606 CRef<objects::CSeq_loc> fta_get_seqloc_int_whole(objects::CSeq_id& seq_id, size_t len)
1607 {
1608     CRef<objects::CSeq_loc> ret;
1609 
1610     if (len < 1)
1611         return ret;
1612 
1613     ret.Reset(new objects::CSeq_loc);
1614     objects::CSeq_interval& interval = ret->SetInt();
1615 
1616     interval.SetFrom(0);
1617     interval.SetTo(static_cast<TSeqPos>(len) - 1);
1618     interval.SetId(seq_id);
1619 
1620     return ret;
1621 }
1622 
1623 /**********************************************************/
fta_validate_assembly(char * name)1624 static void fta_validate_assembly(char* name)
1625 {
1626     bool bad_format = false;
1627 
1628     char* p = name;
1629     if(p == NULL || *p == '\0' || StringLen(p) < 7)
1630         bad_format = true;
1631     else if(p[0] != 'G' || p[1] != 'C' || (p[2] != 'F' && p[2] != 'A') ||
1632             p[3] != '_' || p[4] < '0' || p[4] > '9')
1633         bad_format = true;
1634     else
1635     {
1636         for(p += 5; *p != '\0'; p++)
1637             if(*p < '0' || *p > '9')
1638                 break;
1639         if(*p != '.' || p[1] < '0' || p[1] > '9')
1640             bad_format = true;
1641         else
1642         {
1643             for(p++; *p != '\0'; p++)
1644                 if(*p < '0' || *p > '9')
1645                     break;
1646             if(*p != '\0')
1647                 bad_format = true;
1648         }
1649     }
1650 
1651     if(bad_format)
1652         ErrPostEx(SEV_WARNING, ERR_DBLINK_InvalidIdentifier,
1653                   "\"%s\" is not a validly formatted identifier for the Assembly resource.",
1654                   name);
1655 }
1656 
1657 /**********************************************************/
fta_validate_bioproject(char * name,Parser::ESource source)1658 static bool fta_validate_bioproject(char* name, Parser::ESource source)
1659 {
1660     char* p;
1661     bool bad_format = false;
1662 
1663     if(StringLen(name) < 6)
1664         bad_format = true;
1665     else if(name[0] != 'P' || name[1] != 'R' || name[2] != 'J' ||
1666             (name[3] != 'E' && name[3] != 'N' && name[3] != 'D') ||
1667             name[4] < 'A' || name[4] > 'Z' || name[5] < '0' || name[5] > '9')
1668         bad_format = true;
1669     else
1670     {
1671         for(p = name + 6; *p != '\0'; p++)
1672             if(*p < '0' || *p > '9')
1673                 break;
1674         if(*p != '\0')
1675             bad_format = true;
1676     }
1677 
1678     if(bad_format)
1679     {
1680         ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1681                   "BioProject accession number is not validly formatted: \"%s\". Entry dropped.",
1682                   name);
1683         return false;
1684     }
1685 
1686     if((source == Parser::ESource::NCBI && name[3] != 'N') ||
1687        (source == Parser::ESource::DDBJ && name[3] != 'D' &&
1688         (name[3] != 'N' || name[4] != 'A')) ||
1689        (source == Parser::ESource::EMBL && name[3] != 'E' &&
1690         (name[3] != 'N' || name[4] != 'A')))
1691         ErrPostEx(SEV_WARNING, ERR_FORMAT_WrongBioProjectPrefix,
1692                   "BioProject accession number does not agree with this record's database of origin: \"%s\".",
1693                   name);
1694 
1695     return true;
1696 }
1697 
1698 /**********************************************************/
fta_tokenize_project(char * str,Parser::ESource source,bool newstyle)1699 static ValNodePtr fta_tokenize_project(char* str, Parser::ESource source, bool newstyle)
1700 {
1701     ValNodePtr vnp;
1702     ValNodePtr tvnp;
1703     char*    p;
1704     char*    q;
1705     char*    r;
1706     bool    bad;
1707     Char       ch;
1708 
1709     if(str == NULL || *str == '\0')
1710     {
1711         ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1712                   "Empty PROJECT/PR line type supplied. Entry dropped.");
1713         return(NULL);
1714     }
1715 
1716     for(p = str; *p != '\0'; p++)
1717         if(*p == ';' || *p == ',' || *p == '\t')
1718             *p = ' ';
1719 
1720     for(p = str; *p == ' ';)
1721         p++;
1722     if(*p == '\0')
1723     {
1724         ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1725                   "Empty PROJECT/PR line type supplied. Entry dropped.");
1726         return(NULL);
1727     }
1728 
1729     vnp = ValNodeNew(NULL);
1730     vnp->data.ptrvalue = NULL;
1731     vnp->next = NULL;
1732     tvnp = vnp;
1733 
1734     for(bad = false, p = str; *p != '\0';)
1735     {
1736         while(*p == ' ')
1737             p++;
1738 
1739         if(*p == '\0')
1740             break;
1741 
1742         for(q = p; *p != ' ' && *p != '\0';)
1743             p++;
1744 
1745         ch = *p;
1746         *p = '\0';
1747         if(!newstyle)
1748         {
1749             for(r = q; *r >= '0' && *r <= '9';)
1750                 r++;
1751             if(*r != '\0')
1752             {
1753                 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1754                           "BioProject accession number is not validly formatted: \"%s\". Entry dropped.",
1755                           q);
1756                 bad = true;
1757             }
1758         }
1759         else if(fta_validate_bioproject(q, source) == false)
1760             bad = true;
1761 
1762         if(bad)
1763         {
1764             *p = ch;
1765             break;
1766         }
1767 
1768         tvnp->next = ValNodeNew(NULL);
1769         tvnp = tvnp->next;
1770         tvnp->next = NULL;
1771         tvnp->data.ptrvalue = StringSave(q);
1772 
1773         *p = ch;
1774     }
1775 
1776     tvnp = vnp->next;
1777     MemFree(vnp);
1778 
1779     if(tvnp == NULL)
1780         return(NULL);
1781 
1782     if(!bad)
1783         return(tvnp);
1784 
1785     ValNodeFreeData(tvnp);
1786     return(NULL);
1787 }
1788 
1789 /**********************************************************/
fta_get_project_user_object(TSeqdescList & descrs,char * offset,Parser::EFormat format,unsigned char * drop,Parser::ESource source)1790 void fta_get_project_user_object(TSeqdescList& descrs, char* offset,
1791                                  Parser::EFormat format, unsigned char* drop,
1792                                  Parser::ESource source)
1793 {
1794     ValNodePtr    vnp;
1795     ValNodePtr    tvnp;
1796 
1797     const Char    *name;
1798 
1799     char*       str;
1800     char*       p;
1801     Char          ch;
1802     Int4          i;
1803 
1804     if(offset == NULL)
1805         return;
1806 
1807     bool newstyle = false;
1808     if(format == Parser::EFormat::GenBank)
1809     {
1810         i = ParFlat_COL_DATA;
1811         name = "GenomeProject:";
1812         ch = '\n';
1813     }
1814     else
1815     {
1816         i = ParFlat_COL_DATA_EMBL;
1817         name = "Project:";
1818         ch = ';';
1819     }
1820 
1821     size_t len = StringLen(name);
1822     str = StringSave(offset + i);
1823     p = StringChr(str, ch);
1824     if(p != NULL)
1825         *p = '\0';
1826 
1827     if(StringNCmp(str, name, len) != 0)
1828     {
1829         if(format == Parser::EFormat::GenBank)
1830         {
1831             ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1832                       "PROJECT line is missing \"GenomeProject:\" tag. Entry dropped.",
1833                       str);
1834             MemFree(str);
1835             *drop = 1;
1836             return;
1837         }
1838         newstyle = true;
1839         len = 0;
1840     }
1841     else if(format == Parser::EFormat::EMBL && str[len] == 'P')
1842         newstyle = true;
1843 
1844     vnp = fta_tokenize_project(str + len, source, newstyle);
1845     if(vnp == NULL)
1846     {
1847         *drop = 1;
1848         MemFree(str);
1849         return;
1850     }
1851 
1852     objects::CUser_object* user_obj_ptr;
1853     bool got = false;
1854 
1855     NON_CONST_ITERATE(TSeqdescList, descr, descrs)
1856     {
1857         if (!(*descr)->IsUser() || !(*descr)->GetUser().IsSetData())
1858             continue;
1859 
1860         user_obj_ptr = &((*descr)->SetUser());
1861 
1862         objects::CObject_id* obj_id = nullptr;
1863         if (user_obj_ptr->IsSetType())
1864             obj_id = &(user_obj_ptr->SetType());
1865 
1866         if (obj_id != NULL && obj_id->IsStr() && obj_id->GetStr() == "DBLink")
1867         {
1868             got = true;
1869             break;
1870         }
1871     }
1872 
1873     CRef<objects::CUser_object> user_obj;
1874     if (newstyle)
1875     {
1876         for(i = 0, tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
1877             i++;
1878 
1879         if (!got)
1880         {
1881             user_obj.Reset(new objects::CUser_object);
1882             user_obj_ptr = user_obj.GetNCPointer();
1883 
1884             objects::CObject_id& id = user_obj_ptr->SetType();
1885             id.SetStr("DBLink");
1886         }
1887 
1888         CRef<objects::CUser_field> user_field(new objects::CUser_field);
1889         user_field->SetLabel().SetStr("BioProject");
1890         user_field->SetNum(i);
1891 
1892         for(tvnp = vnp, i = 0; tvnp != NULL; tvnp = tvnp->next)
1893             user_field->SetData().SetStrs().push_back((char*)tvnp->data.ptrvalue);
1894 
1895         user_obj_ptr->SetData().push_back(user_field);
1896     }
1897     else
1898     {
1899         got = false;
1900 
1901         user_obj.Reset(new objects::CUser_object);
1902         user_obj_ptr = user_obj.GetNCPointer();
1903 
1904         objects::CObject_id& id = user_obj_ptr->SetType();
1905         id.SetStr("GenomeProjectsDB");
1906 
1907         for(tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
1908         {
1909 
1910             CRef<objects::CUser_field> user_field(new objects::CUser_field);
1911             user_field->SetLabel().SetStr("ProjectID");
1912             user_field->SetData().SetInt(atoi((char*)tvnp->data.ptrvalue));
1913             user_obj_ptr->SetData().push_back(user_field);
1914 
1915 
1916             user_field.Reset(new objects::CUser_field);
1917             user_field->SetLabel().SetStr("ParentID");
1918             user_field->SetData().SetInt(0);
1919             user_obj_ptr->SetData().push_back(user_field);
1920         }
1921     }
1922 
1923     if (!got)
1924     {
1925         CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
1926         descr->SetUser(*user_obj_ptr);
1927         descrs.push_back(descr);
1928     }
1929 
1930     MemFree(str);
1931     ValNodeFree(vnp);
1932 }
1933 
1934 /**********************************************************/
fta_if_valid_sra(const Char * id,bool dblink)1935 bool fta_if_valid_sra(const Char* id, bool dblink)
1936 {
1937     const Char* p = id;
1938 
1939     if(p != NULL && StringLen(p) > 3 &&
1940        (p[0] == 'E' || p[0] == 'S' || p[0] == 'D') && p[1] == 'R' &&
1941        (p[2] == 'A' || p[2] == 'P' || p[2] == 'R' || p[2] == 'S' ||
1942         p[2] == 'X' || p[2] == 'Z'))
1943     {
1944         for(p += 3; *p >= '0' && *p <= '9';)
1945             p++;
1946         if(*p == '\0')
1947             return true;
1948     }
1949 
1950     if(dblink)
1951         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
1952                   "Incorrectly formatted DBLINK Sequence Read Archive value: \"%s\". Entry dropped.",
1953                   id);
1954 
1955     return false;
1956 }
1957 
1958 /**********************************************************/
fta_if_valid_biosample(const Char * id,bool dblink)1959 bool fta_if_valid_biosample(const Char* id, bool dblink)
1960 {
1961     const Char* p = id;
1962 
1963     if(p != NULL && StringLen(p) > 5 && p[0] == 'S' && p[1] == 'A' &&
1964        p[2] == 'M' && (p[3] == 'N' || p[3] == 'E' || p[3] == 'D'))
1965     {
1966         if(p[4] == 'A' || p[4] == 'G')
1967             p += 5;
1968         else
1969             p += 4;
1970         while(*p >= '0' && *p <= '9')
1971             p++;
1972         if(*p == '\0')
1973             return true;
1974     }
1975 
1976     if(dblink)
1977         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
1978                   "Incorrectly formatted DBLINK BioSample value: \"%s\". Entry dropped.",
1979                   id);
1980 
1981     return false;
1982 }
1983 
1984 /**********************************************************/
fta_tokenize_dblink(char * str,Parser::ESource source)1985 static ValNodePtr fta_tokenize_dblink(char* str, Parser::ESource source)
1986 {
1987     ValNodePtr vnp;
1988     ValNodePtr tvnp;
1989     ValNodePtr uvnp;
1990     ValNodePtr tagvnp;
1991 
1992     bool    got_nl;
1993     bool    bad;
1994     bool    sra;
1995     bool    assembly;
1996     bool    biosample;
1997     bool    bioproject;
1998 
1999     char*    p;
2000     char*    q;
2001     char*    r = NULL;
2002     char*    t;
2003     char*    u;
2004     Char       ch;
2005 
2006     if(str == NULL || *str == '\0')
2007     {
2008         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2009                   "Empty DBLINK line type supplied. Entry dropped.");
2010         return(NULL);
2011     }
2012 
2013     for(p = str; *p != '\0'; p++)
2014         if(*p == ';' || *p == '\t')
2015             *p = ' ';
2016 
2017     vnp = ValNodeNew(NULL);
2018     vnp->data.ptrvalue = NULL;
2019     tvnp = vnp;
2020     bad = false;
2021     got_nl = true;
2022     sra = false;
2023     assembly = false;
2024     biosample = false;
2025     bioproject = false;
2026     tagvnp = NULL;
2027     for(p = str; *p != '\0'; got_nl = false)
2028     {
2029         while(*p == ' ' || *p == '\n' || *p == ':' || *p == ',')
2030         {
2031             if(*p == '\n')
2032                got_nl = true;
2033             p++;
2034         }
2035 
2036         if(got_nl)
2037         {
2038             t = StringChr(p, ':');
2039             if(t != NULL)
2040             {
2041                 r = StringChr(p, '\n');
2042                 u = StringChr(p, ',');
2043 
2044                 if((u == NULL || u > t) && (r == NULL || r > t))
2045                 {
2046                     ch = *++t;
2047                     *t = '\0';
2048 
2049                     if(StringCmp(p, "Project:") != 0 &&
2050                        StringCmp(p, "Assembly:") != 0 &&
2051                        StringCmp(p, "BioSample:") != 0 &&
2052                        StringCmp(p, "BioProject:") != 0 &&
2053                        StringCmp(p, "Sequence Read Archive:") != 0 &&
2054                        StringCmp(p, "Trace Assembly Archive:") != 0)
2055                     {
2056                         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2057                                   "Invalid DBLINK tag encountered: \"%s\". Entry dropped.", p);
2058                         bad = true;
2059                         break;
2060                     }
2061 
2062                     bioproject = (StringCmp(p, "BioProject:") == 0);
2063                     sra = (StringCmp(p, "Sequence Read Archive:") == 0);
2064                     biosample = (StringCmp(p, "BioSample:") == 0);
2065                     assembly = (StringCmp(p, "Assembly:") == 0);
2066 
2067                     if(tvnp->data.ptrvalue != NULL &&
2068                        StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2069                     {
2070                         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2071                                   "Found DBLINK tag with no value: \"%s\". Entry dropped.",
2072                                   tvnp->data.ptrvalue);
2073                         bad = true;
2074                         break;
2075                     }
2076 
2077                     for(uvnp = vnp->next; uvnp != NULL; uvnp = uvnp->next)
2078                         if(StringCmp((char*) uvnp->data.ptrvalue, p) == 0)
2079                     {
2080                         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2081                                   "Multiple DBLINK tags found: \"%s\". Entry dropped.",
2082                                   p);
2083                         bad = true;
2084                         break;
2085                     }
2086                     if(bad)
2087                         break;
2088 
2089                     tvnp->next = ValNodeNew(NULL);
2090                     tvnp = tvnp->next;
2091                     tvnp->next = NULL;
2092                     tvnp->data.ptrvalue = StringSave(p);
2093                     tagvnp = tvnp;
2094                     *t = ch;
2095                     p = t;
2096                     continue;
2097                 }
2098             }
2099         }
2100 
2101         q = p;
2102         while(*p != ',' && *p != '\n' && *p != ':' && *p != '\0')
2103             p++;
2104         if(*p == ':')
2105         {
2106             while(*p != '\0' && *p != '\n')
2107                 p++;
2108             ch = *p;
2109             *p = '\0';
2110             while(*r != '\n' && r > str)
2111                 r--;
2112             while(*r == ' ' || *r == '\n')
2113                 r++;
2114             ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2115                       "Too many delimiters/fields for DBLINK line: \"%s\". Entry dropped.",
2116                       r);
2117             *p = ch;
2118             bad = true;
2119             break;
2120         }
2121 
2122         if(q == p)
2123             continue;
2124 
2125         ch = *p;
2126         *p = '\0';
2127 
2128         if(tagvnp != NULL && tagvnp->data.ptrvalue != NULL)
2129         {
2130             for(uvnp = tagvnp->next; uvnp != NULL; uvnp = uvnp->next)
2131             {
2132                 if(uvnp->data.ptrvalue == NULL ||
2133                    StringCmp((char*) uvnp->data.ptrvalue, q) != 0)
2134                     continue;
2135 
2136                 ErrPostEx(SEV_WARNING, ERR_DBLINK_DuplicateIdentifierRemoved,
2137                           "Duplicate identifier \"%s\" from \"%s\" link removed.",
2138                           q, (char*) tagvnp->data.ptrvalue);
2139                 break;
2140             }
2141 
2142             if(uvnp != NULL)
2143             {
2144                 *p = ch;
2145                 continue;
2146             }
2147         }
2148 
2149         if((bioproject &&
2150             fta_validate_bioproject(q, source) == false) ||
2151            (biosample && fta_if_valid_biosample(q, true) == false) ||
2152            (sra && fta_if_valid_sra(q, true) == false))
2153         {
2154             *p = ch;
2155             bad = true;
2156         }
2157 
2158         if(assembly)
2159            fta_validate_assembly(q);
2160 
2161         tvnp->next = ValNodeNew(NULL);
2162         tvnp = tvnp->next;
2163         tvnp->next = NULL;
2164         tvnp->data.ptrvalue = StringSave(q);
2165         *p = ch;
2166     }
2167 
2168     if(!bad && tvnp->data.ptrvalue != NULL &&
2169        StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2170     {
2171         ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2172                   "Found DBLINK tag with no value: \"%s\". Entry dropped.",
2173                   tvnp->data.ptrvalue);
2174         bad = true;
2175     }
2176 
2177     tvnp = vnp->next;
2178     MemFree(vnp);
2179 
2180     if(tvnp == NULL)
2181         return(NULL);
2182 
2183     if(!bad)
2184         return(tvnp);
2185 
2186     ValNodeFreeData(tvnp);
2187     return(NULL);
2188 }
2189 
2190 /**********************************************************/
fta_get_dblink_user_object(TSeqdescList & descrs,char * offset,size_t len,Parser::ESource source,unsigned char * drop,CRef<objects::CUser_object> & dbuop)2191 void fta_get_dblink_user_object(TSeqdescList& descrs, char* offset,
2192                                 size_t len, Parser::ESource source, unsigned char* drop,
2193                                 CRef<objects::CUser_object>& dbuop)
2194 {
2195     ValNodePtr    vnp;
2196     ValNodePtr    tvnp;
2197     ValNodePtr    uvnp;
2198 
2199     char*       str;
2200     Int4          i;
2201 
2202     if(offset == NULL)
2203         return;
2204 
2205     str = StringSave(offset + ParFlat_COL_DATA);
2206     str[len-ParFlat_COL_DATA] = '\0';
2207     vnp = fta_tokenize_dblink(str, source);
2208     MemFree(str);
2209 
2210     if(vnp == NULL)
2211     {
2212         *drop = 1;
2213         return;
2214     }
2215 
2216     CRef<objects::CUser_object> user_obj;
2217     CRef<objects::CUser_field> user_field;
2218 
2219     for (tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2220     {
2221         if(StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2222         {
2223             if (user_obj.NotEmpty())
2224                 break;
2225 
2226             if(StringCmp((char*) tvnp->data.ptrvalue, "Project:") == 0)
2227             {
2228                 user_obj.Reset(new objects::CUser_object);
2229                 objects::CObject_id& id = user_obj->SetType();
2230 
2231                 id.SetStr("GenomeProjectsDB");
2232             }
2233             continue;
2234         }
2235 
2236         if (user_obj.Empty())
2237             continue;
2238 
2239         str = (char*) tvnp->data.ptrvalue;
2240         if(str == NULL || *str == '\0')
2241             continue;
2242 
2243         if(*str != '0')
2244             while(*str >= '0' && *str <= '9')
2245                 str++;
2246         if(*str != '\0')
2247         {
2248             ErrPostEx(SEV_ERROR, ERR_FORMAT_IncorrectDBLINK,
2249                       "Skipping invalid \"Project:\" value on the DBLINK line: \"%s\".",
2250                       tvnp->data.ptrvalue);
2251             continue;
2252         }
2253 
2254         user_field.Reset(new objects::CUser_field);
2255 
2256         user_field->SetLabel().SetStr("ProjectID");
2257         user_field->SetData().SetInt(atoi((char*)tvnp->data.ptrvalue));
2258         user_obj->SetData().push_back(user_field);
2259 
2260         user_field.Reset(new objects::CUser_field);
2261         user_field->SetLabel().SetStr("ParentID");
2262         user_field->SetData().SetInt(0);
2263 
2264         user_obj->SetData().push_back(user_field);
2265     }
2266 
2267     if (user_obj.NotEmpty() && !user_obj->IsSetData())
2268     {
2269         user_obj.Reset();
2270     }
2271 
2272     if (user_obj.NotEmpty())
2273     {
2274         CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
2275         descr->SetUser(*user_obj);
2276         descrs.push_back(descr);
2277     }
2278 
2279     user_obj.Reset();
2280     user_field.Reset();
2281 
2282     bool inpr = false;
2283     for (tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2284     {
2285         if(StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2286         {
2287             if(StringCmp((char*) tvnp->data.ptrvalue, "Project:") == 0)
2288             {
2289                 inpr = true;
2290                 continue;
2291             }
2292 
2293             inpr = false;
2294 
2295             if (user_obj.Empty())
2296             {
2297                 user_obj.Reset(new objects::CUser_object);
2298                 user_obj->SetType().SetStr("DBLink");
2299             }
2300 
2301             for(i = 0, uvnp = tvnp->next; uvnp != NULL; uvnp = uvnp->next, i++)
2302                 if(StringChr((char*) uvnp->data.ptrvalue, ':') != NULL)
2303                     break;
2304 
2305             user_field.Reset(new objects::CUser_field);
2306 
2307             std::string lstr((char*)tvnp->data.ptrvalue);
2308             lstr = lstr.substr(0, lstr.size() - 1);
2309             user_field->SetLabel().SetStr(lstr);
2310             user_field->SetNum(i);
2311             user_field->SetData().SetStrs();
2312 
2313             user_obj->SetData().push_back(user_field);
2314 
2315             i = 0;
2316         }
2317         else if (!inpr && user_obj.NotEmpty())
2318         {
2319             user_field->SetData().SetStrs().push_back((char*)tvnp->data.ptrvalue);
2320         }
2321     }
2322 
2323     ValNodeFreeData(vnp);
2324 
2325     if (user_obj.NotEmpty())
2326     {
2327         CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
2328         descr->SetUser(*user_obj);
2329         descrs.push_back(descr);
2330 
2331         dbuop = user_obj;
2332     }
2333 }
2334 
2335 /**********************************************************/
fta_check_con_for_wgs(objects::CBioseq & bioseq)2336 Uint1 fta_check_con_for_wgs(objects::CBioseq& bioseq)
2337 {
2338     if (bioseq.GetInst().GetRepr() != objects::CSeq_inst::eRepr_delta || !bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta())
2339         return objects::CMolInfo::eTech_unknown;
2340 
2341     bool good = false;
2342     bool finished = true;
2343 
2344     ITERATE(objects::CDelta_ext::Tdata, delta, bioseq.GetInst().GetExt().GetDelta().Get())
2345     {
2346         if (!(*delta)->IsLoc())
2347             continue;
2348 
2349         const objects::CSeq_loc& locs = (*delta)->GetLoc();
2350         objects::CSeq_loc_CI ci(locs);
2351 
2352         for (; ci; ++ci)
2353         {
2354             const objects::CSeq_id* id = nullptr;
2355 
2356             CConstRef<objects::CSeq_loc> loc = ci.GetRangeAsSeq_loc();
2357             if (loc->IsEmpty() || loc->IsWhole() || loc->IsInt() || loc->IsPnt() || loc->IsPacked_pnt())
2358                 id = &ci.GetSeq_id();
2359             else
2360                 continue;
2361 
2362             if (id == nullptr)
2363                 break;
2364 
2365             if (!id->IsGenbank() && !id->IsEmbl() &&
2366                !id->IsOther() && !id->IsDdbj() &&
2367                !id->IsTpg() && !id->IsTpe() && !id->IsTpd())
2368                 break;
2369 
2370             const objects::CTextseq_id* text_id = id->GetTextseq_Id();
2371             if (text_id == nullptr || !text_id->IsSetAccession() ||
2372                text_id->GetAccession().empty() ||
2373                fta_if_wgs_acc(text_id->GetAccession().c_str()) != 1)
2374                 break;
2375             good = true;
2376         }
2377 
2378         if (ci)
2379         {
2380             finished = false;
2381             break;
2382         }
2383     }
2384 
2385     if (good && finished)
2386         return objects::CMolInfo::eTech_wgs;
2387 
2388     return objects::CMolInfo::eTech_unknown;
2389 }
2390 
2391 /**********************************************************/
fta_fix_seq_id(objects::CSeq_loc & loc,objects::CSeq_id & id,IndexblkPtr ibp,char * location,char * name,SeqLocIdsPtr slip,bool iscon,Parser::ESource source)2392 static void fta_fix_seq_id(objects::CSeq_loc& loc, objects::CSeq_id& id, IndexblkPtr ibp,
2393                            char* location, char* name, SeqLocIdsPtr slip,
2394                            bool iscon, Parser::ESource source)
2395 {
2396     Uint1        accowner;
2397     Int4         i;
2398     Char         ch;
2399 
2400     if (ibp == NULL)
2401         return;
2402 
2403     if (id.IsLocal()) {
2404         return;
2405     }
2406 
2407     if(name == NULL && id.IsGeneral())
2408     {
2409         const objects::CDbtag& tag = id.GetGeneral();
2410         if (tag.GetDb() == "SeqLit" || tag.GetDb() == "UnkSeqLit")
2411             return;
2412     }
2413 
2414     if (!id.IsGenbank() && !id.IsEmbl() && !id.IsPir() &&
2415         !id.IsSwissprot() && !id.IsOther() && !id.IsDdbj() && !id.IsPrf() &&
2416         !id.IsTpg() && !id.IsTpe() && !id.IsTpd())
2417     {
2418         if(StringLen(location) > 50)
2419         {
2420             ch = location[50];
2421             location[50] = '\0';
2422         }
2423         else
2424             ch = '\0';
2425 
2426         if(name == NULL)
2427             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2428                       "Empty or unsupported Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2429                       location);
2430         else
2431             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2432                       "Empty or unsupported Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.",
2433                       name, location);
2434         if(ch != '\0')
2435             location[50] = ch;
2436         ibp->drop = 1;
2437         return;
2438     }
2439 
2440     const objects::CTextseq_id* text_id = id.GetTextseq_Id();
2441     if (text_id == NULL || !text_id->IsSetAccession())
2442     {
2443         if(StringLen(location) > 50)
2444         {
2445             ch = location[50];
2446             location[50] = '\0';
2447         }
2448         else
2449             ch = '\0';
2450         if(name == NULL)
2451             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2452                       "Empty Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2453                       location);
2454         else
2455             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2456                       "Empty Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.",
2457                       name, location);
2458         if(ch != '\0')
2459             location[50] = ch;
2460         ibp->drop = 1;
2461         return;
2462     }
2463 
2464     const Char* accession = text_id->GetAccession().c_str();
2465     if(iscon)
2466     {
2467         i = IsNewAccessFormat(accession);
2468         if(i == 3)
2469         {
2470             if(slip->wgscont == NULL)
2471                 slip->wgscont = accession;
2472             else if(slip->wgsacc == NULL &&
2473                     StringNCmp(slip->wgscont, accession, 4) != 0)
2474                     slip->wgsacc = accession;
2475         }
2476         else if(i == 7)
2477         {
2478             if(slip->wgsscaf == NULL)
2479                 slip->wgsscaf = accession;
2480             else if(slip->wgsacc == NULL &&
2481                     StringNCmp(slip->wgsscaf, accession, 4) != 0)
2482                     slip->wgsacc = accession;
2483         }
2484     }
2485 
2486     accowner = GetNucAccOwner(accession, ibp->is_tpa);
2487     if(accowner == 0)
2488         accowner = GetProtAccOwner(accession);
2489 
2490     if (accowner != 0)
2491     {
2492         if (accowner != id.Which())
2493         {
2494             CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
2495             new_text_id->Assign(*text_id);
2496             SetTextId(accowner, id, *new_text_id);
2497         }
2498     }
2499 
2500     else if(source == Parser::ESource::Flybase)
2501     {
2502         std::string acc(accession);
2503         id.SetGeneral().SetDb("FlyBase");
2504         id.SetGeneral().SetTag().SetStr(acc);
2505     }
2506     else if(source == Parser::ESource::USPTO)
2507     {
2508         CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId((char *) accession);
2509         id.SetPatent(*pat_id);
2510     }
2511     else
2512     {
2513         if(StringLen(location) > 50)
2514         {
2515             ch = location[50];
2516             location[50] = '\0';
2517         }
2518         else
2519             ch = '\0';
2520         if(name == NULL)
2521             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2522                       "Invalid accession found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2523                       location);
2524         else
2525             ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2526                       "Invalid accession found in feature \"%s\" at location \"%s\". Entry skipped.",
2527                       name, location);
2528         if(ch != '\0')
2529             location[50] = ch;
2530         ibp->drop = 1;
2531         return;
2532     }
2533 
2534     slip->total++;
2535 
2536     if (id.IsGenbank())
2537     {
2538         if(source != Parser::ESource::NCBI && source != Parser::ESource::All &&
2539            source != Parser::ESource::LANL && slip->badslp == nullptr)
2540             slip->badslp = &loc;
2541         slip->genbank = 1;
2542     }
2543     else if(id.IsEmbl())
2544     {
2545         if(source != Parser::ESource::EMBL && source != Parser::ESource::All &&
2546            slip->badslp == nullptr)
2547            slip->badslp = &loc;
2548         slip->embl = 1;
2549     }
2550     else if(id.IsPir())
2551     {
2552         if(source != Parser::ESource::PIR && source != Parser::ESource::All &&
2553            slip->badslp == nullptr)
2554            slip->badslp = &loc;
2555         slip->pir = 1;
2556     }
2557     else if(id.IsSwissprot())
2558     {
2559         if(source != Parser::ESource::SPROT && source != Parser::ESource::All &&
2560            slip->badslp == nullptr)
2561            slip->badslp = &loc;
2562         slip->swissprot = 1;
2563     }
2564     else if(id.IsOther())
2565     {
2566         if(source != Parser::ESource::Refseq && source != Parser::ESource::All &&
2567            slip->badslp == nullptr)
2568            slip->badslp = &loc;
2569         slip->other = 1;
2570     }
2571     else if(id.IsDdbj())
2572     {
2573         if(source != Parser::ESource::DDBJ && source != Parser::ESource::All &&
2574            slip->badslp == nullptr)
2575            slip->badslp = &loc;
2576         slip->ddbj = 1;
2577     }
2578     else if(id.IsPrf())
2579     {
2580         if(source != Parser::ESource::PRF && source != Parser::ESource::All &&
2581            slip->badslp == nullptr)
2582            slip->badslp = &loc;
2583         slip->prf = 1;
2584     }
2585     else if(id.IsTpg())
2586     {
2587         if(source != Parser::ESource::NCBI && source != Parser::ESource::All &&
2588            source != Parser::ESource::LANL && slip->badslp == nullptr)
2589            slip->badslp = &loc;
2590         slip->tpg = 1;
2591     }
2592     else if (id.IsTpe())
2593     {
2594         if(source != Parser::ESource::EMBL && source != Parser::ESource::All &&
2595            slip->badslp == nullptr)
2596            slip->badslp = &loc;
2597         slip->tpe = 1;
2598     }
2599     else if (id.IsTpd())
2600     {
2601         if(source != Parser::ESource::DDBJ && source != Parser::ESource::All &&
2602            slip->badslp == nullptr)
2603            slip->badslp = &loc;
2604         slip->tpd = 1;
2605     }
2606 }
2607 
2608 /**********************************************************/
fta_do_fix_seq_loc_id(TSeqLocList & locs,IndexblkPtr ibp,char * location,char * name,SeqLocIdsPtr slip,bool iscon,Parser::ESource source)2609 static void fta_do_fix_seq_loc_id(TSeqLocList& locs, IndexblkPtr ibp,
2610                                   char* location, char* name,
2611                                   SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2612 {
2613     NON_CONST_ITERATE(TSeqLocList, loc, locs)
2614     {
2615         if ((*loc)->IsEmpty())
2616         {
2617             fta_fix_seq_id(*(*loc), (*loc)->SetEmpty(), ibp,
2618                            location, name, slip, iscon, source);
2619         }
2620         else if ((*loc)->IsWhole())
2621         {
2622             fta_fix_seq_id(*(*loc), (*loc)->SetWhole(), ibp,
2623                            location, name, slip, iscon, source);
2624         }
2625         else if ((*loc)->IsInt())
2626         {
2627             fta_fix_seq_id(*(*loc), (*loc)->SetInt().SetId(), ibp, location, name, slip, iscon, source);
2628         }
2629         else if ((*loc)->IsPnt())
2630         {
2631             fta_fix_seq_id(*(*loc), (*loc)->SetPnt().SetId(), ibp, location, name, slip, iscon, source);
2632             if (iscon && !(*loc)->GetPnt().IsSetFuzz())
2633             {
2634                 int point = (*loc)->GetPnt().GetPoint();
2635                 CRef<objects::CSeq_interval> interval(new objects::CSeq_interval);
2636                 interval->SetFrom(point);
2637                 interval->SetTo(point);
2638 
2639                 if ((*loc)->GetPnt().IsSetStrand())
2640                     interval->SetStrand((*loc)->GetPnt().GetStrand());
2641 
2642                 interval->SetId((*loc)->SetPnt().SetId());
2643                 (*loc)->SetInt(*interval);
2644             }
2645         }
2646         else if ((*loc)->IsPacked_int())
2647         {
2648             NON_CONST_ITERATE(objects::CPacked_seqint::Tdata, interval, (*loc)->SetPacked_int().Set())
2649             {
2650                 fta_fix_seq_id(*(*loc), (*interval)->SetId(), ibp, location, name, slip, iscon, source);
2651             }
2652         }
2653         else if ((*loc)->IsPacked_pnt())
2654         {
2655             fta_fix_seq_id(*(*loc), (*loc)->SetPacked_pnt().SetId(), ibp, location, name, slip, iscon, source);
2656         }
2657         else if ((*loc)->IsMix())
2658         {
2659             fta_do_fix_seq_loc_id((*loc)->SetMix().Set(), ibp, location, name, slip, iscon, source);
2660         }
2661         else if ((*loc)->IsEquiv())
2662         {
2663             fta_do_fix_seq_loc_id((*loc)->SetEquiv().Set(), ibp,
2664                                   location, name, slip, iscon, source);
2665         }
2666     }
2667 }
2668 
2669 /**********************************************************/
fta_fix_seq_loc_id(TSeqLocList & locs,ParserPtr pp,char * location,char * name,bool iscon)2670 Int4 fta_fix_seq_loc_id(TSeqLocList& locs, ParserPtr pp, char* location,
2671                         char* name, bool iscon)
2672 {
2673     SeqLocIds   sli;
2674     const Char  *p = NULL;
2675     ErrSev      sev;
2676     IndexblkPtr ibp;
2677     Char        ch;
2678     Int4        tpa;
2679     Int4        non_tpa;
2680     Int4        i = 0;
2681 
2682     ibp = pp->entrylist[pp->curindx];
2683 
2684     MemSet(&sli, 0, sizeof(SeqLocIds));
2685     fta_do_fix_seq_loc_id(locs, ibp, location, name, &sli, iscon, pp->source);
2686 
2687     tpa = sli.tpg + sli.tpe + sli.tpd;
2688     non_tpa = sli.genbank + sli.embl + sli.pir + sli.swissprot + sli.other +
2689               sli.ddbj + sli.prf;
2690 
2691     if(iscon && sli.wgsacc == NULL && sli.wgscont != NULL &&
2692        sli.wgsscaf != NULL && StringNCmp(sli.wgscont, sli.wgsscaf, 4) != 0)
2693         sli.wgsacc = sli.wgsscaf;
2694 
2695     ch = '\0';
2696     if((tpa > 0 && non_tpa > 0) || tpa > 1 || non_tpa > 1 ||
2697        (iscon && sli.wgscont != NULL && sli.wgsscaf != NULL))
2698     {
2699         if(StringLen(location) > 50)
2700         {
2701             ch = location[50];
2702             location[50] = '\0';
2703         }
2704     }
2705 
2706     if(tpa > 0 && non_tpa > 0)
2707     {
2708         if(name == NULL)
2709             ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa,
2710                       "The CONTIG/CO line with location \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.",
2711                       location);
2712         else
2713             ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa,
2714                       "The \"%s\" feature at \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.",
2715                       name, location);
2716         ibp->drop = 1;
2717     }
2718 
2719     if(tpa > 1 || non_tpa > 1)
2720     {
2721         if (!pp->allow_crossdb_featloc)
2722         {
2723             sev = SEV_REJECT;
2724             p = (char*) "Entry skipped.";
2725             ibp->drop = 1;
2726         }
2727         else
2728         {
2729             sev = SEV_WARNING;
2730             p = (char*) "";
2731         }
2732         if(name == NULL)
2733         {
2734             std::string label;
2735             if (sli.badslp != nullptr)
2736                 sli.badslp->GetLabel(&label);
2737 
2738             ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc,
2739                       "The CONTIG/CO line refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval : \"%s\".%s",
2740                       label.empty() ? location : label.c_str(), p);
2741         }
2742         else
2743             ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc,
2744                       "The \"%s\" feature at \"%s\" refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval.%s",
2745                       name, location, p);
2746     }
2747 
2748     if(iscon)
2749     {
2750         if(sli.wgscont != NULL && sli.wgsscaf != NULL)
2751             ErrPostEx(SEV_ERROR, ERR_LOCATION_ContigAndScaffold,
2752                       "The CONTIG/CO line with location \"%s\" refers to intervals on both WGS contig and WGS scaffold records.",
2753                       location);
2754 
2755         if(sli.wgsacc != NULL)
2756         {
2757             if(sli.wgscont != NULL &&
2758                StringNCmp(sli.wgscont, sli.wgsacc, 4) != 0)
2759                 p = sli.wgscont;
2760             else if(sli.wgsscaf != NULL &&
2761                     StringNCmp(sli.wgsscaf, sli.wgsacc, 4) != 0)
2762                 p = sli.wgsscaf;
2763 
2764             if(p != NULL)
2765             {
2766                 Char msga[5],
2767                      msgb[5];
2768 
2769                 StringNCpy(msga, sli.wgsacc, 4);
2770                 StringNCpy(msgb, p, 4);
2771                 msga[4] = msgb[4] = 0;
2772 
2773                 ErrPostEx(SEV_WARNING, ERR_SEQUENCE_MultipleWGSProjects,
2774                           "This CON/scaffold record is assembled from the contigs of multiple WGS projects. First pair of WGS project codes is \"%s\" and \"%s\".",
2775                           msgb, msga);
2776             }
2777         }
2778 
2779         i = IsNewAccessFormat(ibp->acnum);
2780         if(i == 3 || i == 7)
2781         {
2782             p = NULL;
2783             if(sli.wgscont != NULL &&
2784                StringNCmp(sli.wgscont, ibp->acnum, 4) != 0)
2785                 p = sli.wgscont;
2786             else if(sli.wgsscaf != NULL &&
2787                     StringNCmp(sli.wgsscaf, ibp->acnum, 4) != 0)
2788                 p = sli.wgsscaf;
2789             else if(sli.wgsacc != NULL &&
2790                     StringNCmp(sli.wgsacc, ibp->acnum, 4) != 0)
2791                 p = sli.wgsscaf;
2792 
2793             if(p != NULL)
2794             {
2795                 Char msg[5];
2796                 StringNCpy(msg, p, 4);
2797                 msg[4] = 0;
2798 
2799                 ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSPrefixMismatch,
2800                           "This WGS CON/scaffold record is assembled from the contigs of different WGS projects. First differing WGS project code is \"%s\".",
2801                           msg);
2802             }
2803         }
2804     }
2805 
2806     if(ch != '\0')
2807         location[50] = ch;
2808 
2809     if(sli.wgscont != NULL)
2810         sli.wgscont = NULL;
2811     if(sli.wgsscaf != NULL)
2812         sli.wgsscaf = NULL;
2813     if(sli.wgsacc != NULL)
2814         sli.wgsacc = NULL;
2815 
2816     return(sli.total);
2817 }
2818 
2819 /**********************************************************/
fta_vnp_structured_comment(char * buf)2820 static ValNodePtr fta_vnp_structured_comment(char* buf)
2821 {
2822     ValNodePtr res;
2823     ValNodePtr vnp;
2824     char*    start;
2825     char*    p;
2826     char*    q;
2827     char*    r;
2828     bool       bad;
2829 
2830     if(buf == NULL || *buf == '\0')
2831         return(NULL);
2832 
2833     for(p = buf; *p != '\0'; p++)
2834     {
2835         if(*p != '~')
2836             continue;
2837 
2838         for(p++; *p == ' ' || *p == '~'; p++)
2839             *p = ' ';
2840         p--;
2841     }
2842 
2843     bad = false;
2844     res = ValNodeNew(NULL);
2845     vnp = res;
2846     for(start = buf, q = start;;)
2847     {
2848         p = StringStr(start, "::");
2849         if(p == NULL)
2850         {
2851             if(start == buf)
2852                 bad = true;
2853             break;
2854         }
2855 
2856         q = StringStr(p + 2, "::");
2857         if(q == NULL)
2858         {
2859             vnp->next = ValNodeNew(NULL);
2860             vnp = vnp->next;
2861             vnp->data.ptrvalue = StringSave(start);
2862             for(r = (char*) vnp->data.ptrvalue; *r != '\0'; r++)
2863                 if(*r == '~')
2864                     *r = ' ';
2865             ShrinkSpaces((char*) vnp->data.ptrvalue);
2866             break;
2867         }
2868 
2869         *q = '\0';
2870         r = StringRChr(p + 2, '~');
2871         *q = ':';
2872         if(r == NULL)
2873         {
2874             bad = true;
2875             break;
2876         }
2877 
2878         *r = '\0';
2879         vnp->next = ValNodeNew(NULL);
2880         vnp = vnp->next;
2881         vnp->data.ptrvalue = StringSave(start);
2882         *r = '~';
2883         for(p = (char*) vnp->data.ptrvalue; *p != '\0'; p++)
2884             if(*p == '~')
2885                 *p = ' ';
2886         ShrinkSpaces((char*) vnp->data.ptrvalue);
2887 
2888         start = r;
2889     }
2890 
2891     vnp = res->next;
2892     res->next = NULL;
2893     ValNodeFree(res);
2894 
2895     if(!bad)
2896         return(vnp);
2897 
2898     ValNodeFreeData(vnp);
2899     return(NULL);
2900 }
2901 
2902 /**********************************************************/
fta_build_structured_comment(char * tag,char * buf)2903 static CRef<objects::CUser_object> fta_build_structured_comment(char* tag, char* buf)
2904 {
2905     ValNodePtr    vnp;
2906     ValNodePtr    tvnp;
2907 
2908     char*       p;
2909     char*       q;
2910 
2911     CRef<objects::CUser_object> obj;
2912 
2913     if (tag == NULL || *tag == '\0' || buf == NULL || *buf == '\0')
2914         return obj;
2915 
2916     vnp = fta_vnp_structured_comment(buf);
2917     if(vnp == NULL)
2918         return obj;
2919 
2920     obj.Reset((new objects::CUser_object));
2921 
2922     objects::CObject_id& id = obj->SetType();
2923     id.SetStr("StructuredComment");
2924 
2925     CRef<objects::CUser_field> field(new objects::CUser_field);
2926     field->SetLabel().SetStr("StructuredCommentPrefix");
2927 
2928     field->SetData().SetStr() = tag;
2929     field->SetData().SetStr() += "-START##";
2930 
2931     obj->SetData().push_back(field);
2932 
2933     for(tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2934     {
2935         p = (char*) tvnp->data.ptrvalue;
2936         if(p == NULL || *p == '\0')
2937             continue;
2938 
2939         q = StringStr(p, "::");
2940         if(q == NULL)
2941             continue;
2942 
2943         if(q > p && *(q - 1) == ' ')
2944             q--;
2945 
2946         for(*q++ = '\0'; *q == ' ' || *q == ':';)
2947             q++;
2948 
2949         if(*p == '\0' || *q == '\0')
2950             continue;
2951 
2952         field.Reset(new objects::CUser_field);
2953         field->SetLabel().SetStr(p);
2954         field->SetData().SetStr(q);
2955 
2956         obj->SetData().push_back(field);
2957     }
2958 
2959     if (obj->GetData().size() < 2)
2960     {
2961         obj.Reset();
2962         return obj;
2963     }
2964 
2965     field.Reset(new objects::CUser_field);
2966     field->SetLabel().SetStr("StructuredCommentSuffix");
2967     field->SetData().SetStr() = tag;
2968     field->SetData().SetStr() += "-END##";
2969 
2970     obj->SetData().push_back(field);
2971 
2972     ValNodeFreeData(vnp);
2973 
2974     return obj;
2975 }
2976 
2977 /**********************************************************/
fta_parse_structured_comment(char * str,bool & bad,TUserObjVector & objs)2978 void fta_parse_structured_comment(char* str, bool& bad, TUserObjVector& objs)
2979 {
2980     ValNodePtr    tagvnp;
2981     ValNodePtr    vnp;
2982 
2983     char*       start;
2984     char*       tag = NULL;
2985     char*       buf;
2986     char*       p;
2987     char*       q;
2988     char*       r;
2989 
2990     if(str == NULL || *str == '\0')
2991         return;
2992 
2993     tagvnp = NULL;
2994     for(p = str;;)
2995     {
2996         p = StringStr(p, "-START##");
2997         if(p == NULL)
2998             break;
2999         for(q = p;; q--)
3000             if(*q == '~' || (*q == '#' && q > str && *--q == '#') || q == str)
3001                 break;
3002         if(q[0] != '#' || q[1] != '#')
3003         {
3004             p += 8;
3005             continue;
3006         }
3007 
3008         start = q;
3009 
3010         *p = '\0';
3011         tag = StringSave(q);
3012         *p = '-';
3013 
3014         for(q = p;;)
3015         {
3016             q = StringStr(q, tag);
3017             if(q == NULL)
3018             {
3019                 bad = true;
3020                 break;
3021             }
3022             size_t i = StringLen(tag);
3023             if(StringNCmp(q + i, "-END##", 6) != 0)
3024             {
3025                 q += (i + 6);
3026                 continue;
3027             }
3028             r = StringStr(p + 8, "-START##");
3029             if(r != NULL && r < q)
3030             {
3031                 bad = true;
3032                 break;
3033             }
3034             break;
3035         }
3036 
3037         if (bad)
3038             break;
3039 
3040         if(tagvnp == NULL)
3041         {
3042             tagvnp = ValNodeNew(NULL);
3043             tagvnp->data.ptrvalue = StringSave(tag);
3044             tagvnp->next = NULL;
3045         }
3046         else
3047         {
3048             for(vnp = tagvnp; vnp != NULL; vnp = vnp->next)
3049             {
3050                 r = (char*) vnp->data.ptrvalue;
3051                 if(StringCmp(r + 2, tag + 2) == 0)
3052                 {
3053                     if(*r != ' ')
3054                     {
3055                         ErrPostEx(SEV_ERROR, ERR_COMMENT_SameStructuredCommentTags,
3056                                   "More than one structured comment with the same tag \"%s\" found.",
3057                                   tag + 2);
3058                         *r = ' ';
3059                     }
3060                     break;
3061                 }
3062                 if(vnp->next == NULL)
3063                 {
3064                     vnp->next = ValNodeNew(NULL);
3065                     vnp->next->data.ptrvalue = StringSave(tag);
3066                     vnp->next->next = NULL;
3067                     break;
3068                 }
3069             }
3070         }
3071 
3072         if(StringCmp(tag, "##Metadata") == 0)
3073         {
3074             MemFree(tag);
3075             p += 8;
3076             continue;
3077         }
3078 
3079         *q = '\0';
3080         if(StringStr(p + 8, "::") == NULL)
3081         {
3082             ErrPostEx(SEV_ERROR, ERR_COMMENT_StructuredCommentLacksDelim,
3083                       "The structured comment in this record lacks the expected double-colon '::' delimiter between fields and values.");
3084             MemFree(tag);
3085             p += 8;
3086             *q = '#';
3087             continue;
3088         }
3089 
3090         buf = StringSave(p + 8);
3091         *q = '#';
3092 
3093         CRef<objects::CUser_object> cur = fta_build_structured_comment(tag, buf);
3094         MemFree(buf);
3095 
3096         if (cur.Empty())
3097         {
3098             bad = true;
3099             break;
3100         }
3101 
3102         objs.push_back(cur);
3103 
3104         fta_StringCpy(start, q + StringLen(tag) + 6);
3105         MemFree(tag);
3106         p = start;
3107     }
3108 
3109     if(bad)
3110     {
3111         ErrPostEx(SEV_REJECT, ERR_COMMENT_InvalidStructuredComment,
3112                   "Incorrectly formatted structured comment with tag \"%s\" encountered. Entry dropped.",
3113                   tag + 2);
3114         MemFree(tag);
3115     }
3116 
3117     if(tagvnp != NULL)
3118         ValNodeFreeData(tagvnp);
3119 }
3120 
3121 /**********************************************************/
GetQSFromFile(FILE * fd,IndexblkPtr ibp)3122 char* GetQSFromFile(FILE* fd, IndexblkPtr ibp)
3123 {
3124     char* ret;
3125     Char    buf[1024];
3126 
3127     if(fd == NULL || ibp->qslength < 1)
3128         return(NULL);
3129 
3130     ret = (char*) MemNew(ibp->qslength + 10);
3131     ret[0] = '\0';
3132     fseek(fd, static_cast<long>(ibp->qsoffset), 0);
3133     while(fgets(buf, 1023, fd) != NULL)
3134     {
3135         if(buf[0] == '>' && ret[0] != '\0')
3136             break;
3137         StringCat(ret, buf);
3138     }
3139     return(ret);
3140 }
3141 
3142 /**********************************************************/
fta_remove_cleanup_user_object(objects::CSeq_entry & seq_entry)3143 void fta_remove_cleanup_user_object(objects::CSeq_entry& seq_entry)
3144 {
3145     TSeqdescList* descrs = nullptr;
3146     if (seq_entry.IsSeq())
3147     {
3148         if (seq_entry.GetSeq().IsSetDescr())
3149             descrs = &seq_entry.SetSeq().SetDescr().Set();
3150     }
3151     else if (seq_entry.IsSet())
3152     {
3153         if (seq_entry.GetSet().IsSetDescr())
3154             descrs = &seq_entry.SetSet().SetDescr().Set();
3155     }
3156 
3157     if (descrs == nullptr)
3158         return;
3159 
3160     for (TSeqdescList::iterator descr = descrs->begin(); descr != descrs->end(); )
3161     {
3162         if (!(*descr)->IsUser())
3163         {
3164             ++descr;
3165             continue;
3166         }
3167 
3168         const objects::CUser_object& user_obj = (*descr)->GetUser();
3169         if (!user_obj.IsSetType() || !user_obj.GetType().IsStr() ||
3170             user_obj.GetType().GetStr() != "NcbiCleanup")
3171         {
3172             ++descr;
3173             continue;
3174         }
3175 
3176         descr = descrs->erase(descr);
3177         break;
3178     }
3179 }
3180 
3181 /**********************************************************/
fta_tsa_tls_comment_dblink_check(const objects::CBioseq & bioseq,bool is_tsa)3182 void fta_tsa_tls_comment_dblink_check(const objects::CBioseq& bioseq,
3183                                       bool is_tsa)
3184 {
3185     bool got_comment = false;
3186     bool got_dblink = false;
3187 
3188     ITERATE(TSeqdescList, descr, bioseq.GetDescr().Get())
3189     {
3190         if (!(*descr)->IsUser())
3191             continue;
3192 
3193         const objects::CUser_object& user_obj = (*descr)->GetUser();
3194         if (!user_obj.IsSetType() || !user_obj.GetType().IsStr())
3195             continue;
3196 
3197         const std::string& user_type_str = user_obj.GetType().GetStr();
3198 
3199         if (user_type_str == "StructuredComment")
3200             got_comment = true;
3201         else if (user_type_str == "GenomeProjectsDB")
3202             got_dblink = true;
3203         else if (user_type_str == "DBLink")
3204         {
3205             ITERATE(objects::CUser_object::TData, field, user_obj.GetData())
3206             {
3207                 if (!(*field)->IsSetLabel() || !(*field)->GetLabel().IsStr() ||
3208                     (*field)->GetLabel().GetStr() != "BioProject")
3209                     continue;
3210                 got_dblink = true;
3211                 break;
3212             }
3213         }
3214     }
3215 
3216     if(!is_tsa)
3217     {
3218         if(!got_comment)
3219             ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksStructuredComment,
3220                       "This TLS record lacks an expected structured comment.");
3221         if(!got_dblink)
3222             ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksBioProjectLink,
3223                       "This TLS record lacks an expected BioProject or Project link.");
3224     }
3225     else
3226     {
3227         if(!got_comment)
3228             ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksStructuredComment,
3229                       "This TSA record lacks an expected structured comment.");
3230         if(!got_dblink)
3231             ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksBioProjectLink,
3232                       "This TSA record lacks an expected BioProject or Project link.");
3233     }
3234 }
3235 
3236 /**********************************************************/
fta_set_molinfo_completeness(objects::CBioseq & bioseq,IndexblkPtr ibp)3237 void fta_set_molinfo_completeness(objects::CBioseq& bioseq, IndexblkPtr ibp)
3238 {
3239     if (bioseq.GetInst().GetTopology() != 2 || (ibp != NULL && ibp->gaps != NULL))
3240         return;
3241 
3242     objects::CMolInfo* mol_info = nullptr;
3243     NON_CONST_ITERATE(TSeqdescList, descr, bioseq.SetDescr().Set())
3244     {
3245         if ((*descr)->IsMolinfo())
3246         {
3247             mol_info = &(*descr)->SetMolinfo();
3248             break;
3249         }
3250     }
3251 
3252     if (mol_info != nullptr)
3253     {
3254         mol_info->SetCompleteness(1);
3255     }
3256     else
3257     {
3258         CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
3259         objects::CMolInfo& mol = descr->SetMolinfo();
3260         mol.SetCompleteness(1);
3261 
3262         bioseq.SetDescr().Set().push_back(descr);
3263     }
3264 }
3265 
3266 /**********************************************************/
fta_create_far_fetch_policy_user_object(objects::CBioseq & bsp,Int4 num)3267 void fta_create_far_fetch_policy_user_object(objects::CBioseq& bsp, Int4 num)
3268 {
3269     if (num < 1000)
3270         return;
3271 
3272     ErrPostEx(SEV_INFO, ERR_SEQUENCE_HasManyComponents,
3273               "An OnlyNearFeatures FeatureFetchPolicy User-object has been added to this record because it is constructed from %d components, which exceeds the threshold of 999 for User-object creation.",
3274               num);
3275 
3276     CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
3277     descr->SetUser().SetType().SetStr("FeatureFetchPolicy");
3278 
3279     CRef<objects::CUser_field> field(new objects::CUser_field);
3280 
3281     field->SetLabel().SetStr("Policy");
3282     field->SetData().SetStr("OnlyNearFeatures");
3283 
3284     descr->SetUser().SetData().push_back(field);
3285 
3286     bsp.SetDescr().Set().push_back(descr);
3287 }
3288 
3289 /**********************************************************/
StripECO(char * str)3290 void StripECO(char* str)
3291 {
3292     char* p;
3293     char* q;
3294 
3295     if(str == NULL || *str == '\0')
3296         return;
3297 
3298     p = StringStr(str, "{ECO:");
3299     if(p == NULL)
3300         return;
3301 
3302     for(;;)
3303     {
3304         q = StringChr(p + 1, '}');
3305         if(q == NULL)
3306             break;
3307         if(p > str && *(p - 1) == ' ')
3308             p--;
3309         if(p > str)
3310             if((*(p - 1) == '.' && q[1] == '.') ||
3311                (*(p - 1) == ';' && q[1] == ';'))
3312                 p--;
3313         fta_StringCpy(p, q + 1);
3314         p = StringStr(p, "{ECO:");
3315         if(p == NULL)
3316             break;
3317     }
3318 }
3319 
3320 /**********************************************************/
fta_dblink_has_sra(const CRef<objects::CUser_object> & uop)3321 bool fta_dblink_has_sra(const CRef<objects::CUser_object>& uop)
3322 {
3323     if (uop.Empty() || !uop->IsSetData() || !uop->IsSetType() ||
3324         !uop->GetType().IsStr() || uop->GetType().GetStr() != "DBLink")
3325         return false;
3326 
3327     bool got = false;
3328 
3329     ITERATE(objects::CUser_object::TData, field, uop->GetData())
3330     {
3331         if (!(*field)->IsSetData() || !(*field)->GetData().IsStrs() || !(*field)->IsSetNum() || (*field)->GetNum() < 1 ||
3332             !(*field)->IsSetLabel() || !(*field)->GetLabel().IsStr() || (*field)->GetLabel().GetStr() != "Sequence Read Archive")
3333             continue;
3334 
3335         ITERATE(objects::CUser_field::C_Data::TStrs, str, (*field)->GetData().GetStrs())
3336         {
3337             if (str->size() > 2 &&
3338                 ((*str)[0] == 'D' || (*str)[0] == 'E' || (*str)[0] == 'S') && (*str)[1] == 'R' &&
3339                 ((*str)[2] == 'R' || (*str)[2] == 'X' || (*str)[2] == 'Z'))
3340             {
3341                 got = true;
3342                 break;
3343             }
3344         }
3345         if(got)
3346             break;
3347     }
3348     return(got);
3349 }
3350 
3351 END_NCBI_SCOPE
3352