1 /* add.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: add.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Additional parser functions.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39 #include <objects/seq/Seq_gap.hpp>
40 #include <objects/general/User_object.hpp>
41 #include <objects/general/User_field.hpp>
42 #include <objects/general/Object_id.hpp>
43 #include <objects/seq/Seq_descr.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seq/MolInfo.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seq_ext.hpp>
48 #include <objects/seq/Seq_hist.hpp>
49 #include <objects/seq/Seq_hist_rec.hpp>
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Dense_seg.hpp>
52 #include <objects/general/Dbtag.hpp>
53 #include <objects/seqalign/Seq_align_set.hpp>
54 #include <objects/seq/Seq_annot.hpp>
55 #include <objects/seqfeat/Imp_feat.hpp>
56 #include <objects/seq/seqport_util.hpp>
57 #include <objects/seq/Delta_ext.hpp>
58 #include <objects/seq/Delta_seq.hpp>
59 #include <objects/seq/Seq_literal.hpp>
60 #include <objects/seqloc/Seq_point.hpp>
61 #include <objects/seqloc/Seq_loc_equiv.hpp>
62 #include <objects/seqset/Bioseq_set.hpp>
63 #include <objects/seq/seq_id_handle.hpp>
64
65 #include "index.h"
66 #include "genbank.h" /* for ParFlat_FEATURES */
67 #include "embl.h" /* for ParFlat_FH */
68
69 #include <objtools/flatfile/flatdefn.h>
70 #include "ftanet.h"
71
72 #include "ftaerr.hpp"
73 #include "indx_blk.h"
74 #include "asci_blk.h"
75 #include "utilfun.h"
76
77 #ifdef THIS_FILE
78 # undef THIS_FILE
79 #endif
80 #define THIS_FILE "add.cpp"
81
82 #define HTG_GAP 100
83 #define SHORT_GAP 20
84
85 BEGIN_NCBI_SCOPE
86 USING_SCOPE(objects);
87
88 typedef struct _seq_loc_ids {
89 objects::CSeq_loc* badslp;
90 const Char* wgsacc;
91 const Char* wgscont;
92 const Char* wgsscaf;
93 Int4 genbank;
94 Int4 embl;
95 Int4 pir;
96 Int4 swissprot;
97 Int4 other;
98 Int4 ddbj;
99 Int4 prf;
100 Int4 tpg;
101 Int4 tpe;
102 Int4 tpd;
103 Int4 total;
104 } SeqLocIds, *SeqLocIdsPtr;
105
106 typedef struct _fta_tpa_block {
107 Int4 from1;
108 Int4 to1;
109 char* accession;
110 Int4 version;
111 Int4 from2;
112 Int4 to2;
113 Uint1 strand;
114 Uint1 sicho; /* SeqId choice */
115 struct _fta_tpa_block* next;
116 } FTATpaBlock, *FTATpaBlockPtr;
117
118 typedef struct _fta_tpa_span {
119 Int4 from;
120 Int4 to;
121 struct _fta_tpa_span* next;
122 } FTATpaSpan, *FTATpaSpanPtr;
123
124 /**********************************************************/
fta_tpa_block_free(FTATpaBlockPtr ftbp)125 static void fta_tpa_block_free(FTATpaBlockPtr ftbp)
126 {
127 FTATpaBlockPtr next;
128
129 for(; ftbp != NULL; ftbp = next)
130 {
131 next = ftbp->next;
132 if(ftbp->accession != NULL)
133 MemFree(ftbp->accession);
134 MemFree(ftbp);
135 }
136 }
137
138 /**********************************************************
139 *
140 * char* tata_save(str):
141 *
142 * Deletes spaces from the begining and the end and
143 * returns Nlm_StringSave.
144 *
145 **********************************************************/
tata_save(char * str)146 char* tata_save(char* str)
147 {
148 char* s;
149 char* ss;
150
151 if(str == NULL)
152 return(NULL);
153
154 while(isspace((int) *str) != 0 || *str == ',')
155 str++;
156 for(s = str; *s != '\0'; s++)
157 {
158 if(*s != '\n')
159 continue;
160
161 for(ss = s + 1; isspace((int) *ss) != 0;)
162 ss++;
163 *s = ' ';
164 fta_StringCpy(s + 1, ss);
165 }
166 s = str + StringLen(str) - 1;
167 while(s >= str && (*s == ' ' || *s == ';' || *s == ',' || *s == '\"' ||
168 *s == '\t'))
169 *s-- = '\0';
170
171 if(*str == '\0')
172 return(NULL);
173
174 return(StringSave(str));
175 }
176
177 /**********************************************************/
no_date(Parser::EFormat format,const TSeqdescList & descrs)178 bool no_date(Parser::EFormat format, const TSeqdescList& descrs)
179 {
180 bool no_create = true;
181 bool no_update = true;
182
183 ITERATE(TSeqdescList, desc, descrs)
184 {
185 if ((*desc)->IsCreate_date())
186 no_create = false;
187 else if ((*desc)->IsUpdate_date())
188 no_update = false;
189
190 if (no_create == false && no_update == false)
191 break;
192 }
193
194 if(format == Parser::EFormat::GenBank)
195 return(no_update);
196
197 return(no_create || no_update);
198 }
199
200 /**********************************************************
201 *
202 * bool no_reference(bsp):
203 *
204 * Search for at least one reference in bioseq->desr
205 * or in bioseq->annot.
206 * If no reference return TRUE.
207 *
208 **********************************************************/
no_reference(const objects::CBioseq & bioseq)209 bool no_reference(const objects::CBioseq& bioseq)
210 {
211 ITERATE(TSeqdescList, desc, bioseq.GetDescr().Get())
212 {
213 if ((*desc)->IsPub())
214 return false;
215 }
216
217 ITERATE(objects::CBioseq::TAnnot, annot, bioseq.GetAnnot())
218 {
219 if (!(*annot)->IsFtable())
220 continue;
221
222 ITERATE(objects::CSeq_annot::C_Data::TFtable, feat, (*annot)->GetData().GetFtable())
223 {
224 if ((*feat)->IsSetData() && (*feat)->GetData().IsPub())
225 return false;
226 }
227
228 ITERATE(objects::CSeq_annot::C_Data::TFtable, feat, (*annot)->GetData().GetFtable())
229 {
230 if (!(*feat)->IsSetData() || !(*feat)->GetData().IsImp())
231 continue;
232
233 const objects::CImp_feat& imp = (*feat)->GetData().GetImp();
234 if (imp.GetKey() == "Site-ref")
235 {
236 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
237 "The entry has only 'sites' references");
238 return false;
239 }
240 }
241 }
242
243 return true;
244 }
245
246 /**********************************************************
247 *
248 * bool check_cds(entry, format):
249 *
250 * Returns TRUE if CDS is in the entry.
251 *
252 **********************************************************/
check_cds(DataBlkPtr entry,Parser::EFormat format)253 bool check_cds(DataBlkPtr entry, Parser::EFormat format)
254 {
255 DataBlkPtr temp;
256 DataBlkPtr dbp;
257 const char *str;
258 char* p;
259 Char ch;
260 Int2 type;
261
262 if(format == Parser::EFormat::EMBL)
263 {
264 type = ParFlat_FH;
265 str = "\nFT CDS ";
266 }
267 else if(format == Parser::EFormat::GenBank)
268 {
269 type = ParFlat_FEATURES;
270 str = "\n CDS ";
271 }
272 else
273 return false;
274
275 for(temp = TrackNodeType(entry, type); temp != NULL; temp = temp->next)
276 {
277 if(temp->type != type)
278 continue;
279
280 size_t len = 0;
281 for(dbp = (DataBlkPtr) temp->data; dbp != NULL; dbp = dbp->next)
282 len += dbp->len;
283 if(len == 0)
284 continue;
285
286 dbp = (DataBlkPtr) temp->data;
287 ch = dbp->offset[len];
288 dbp->offset[len] = '\0';
289 p = StringStr(dbp->offset, str);
290 dbp->offset[len] = ch;
291
292 if(p != NULL)
293 break;
294 }
295
296 if(temp == NULL)
297 return false;
298 return true;
299 }
300
301 /**********************************************************/
err_install(IndexblkPtr ibp,bool accver)302 void err_install(IndexblkPtr ibp, bool accver)
303 {
304 Char temp[200];
305
306 FtaInstallPrefix(PREFIX_LOCUS, ibp->locusname, NULL);
307 if(accver && ibp->vernum > 0)
308 sprintf(temp, "%s.%d", ibp->acnum, ibp->vernum);
309 else
310 StringCpy(temp, ibp->acnum);
311 if(*temp == '\0')
312 StringCpy(temp, ibp->locusname);
313 FtaInstallPrefix(PREFIX_ACCESSION, temp, NULL);
314 }
315
316 /**********************************************************/
CreateSeqGap(objects::CSeq_literal & seq_lit,GapFeatsPtr gfp)317 static void CreateSeqGap(objects::CSeq_literal& seq_lit, GapFeatsPtr gfp)
318 {
319 if (gfp == NULL)
320 return;
321
322 objects::CSeq_gap& sgap = seq_lit.SetSeq_data().SetGap();
323 sgap.SetType(gfp->asn_gap_type);
324
325 if (!gfp->asn_linkage_evidence.empty())
326 sgap.SetLinkage_evidence().swap(gfp->asn_linkage_evidence);
327
328 if (StringCmp(gfp->gap_type, "unknown") == 0 ||
329 StringCmp(gfp->gap_type, "within scaffold") == 0 ||
330 StringCmp(gfp->gap_type, "repeat within scaffold") == 0)
331 sgap.SetLinkage(1);
332 else
333 sgap.SetLinkage(0);
334 }
335
336 /**********************************************************/
AssemblyGapsToDelta(objects::CBioseq & bioseq,GapFeatsPtr gfp,unsigned char * drop)337 void AssemblyGapsToDelta(objects::CBioseq& bioseq, GapFeatsPtr gfp, unsigned char* drop)
338 {
339 if (!bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta() ||
340 gfp == NULL)
341 return;
342
343 objects::CDelta_ext::Tdata& deltas = bioseq.SetInst().SetExt().SetDelta();
344 objects::CDelta_ext::Tdata::iterator delta = deltas.begin();
345 for (; delta != deltas.end(); ++delta)
346 {
347 if (gfp == NULL)
348 break;
349
350 if (!(*delta)->IsLiteral()) /* not Seq-lit */
351 continue;
352
353 objects::CSeq_literal& literal = (*delta)->SetLiteral();
354 if (literal.GetLength() != static_cast<Uint4>(gfp->to - gfp->from + 1))
355 {
356 ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
357 "The lengths of the CONTIG/CO line gaps disagrees with the lengths of assembly_gap features. First assembly_gap with a mismatch is at \"%d..%d\".",
358 gfp->from, gfp->to);
359 *drop = 1;
360 break;
361 }
362
363 CreateSeqGap(literal, gfp);
364
365 gfp = gfp->next;
366 }
367
368 if (*drop != 0 || (delta == deltas.end() && gfp == NULL))
369 return;
370
371 if (delta == deltas.end() && gfp != NULL)
372 {
373 ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
374 "The number of the assembly_gap features exceeds the number of CONTIG/CO line gaps. First extra assembly_gap is at \"%d..%d\".",
375 gfp->from, gfp->to);
376 *drop = 1;
377 }
378 else if (delta != deltas.end() && gfp == NULL)
379 {
380 for (; delta != deltas.end(); ++delta)
381 {
382 if ((*delta)->IsLiteral()) /* Seq-lit */
383 break;
384 }
385
386 if (delta == deltas.end())
387 return;
388
389 ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch,
390 "The number of the CONTIG/CO line gaps exceeds the number of assembly_gap features.");
391 *drop = 1;
392 }
393 }
394
395 /**********************************************************/
GapsToDelta(objects::CBioseq & bioseq,GapFeatsPtr gfp,unsigned char * drop)396 void GapsToDelta(objects::CBioseq& bioseq, GapFeatsPtr gfp, unsigned char* drop)
397 {
398 GapFeatsPtr tgfp;
399
400 const Char* p;
401 Int4 prevto;
402 Int4 nextfrom;
403 Int4 i;
404
405 if (gfp == NULL || !bioseq.GetInst().IsSetSeq_data())
406 return;
407
408 const std::string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
409 p = sequence.c_str();
410
411 if (sequence.empty() || sequence.size() != bioseq.GetLength())
412 return;
413
414 for(prevto = 0, tgfp = gfp; tgfp != NULL; tgfp = tgfp->next)
415 {
416 if(tgfp->next != NULL)
417 {
418 p = sequence.c_str() + tgfp->to;
419 for(i = tgfp->to + 1; i < tgfp->next->from; p++, i++)
420 if(*p != 'N')
421 break;
422 if(i == tgfp->next->from && tgfp->next->from > tgfp->to + 1)
423 {
424 ErrPostEx(SEV_ERROR, ERR_FEATURE_AllNsBetweenGaps,
425 "A run of all-N sequence exists between the gap features located at \"%d..%d\" and \"%d..%d\".",
426 tgfp->from, tgfp->to, tgfp->next->from,
427 tgfp->next->to);
428 tgfp->rightNs = true;
429 tgfp->next->leftNs = true;
430 }
431 nextfrom = tgfp->next->from;
432 }
433 else
434 nextfrom = bioseq.GetLength() + 1;
435
436 if(tgfp->leftNs == false && tgfp->from - prevto > 10)
437 {
438 for (p = sequence.c_str() + tgfp->from - 11, i = 0; i < 10; p++, i++)
439 if(*p != 'N')
440 break;
441 if(i == 10)
442 {
443 ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap,
444 "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.",
445 tgfp->from, tgfp->to);
446 }
447 }
448
449 if(tgfp->rightNs == false && nextfrom - tgfp->to > 10)
450 {
451 for (p = sequence.c_str() + tgfp->to, i = 0; i < 10; p++, i++)
452 if(*p != 'N')
453 break;
454 if(i == 10)
455 {
456 ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap,
457 "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.",
458 tgfp->from, tgfp->to);
459 }
460 }
461
462 for (i = tgfp->from - 1, p = sequence.c_str() + i; i < tgfp->to; p++, i++)
463 if(*p != 'N')
464 break;
465 if(i < tgfp->to)
466 {
467 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapSequence,
468 "The sequence data associated with the gap feature at \"%d..%d\" contains basepairs other than N.",
469 tgfp->from, tgfp->to);
470 *drop = 1;
471 }
472
473 prevto = tgfp->to;
474 }
475
476 if (*drop != 0)
477 return;
478
479 objects::CDelta_ext::Tdata deltas;
480
481 for (prevto = 0, tgfp = gfp;; tgfp = tgfp->next)
482 {
483 Int4 len = 0;
484
485 CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
486 if (tgfp->from - prevto - 1 > 0)
487 {
488 len = tgfp->from - prevto - 1;
489 delta->SetLiteral().SetLength(len);
490 delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
491
492 deltas.push_back(delta);
493
494 delta.Reset(new objects::CDelta_seq);
495 }
496
497 len = tgfp->to - tgfp->from + 1;
498 delta->SetLiteral().SetLength(len);
499 if(tgfp->estimated_length == -100)
500 {
501 delta->SetLiteral().SetFuzz().SetLim();
502 }
503 else if(tgfp->estimated_length != len)
504 {
505 delta->SetLiteral().SetFuzz().SetRange().SetMin(tgfp->estimated_length);
506 delta->SetLiteral().SetFuzz().SetRange().SetMax(len);
507 }
508
509 if (tgfp->assembly_gap)
510 CreateSeqGap(delta->SetLiteral(), tgfp);
511
512 deltas.push_back(delta);
513
514 prevto = tgfp->to;
515
516 if(tgfp->next == NULL)
517 {
518 if (bioseq.GetLength() - prevto > 0)
519 {
520 delta.Reset(new objects::CDelta_seq);
521
522 len = bioseq.GetLength() - prevto;
523 delta->SetLiteral().SetLength(len);
524 delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
525
526 deltas.push_back(delta);
527 }
528 break;
529 }
530 }
531
532 if (!deltas.empty())
533 {
534 bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
535 bioseq.SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
536 bioseq.SetInst().ResetSeq_data();
537 }
538 }
539
540 /**********************************************************/
SeqToDelta(objects::CBioseq & bioseq,Int2 tech)541 void SeqToDelta(objects::CBioseq& bioseq, Int2 tech)
542 {
543 char* p;
544 char* q;
545 char* r;
546
547 Int4 i;
548 Int4 j;
549 Int4 gotcha;
550
551 if (!bioseq.GetInst().IsSetSeq_data())
552 return;
553
554 const std::string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
555 if (sequence.empty() || sequence.size() != bioseq.GetLength())
556 return;
557
558 vector<Char> buf(sequence.begin(), sequence.end());
559 buf.push_back(0);
560 p = &buf[0];
561 gotcha = 0;
562
563 objects::CDelta_ext::Tdata deltas;
564
565 for (q = p; *p != '\0';)
566 {
567 if(*p != 'N')
568 {
569 p++;
570 continue;
571 }
572
573 for(r = p, p++, i = 1; *p == 'N'; i++)
574 p++;
575 if(i < HTG_GAP)
576 {
577 if(i >= SHORT_GAP && gotcha == 0)
578 gotcha = 1;
579 continue;
580 }
581
582 CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
583 gotcha = 2;
584
585 if(r != q)
586 {
587 *r = '\0';
588 j = (Int4) (r - q);
589
590 delta->SetLiteral().SetLength(j);
591 delta->SetLiteral().SetSeq_data().SetIupacna().Set(std::string(q, r));
592
593 deltas.push_back(delta);
594
595 delta.Reset(new objects::CDelta_seq);
596
597 *r = 'N';
598 }
599
600 delta->SetLiteral().SetLength(i);
601 if (i == 100)
602 {
603 delta->SetLiteral().SetFuzz().SetLim();
604 }
605
606 deltas.push_back(delta);
607 q = p;
608 }
609
610 if(p > q)
611 {
612 j = (Int4) (p - q);
613
614 CRef<objects::CDelta_seq> delta(new objects::CDelta_seq);
615 delta->SetLiteral().SetLength(j);
616 delta->SetLiteral().SetSeq_data().SetIupacna().Set(std::string(q, p));
617
618 deltas.push_back(delta);
619 }
620
621 if (deltas.size() > 1)
622 {
623 bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
624 bioseq.SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
625 bioseq.SetInst().ResetSeq_data();
626 }
627
628 if (bioseq.GetInst().GetRepr() != objects::CSeq_inst::eRepr_delta && tech == 1)
629 {
630 ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGWithoutGaps,
631 "This Phase 1 HTG sequence has no runs of 100 "
632 "or more N's to indicate gaps between component contigs. "
633 "This could be an error, or perhaps sequencing is finished "
634 "and this record should not be Phase 1.");
635 }
636
637 if (bioseq.GetInst().GetRepr() == objects::CSeq_inst::eRepr_delta)
638 {
639 if(tech == 4) /* Phase 0 */
640 ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPhaseZeroHasGap,
641 "A Phase 0 HTG record usually consists of several reads "
642 "for one contig, and hence gaps are not expected. But "
643 "this record does have one (ore more) gaps, hence it "
644 "may require review.");
645 if(gotcha == 1)
646 ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPossibleShortGap,
647 "This sequence has one or more runs "
648 "of at least 20 N's. They could indicate gaps, "
649 "but have not been treated that way because "
650 "they are below the minimum of 100 N's.");
651 }
652 }
653
654 /**********************************************************/
fta_ranges_to_hist(const objects::CGB_block::TExtra_accessions & extra_accs)655 static bool fta_ranges_to_hist(const objects::CGB_block::TExtra_accessions& extra_accs)
656 {
657 std::string ppacc1;
658 std::string ppacc2;
659 char* master;
660 char* range;
661 char* acc1;
662 char* acc2;
663 char* p;
664 char* q;
665 Char ch1;
666 Char ch2;
667 Int4 i;
668
669 if(extra_accs.empty())
670 return false;
671
672 if(extra_accs.size() != 2)
673 return true;
674
675 objects::CGB_block::TExtra_accessions::const_iterator it = extra_accs.begin();
676
677 ppacc1 = *it;
678 ++it;
679 ppacc2 = *it;
680 acc1 = (char*) ppacc1.c_str();
681 acc2 = (char*) ppacc2.c_str();
682
683
684 if(acc1 == NULL && acc2 == NULL)
685 return false;
686 if(acc1 == NULL || acc2 == NULL)
687 return true;
688
689 p = StringChr(acc1, '-');
690 q = StringChr(acc2, '-');
691
692 if((p == NULL && q == NULL) || (p != NULL && q != NULL))
693 return true;
694
695 if(p == NULL)
696 {
697 master = acc1;
698 range = acc2;
699 *q = '\0';
700 }
701 else
702 {
703 master = acc2;
704 range = acc1;
705 *p = '\0';
706 }
707
708 if(fta_if_wgs_acc(master) != 0 || fta_if_wgs_acc(range) != 1)
709 {
710 if(p != NULL)
711 *p = '-';
712 if(q != NULL)
713 *q = '-';
714 return true;
715 }
716
717 if(p != NULL)
718 *p = '-';
719 if(q != NULL)
720 *q = '-';
721
722 for(p = master; *p != '\0' && (*p < '0' || *p > '9');)
723 p++;
724 if(*p != '\0')
725 p++;
726 if(*p != '\0')
727 p++;
728 ch1 = *p;
729 *p = '\0';
730
731 for(q = range; *q != '\0' && (*q < '0' || *q > '9');)
732 q++;
733 if(*q != '\0')
734 q++;
735 if(*q != '\0')
736 q++;
737 ch2 = *q;
738 *q = '\0';
739
740 i = StringCmp(master, range);
741 *p = ch1;
742 *q = ch2;
743
744 if(i == 0)
745 return false;
746 return true;
747 }
748
749
s_IsConOrScaffold(CBioseq_Handle bsh)750 static bool s_IsConOrScaffold(CBioseq_Handle bsh)
751 {
752 if (bsh &&
753 bsh.IsSetInst_Repr() &&
754 bsh.GetInst_Repr() == CSeq_inst::eRepr_delta &&
755 bsh.IsSetInst_Ext()) {
756 const auto& ext = bsh.GetInst_Ext();
757 if (ext.IsDelta() &&
758 ext.GetDelta().IsSet()) {
759 const auto& delta = ext.GetDelta().Get();
760 return any_of(begin(delta),
761 end(delta),
762 [](CRef<CDelta_seq> pDeltaSeq) { return (pDeltaSeq && pDeltaSeq->IsLoc()); });
763 }
764 }
765 return false;
766 }
767
s_IsAccession(const CSeq_id & id)768 static bool s_IsAccession(const CSeq_id& id) {
769 const auto idType = id.Which();
770 switch (idType) {
771 case CSeq_id::e_Local:
772 case CSeq_id::e_General:
773 case CSeq_id::e_Gi:
774 case CSeq_id::e_Named_annot_track:
775 return false;
776 default:
777 return true;
778 }
779 }
780
781
g_DoesNotReferencePrimary(const CDelta_ext & delta_ext,const CSeq_id & primary,CScope & scope)782 bool g_DoesNotReferencePrimary(const CDelta_ext& delta_ext, const CSeq_id& primary, CScope& scope)
783 {
784 const auto primaryType = primary.Which();
785 string primaryString = primary.GetSeqIdString();
786 const bool primaryIsAccession = s_IsAccession(primary);
787 const bool primaryIsGi = primaryIsAccession ?
788 false :
789 (primaryType == CSeq_id::e_Gi);
790
791 unique_ptr<string> pPrimaryAccessionString;
792
793 for (const auto& pDeltaSeq : delta_ext.Get()) {
794 if (pDeltaSeq && pDeltaSeq->IsLoc()) {
795 auto pId = pDeltaSeq->GetLoc().GetId();
796 const auto& deltaIdType = pId->Which();
797 if (deltaIdType == primaryType) {
798 if (pId->GetSeqIdString() == primaryString) {
799 return false;
800 }
801 }
802 else {
803 if (primaryIsAccession && deltaIdType == CSeq_id::e_Gi) {
804 auto deltaHandle = CSeq_id_Handle::GetHandle(pId->GetGi());
805 auto deltaAccessionHandle = scope.GetAccVer(deltaHandle);
806 if (!deltaAccessionHandle) {
807 return false;
808 }
809
810 if (deltaAccessionHandle.GetSeqId()->GetSeqIdString() ==
811 primaryString) {
812 return false;
813 }
814 }
815 else
816 if (primaryIsGi && s_IsAccession(*pId)) {
817 if (!pPrimaryAccessionString) {
818 auto primaryGiHandle = CSeq_id_Handle::GetHandle(primary.GetGi());
819 auto primaryAccessionHandle = scope.GetAccVer(primaryGiHandle);
820 if (!primaryAccessionHandle) {
821 return false;
822 }
823 pPrimaryAccessionString =
824 make_unique<string>(primaryAccessionHandle.GetSeqId()->GetSeqIdString());
825 }
826
827 if (*pPrimaryAccessionString == pId->GetSeqIdString()) {
828 return false;
829 }
830 }
831 }
832 }
833 }
834 return true;
835 }
836
837
sGetPrefixLength(const CTempString & accession)838 static int sGetPrefixLength(const CTempString& accession)
839 {
840 auto it = find_if(begin(accession),
841 end(accession),
842 [](char c) { return !(isalpha(c) || c == '_'); });
843
844 _ASSERT(it != accession.end());
845 return distance(accession.begin(), it);
846 }
847
848
849 /**********************************************************/
fta_add_hist(ParserPtr pp,objects::CBioseq & bioseq,objects::CGB_block::TExtra_accessions & extra_accs,Parser::ESource source,Int4 acctype,bool pricon,char * acc)850 void fta_add_hist(ParserPtr pp, objects::CBioseq& bioseq, objects::CGB_block::TExtra_accessions& extra_accs, Parser::ESource source,
851 Int4 acctype, bool pricon, char* acc)
852 {
853 IndexblkPtr ibp;
854
855 Int4 pri_acc;
856 Int4 sec_acc;
857
858 if(pp->accver == false || pp->histacc == false ||
859 pp->source != source || pp->entrez_fetch == 0)
860 return;
861
862 if (!fta_ranges_to_hist(extra_accs))
863 return;
864
865 objects::CGB_block::TExtra_accessions hist;
866 UnwrapAccessionRange(extra_accs, hist);
867 if (hist.empty())
868 return;
869
870 ibp = pp->entrylist[pp->curindx];
871
872 pri_acc = fta_if_wgs_acc(acc);
873
874 CTempString primaryAccession(acc);
875 int prefixLength=0;
876
877 list<CRef<CSeq_id>> replaces;
878
879 for (const auto& accessionString : hist) {
880 if (accessionString.empty())
881 continue;
882
883 const auto idChoice = GetNucAccOwner(accessionString.c_str(), ibp->is_tpa);
884 if (idChoice == CSeq_id::e_not_set) {
885 continue;
886 }
887 sec_acc = fta_if_wgs_acc(accessionString.c_str());
888 if(sec_acc == 0) { // Project WGS accession
889 continue;
890 }
891
892 if (sec_acc == 1) // Contig WGS accession
893 {
894 if (pri_acc == 0 || pri_acc == 2) { // A project WGS accession or
895 continue; // a scaffold WGS accession
896 }
897
898 if (pri_acc == 1) { // Contig WGS accession
899 if (!prefixLength) {
900 prefixLength = sGetPrefixLength(primaryAccession);
901 }
902
903 if ( (accessionString.length() <= prefixLength ||
904 !NStr::EqualNocase(accessionString, 0, prefixLength, primaryAccession.substr(0,prefixLength)) ||
905 !isdigit(accessionString[prefixLength])) &&
906 !pp->allow_uwsec ) {
907 continue;
908 }
909 }
910 }
911
912 CRef<CSeq_id> id(new CSeq_id(idChoice, accessionString));
913 auto secondaryBsh = GetScope().GetBioseqHandle(*id);
914 bool IsConOrScaffold=false;
915 try {
916 IsConOrScaffold = s_IsConOrScaffold(secondaryBsh);
917 }
918 catch (...) {
919 ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
920 "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
921 accessionString.c_str());
922 continue;
923 }
924
925 if (!IsConOrScaffold && pricon && idChoice == acctype) {
926 continue;
927 }
928
929 if (IsConOrScaffold && !pricon) {
930 CRef<CSeq_id> pPrimary(new CSeq_id(primaryAccession));
931 if (g_DoesNotReferencePrimary(secondaryBsh.GetInst_Ext().GetDelta(),
932 *pPrimary,
933 GetScope())) {
934 replaces.push_back(id);
935 }
936 continue;
937 }
938
939 replaces.push_back(id);
940 }
941
942
943 if (!replaces.empty()) {
944 auto& hist_replaces_ids = bioseq.SetInst().SetHist().SetReplaces().SetIds();
945 hist_replaces_ids.splice(hist_replaces_ids.end(), replaces);
946 }
947 }
948
949 /**********************************************************/
fta_strings_same(const char * s1,const char * s2)950 bool fta_strings_same(const char* s1, const char* s2)
951 {
952 if(s1 == NULL && s2 == NULL)
953 return true;
954 if(s1 == NULL || s2 == NULL || StringCmp(s1, s2) != 0)
955 return false;
956 return true;
957 }
958
959 /**********************************************************/
fta_check_htg_kwds(TKeywordList & kwds,IndexblkPtr ibp,objects::CMolInfo & mol_info)960 bool fta_check_htg_kwds(TKeywordList& kwds, IndexblkPtr ibp, objects::CMolInfo& mol_info)
961 {
962 bool deldiv = false;
963
964 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
965 {
966 bool delnode = false;
967 bool errpost = false;
968 if(*key == "HTGS_PHASE0")
969 {
970 if(ibp->htg != 0 && ibp->htg != 5)
971 {
972 delnode = true;
973 if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 3)
974 errpost = true;
975 }
976 else
977 {
978 ibp->htg = 4;
979 mol_info.SetTech(objects::CMolInfo::eTech_htgs_0);
980 }
981 deldiv = true;
982 }
983 else if (*key == "HTGS_PHASE1")
984 {
985 if(ibp->htg != 0 && ibp->htg != 5)
986 {
987 delnode = true;
988 if(ibp->htg == 2 || ibp->htg == 3 || ibp->htg == 4)
989 errpost = true;
990 }
991 else
992 {
993 ibp->htg = 1;
994 mol_info.SetTech(objects::CMolInfo::eTech_htgs_1);
995 }
996 deldiv = true;
997 }
998 else if (*key == "HTGS_PHASE2")
999 {
1000 if(ibp->htg != 0 && ibp->htg != 5)
1001 {
1002 delnode = true;
1003 if(ibp->htg == 1 || ibp->htg == 3 || ibp->htg == 4)
1004 errpost = true;
1005 }
1006 else
1007 {
1008 ibp->htg = 2;
1009 mol_info.SetTech(objects::CMolInfo::eTech_htgs_2);
1010 }
1011 deldiv = true;
1012 }
1013 else if (*key == "HTGS_PHASE3")
1014 {
1015 if(ibp->htg != 0 && ibp->htg != 5)
1016 {
1017 delnode = true;
1018 if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
1019 errpost = true;
1020 }
1021 else
1022 {
1023 ibp->htg = 3;
1024 mol_info.SetTech(objects::CMolInfo::eTech_htgs_3);
1025 }
1026 deldiv = true;
1027 }
1028 else if (*key == "HTG")
1029 {
1030 if(ibp->htg == 0)
1031 {
1032 ibp->htg = 5;
1033 mol_info.SetTech(objects::CMolInfo::eTech_htgs_3);
1034 }
1035 deldiv = true;
1036 }
1037
1038 if(errpost)
1039 {
1040 ErrPostEx(SEV_ERROR, ERR_KEYWORD_MultipleHTGPhases,
1041 "This entry has multiple HTG-related keywords, for differing HTG phases. Ignoring all but the first.");
1042 }
1043
1044 if (delnode)
1045 key = kwds.erase(key);
1046 else
1047 ++key;
1048 }
1049 if(ibp->htg == 5)
1050 ibp->htg = 3;
1051
1052 return deldiv;
1053 }
1054
1055 /**********************************************************/
fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp,Int4 length,bool tpa)1056 static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
1057 {
1058 FTATpaBlockPtr tftbp;
1059 FTATpaSpanPtr ftsp;
1060 FTATpaSpanPtr tftsp;
1061 Int4 i1;
1062 Int4 i2;
1063 Int4 j;
1064
1065 if(ftbp == NULL || length < 1)
1066 return;
1067
1068 ftsp = (FTATpaSpanPtr) MemNew(sizeof(FTATpaSpan));
1069 ftsp->from = ftbp->from1;
1070 ftsp->to = ftbp->to1;
1071 ftsp->next = NULL;
1072 tftsp = ftsp;
1073 for(tftbp = ftbp; tftbp != NULL; tftbp = tftbp->next)
1074 {
1075 i1 = tftbp->to1 - tftbp->from1;
1076 i2 = tftbp->to2 - tftbp->from2;
1077 j = (i2 > i1) ? (i2 - i1) : (i1 - i2);
1078 i1++;
1079
1080 if(i1 < 3000 && j * 10 > i1)
1081 {
1082 if(tpa)
1083 ErrPostEx(SEV_ERROR, ERR_TPA_SpanLengthDiff,
1084 "Span \"%d..%d\" of this TPA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.",
1085 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1086 else
1087 ErrPostEx(SEV_ERROR, ERR_TSA_SpanLengthDiff,
1088 "Span \"%d..%d\" of this TSA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.",
1089 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1090 }
1091
1092 if(i1 >= 3000 && j > 300)
1093 {
1094 if (tpa)
1095 ErrPostEx(SEV_ERROR, ERR_TPA_SpanDiffOver300bp,
1096 "Span \"%d..%d\" of this TPA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.",
1097 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1098 else
1099 ErrPostEx(SEV_ERROR, ERR_TSA_SpanDiffOver300bp,
1100 "Span \"%d..%d\" of this TSA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.",
1101 tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1102 }
1103
1104 if(tftbp->from1 <= tftsp->to + 1)
1105 {
1106 if(tftbp->to1 > tftsp->to)
1107 tftsp->to = tftbp->to1;
1108 continue;
1109 }
1110
1111 tftsp->next = (FTATpaSpanPtr) MemNew(sizeof(FTATpaSpan));
1112 tftsp = tftsp->next;
1113 tftsp->from = tftbp->from1;
1114 tftsp->to = tftbp->to1;
1115 tftsp->next = NULL;
1116 }
1117
1118 if(ftsp->from - 1 > 50)
1119 {
1120 if(tpa)
1121 ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1122 "This TPA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1123 ftsp->from - 1);
1124 else
1125 ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1126 "This TSA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1127 ftsp->from - 1);
1128 }
1129
1130 for(; ftsp != NULL; ftsp = tftsp)
1131 {
1132 tftsp = ftsp->next;
1133 if(tftsp != NULL && tftsp->from - ftsp->to - 1 > 50)
1134 {
1135 if(tpa)
1136 ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1137 "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1138 ftsp->to + 1, tftsp->from - 1);
1139 else
1140 ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1141 "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1142 ftsp->to + 1, tftsp->from - 1);
1143 }
1144 else if(tftsp == NULL && length - ftsp->to > 50)
1145 {
1146 if(tpa)
1147 ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage,
1148 "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1149 ftsp->to + 1, length);
1150 else
1151 ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage,
1152 "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.",
1153 ftsp->to + 1, length);
1154 }
1155
1156 MemFree(ftsp);
1157 }
1158 }
1159
1160 /**********************************************************/
fta_number_is_huge(const Char * s)1161 bool fta_number_is_huge(const Char* s)
1162 {
1163 size_t i = StringLen(s);
1164 if(i > 10)
1165 return true;
1166 else if(i < 10)
1167 return false;
1168
1169 if(*s > '2')
1170 return true;
1171 else if(*s < '2')
1172 return false;
1173
1174 if(*++s > '1')
1175 return true;
1176 else if(*s < '1')
1177 return false;
1178
1179 if(*++s > '4')
1180 return true;
1181 else if(*s < '4')
1182 return false;
1183
1184 if(*++s > '7')
1185 return true;
1186 else if(*s < '7')
1187 return false;
1188
1189 if(*++s > '4')
1190 return true;
1191 else if(*s < '4')
1192 return false;
1193
1194 if(*++s > '8')
1195 return true;
1196 else if(*s < '8')
1197 return false;
1198
1199 if(*++s > '3')
1200 return true;
1201 else if(*s < '3')
1202 return false;
1203
1204 if(*++s > '6')
1205 return true;
1206 else if(*s < '6')
1207 return false;
1208
1209 if(*++s > '4')
1210 return true;
1211 else if(*s < '4')
1212 return false;
1213
1214 if(*++s > '7')
1215 return true;
1216 return false;
1217 }
1218
1219 /**********************************************************/
fta_parse_tpa_tsa_block(objects::CBioseq & bioseq,char * offset,char * acnum,Int2 vernum,size_t len,Int2 col_data,bool tpa)1220 bool fta_parse_tpa_tsa_block(objects::CBioseq& bioseq, char* offset, char* acnum,
1221 Int2 vernum, size_t len, Int2 col_data, bool tpa)
1222 {
1223 FTATpaBlockPtr ftbp;
1224 FTATpaBlockPtr tftbp;
1225 FTATpaBlockPtr ft;
1226
1227 char* buf;
1228 char* p;
1229 char* q;
1230 char* r;
1231 char* t;
1232 char* bad_accession;
1233 bool bad_line;
1234 bool bad_interval;
1235 Char ch;
1236 Int4 from1;
1237 Int4 to1;
1238 Int4 len1;
1239 Int4 len2;
1240 Uint1 choice;
1241
1242 if (offset == NULL || acnum == NULL || len < 2)
1243 return false;
1244
1245 choice = GetNucAccOwner(acnum, tpa);
1246
1247 if(col_data == 0) /* HACK: XML format */
1248 {
1249 for(p = offset; *p != '\0'; p++)
1250 if(*p == '~')
1251 *p = '\n';
1252 p = StringChr(offset, '\n');
1253 if(p == NULL)
1254 return false;
1255 buf = (char*) MemNew(StringLen(p) + 1);
1256 StringCpy(buf, p + 1);
1257 StringCat(buf, "\n");
1258 }
1259 else
1260 {
1261 ch = offset[len];
1262 offset[len] = '\0';
1263 p = StringChr(offset, '\n');
1264 if(p == NULL)
1265 {
1266 offset[len] = ch;
1267 return false;
1268 }
1269 buf = StringSave(p + 1);
1270 offset[len] = ch;
1271 }
1272
1273 ftbp = (FTATpaBlockPtr) MemNew(sizeof(FTATpaBlock));
1274
1275 bad_line = false;
1276 bad_interval = false;
1277 bad_accession = NULL;
1278 p = buf;
1279 for(q = StringChr(p, '\n'); q != NULL; p = q + 1, q = StringChr(p, '\n'))
1280 {
1281 *q = '\0';
1282 if((Int2) StringLen(p) < col_data)
1283 break;
1284 for(p += col_data; *p == ' ';)
1285 p++;
1286 for(r = p; *p >= '0' && *p <= '9';)
1287 p++;
1288 if(*p != '-')
1289 {
1290 bad_interval = true;
1291 break;
1292 }
1293
1294 *p++ = '\0';
1295 from1 = atoi(r);
1296
1297 for(r = p; *p >= '0' && *p <= '9';)
1298 p++;
1299 if(*p != ' ' && *p != '\n' && *p != '\0')
1300 {
1301 bad_interval = true;
1302 break;
1303 }
1304 if(*p != '\0')
1305 *p++ = '\0';
1306 to1 = atoi(r);
1307
1308 if(from1 >= to1)
1309 {
1310 bad_interval = true;
1311 break;
1312 }
1313
1314 for(ft = ftbp; ft->next != NULL; ft = ft->next)
1315 if((ft->next->from1 > from1) ||
1316 (ft->next->from1 == from1 && ft->next->to1 > to1))
1317 break;
1318 tftbp = (FTATpaBlockPtr) MemNew(sizeof(FTATpaBlock));
1319 tftbp->next = ft->next;
1320 ft->next = tftbp;
1321
1322 tftbp->from1 = from1;
1323 tftbp->to1 = to1;
1324
1325 while(*p == ' ')
1326 p++;
1327 for(r = p; *p != '\0' && *p != ' ' && *p != '\n';)
1328 p++;
1329 if(*p != '\0')
1330 *p++ = '\0';
1331 tftbp->accession = StringSave(r);
1332 r = StringChr(tftbp->accession, '.');
1333 if(r != NULL)
1334 {
1335 *r++ = '\0';
1336 for(t = r; *t >= '0' && *t <= '9';)
1337 t++;
1338 if(*t != '\0')
1339 {
1340 *--r = '.';
1341 bad_accession = tftbp->accession;
1342 break;
1343 }
1344 tftbp->version = atoi(r);
1345 }
1346
1347 if(StringNICmp(tftbp->accession, "ti", 2) == 0)
1348 {
1349 for(r = tftbp->accession + 2; *r == '0';)
1350 r++;
1351 if(*r == '\0')
1352 {
1353 bad_accession = tftbp->accession;
1354 break;
1355 }
1356 while(*r >= '0' && *r <= '9')
1357 r++;
1358 if(*r != '\0')
1359 {
1360 bad_accession = tftbp->accession;
1361 break;
1362 }
1363 }
1364 else
1365 {
1366 tftbp->sicho = GetNucAccOwner(tftbp->accession, false);
1367 if ((tftbp->sicho != objects::CSeq_id::e_Genbank && tftbp->sicho != objects::CSeq_id::e_Embl &&
1368 tftbp->sicho != objects::CSeq_id::e_Ddbj &&
1369 (tftbp->sicho != objects::CSeq_id::e_Tpg || tpa == false)))
1370 {
1371 bad_accession = tftbp->accession;
1372 break;
1373 }
1374 }
1375
1376 while(*p == ' ')
1377 p++;
1378
1379 if(StringNICmp(p, "not_available", 13) == 0)
1380 {
1381 p += 13;
1382 tftbp->from2 = 1;
1383 tftbp->to2 = 1;
1384 }
1385 else
1386 {
1387 for(r = p; *p >= '0' && *p <= '9';)
1388 p++;
1389 if(*p != '-')
1390 {
1391 bad_interval = true;
1392 break;
1393 }
1394 *p++ = '\0';
1395 tftbp->from2 = atoi(r);
1396
1397 for(r = p; *p >= '0' && *p <= '9';)
1398 p++;
1399 if(*p != ' ' && *p != '\n' && *p != '\0')
1400 {
1401 bad_interval = true;
1402 break;
1403 }
1404 if(*p != '\0')
1405 *p++ = '\0';
1406 tftbp->to2 = atoi(r);
1407
1408 if(tftbp->from2 >= tftbp->to2)
1409 {
1410 bad_interval = true;
1411 break;
1412 }
1413 }
1414
1415 while(*p == ' ')
1416 p++;
1417 if(*p == 'c')
1418 {
1419 tftbp->strand = 2;
1420 for(p++; *p == ' ';)
1421 p++;
1422 }
1423 else
1424 tftbp->strand = 1;
1425 if(*p != '\0')
1426 {
1427 bad_line = true;
1428 break;
1429 }
1430 }
1431
1432 MemFree(buf);
1433 if (bad_line || bad_interval || bad_accession != NULL)
1434 {
1435 if(bad_interval)
1436 {
1437 if(tpa)
1438 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySpan,
1439 "Intervals from primary records on which a TPA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1440 else
1441 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySpan,
1442 "Intervals from primary records on which a TSA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1443 }
1444 else if(bad_accession != NULL)
1445 {
1446 if(tpa)
1447 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySeqId,
1448 "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.",
1449 bad_accession);
1450 else
1451 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySeqId,
1452 "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.",
1453 bad_accession);
1454 }
1455 else
1456 {
1457 if(tpa)
1458 ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimaryBlock,
1459 "Supplied PRIMARY block for TPA record is incorrect. Cannot parse. Entry dropped.");
1460 else
1461 ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimaryBlock,
1462 "Supplied PRIMARY block for TSA record is incorrect. Cannot parse. Entry dropped.");
1463 }
1464
1465 if(ftbp != NULL)
1466 fta_tpa_block_free(ftbp);
1467 return false;
1468 }
1469
1470 tftbp = ftbp->next;
1471 ftbp->next = NULL;
1472 MemFree(ftbp);
1473 ftbp = tftbp;
1474
1475 fta_check_tpa_tsa_coverage(ftbp, bioseq.GetLength(), tpa);
1476
1477 objects::CSeq_hist::TAssembly& assembly = bioseq.SetInst().SetHist().SetAssembly();
1478 if (!assembly.empty())
1479 assembly.clear();
1480
1481 CRef<objects::CSeq_align> root_align(new objects::CSeq_align);
1482
1483 root_align->SetType(objects::CSeq_align::eType_not_set);
1484 objects::CSeq_align_set& align_set = root_align->SetSegs().SetDisc();
1485
1486 for(; tftbp != NULL; tftbp = tftbp->next)
1487 {
1488 len1 = tftbp->to1 - tftbp->from1 + 1;
1489 len2 = tftbp->to2 - tftbp->from2 + 1;
1490
1491 CRef<objects::CSeq_align> align(new objects::CSeq_align);
1492 align->SetType(objects::CSeq_align::eType_partial);
1493 align->SetDim(2);
1494
1495 objects::CSeq_align::C_Segs::TDenseg& seg = align->SetSegs().SetDenseg();
1496
1497 seg.SetDim(2);
1498 seg.SetNumseg((len1 == len2) ? 1 : 2);
1499
1500 seg.SetStarts().push_back(tftbp->from1 - 1);
1501 seg.SetStarts().push_back(tftbp->from2 - 1);
1502
1503 if (len1 != len2)
1504 {
1505 if (len1 < len2)
1506 {
1507 seg.SetStarts().push_back(-1);
1508 seg.SetStarts().push_back(tftbp->from2 - 1 + len1);
1509 }
1510 else
1511 {
1512 seg.SetStarts().push_back(tftbp->from1 - 1 + len2);
1513 seg.SetStarts().push_back(-1);
1514 }
1515 }
1516
1517 if (len1 == len2)
1518 seg.SetLens().push_back(len1);
1519 else if(len1 < len2)
1520 {
1521 seg.SetLens().push_back(len1);
1522 seg.SetLens().push_back(len2 - len1);
1523 }
1524 else
1525 {
1526 seg.SetLens().push_back(len2);
1527 seg.SetLens().push_back(len1 - len2);
1528 }
1529
1530 seg.SetStrands().push_back(objects::eNa_strand_plus);
1531 seg.SetStrands().push_back(static_cast<objects::ENa_strand>(tftbp->strand));
1532
1533 if (len1 != len2)
1534 {
1535 seg.SetStrands().push_back(objects::eNa_strand_plus);
1536 seg.SetStrands().push_back(static_cast<objects::ENa_strand>(tftbp->strand));
1537 }
1538
1539 CRef<objects::CTextseq_id> text_id(new objects::CTextseq_id);
1540 text_id->SetAccession(acnum);
1541
1542 if(vernum > 0)
1543 text_id->SetVersion(vernum);
1544
1545 CRef<objects::CSeq_id> id(new objects::CSeq_id),
1546 aux_id;
1547 SetTextId(choice, *id, *text_id);
1548 seg.SetIds().push_back(id);
1549
1550 if(StringNICmp(tftbp->accession, "ti", 2) == 0)
1551 {
1552 CRef<objects::CSeq_id> gen_id(new objects::CSeq_id);
1553 objects::CDbtag& tag = gen_id->SetGeneral();
1554
1555 for(r = tftbp->accession + 2; *r == '0';)
1556 r++;
1557 if(fta_number_is_huge(r) == false)
1558 tag.SetTag().SetId(atoi(r));
1559 else
1560 tag.SetTag().SetStr(r);
1561
1562 tag.SetDb("ti");
1563 seg.SetIds().push_back(gen_id);
1564 }
1565 else
1566 {
1567 CRef<objects::CTextseq_id> otext_id(new objects::CTextseq_id);
1568 otext_id->SetAccession(tftbp->accession);
1569
1570 if (tftbp->version > 0)
1571 otext_id->SetVersion(tftbp->version);
1572
1573 aux_id.Reset(new objects::CSeq_id);
1574 SetTextId(tftbp->sicho, *aux_id, *otext_id);
1575 }
1576
1577 if (aux_id.NotEmpty())
1578 seg.SetIds().push_back(aux_id);
1579
1580 align_set.Set().push_back(align);
1581 }
1582
1583 assembly.push_back(root_align);
1584
1585 if(ftbp != NULL)
1586 fta_tpa_block_free(ftbp);
1587 return true;
1588 }
1589
1590 /**********************************************************/
StringRStr(char * where,const char * what)1591 char* StringRStr(char* where, const char *what)
1592 {
1593 if(where == NULL || what == NULL || *where == '\0' || *what == '\0')
1594 return(NULL);
1595
1596 size_t i = StringLen(what);
1597 char* res = nullptr;
1598 for(char* p = where; *p != '\0'; p++)
1599 if(StringNCmp(p, what, i) == 0)
1600 res = p;
1601
1602 return(res);
1603 }
1604
1605 /**********************************************************/
fta_get_seqloc_int_whole(objects::CSeq_id & seq_id,size_t len)1606 CRef<objects::CSeq_loc> fta_get_seqloc_int_whole(objects::CSeq_id& seq_id, size_t len)
1607 {
1608 CRef<objects::CSeq_loc> ret;
1609
1610 if (len < 1)
1611 return ret;
1612
1613 ret.Reset(new objects::CSeq_loc);
1614 objects::CSeq_interval& interval = ret->SetInt();
1615
1616 interval.SetFrom(0);
1617 interval.SetTo(static_cast<TSeqPos>(len) - 1);
1618 interval.SetId(seq_id);
1619
1620 return ret;
1621 }
1622
1623 /**********************************************************/
fta_validate_assembly(char * name)1624 static void fta_validate_assembly(char* name)
1625 {
1626 bool bad_format = false;
1627
1628 char* p = name;
1629 if(p == NULL || *p == '\0' || StringLen(p) < 7)
1630 bad_format = true;
1631 else if(p[0] != 'G' || p[1] != 'C' || (p[2] != 'F' && p[2] != 'A') ||
1632 p[3] != '_' || p[4] < '0' || p[4] > '9')
1633 bad_format = true;
1634 else
1635 {
1636 for(p += 5; *p != '\0'; p++)
1637 if(*p < '0' || *p > '9')
1638 break;
1639 if(*p != '.' || p[1] < '0' || p[1] > '9')
1640 bad_format = true;
1641 else
1642 {
1643 for(p++; *p != '\0'; p++)
1644 if(*p < '0' || *p > '9')
1645 break;
1646 if(*p != '\0')
1647 bad_format = true;
1648 }
1649 }
1650
1651 if(bad_format)
1652 ErrPostEx(SEV_WARNING, ERR_DBLINK_InvalidIdentifier,
1653 "\"%s\" is not a validly formatted identifier for the Assembly resource.",
1654 name);
1655 }
1656
1657 /**********************************************************/
fta_validate_bioproject(char * name,Parser::ESource source)1658 static bool fta_validate_bioproject(char* name, Parser::ESource source)
1659 {
1660 char* p;
1661 bool bad_format = false;
1662
1663 if(StringLen(name) < 6)
1664 bad_format = true;
1665 else if(name[0] != 'P' || name[1] != 'R' || name[2] != 'J' ||
1666 (name[3] != 'E' && name[3] != 'N' && name[3] != 'D') ||
1667 name[4] < 'A' || name[4] > 'Z' || name[5] < '0' || name[5] > '9')
1668 bad_format = true;
1669 else
1670 {
1671 for(p = name + 6; *p != '\0'; p++)
1672 if(*p < '0' || *p > '9')
1673 break;
1674 if(*p != '\0')
1675 bad_format = true;
1676 }
1677
1678 if(bad_format)
1679 {
1680 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1681 "BioProject accession number is not validly formatted: \"%s\". Entry dropped.",
1682 name);
1683 return false;
1684 }
1685
1686 if((source == Parser::ESource::NCBI && name[3] != 'N') ||
1687 (source == Parser::ESource::DDBJ && name[3] != 'D' &&
1688 (name[3] != 'N' || name[4] != 'A')) ||
1689 (source == Parser::ESource::EMBL && name[3] != 'E' &&
1690 (name[3] != 'N' || name[4] != 'A')))
1691 ErrPostEx(SEV_WARNING, ERR_FORMAT_WrongBioProjectPrefix,
1692 "BioProject accession number does not agree with this record's database of origin: \"%s\".",
1693 name);
1694
1695 return true;
1696 }
1697
1698 /**********************************************************/
fta_tokenize_project(char * str,Parser::ESource source,bool newstyle)1699 static ValNodePtr fta_tokenize_project(char* str, Parser::ESource source, bool newstyle)
1700 {
1701 ValNodePtr vnp;
1702 ValNodePtr tvnp;
1703 char* p;
1704 char* q;
1705 char* r;
1706 bool bad;
1707 Char ch;
1708
1709 if(str == NULL || *str == '\0')
1710 {
1711 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1712 "Empty PROJECT/PR line type supplied. Entry dropped.");
1713 return(NULL);
1714 }
1715
1716 for(p = str; *p != '\0'; p++)
1717 if(*p == ';' || *p == ',' || *p == '\t')
1718 *p = ' ';
1719
1720 for(p = str; *p == ' ';)
1721 p++;
1722 if(*p == '\0')
1723 {
1724 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1725 "Empty PROJECT/PR line type supplied. Entry dropped.");
1726 return(NULL);
1727 }
1728
1729 vnp = ValNodeNew(NULL);
1730 vnp->data.ptrvalue = NULL;
1731 vnp->next = NULL;
1732 tvnp = vnp;
1733
1734 for(bad = false, p = str; *p != '\0';)
1735 {
1736 while(*p == ' ')
1737 p++;
1738
1739 if(*p == '\0')
1740 break;
1741
1742 for(q = p; *p != ' ' && *p != '\0';)
1743 p++;
1744
1745 ch = *p;
1746 *p = '\0';
1747 if(!newstyle)
1748 {
1749 for(r = q; *r >= '0' && *r <= '9';)
1750 r++;
1751 if(*r != '\0')
1752 {
1753 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1754 "BioProject accession number is not validly formatted: \"%s\". Entry dropped.",
1755 q);
1756 bad = true;
1757 }
1758 }
1759 else if(fta_validate_bioproject(q, source) == false)
1760 bad = true;
1761
1762 if(bad)
1763 {
1764 *p = ch;
1765 break;
1766 }
1767
1768 tvnp->next = ValNodeNew(NULL);
1769 tvnp = tvnp->next;
1770 tvnp->next = NULL;
1771 tvnp->data.ptrvalue = StringSave(q);
1772
1773 *p = ch;
1774 }
1775
1776 tvnp = vnp->next;
1777 MemFree(vnp);
1778
1779 if(tvnp == NULL)
1780 return(NULL);
1781
1782 if(!bad)
1783 return(tvnp);
1784
1785 ValNodeFreeData(tvnp);
1786 return(NULL);
1787 }
1788
1789 /**********************************************************/
fta_get_project_user_object(TSeqdescList & descrs,char * offset,Parser::EFormat format,unsigned char * drop,Parser::ESource source)1790 void fta_get_project_user_object(TSeqdescList& descrs, char* offset,
1791 Parser::EFormat format, unsigned char* drop,
1792 Parser::ESource source)
1793 {
1794 ValNodePtr vnp;
1795 ValNodePtr tvnp;
1796
1797 const Char *name;
1798
1799 char* str;
1800 char* p;
1801 Char ch;
1802 Int4 i;
1803
1804 if(offset == NULL)
1805 return;
1806
1807 bool newstyle = false;
1808 if(format == Parser::EFormat::GenBank)
1809 {
1810 i = ParFlat_COL_DATA;
1811 name = "GenomeProject:";
1812 ch = '\n';
1813 }
1814 else
1815 {
1816 i = ParFlat_COL_DATA_EMBL;
1817 name = "Project:";
1818 ch = ';';
1819 }
1820
1821 size_t len = StringLen(name);
1822 str = StringSave(offset + i);
1823 p = StringChr(str, ch);
1824 if(p != NULL)
1825 *p = '\0';
1826
1827 if(StringNCmp(str, name, len) != 0)
1828 {
1829 if(format == Parser::EFormat::GenBank)
1830 {
1831 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc,
1832 "PROJECT line is missing \"GenomeProject:\" tag. Entry dropped.",
1833 str);
1834 MemFree(str);
1835 *drop = 1;
1836 return;
1837 }
1838 newstyle = true;
1839 len = 0;
1840 }
1841 else if(format == Parser::EFormat::EMBL && str[len] == 'P')
1842 newstyle = true;
1843
1844 vnp = fta_tokenize_project(str + len, source, newstyle);
1845 if(vnp == NULL)
1846 {
1847 *drop = 1;
1848 MemFree(str);
1849 return;
1850 }
1851
1852 objects::CUser_object* user_obj_ptr;
1853 bool got = false;
1854
1855 NON_CONST_ITERATE(TSeqdescList, descr, descrs)
1856 {
1857 if (!(*descr)->IsUser() || !(*descr)->GetUser().IsSetData())
1858 continue;
1859
1860 user_obj_ptr = &((*descr)->SetUser());
1861
1862 objects::CObject_id* obj_id = nullptr;
1863 if (user_obj_ptr->IsSetType())
1864 obj_id = &(user_obj_ptr->SetType());
1865
1866 if (obj_id != NULL && obj_id->IsStr() && obj_id->GetStr() == "DBLink")
1867 {
1868 got = true;
1869 break;
1870 }
1871 }
1872
1873 CRef<objects::CUser_object> user_obj;
1874 if (newstyle)
1875 {
1876 for(i = 0, tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
1877 i++;
1878
1879 if (!got)
1880 {
1881 user_obj.Reset(new objects::CUser_object);
1882 user_obj_ptr = user_obj.GetNCPointer();
1883
1884 objects::CObject_id& id = user_obj_ptr->SetType();
1885 id.SetStr("DBLink");
1886 }
1887
1888 CRef<objects::CUser_field> user_field(new objects::CUser_field);
1889 user_field->SetLabel().SetStr("BioProject");
1890 user_field->SetNum(i);
1891
1892 for(tvnp = vnp, i = 0; tvnp != NULL; tvnp = tvnp->next)
1893 user_field->SetData().SetStrs().push_back((char*)tvnp->data.ptrvalue);
1894
1895 user_obj_ptr->SetData().push_back(user_field);
1896 }
1897 else
1898 {
1899 got = false;
1900
1901 user_obj.Reset(new objects::CUser_object);
1902 user_obj_ptr = user_obj.GetNCPointer();
1903
1904 objects::CObject_id& id = user_obj_ptr->SetType();
1905 id.SetStr("GenomeProjectsDB");
1906
1907 for(tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
1908 {
1909
1910 CRef<objects::CUser_field> user_field(new objects::CUser_field);
1911 user_field->SetLabel().SetStr("ProjectID");
1912 user_field->SetData().SetInt(atoi((char*)tvnp->data.ptrvalue));
1913 user_obj_ptr->SetData().push_back(user_field);
1914
1915
1916 user_field.Reset(new objects::CUser_field);
1917 user_field->SetLabel().SetStr("ParentID");
1918 user_field->SetData().SetInt(0);
1919 user_obj_ptr->SetData().push_back(user_field);
1920 }
1921 }
1922
1923 if (!got)
1924 {
1925 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
1926 descr->SetUser(*user_obj_ptr);
1927 descrs.push_back(descr);
1928 }
1929
1930 MemFree(str);
1931 ValNodeFree(vnp);
1932 }
1933
1934 /**********************************************************/
fta_if_valid_sra(const Char * id,bool dblink)1935 bool fta_if_valid_sra(const Char* id, bool dblink)
1936 {
1937 const Char* p = id;
1938
1939 if(p != NULL && StringLen(p) > 3 &&
1940 (p[0] == 'E' || p[0] == 'S' || p[0] == 'D') && p[1] == 'R' &&
1941 (p[2] == 'A' || p[2] == 'P' || p[2] == 'R' || p[2] == 'S' ||
1942 p[2] == 'X' || p[2] == 'Z'))
1943 {
1944 for(p += 3; *p >= '0' && *p <= '9';)
1945 p++;
1946 if(*p == '\0')
1947 return true;
1948 }
1949
1950 if(dblink)
1951 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
1952 "Incorrectly formatted DBLINK Sequence Read Archive value: \"%s\". Entry dropped.",
1953 id);
1954
1955 return false;
1956 }
1957
1958 /**********************************************************/
fta_if_valid_biosample(const Char * id,bool dblink)1959 bool fta_if_valid_biosample(const Char* id, bool dblink)
1960 {
1961 const Char* p = id;
1962
1963 if(p != NULL && StringLen(p) > 5 && p[0] == 'S' && p[1] == 'A' &&
1964 p[2] == 'M' && (p[3] == 'N' || p[3] == 'E' || p[3] == 'D'))
1965 {
1966 if(p[4] == 'A' || p[4] == 'G')
1967 p += 5;
1968 else
1969 p += 4;
1970 while(*p >= '0' && *p <= '9')
1971 p++;
1972 if(*p == '\0')
1973 return true;
1974 }
1975
1976 if(dblink)
1977 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
1978 "Incorrectly formatted DBLINK BioSample value: \"%s\". Entry dropped.",
1979 id);
1980
1981 return false;
1982 }
1983
1984 /**********************************************************/
fta_tokenize_dblink(char * str,Parser::ESource source)1985 static ValNodePtr fta_tokenize_dblink(char* str, Parser::ESource source)
1986 {
1987 ValNodePtr vnp;
1988 ValNodePtr tvnp;
1989 ValNodePtr uvnp;
1990 ValNodePtr tagvnp;
1991
1992 bool got_nl;
1993 bool bad;
1994 bool sra;
1995 bool assembly;
1996 bool biosample;
1997 bool bioproject;
1998
1999 char* p;
2000 char* q;
2001 char* r = NULL;
2002 char* t;
2003 char* u;
2004 Char ch;
2005
2006 if(str == NULL || *str == '\0')
2007 {
2008 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2009 "Empty DBLINK line type supplied. Entry dropped.");
2010 return(NULL);
2011 }
2012
2013 for(p = str; *p != '\0'; p++)
2014 if(*p == ';' || *p == '\t')
2015 *p = ' ';
2016
2017 vnp = ValNodeNew(NULL);
2018 vnp->data.ptrvalue = NULL;
2019 tvnp = vnp;
2020 bad = false;
2021 got_nl = true;
2022 sra = false;
2023 assembly = false;
2024 biosample = false;
2025 bioproject = false;
2026 tagvnp = NULL;
2027 for(p = str; *p != '\0'; got_nl = false)
2028 {
2029 while(*p == ' ' || *p == '\n' || *p == ':' || *p == ',')
2030 {
2031 if(*p == '\n')
2032 got_nl = true;
2033 p++;
2034 }
2035
2036 if(got_nl)
2037 {
2038 t = StringChr(p, ':');
2039 if(t != NULL)
2040 {
2041 r = StringChr(p, '\n');
2042 u = StringChr(p, ',');
2043
2044 if((u == NULL || u > t) && (r == NULL || r > t))
2045 {
2046 ch = *++t;
2047 *t = '\0';
2048
2049 if(StringCmp(p, "Project:") != 0 &&
2050 StringCmp(p, "Assembly:") != 0 &&
2051 StringCmp(p, "BioSample:") != 0 &&
2052 StringCmp(p, "BioProject:") != 0 &&
2053 StringCmp(p, "Sequence Read Archive:") != 0 &&
2054 StringCmp(p, "Trace Assembly Archive:") != 0)
2055 {
2056 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2057 "Invalid DBLINK tag encountered: \"%s\". Entry dropped.", p);
2058 bad = true;
2059 break;
2060 }
2061
2062 bioproject = (StringCmp(p, "BioProject:") == 0);
2063 sra = (StringCmp(p, "Sequence Read Archive:") == 0);
2064 biosample = (StringCmp(p, "BioSample:") == 0);
2065 assembly = (StringCmp(p, "Assembly:") == 0);
2066
2067 if(tvnp->data.ptrvalue != NULL &&
2068 StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2069 {
2070 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2071 "Found DBLINK tag with no value: \"%s\". Entry dropped.",
2072 tvnp->data.ptrvalue);
2073 bad = true;
2074 break;
2075 }
2076
2077 for(uvnp = vnp->next; uvnp != NULL; uvnp = uvnp->next)
2078 if(StringCmp((char*) uvnp->data.ptrvalue, p) == 0)
2079 {
2080 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2081 "Multiple DBLINK tags found: \"%s\". Entry dropped.",
2082 p);
2083 bad = true;
2084 break;
2085 }
2086 if(bad)
2087 break;
2088
2089 tvnp->next = ValNodeNew(NULL);
2090 tvnp = tvnp->next;
2091 tvnp->next = NULL;
2092 tvnp->data.ptrvalue = StringSave(p);
2093 tagvnp = tvnp;
2094 *t = ch;
2095 p = t;
2096 continue;
2097 }
2098 }
2099 }
2100
2101 q = p;
2102 while(*p != ',' && *p != '\n' && *p != ':' && *p != '\0')
2103 p++;
2104 if(*p == ':')
2105 {
2106 while(*p != '\0' && *p != '\n')
2107 p++;
2108 ch = *p;
2109 *p = '\0';
2110 while(*r != '\n' && r > str)
2111 r--;
2112 while(*r == ' ' || *r == '\n')
2113 r++;
2114 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2115 "Too many delimiters/fields for DBLINK line: \"%s\". Entry dropped.",
2116 r);
2117 *p = ch;
2118 bad = true;
2119 break;
2120 }
2121
2122 if(q == p)
2123 continue;
2124
2125 ch = *p;
2126 *p = '\0';
2127
2128 if(tagvnp != NULL && tagvnp->data.ptrvalue != NULL)
2129 {
2130 for(uvnp = tagvnp->next; uvnp != NULL; uvnp = uvnp->next)
2131 {
2132 if(uvnp->data.ptrvalue == NULL ||
2133 StringCmp((char*) uvnp->data.ptrvalue, q) != 0)
2134 continue;
2135
2136 ErrPostEx(SEV_WARNING, ERR_DBLINK_DuplicateIdentifierRemoved,
2137 "Duplicate identifier \"%s\" from \"%s\" link removed.",
2138 q, (char*) tagvnp->data.ptrvalue);
2139 break;
2140 }
2141
2142 if(uvnp != NULL)
2143 {
2144 *p = ch;
2145 continue;
2146 }
2147 }
2148
2149 if((bioproject &&
2150 fta_validate_bioproject(q, source) == false) ||
2151 (biosample && fta_if_valid_biosample(q, true) == false) ||
2152 (sra && fta_if_valid_sra(q, true) == false))
2153 {
2154 *p = ch;
2155 bad = true;
2156 }
2157
2158 if(assembly)
2159 fta_validate_assembly(q);
2160
2161 tvnp->next = ValNodeNew(NULL);
2162 tvnp = tvnp->next;
2163 tvnp->next = NULL;
2164 tvnp->data.ptrvalue = StringSave(q);
2165 *p = ch;
2166 }
2167
2168 if(!bad && tvnp->data.ptrvalue != NULL &&
2169 StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2170 {
2171 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK,
2172 "Found DBLINK tag with no value: \"%s\". Entry dropped.",
2173 tvnp->data.ptrvalue);
2174 bad = true;
2175 }
2176
2177 tvnp = vnp->next;
2178 MemFree(vnp);
2179
2180 if(tvnp == NULL)
2181 return(NULL);
2182
2183 if(!bad)
2184 return(tvnp);
2185
2186 ValNodeFreeData(tvnp);
2187 return(NULL);
2188 }
2189
2190 /**********************************************************/
fta_get_dblink_user_object(TSeqdescList & descrs,char * offset,size_t len,Parser::ESource source,unsigned char * drop,CRef<objects::CUser_object> & dbuop)2191 void fta_get_dblink_user_object(TSeqdescList& descrs, char* offset,
2192 size_t len, Parser::ESource source, unsigned char* drop,
2193 CRef<objects::CUser_object>& dbuop)
2194 {
2195 ValNodePtr vnp;
2196 ValNodePtr tvnp;
2197 ValNodePtr uvnp;
2198
2199 char* str;
2200 Int4 i;
2201
2202 if(offset == NULL)
2203 return;
2204
2205 str = StringSave(offset + ParFlat_COL_DATA);
2206 str[len-ParFlat_COL_DATA] = '\0';
2207 vnp = fta_tokenize_dblink(str, source);
2208 MemFree(str);
2209
2210 if(vnp == NULL)
2211 {
2212 *drop = 1;
2213 return;
2214 }
2215
2216 CRef<objects::CUser_object> user_obj;
2217 CRef<objects::CUser_field> user_field;
2218
2219 for (tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2220 {
2221 if(StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2222 {
2223 if (user_obj.NotEmpty())
2224 break;
2225
2226 if(StringCmp((char*) tvnp->data.ptrvalue, "Project:") == 0)
2227 {
2228 user_obj.Reset(new objects::CUser_object);
2229 objects::CObject_id& id = user_obj->SetType();
2230
2231 id.SetStr("GenomeProjectsDB");
2232 }
2233 continue;
2234 }
2235
2236 if (user_obj.Empty())
2237 continue;
2238
2239 str = (char*) tvnp->data.ptrvalue;
2240 if(str == NULL || *str == '\0')
2241 continue;
2242
2243 if(*str != '0')
2244 while(*str >= '0' && *str <= '9')
2245 str++;
2246 if(*str != '\0')
2247 {
2248 ErrPostEx(SEV_ERROR, ERR_FORMAT_IncorrectDBLINK,
2249 "Skipping invalid \"Project:\" value on the DBLINK line: \"%s\".",
2250 tvnp->data.ptrvalue);
2251 continue;
2252 }
2253
2254 user_field.Reset(new objects::CUser_field);
2255
2256 user_field->SetLabel().SetStr("ProjectID");
2257 user_field->SetData().SetInt(atoi((char*)tvnp->data.ptrvalue));
2258 user_obj->SetData().push_back(user_field);
2259
2260 user_field.Reset(new objects::CUser_field);
2261 user_field->SetLabel().SetStr("ParentID");
2262 user_field->SetData().SetInt(0);
2263
2264 user_obj->SetData().push_back(user_field);
2265 }
2266
2267 if (user_obj.NotEmpty() && !user_obj->IsSetData())
2268 {
2269 user_obj.Reset();
2270 }
2271
2272 if (user_obj.NotEmpty())
2273 {
2274 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
2275 descr->SetUser(*user_obj);
2276 descrs.push_back(descr);
2277 }
2278
2279 user_obj.Reset();
2280 user_field.Reset();
2281
2282 bool inpr = false;
2283 for (tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2284 {
2285 if(StringChr((char*) tvnp->data.ptrvalue, ':') != NULL)
2286 {
2287 if(StringCmp((char*) tvnp->data.ptrvalue, "Project:") == 0)
2288 {
2289 inpr = true;
2290 continue;
2291 }
2292
2293 inpr = false;
2294
2295 if (user_obj.Empty())
2296 {
2297 user_obj.Reset(new objects::CUser_object);
2298 user_obj->SetType().SetStr("DBLink");
2299 }
2300
2301 for(i = 0, uvnp = tvnp->next; uvnp != NULL; uvnp = uvnp->next, i++)
2302 if(StringChr((char*) uvnp->data.ptrvalue, ':') != NULL)
2303 break;
2304
2305 user_field.Reset(new objects::CUser_field);
2306
2307 std::string lstr((char*)tvnp->data.ptrvalue);
2308 lstr = lstr.substr(0, lstr.size() - 1);
2309 user_field->SetLabel().SetStr(lstr);
2310 user_field->SetNum(i);
2311 user_field->SetData().SetStrs();
2312
2313 user_obj->SetData().push_back(user_field);
2314
2315 i = 0;
2316 }
2317 else if (!inpr && user_obj.NotEmpty())
2318 {
2319 user_field->SetData().SetStrs().push_back((char*)tvnp->data.ptrvalue);
2320 }
2321 }
2322
2323 ValNodeFreeData(vnp);
2324
2325 if (user_obj.NotEmpty())
2326 {
2327 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
2328 descr->SetUser(*user_obj);
2329 descrs.push_back(descr);
2330
2331 dbuop = user_obj;
2332 }
2333 }
2334
2335 /**********************************************************/
fta_check_con_for_wgs(objects::CBioseq & bioseq)2336 Uint1 fta_check_con_for_wgs(objects::CBioseq& bioseq)
2337 {
2338 if (bioseq.GetInst().GetRepr() != objects::CSeq_inst::eRepr_delta || !bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta())
2339 return objects::CMolInfo::eTech_unknown;
2340
2341 bool good = false;
2342 bool finished = true;
2343
2344 ITERATE(objects::CDelta_ext::Tdata, delta, bioseq.GetInst().GetExt().GetDelta().Get())
2345 {
2346 if (!(*delta)->IsLoc())
2347 continue;
2348
2349 const objects::CSeq_loc& locs = (*delta)->GetLoc();
2350 objects::CSeq_loc_CI ci(locs);
2351
2352 for (; ci; ++ci)
2353 {
2354 const objects::CSeq_id* id = nullptr;
2355
2356 CConstRef<objects::CSeq_loc> loc = ci.GetRangeAsSeq_loc();
2357 if (loc->IsEmpty() || loc->IsWhole() || loc->IsInt() || loc->IsPnt() || loc->IsPacked_pnt())
2358 id = &ci.GetSeq_id();
2359 else
2360 continue;
2361
2362 if (id == nullptr)
2363 break;
2364
2365 if (!id->IsGenbank() && !id->IsEmbl() &&
2366 !id->IsOther() && !id->IsDdbj() &&
2367 !id->IsTpg() && !id->IsTpe() && !id->IsTpd())
2368 break;
2369
2370 const objects::CTextseq_id* text_id = id->GetTextseq_Id();
2371 if (text_id == nullptr || !text_id->IsSetAccession() ||
2372 text_id->GetAccession().empty() ||
2373 fta_if_wgs_acc(text_id->GetAccession().c_str()) != 1)
2374 break;
2375 good = true;
2376 }
2377
2378 if (ci)
2379 {
2380 finished = false;
2381 break;
2382 }
2383 }
2384
2385 if (good && finished)
2386 return objects::CMolInfo::eTech_wgs;
2387
2388 return objects::CMolInfo::eTech_unknown;
2389 }
2390
2391 /**********************************************************/
fta_fix_seq_id(objects::CSeq_loc & loc,objects::CSeq_id & id,IndexblkPtr ibp,char * location,char * name,SeqLocIdsPtr slip,bool iscon,Parser::ESource source)2392 static void fta_fix_seq_id(objects::CSeq_loc& loc, objects::CSeq_id& id, IndexblkPtr ibp,
2393 char* location, char* name, SeqLocIdsPtr slip,
2394 bool iscon, Parser::ESource source)
2395 {
2396 Uint1 accowner;
2397 Int4 i;
2398 Char ch;
2399
2400 if (ibp == NULL)
2401 return;
2402
2403 if (id.IsLocal()) {
2404 return;
2405 }
2406
2407 if(name == NULL && id.IsGeneral())
2408 {
2409 const objects::CDbtag& tag = id.GetGeneral();
2410 if (tag.GetDb() == "SeqLit" || tag.GetDb() == "UnkSeqLit")
2411 return;
2412 }
2413
2414 if (!id.IsGenbank() && !id.IsEmbl() && !id.IsPir() &&
2415 !id.IsSwissprot() && !id.IsOther() && !id.IsDdbj() && !id.IsPrf() &&
2416 !id.IsTpg() && !id.IsTpe() && !id.IsTpd())
2417 {
2418 if(StringLen(location) > 50)
2419 {
2420 ch = location[50];
2421 location[50] = '\0';
2422 }
2423 else
2424 ch = '\0';
2425
2426 if(name == NULL)
2427 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2428 "Empty or unsupported Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2429 location);
2430 else
2431 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2432 "Empty or unsupported Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.",
2433 name, location);
2434 if(ch != '\0')
2435 location[50] = ch;
2436 ibp->drop = 1;
2437 return;
2438 }
2439
2440 const objects::CTextseq_id* text_id = id.GetTextseq_Id();
2441 if (text_id == NULL || !text_id->IsSetAccession())
2442 {
2443 if(StringLen(location) > 50)
2444 {
2445 ch = location[50];
2446 location[50] = '\0';
2447 }
2448 else
2449 ch = '\0';
2450 if(name == NULL)
2451 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2452 "Empty Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2453 location);
2454 else
2455 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2456 "Empty Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.",
2457 name, location);
2458 if(ch != '\0')
2459 location[50] = ch;
2460 ibp->drop = 1;
2461 return;
2462 }
2463
2464 const Char* accession = text_id->GetAccession().c_str();
2465 if(iscon)
2466 {
2467 i = IsNewAccessFormat(accession);
2468 if(i == 3)
2469 {
2470 if(slip->wgscont == NULL)
2471 slip->wgscont = accession;
2472 else if(slip->wgsacc == NULL &&
2473 StringNCmp(slip->wgscont, accession, 4) != 0)
2474 slip->wgsacc = accession;
2475 }
2476 else if(i == 7)
2477 {
2478 if(slip->wgsscaf == NULL)
2479 slip->wgsscaf = accession;
2480 else if(slip->wgsacc == NULL &&
2481 StringNCmp(slip->wgsscaf, accession, 4) != 0)
2482 slip->wgsacc = accession;
2483 }
2484 }
2485
2486 accowner = GetNucAccOwner(accession, ibp->is_tpa);
2487 if(accowner == 0)
2488 accowner = GetProtAccOwner(accession);
2489
2490 if (accowner != 0)
2491 {
2492 if (accowner != id.Which())
2493 {
2494 CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
2495 new_text_id->Assign(*text_id);
2496 SetTextId(accowner, id, *new_text_id);
2497 }
2498 }
2499
2500 else if(source == Parser::ESource::Flybase)
2501 {
2502 std::string acc(accession);
2503 id.SetGeneral().SetDb("FlyBase");
2504 id.SetGeneral().SetTag().SetStr(acc);
2505 }
2506 else if(source == Parser::ESource::USPTO)
2507 {
2508 CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId((char *) accession);
2509 id.SetPatent(*pat_id);
2510 }
2511 else
2512 {
2513 if(StringLen(location) > 50)
2514 {
2515 ch = location[50];
2516 location[50] = '\0';
2517 }
2518 else
2519 ch = '\0';
2520 if(name == NULL)
2521 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2522 "Invalid accession found in CONTIG/CO line at location: \"%s\". Entry skipped.",
2523 location);
2524 else
2525 ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem,
2526 "Invalid accession found in feature \"%s\" at location \"%s\". Entry skipped.",
2527 name, location);
2528 if(ch != '\0')
2529 location[50] = ch;
2530 ibp->drop = 1;
2531 return;
2532 }
2533
2534 slip->total++;
2535
2536 if (id.IsGenbank())
2537 {
2538 if(source != Parser::ESource::NCBI && source != Parser::ESource::All &&
2539 source != Parser::ESource::LANL && slip->badslp == nullptr)
2540 slip->badslp = &loc;
2541 slip->genbank = 1;
2542 }
2543 else if(id.IsEmbl())
2544 {
2545 if(source != Parser::ESource::EMBL && source != Parser::ESource::All &&
2546 slip->badslp == nullptr)
2547 slip->badslp = &loc;
2548 slip->embl = 1;
2549 }
2550 else if(id.IsPir())
2551 {
2552 if(source != Parser::ESource::PIR && source != Parser::ESource::All &&
2553 slip->badslp == nullptr)
2554 slip->badslp = &loc;
2555 slip->pir = 1;
2556 }
2557 else if(id.IsSwissprot())
2558 {
2559 if(source != Parser::ESource::SPROT && source != Parser::ESource::All &&
2560 slip->badslp == nullptr)
2561 slip->badslp = &loc;
2562 slip->swissprot = 1;
2563 }
2564 else if(id.IsOther())
2565 {
2566 if(source != Parser::ESource::Refseq && source != Parser::ESource::All &&
2567 slip->badslp == nullptr)
2568 slip->badslp = &loc;
2569 slip->other = 1;
2570 }
2571 else if(id.IsDdbj())
2572 {
2573 if(source != Parser::ESource::DDBJ && source != Parser::ESource::All &&
2574 slip->badslp == nullptr)
2575 slip->badslp = &loc;
2576 slip->ddbj = 1;
2577 }
2578 else if(id.IsPrf())
2579 {
2580 if(source != Parser::ESource::PRF && source != Parser::ESource::All &&
2581 slip->badslp == nullptr)
2582 slip->badslp = &loc;
2583 slip->prf = 1;
2584 }
2585 else if(id.IsTpg())
2586 {
2587 if(source != Parser::ESource::NCBI && source != Parser::ESource::All &&
2588 source != Parser::ESource::LANL && slip->badslp == nullptr)
2589 slip->badslp = &loc;
2590 slip->tpg = 1;
2591 }
2592 else if (id.IsTpe())
2593 {
2594 if(source != Parser::ESource::EMBL && source != Parser::ESource::All &&
2595 slip->badslp == nullptr)
2596 slip->badslp = &loc;
2597 slip->tpe = 1;
2598 }
2599 else if (id.IsTpd())
2600 {
2601 if(source != Parser::ESource::DDBJ && source != Parser::ESource::All &&
2602 slip->badslp == nullptr)
2603 slip->badslp = &loc;
2604 slip->tpd = 1;
2605 }
2606 }
2607
2608 /**********************************************************/
fta_do_fix_seq_loc_id(TSeqLocList & locs,IndexblkPtr ibp,char * location,char * name,SeqLocIdsPtr slip,bool iscon,Parser::ESource source)2609 static void fta_do_fix_seq_loc_id(TSeqLocList& locs, IndexblkPtr ibp,
2610 char* location, char* name,
2611 SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2612 {
2613 NON_CONST_ITERATE(TSeqLocList, loc, locs)
2614 {
2615 if ((*loc)->IsEmpty())
2616 {
2617 fta_fix_seq_id(*(*loc), (*loc)->SetEmpty(), ibp,
2618 location, name, slip, iscon, source);
2619 }
2620 else if ((*loc)->IsWhole())
2621 {
2622 fta_fix_seq_id(*(*loc), (*loc)->SetWhole(), ibp,
2623 location, name, slip, iscon, source);
2624 }
2625 else if ((*loc)->IsInt())
2626 {
2627 fta_fix_seq_id(*(*loc), (*loc)->SetInt().SetId(), ibp, location, name, slip, iscon, source);
2628 }
2629 else if ((*loc)->IsPnt())
2630 {
2631 fta_fix_seq_id(*(*loc), (*loc)->SetPnt().SetId(), ibp, location, name, slip, iscon, source);
2632 if (iscon && !(*loc)->GetPnt().IsSetFuzz())
2633 {
2634 int point = (*loc)->GetPnt().GetPoint();
2635 CRef<objects::CSeq_interval> interval(new objects::CSeq_interval);
2636 interval->SetFrom(point);
2637 interval->SetTo(point);
2638
2639 if ((*loc)->GetPnt().IsSetStrand())
2640 interval->SetStrand((*loc)->GetPnt().GetStrand());
2641
2642 interval->SetId((*loc)->SetPnt().SetId());
2643 (*loc)->SetInt(*interval);
2644 }
2645 }
2646 else if ((*loc)->IsPacked_int())
2647 {
2648 NON_CONST_ITERATE(objects::CPacked_seqint::Tdata, interval, (*loc)->SetPacked_int().Set())
2649 {
2650 fta_fix_seq_id(*(*loc), (*interval)->SetId(), ibp, location, name, slip, iscon, source);
2651 }
2652 }
2653 else if ((*loc)->IsPacked_pnt())
2654 {
2655 fta_fix_seq_id(*(*loc), (*loc)->SetPacked_pnt().SetId(), ibp, location, name, slip, iscon, source);
2656 }
2657 else if ((*loc)->IsMix())
2658 {
2659 fta_do_fix_seq_loc_id((*loc)->SetMix().Set(), ibp, location, name, slip, iscon, source);
2660 }
2661 else if ((*loc)->IsEquiv())
2662 {
2663 fta_do_fix_seq_loc_id((*loc)->SetEquiv().Set(), ibp,
2664 location, name, slip, iscon, source);
2665 }
2666 }
2667 }
2668
2669 /**********************************************************/
fta_fix_seq_loc_id(TSeqLocList & locs,ParserPtr pp,char * location,char * name,bool iscon)2670 Int4 fta_fix_seq_loc_id(TSeqLocList& locs, ParserPtr pp, char* location,
2671 char* name, bool iscon)
2672 {
2673 SeqLocIds sli;
2674 const Char *p = NULL;
2675 ErrSev sev;
2676 IndexblkPtr ibp;
2677 Char ch;
2678 Int4 tpa;
2679 Int4 non_tpa;
2680 Int4 i = 0;
2681
2682 ibp = pp->entrylist[pp->curindx];
2683
2684 MemSet(&sli, 0, sizeof(SeqLocIds));
2685 fta_do_fix_seq_loc_id(locs, ibp, location, name, &sli, iscon, pp->source);
2686
2687 tpa = sli.tpg + sli.tpe + sli.tpd;
2688 non_tpa = sli.genbank + sli.embl + sli.pir + sli.swissprot + sli.other +
2689 sli.ddbj + sli.prf;
2690
2691 if(iscon && sli.wgsacc == NULL && sli.wgscont != NULL &&
2692 sli.wgsscaf != NULL && StringNCmp(sli.wgscont, sli.wgsscaf, 4) != 0)
2693 sli.wgsacc = sli.wgsscaf;
2694
2695 ch = '\0';
2696 if((tpa > 0 && non_tpa > 0) || tpa > 1 || non_tpa > 1 ||
2697 (iscon && sli.wgscont != NULL && sli.wgsscaf != NULL))
2698 {
2699 if(StringLen(location) > 50)
2700 {
2701 ch = location[50];
2702 location[50] = '\0';
2703 }
2704 }
2705
2706 if(tpa > 0 && non_tpa > 0)
2707 {
2708 if(name == NULL)
2709 ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa,
2710 "The CONTIG/CO line with location \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.",
2711 location);
2712 else
2713 ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa,
2714 "The \"%s\" feature at \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.",
2715 name, location);
2716 ibp->drop = 1;
2717 }
2718
2719 if(tpa > 1 || non_tpa > 1)
2720 {
2721 if (!pp->allow_crossdb_featloc)
2722 {
2723 sev = SEV_REJECT;
2724 p = (char*) "Entry skipped.";
2725 ibp->drop = 1;
2726 }
2727 else
2728 {
2729 sev = SEV_WARNING;
2730 p = (char*) "";
2731 }
2732 if(name == NULL)
2733 {
2734 std::string label;
2735 if (sli.badslp != nullptr)
2736 sli.badslp->GetLabel(&label);
2737
2738 ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc,
2739 "The CONTIG/CO line refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval : \"%s\".%s",
2740 label.empty() ? location : label.c_str(), p);
2741 }
2742 else
2743 ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc,
2744 "The \"%s\" feature at \"%s\" refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval.%s",
2745 name, location, p);
2746 }
2747
2748 if(iscon)
2749 {
2750 if(sli.wgscont != NULL && sli.wgsscaf != NULL)
2751 ErrPostEx(SEV_ERROR, ERR_LOCATION_ContigAndScaffold,
2752 "The CONTIG/CO line with location \"%s\" refers to intervals on both WGS contig and WGS scaffold records.",
2753 location);
2754
2755 if(sli.wgsacc != NULL)
2756 {
2757 if(sli.wgscont != NULL &&
2758 StringNCmp(sli.wgscont, sli.wgsacc, 4) != 0)
2759 p = sli.wgscont;
2760 else if(sli.wgsscaf != NULL &&
2761 StringNCmp(sli.wgsscaf, sli.wgsacc, 4) != 0)
2762 p = sli.wgsscaf;
2763
2764 if(p != NULL)
2765 {
2766 Char msga[5],
2767 msgb[5];
2768
2769 StringNCpy(msga, sli.wgsacc, 4);
2770 StringNCpy(msgb, p, 4);
2771 msga[4] = msgb[4] = 0;
2772
2773 ErrPostEx(SEV_WARNING, ERR_SEQUENCE_MultipleWGSProjects,
2774 "This CON/scaffold record is assembled from the contigs of multiple WGS projects. First pair of WGS project codes is \"%s\" and \"%s\".",
2775 msgb, msga);
2776 }
2777 }
2778
2779 i = IsNewAccessFormat(ibp->acnum);
2780 if(i == 3 || i == 7)
2781 {
2782 p = NULL;
2783 if(sli.wgscont != NULL &&
2784 StringNCmp(sli.wgscont, ibp->acnum, 4) != 0)
2785 p = sli.wgscont;
2786 else if(sli.wgsscaf != NULL &&
2787 StringNCmp(sli.wgsscaf, ibp->acnum, 4) != 0)
2788 p = sli.wgsscaf;
2789 else if(sli.wgsacc != NULL &&
2790 StringNCmp(sli.wgsacc, ibp->acnum, 4) != 0)
2791 p = sli.wgsscaf;
2792
2793 if(p != NULL)
2794 {
2795 Char msg[5];
2796 StringNCpy(msg, p, 4);
2797 msg[4] = 0;
2798
2799 ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSPrefixMismatch,
2800 "This WGS CON/scaffold record is assembled from the contigs of different WGS projects. First differing WGS project code is \"%s\".",
2801 msg);
2802 }
2803 }
2804 }
2805
2806 if(ch != '\0')
2807 location[50] = ch;
2808
2809 if(sli.wgscont != NULL)
2810 sli.wgscont = NULL;
2811 if(sli.wgsscaf != NULL)
2812 sli.wgsscaf = NULL;
2813 if(sli.wgsacc != NULL)
2814 sli.wgsacc = NULL;
2815
2816 return(sli.total);
2817 }
2818
2819 /**********************************************************/
fta_vnp_structured_comment(char * buf)2820 static ValNodePtr fta_vnp_structured_comment(char* buf)
2821 {
2822 ValNodePtr res;
2823 ValNodePtr vnp;
2824 char* start;
2825 char* p;
2826 char* q;
2827 char* r;
2828 bool bad;
2829
2830 if(buf == NULL || *buf == '\0')
2831 return(NULL);
2832
2833 for(p = buf; *p != '\0'; p++)
2834 {
2835 if(*p != '~')
2836 continue;
2837
2838 for(p++; *p == ' ' || *p == '~'; p++)
2839 *p = ' ';
2840 p--;
2841 }
2842
2843 bad = false;
2844 res = ValNodeNew(NULL);
2845 vnp = res;
2846 for(start = buf, q = start;;)
2847 {
2848 p = StringStr(start, "::");
2849 if(p == NULL)
2850 {
2851 if(start == buf)
2852 bad = true;
2853 break;
2854 }
2855
2856 q = StringStr(p + 2, "::");
2857 if(q == NULL)
2858 {
2859 vnp->next = ValNodeNew(NULL);
2860 vnp = vnp->next;
2861 vnp->data.ptrvalue = StringSave(start);
2862 for(r = (char*) vnp->data.ptrvalue; *r != '\0'; r++)
2863 if(*r == '~')
2864 *r = ' ';
2865 ShrinkSpaces((char*) vnp->data.ptrvalue);
2866 break;
2867 }
2868
2869 *q = '\0';
2870 r = StringRChr(p + 2, '~');
2871 *q = ':';
2872 if(r == NULL)
2873 {
2874 bad = true;
2875 break;
2876 }
2877
2878 *r = '\0';
2879 vnp->next = ValNodeNew(NULL);
2880 vnp = vnp->next;
2881 vnp->data.ptrvalue = StringSave(start);
2882 *r = '~';
2883 for(p = (char*) vnp->data.ptrvalue; *p != '\0'; p++)
2884 if(*p == '~')
2885 *p = ' ';
2886 ShrinkSpaces((char*) vnp->data.ptrvalue);
2887
2888 start = r;
2889 }
2890
2891 vnp = res->next;
2892 res->next = NULL;
2893 ValNodeFree(res);
2894
2895 if(!bad)
2896 return(vnp);
2897
2898 ValNodeFreeData(vnp);
2899 return(NULL);
2900 }
2901
2902 /**********************************************************/
fta_build_structured_comment(char * tag,char * buf)2903 static CRef<objects::CUser_object> fta_build_structured_comment(char* tag, char* buf)
2904 {
2905 ValNodePtr vnp;
2906 ValNodePtr tvnp;
2907
2908 char* p;
2909 char* q;
2910
2911 CRef<objects::CUser_object> obj;
2912
2913 if (tag == NULL || *tag == '\0' || buf == NULL || *buf == '\0')
2914 return obj;
2915
2916 vnp = fta_vnp_structured_comment(buf);
2917 if(vnp == NULL)
2918 return obj;
2919
2920 obj.Reset((new objects::CUser_object));
2921
2922 objects::CObject_id& id = obj->SetType();
2923 id.SetStr("StructuredComment");
2924
2925 CRef<objects::CUser_field> field(new objects::CUser_field);
2926 field->SetLabel().SetStr("StructuredCommentPrefix");
2927
2928 field->SetData().SetStr() = tag;
2929 field->SetData().SetStr() += "-START##";
2930
2931 obj->SetData().push_back(field);
2932
2933 for(tvnp = vnp; tvnp != NULL; tvnp = tvnp->next)
2934 {
2935 p = (char*) tvnp->data.ptrvalue;
2936 if(p == NULL || *p == '\0')
2937 continue;
2938
2939 q = StringStr(p, "::");
2940 if(q == NULL)
2941 continue;
2942
2943 if(q > p && *(q - 1) == ' ')
2944 q--;
2945
2946 for(*q++ = '\0'; *q == ' ' || *q == ':';)
2947 q++;
2948
2949 if(*p == '\0' || *q == '\0')
2950 continue;
2951
2952 field.Reset(new objects::CUser_field);
2953 field->SetLabel().SetStr(p);
2954 field->SetData().SetStr(q);
2955
2956 obj->SetData().push_back(field);
2957 }
2958
2959 if (obj->GetData().size() < 2)
2960 {
2961 obj.Reset();
2962 return obj;
2963 }
2964
2965 field.Reset(new objects::CUser_field);
2966 field->SetLabel().SetStr("StructuredCommentSuffix");
2967 field->SetData().SetStr() = tag;
2968 field->SetData().SetStr() += "-END##";
2969
2970 obj->SetData().push_back(field);
2971
2972 ValNodeFreeData(vnp);
2973
2974 return obj;
2975 }
2976
2977 /**********************************************************/
fta_parse_structured_comment(char * str,bool & bad,TUserObjVector & objs)2978 void fta_parse_structured_comment(char* str, bool& bad, TUserObjVector& objs)
2979 {
2980 ValNodePtr tagvnp;
2981 ValNodePtr vnp;
2982
2983 char* start;
2984 char* tag = NULL;
2985 char* buf;
2986 char* p;
2987 char* q;
2988 char* r;
2989
2990 if(str == NULL || *str == '\0')
2991 return;
2992
2993 tagvnp = NULL;
2994 for(p = str;;)
2995 {
2996 p = StringStr(p, "-START##");
2997 if(p == NULL)
2998 break;
2999 for(q = p;; q--)
3000 if(*q == '~' || (*q == '#' && q > str && *--q == '#') || q == str)
3001 break;
3002 if(q[0] != '#' || q[1] != '#')
3003 {
3004 p += 8;
3005 continue;
3006 }
3007
3008 start = q;
3009
3010 *p = '\0';
3011 tag = StringSave(q);
3012 *p = '-';
3013
3014 for(q = p;;)
3015 {
3016 q = StringStr(q, tag);
3017 if(q == NULL)
3018 {
3019 bad = true;
3020 break;
3021 }
3022 size_t i = StringLen(tag);
3023 if(StringNCmp(q + i, "-END##", 6) != 0)
3024 {
3025 q += (i + 6);
3026 continue;
3027 }
3028 r = StringStr(p + 8, "-START##");
3029 if(r != NULL && r < q)
3030 {
3031 bad = true;
3032 break;
3033 }
3034 break;
3035 }
3036
3037 if (bad)
3038 break;
3039
3040 if(tagvnp == NULL)
3041 {
3042 tagvnp = ValNodeNew(NULL);
3043 tagvnp->data.ptrvalue = StringSave(tag);
3044 tagvnp->next = NULL;
3045 }
3046 else
3047 {
3048 for(vnp = tagvnp; vnp != NULL; vnp = vnp->next)
3049 {
3050 r = (char*) vnp->data.ptrvalue;
3051 if(StringCmp(r + 2, tag + 2) == 0)
3052 {
3053 if(*r != ' ')
3054 {
3055 ErrPostEx(SEV_ERROR, ERR_COMMENT_SameStructuredCommentTags,
3056 "More than one structured comment with the same tag \"%s\" found.",
3057 tag + 2);
3058 *r = ' ';
3059 }
3060 break;
3061 }
3062 if(vnp->next == NULL)
3063 {
3064 vnp->next = ValNodeNew(NULL);
3065 vnp->next->data.ptrvalue = StringSave(tag);
3066 vnp->next->next = NULL;
3067 break;
3068 }
3069 }
3070 }
3071
3072 if(StringCmp(tag, "##Metadata") == 0)
3073 {
3074 MemFree(tag);
3075 p += 8;
3076 continue;
3077 }
3078
3079 *q = '\0';
3080 if(StringStr(p + 8, "::") == NULL)
3081 {
3082 ErrPostEx(SEV_ERROR, ERR_COMMENT_StructuredCommentLacksDelim,
3083 "The structured comment in this record lacks the expected double-colon '::' delimiter between fields and values.");
3084 MemFree(tag);
3085 p += 8;
3086 *q = '#';
3087 continue;
3088 }
3089
3090 buf = StringSave(p + 8);
3091 *q = '#';
3092
3093 CRef<objects::CUser_object> cur = fta_build_structured_comment(tag, buf);
3094 MemFree(buf);
3095
3096 if (cur.Empty())
3097 {
3098 bad = true;
3099 break;
3100 }
3101
3102 objs.push_back(cur);
3103
3104 fta_StringCpy(start, q + StringLen(tag) + 6);
3105 MemFree(tag);
3106 p = start;
3107 }
3108
3109 if(bad)
3110 {
3111 ErrPostEx(SEV_REJECT, ERR_COMMENT_InvalidStructuredComment,
3112 "Incorrectly formatted structured comment with tag \"%s\" encountered. Entry dropped.",
3113 tag + 2);
3114 MemFree(tag);
3115 }
3116
3117 if(tagvnp != NULL)
3118 ValNodeFreeData(tagvnp);
3119 }
3120
3121 /**********************************************************/
GetQSFromFile(FILE * fd,IndexblkPtr ibp)3122 char* GetQSFromFile(FILE* fd, IndexblkPtr ibp)
3123 {
3124 char* ret;
3125 Char buf[1024];
3126
3127 if(fd == NULL || ibp->qslength < 1)
3128 return(NULL);
3129
3130 ret = (char*) MemNew(ibp->qslength + 10);
3131 ret[0] = '\0';
3132 fseek(fd, static_cast<long>(ibp->qsoffset), 0);
3133 while(fgets(buf, 1023, fd) != NULL)
3134 {
3135 if(buf[0] == '>' && ret[0] != '\0')
3136 break;
3137 StringCat(ret, buf);
3138 }
3139 return(ret);
3140 }
3141
3142 /**********************************************************/
fta_remove_cleanup_user_object(objects::CSeq_entry & seq_entry)3143 void fta_remove_cleanup_user_object(objects::CSeq_entry& seq_entry)
3144 {
3145 TSeqdescList* descrs = nullptr;
3146 if (seq_entry.IsSeq())
3147 {
3148 if (seq_entry.GetSeq().IsSetDescr())
3149 descrs = &seq_entry.SetSeq().SetDescr().Set();
3150 }
3151 else if (seq_entry.IsSet())
3152 {
3153 if (seq_entry.GetSet().IsSetDescr())
3154 descrs = &seq_entry.SetSet().SetDescr().Set();
3155 }
3156
3157 if (descrs == nullptr)
3158 return;
3159
3160 for (TSeqdescList::iterator descr = descrs->begin(); descr != descrs->end(); )
3161 {
3162 if (!(*descr)->IsUser())
3163 {
3164 ++descr;
3165 continue;
3166 }
3167
3168 const objects::CUser_object& user_obj = (*descr)->GetUser();
3169 if (!user_obj.IsSetType() || !user_obj.GetType().IsStr() ||
3170 user_obj.GetType().GetStr() != "NcbiCleanup")
3171 {
3172 ++descr;
3173 continue;
3174 }
3175
3176 descr = descrs->erase(descr);
3177 break;
3178 }
3179 }
3180
3181 /**********************************************************/
fta_tsa_tls_comment_dblink_check(const objects::CBioseq & bioseq,bool is_tsa)3182 void fta_tsa_tls_comment_dblink_check(const objects::CBioseq& bioseq,
3183 bool is_tsa)
3184 {
3185 bool got_comment = false;
3186 bool got_dblink = false;
3187
3188 ITERATE(TSeqdescList, descr, bioseq.GetDescr().Get())
3189 {
3190 if (!(*descr)->IsUser())
3191 continue;
3192
3193 const objects::CUser_object& user_obj = (*descr)->GetUser();
3194 if (!user_obj.IsSetType() || !user_obj.GetType().IsStr())
3195 continue;
3196
3197 const std::string& user_type_str = user_obj.GetType().GetStr();
3198
3199 if (user_type_str == "StructuredComment")
3200 got_comment = true;
3201 else if (user_type_str == "GenomeProjectsDB")
3202 got_dblink = true;
3203 else if (user_type_str == "DBLink")
3204 {
3205 ITERATE(objects::CUser_object::TData, field, user_obj.GetData())
3206 {
3207 if (!(*field)->IsSetLabel() || !(*field)->GetLabel().IsStr() ||
3208 (*field)->GetLabel().GetStr() != "BioProject")
3209 continue;
3210 got_dblink = true;
3211 break;
3212 }
3213 }
3214 }
3215
3216 if(!is_tsa)
3217 {
3218 if(!got_comment)
3219 ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksStructuredComment,
3220 "This TLS record lacks an expected structured comment.");
3221 if(!got_dblink)
3222 ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksBioProjectLink,
3223 "This TLS record lacks an expected BioProject or Project link.");
3224 }
3225 else
3226 {
3227 if(!got_comment)
3228 ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksStructuredComment,
3229 "This TSA record lacks an expected structured comment.");
3230 if(!got_dblink)
3231 ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksBioProjectLink,
3232 "This TSA record lacks an expected BioProject or Project link.");
3233 }
3234 }
3235
3236 /**********************************************************/
fta_set_molinfo_completeness(objects::CBioseq & bioseq,IndexblkPtr ibp)3237 void fta_set_molinfo_completeness(objects::CBioseq& bioseq, IndexblkPtr ibp)
3238 {
3239 if (bioseq.GetInst().GetTopology() != 2 || (ibp != NULL && ibp->gaps != NULL))
3240 return;
3241
3242 objects::CMolInfo* mol_info = nullptr;
3243 NON_CONST_ITERATE(TSeqdescList, descr, bioseq.SetDescr().Set())
3244 {
3245 if ((*descr)->IsMolinfo())
3246 {
3247 mol_info = &(*descr)->SetMolinfo();
3248 break;
3249 }
3250 }
3251
3252 if (mol_info != nullptr)
3253 {
3254 mol_info->SetCompleteness(1);
3255 }
3256 else
3257 {
3258 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
3259 objects::CMolInfo& mol = descr->SetMolinfo();
3260 mol.SetCompleteness(1);
3261
3262 bioseq.SetDescr().Set().push_back(descr);
3263 }
3264 }
3265
3266 /**********************************************************/
fta_create_far_fetch_policy_user_object(objects::CBioseq & bsp,Int4 num)3267 void fta_create_far_fetch_policy_user_object(objects::CBioseq& bsp, Int4 num)
3268 {
3269 if (num < 1000)
3270 return;
3271
3272 ErrPostEx(SEV_INFO, ERR_SEQUENCE_HasManyComponents,
3273 "An OnlyNearFeatures FeatureFetchPolicy User-object has been added to this record because it is constructed from %d components, which exceeds the threshold of 999 for User-object creation.",
3274 num);
3275
3276 CRef<objects::CSeqdesc> descr(new objects::CSeqdesc);
3277 descr->SetUser().SetType().SetStr("FeatureFetchPolicy");
3278
3279 CRef<objects::CUser_field> field(new objects::CUser_field);
3280
3281 field->SetLabel().SetStr("Policy");
3282 field->SetData().SetStr("OnlyNearFeatures");
3283
3284 descr->SetUser().SetData().push_back(field);
3285
3286 bsp.SetDescr().Set().push_back(descr);
3287 }
3288
3289 /**********************************************************/
StripECO(char * str)3290 void StripECO(char* str)
3291 {
3292 char* p;
3293 char* q;
3294
3295 if(str == NULL || *str == '\0')
3296 return;
3297
3298 p = StringStr(str, "{ECO:");
3299 if(p == NULL)
3300 return;
3301
3302 for(;;)
3303 {
3304 q = StringChr(p + 1, '}');
3305 if(q == NULL)
3306 break;
3307 if(p > str && *(p - 1) == ' ')
3308 p--;
3309 if(p > str)
3310 if((*(p - 1) == '.' && q[1] == '.') ||
3311 (*(p - 1) == ';' && q[1] == ';'))
3312 p--;
3313 fta_StringCpy(p, q + 1);
3314 p = StringStr(p, "{ECO:");
3315 if(p == NULL)
3316 break;
3317 }
3318 }
3319
3320 /**********************************************************/
fta_dblink_has_sra(const CRef<objects::CUser_object> & uop)3321 bool fta_dblink_has_sra(const CRef<objects::CUser_object>& uop)
3322 {
3323 if (uop.Empty() || !uop->IsSetData() || !uop->IsSetType() ||
3324 !uop->GetType().IsStr() || uop->GetType().GetStr() != "DBLink")
3325 return false;
3326
3327 bool got = false;
3328
3329 ITERATE(objects::CUser_object::TData, field, uop->GetData())
3330 {
3331 if (!(*field)->IsSetData() || !(*field)->GetData().IsStrs() || !(*field)->IsSetNum() || (*field)->GetNum() < 1 ||
3332 !(*field)->IsSetLabel() || !(*field)->GetLabel().IsStr() || (*field)->GetLabel().GetStr() != "Sequence Read Archive")
3333 continue;
3334
3335 ITERATE(objects::CUser_field::C_Data::TStrs, str, (*field)->GetData().GetStrs())
3336 {
3337 if (str->size() > 2 &&
3338 ((*str)[0] == 'D' || (*str)[0] == 'E' || (*str)[0] == 'S') && (*str)[1] == 'R' &&
3339 ((*str)[2] == 'R' || (*str)[2] == 'X' || (*str)[2] == 'Z'))
3340 {
3341 got = true;
3342 break;
3343 }
3344 }
3345 if(got)
3346 break;
3347 }
3348 return(got);
3349 }
3350
3351 END_NCBI_SCOPE
3352