1 /* reblobber.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: reblobber.c
27 *
28 * Author: Colleen Bollin
29 *
30 * Version Creation Date: Dec. 2, 2011
31 *
32 * $Revision: 1.7 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <sequtil.h>
51 #include <salutil.h>
52 #include <edutil.h>
53 #include <seqport.h>
54 #include <gather.h>
55 #include <sqnutils.h>
56 #include <subutil.h>
57 #include <toasn3.h>
58 #include <valid.h>
59 #include <asn2gnbk.h>
60 #include <explore.h>
61 #include <tofasta.h>
62 #include <simple.h>
63 #include <suggslp.h>
64 #include <toporg.h>
65 #include <aliparse.h>
66 #include <util/creaders/alnread.h>
67 #include <pmfapi.h>
68 #include <tax3api.h>
69 #ifdef INTERNAL_NCBI_TBL2ASN
70 #include <accpubseq.h>
71 #endif
72 #define NLM_GENERATED_CODE_PROTO
73 #include <objmacro.h>
74 #include <macroapi.h>
75
76 #define REBLOBBER_APP_VER "1.0"
77
78 CharPtr REBLOBBER_APPLICATION = REBLOBBER_APP_VER;
79
80 typedef struct cleanupargs {
81 Boolean collection_dates;
82 Boolean collection_dates_month_first;
83 Boolean add_notes_to_overlapping_cds_without_abc;
84 Boolean extend_partial_features_to_gaps_or_ends;
85 Boolean add_exception_to_nonextendable_partials;
86 Boolean add_exception_to_short_introns;
87 FILE * cleanup_log;
88 } CleanupArgsData, PNTR CleanupArgsPtr;
89
90 typedef struct tblargs {
91 Boolean raw2delt;
92 Int2 r2dmin;
93 Boolean r2dunk100;
94 Boolean fastaset;
95 Int2 whichclass;
96 Boolean deltaset;
97 Boolean alignset;
98 Boolean gapped;
99 Boolean phrapace;
100 Boolean ftable;
101 Boolean genprodset;
102 Boolean delaygenprodset;
103 Boolean linkbyoverlap;
104 Boolean linkbyproduct;
105 Boolean implicitgaps;
106 Boolean forcelocalid;
107 Boolean gpstonps;
108 Boolean gnltonote;
109 Boolean removeunnecxref;
110 Boolean dotaxlookup;
111 Boolean dopublookup;
112 CharPtr accn;
113 CharPtr center;
114 Int4 project_version;
115 CharPtr organism;
116 CharPtr srcquals;
117 CharPtr comment;
118 CharPtr commentFile;
119 CharPtr tableFile;
120 Boolean findorf;
121 Boolean runonorf;
122 Boolean altstart;
123 Boolean conflict;
124 Boolean validate;
125 Boolean relaxed;
126 Boolean validate_barcode;
127 Boolean flatfile;
128 Boolean genereport;
129 Boolean seqidfromfile;
130 Boolean smartfeats;
131 Boolean refSeqTitles;
132 Boolean smarttitle;
133 Boolean logtoterminal;
134 CharPtr aln_beginning_gap;
135 CharPtr aln_end_gap;
136 CharPtr aln_middle_gap;
137 CharPtr aln_missing;
138 CharPtr aln_match;
139 Boolean aln_is_protein;
140 Boolean save_bioseq_set;
141 Boolean auto_def;
142 Boolean apply_cmt_to_all;
143 Boolean adjust_mrna_for_cds_stop_codon;
144 Boolean seq_fetch_failure;
145
146 GlobalDiscrepReportPtr global_report;
147
148 CleanupArgsData cleanup_args;
149 } TblArgs, PNTR TblArgsPtr;
150
151
RetryRead(CharPtr filename)152 static void RetryRead (CharPtr filename)
153 {
154 AsnIoPtr aip;
155 SeqEntryPtr orig_sep;
156 FILE *fp;
157
158 fp = FileOpen (filename, "r");
159 if (fp == NULL) {
160 Message (MSG_ERROR, "Unable to open %s", filename);
161 }
162 FileClose (fp);
163
164 /* for debugging purposes, try to read it again */
165 aip = AsnIoOpen (filename, "r");
166 orig_sep = SeqEntryAsnRead (aip, NULL);
167 AsnIoClose (aip);
168 }
169
170
171 /* source information for several common organisms sequenced by genome centers */
172
173
174 typedef struct gcmdata {
175 SeqFeatPtr gene;
176 SeqFeatPtr feat;
177 CharPtr label;
178 } GmcData, PNTR GmcDataPtr;
179
SortByGenePtr(VoidPtr vp1,VoidPtr vp2)180 static int LIBCALLBACK SortByGenePtr (
181 VoidPtr vp1,
182 VoidPtr vp2
183 )
184
185 {
186 GmcDataPtr gdp1, gdp2;
187
188 if (vp1 == NULL || vp2 == NULL) return 0;
189 gdp1 = (GmcDataPtr) vp1;
190 gdp2 = (GmcDataPtr) vp2;
191 if (gdp1 == NULL || gdp2 == NULL) return 0;
192
193 if (gdp1->gene > gdp2->gene) return -1;
194 if (gdp1->gene < gdp2->gene) return 1;
195
196 if (gdp1->feat > gdp2->feat) return -1;
197 if (gdp1->feat < gdp2->feat) return 1;
198
199 return 0;
200 }
201
202
203
EnhanceOneCDS(SeqFeatPtr sfp,Boolean alt_splice)204 static void EnhanceOneCDS (
205 SeqFeatPtr sfp,
206 Boolean alt_splice
207 )
208
209 {
210 DbtagPtr dbt;
211 GBQualPtr gbq;
212 Char id [64];
213 SeqIdPtr ids, sip;
214 size_t len;
215 CharPtr name, nwstr, ptr, str;
216 ObjectIdPtr oip;
217 ProtRefPtr prp;
218 Char tmp [256];
219 ValNodePtr vnp;
220 SeqFeatXrefPtr xref;
221
222 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
223
224 name = NULL;
225 vnp = NULL;
226 prp = NULL;
227
228 for (xref = sfp->xref; xref != NULL; xref = xref->next) {
229 if (xref->data.choice == SEQFEAT_PROT) {
230 prp = (ProtRefPtr) xref->data.value.ptrvalue;
231 }
232 }
233
234 id [0] = '\0';
235 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
236 if (StringICmp (gbq->qual, "protein_id") == 0) {
237 StringNCpy_0 (id, gbq->val, sizeof (id));
238 }
239 }
240 if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
241 str = NULL;
242 ids = SeqIdParse (id);
243 for (sip = ids; sip != NULL; sip = sip->next) {
244 if (sip->choice != SEQID_GENERAL) continue;
245 dbt = (DbtagPtr) sip->data.ptrvalue;
246 if (dbt == NULL) continue;
247 if (IsSkippableDbtag (dbt)) continue;
248 oip = dbt->tag;
249 if (oip == NULL) continue;
250 str = oip->str;
251 }
252
253 if (StringDoesHaveText (str)) {
254 if (prp != NULL && prp->name != NULL) {
255 vnp = prp->name;
256 name = (CharPtr) vnp->data.ptrvalue;
257 }
258 if (StringDoesHaveText (name) && vnp != NULL) {
259 if (alt_splice) {
260 ptr = StringChr (str, '-');
261 if (ptr != NULL && StringLen (ptr) == 3) {
262 ptr++;
263 ptr++;
264 sprintf (tmp, "%s, isoform %s", str, ptr);
265 len = StringLen (name) + StringLen (", ") + StringLen (tmp);
266 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
267 if (nwstr != NULL) {
268 StringCpy (nwstr, name);
269 /*
270 StringCat (nwstr, ", ");
271 */
272 StringCat (nwstr, " ");
273 StringCat (nwstr, tmp);
274 vnp->data.ptrvalue = (Pointer) nwstr;
275 MemFree (name);
276 }
277 } else {
278 AddQualifierToFeature (sfp, "product", str);
279 }
280 } else {
281 len = StringLen (name) + StringLen (", ") + StringLen (str);
282 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
283 if (nwstr != NULL) {
284 StringCpy (nwstr, name);
285 /*
286 StringCat (nwstr, ", ");
287 */
288 StringCat (nwstr, " ");
289 StringCat (nwstr, str);
290 vnp->data.ptrvalue = (Pointer) nwstr;
291 MemFree (name);
292 }
293 }
294 } else {
295 if (alt_splice) {
296 ptr = StringChr (str, '-');
297 if (ptr != NULL && StringLen (ptr) == 3) {
298 ptr++;
299 ptr++;
300 sprintf (tmp, "%s, isoform %s", str, ptr);
301 AddQualifierToFeature (sfp, "product", tmp);
302 } else {
303 AddQualifierToFeature (sfp, "product", str);
304 }
305 } else {
306 AddQualifierToFeature (sfp, "product", str);
307 }
308 }
309 }
310
311 SeqIdSetFree (ids);
312 }
313 }
314
EnhanceOneRna(SeqFeatPtr sfp,Boolean alt_splice)315 static void EnhanceOneRna (
316 SeqFeatPtr sfp,
317 Boolean alt_splice
318 )
319
320 {
321 DbtagPtr dbt;
322 GBQualPtr gbq, nm_gbq;
323 Char id [64];
324 SeqIdPtr ids, sip;
325 size_t len;
326 CharPtr name, nwstr, ptr, str;
327 ObjectIdPtr oip;
328 RnaRefPtr rrp;
329 Char tmp [256];
330
331 if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
332
333 name = NULL;
334 nm_gbq = NULL;
335
336 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
337 if (rrp != NULL && rrp->ext.choice == 1) {
338 switch (rrp->type) {
339 case 1 : /* precurrsor_RNA */
340 case 2 : /* mRNA */
341 case 4 : /* rRNA */
342 name = rrp->ext.value.ptrvalue;
343 break;
344 case 255 : /* misc_RNA, ncRNA, tmRNA */
345 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
346 if (StringICmp (gbq->qual, "product") == 0) {
347 nm_gbq = gbq;
348 name = gbq->val;
349 }
350 }
351 break;
352 case 3: /* tRNA */
353 return;
354 default :
355 break;
356 }
357 }
358
359 id [0] = '\0';
360 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
361 if (StringICmp (gbq->qual, "transcript_id") == 0) {
362 StringNCpy_0 (id, gbq->val, sizeof (id));
363 }
364 }
365 if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
366 str = NULL;
367 ids = SeqIdParse (id);
368 for (sip = ids; sip != NULL; sip = sip->next) {
369 if (sip->choice != SEQID_GENERAL) continue;
370 dbt = (DbtagPtr) sip->data.ptrvalue;
371 if (dbt == NULL) continue;
372 if (IsSkippableDbtag(dbt)) continue;
373 oip = dbt->tag;
374 if (oip == NULL) continue;
375 str = oip->str;
376 }
377
378 if (StringDoesHaveText (str)) {
379 if (StringDoesHaveText (name) && StringCmp (str, name) != 0) {
380 if (alt_splice) {
381 ptr = StringChr (str, '-');
382 if (ptr != NULL && StringLen (ptr) == 3) {
383 ptr++;
384 ptr++;
385 sprintf (tmp, "%s, transcript variant %s", str, ptr);
386 len = StringLen (name) + StringLen (", ") + StringLen (tmp);
387 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
388 if (nwstr != NULL) {
389 StringCpy (nwstr, name);
390 /*
391 StringCat (nwstr, ", ");
392 */
393 StringCat (nwstr, " ");
394 StringCat (nwstr, tmp);
395 if (nm_gbq != NULL) {
396 nm_gbq->val = (Pointer) nwstr;
397 } else if (rrp != NULL) {
398 rrp->ext.value.ptrvalue = (Pointer) nwstr;
399 }
400 MemFree (name);
401 }
402 } else {
403 AddQualifierToFeature (sfp, "product", str);
404 }
405 } else {
406 len = StringLen (name) + StringLen (", ") + StringLen (str);
407 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
408 if (nwstr != NULL) {
409 StringCpy (nwstr, name);
410 /*
411 StringCat (nwstr, ", ");
412 */
413 StringCat (nwstr, " ");
414 StringCat (nwstr, str);
415 if (nm_gbq != NULL) {
416 nm_gbq->val = (Pointer) nwstr;
417 } else if (rrp != NULL) {
418 rrp->ext.value.ptrvalue = (Pointer) nwstr;
419 }
420 MemFree (name);
421 }
422 }
423 } else {
424 if (alt_splice) {
425 ptr = StringChr (str, '-');
426 if (ptr != NULL && StringLen (ptr) == 3) {
427 ptr++;
428 ptr++;
429 sprintf (tmp, "%s, transcript variant %s", str, ptr);
430 AddQualifierToFeature (sfp, "product", tmp);
431 } else {
432 AddQualifierToFeature (sfp, "product", str);
433 }
434 } else {
435 AddQualifierToFeature (sfp, "product", str);
436 }
437 }
438 }
439
440 SeqIdSetFree (ids);
441 }
442 }
443
EnhanceFeatureAnnotation(SeqFeatPtr features,BioseqPtr bsp)444 static void EnhanceFeatureAnnotation (
445 SeqFeatPtr features,
446 BioseqPtr bsp
447 )
448
449 {
450 GmcDataPtr gdp, head;
451 GeneRefPtr grp;
452 Int2 i, j, k, numgene, numcds, numrna;
453 SeqFeatPtr sfp;
454
455 if (features == NULL || bsp == NULL) return;
456
457 numgene = 0;
458 numcds = 0;
459 numrna = 0;
460
461 for (sfp = features; sfp != NULL; sfp = sfp->next) {
462 switch (sfp->data.choice) {
463 case SEQFEAT_GENE :
464 numgene++;
465 break;
466 case SEQFEAT_CDREGION :
467 numcds++;
468 break;
469 case SEQFEAT_RNA :
470 numrna++;
471 break;
472 default :
473 break;
474 }
475 }
476
477 if (numgene == 0) return;
478
479 if (numcds > 0) {
480 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numcds + 1));
481 if (head != NULL) {
482 gdp = head;
483 for (sfp = features; sfp != NULL; sfp = sfp->next) {
484 if (sfp->idx.subtype == FEATDEF_CDS) {
485 gdp->feat = sfp;
486 grp = SeqMgrGetGeneXref (sfp);
487 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
488 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
489 }
490 gdp++;
491 }
492 }
493 HeapSort (head, (size_t) numcds, sizeof (GmcData), SortByGenePtr);
494 for (i = 0; i < numcds; i += j) {
495 sfp = head [i].gene;
496 for (j = 1; i + j < numcds && sfp == head [i + j].gene; j++) continue;
497 if (j == 1) {
498 /* no alt splicing */
499 EnhanceOneCDS (head [i].feat, FALSE);
500 } else {
501 /* is alt splicing */
502 for (k = 0; k < j; k++) {
503 EnhanceOneCDS (head [i + k].feat, TRUE);
504 }
505 }
506 }
507 }
508 MemFree (head);
509 }
510
511 if (numrna > 0) {
512 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1));
513 if (head != NULL) {
514 gdp = head;
515 for (sfp = features; sfp != NULL; sfp = sfp->next) {
516 if (sfp->data.choice == SEQFEAT_RNA) {
517 gdp->feat = sfp;
518 grp = SeqMgrGetGeneXref (sfp);
519 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
520 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
521 }
522 gdp++;
523 }
524 }
525 HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr);
526 for (i = 0; i < numrna; i += j) {
527 sfp = head [i].gene;
528 for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue;
529 if (j == 1) {
530 /* no alt splicing */
531 EnhanceOneRna (head [i].feat, FALSE);
532 } else {
533 /* is alt splicing */
534 for (k = 0; k < j; k++) {
535 EnhanceOneRna (head [i + k].feat, TRUE);
536 }
537 }
538 }
539 }
540 MemFree (head);
541 }
542 }
543
AttachSeqAnnotEntity(Uint2 entityID,SeqAnnotPtr sap,TblArgsPtr tbl)544 static BioseqPtr AttachSeqAnnotEntity (
545 Uint2 entityID,
546 SeqAnnotPtr sap,
547 TblArgsPtr tbl
548 )
549
550 {
551 SeqAnnotPtr anp;
552 BioseqPtr bsp;
553 Char buf [80];
554 Int2 genCode;
555 SeqEntryPtr oldscope;
556 SeqEntryPtr sep;
557 SeqFeatPtr sfp = NULL;
558 SeqIdPtr sip;
559 SeqLocPtr slp;
560
561 if (sap == NULL || tbl == NULL) return NULL;
562
563 bsp = GetBioseqReferencedByAnnot (sap, entityID);
564 if (bsp == NULL) {
565 oldscope = SeqEntrySetScope (NULL);
566 if (oldscope != NULL) {
567 bsp = GetBioseqReferencedByAnnot (sap, entityID);
568 SeqEntrySetScope (oldscope);
569 }
570 }
571 if (bsp != NULL) {
572 sep = SeqMgrGetSeqEntryForData (bsp);
573 entityID = ObjMgrGetEntityIDForChoice (sep);
574 if (sap->type == 1) {
575 sfp = (SeqFeatPtr) sap->data;
576 genCode = GetGenCodeForBsp (bsp);
577 SetEmptyGeneticCodes (sap, genCode);
578 }
579 if (bsp->annot == NULL) {
580 bsp->annot = sap;
581 } else {
582 anp = bsp->annot;
583 while (anp->next != NULL) {
584 anp = anp->next;
585 }
586 anp->next = sap;
587 }
588 if (sfp != NULL) {
589 if (tbl->smartfeats) {
590
591 /* indexing needed to find mRNA and CDS within each gene */
592
593 SeqMgrIndexFeatures (entityID, NULL);
594
595 EnhanceFeatureAnnotation (sfp, bsp);
596 }
597
598 PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid, &(tbl->seq_fetch_failure));
599 sep = GetTopSeqEntryForEntityID (entityID);
600 }
601 } else {
602 buf [0] = '\0';
603 if (sap->type == 1) {
604 sfp = (SeqFeatPtr) sap->data;
605 if (sfp != NULL && sfp->location != NULL) {
606 slp = SeqLocFindNext (sfp->location, NULL);
607 if (slp != NULL) {
608 sip = SeqLocId (slp);
609 if (sip != NULL) {
610 SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
611 }
612 }
613 }
614 }
615 // Message (MSG_POSTERR, "Feature table identifiers %s do not match record", buf);
616 }
617 sep = GetTopSeqEntryForEntityID (entityID);
618 return bsp;
619 }
620
621
622
623
ProcessOneAnnot(SeqAnnotPtr sap,Uint2 entityID,TblArgsPtr tbl)624 static void ProcessOneAnnot (
625 SeqAnnotPtr sap,
626 Uint2 entityID,
627 TblArgsPtr tbl
628 )
629
630 {
631 BioseqPtr bsp;
632 GBQualPtr gbq;
633 Int2 genCode;
634 SeqFeatPtr sfp;
635 SeqEntryPtr sep;
636
637 if (sap == NULL || tbl == NULL) return;
638
639 if (tbl->delaygenprodset) {
640 if (sap->type == 1) {
641 for (sfp = (SeqFeatPtr) sap->data; sfp != NULL; sfp = sfp->next) {
642 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
643 if (StringICmp (gbq->qual, "protein_id") == 0 && sfp->data.choice == SEQFEAT_RNA) {
644 gbq->qual = MemFree (gbq->qual);
645 gbq->qual = StringSave ("orig_protein_id");
646 }
647 if (StringICmp (gbq->qual, "transcript_id") == 0) {
648 gbq->qual = MemFree (gbq->qual);
649 gbq->qual = StringSave ("orig_transcript_id");
650 }
651 }
652 }
653 }
654 }
655
656 bsp = AttachSeqAnnotEntity (entityID, sap, tbl);
657 if (bsp == NULL) return;
658
659 sep = GetTopSeqEntryForEntityID (entityID);
660
661 /* correct all idx parent pointers */
662
663 AssignIDsInEntity (entityID, 0, NULL);
664
665 genCode = GetGenCodeForBsp (bsp);
666
667 /* coercion of SeqIds to accession moved to ProcessOneRecord->MakeAccessionID */
668
669 /* for parsed in features or best ORF, promote CDS products to protein bioseq */
670
671 for (sap = bsp->annot; sap != NULL; sap = sap->next) {
672 if (sap->type == 1) {
673 SetEmptyGeneticCodes (sap, genCode);
674 sfp = (SeqFeatPtr) sap->data;
675 PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid, &(tbl->seq_fetch_failure));
676 }
677 }
678 sep = GetTopSeqEntryForEntityID (entityID);
679 move_cds_ex (sep, FALSE);
680 }
681
682
683
684
685
686 typedef struct raw2deltdata {
687 Uint2 entityID;
688 SubmitBlockPtr sbp;
689 BioSourcePtr src;
690 TblArgsPtr tbl;
691 MolInfoPtr template_molinfo;
692 } Raw2DeltData, PNTR Raw2DeltPtr;
693
694
695
696
697 typedef struct resqseqgph {
698 Int2 index;
699 SeqGraphPtr sgp;
700 } ResqSeqgph, PNTR ResqSeqgphPtr;
701
702
703
704
705
706 typedef struct reqcontig {
707 Int2 index;
708 Char str [41];
709 } ResqContig, PNTR ResqContigPtr;
710
711 #define MAX_FIELDS 8
712
713
714
715
716
DoSequenceLengthsMatch(TAlignmentFilePtr afp)717 static Boolean DoSequenceLengthsMatch (
718 TAlignmentFilePtr afp
719 )
720
721 {
722 int seq_index;
723 Int4 seq_len;
724
725 if (afp == NULL || afp->sequences == NULL || afp->num_sequences == 0) {
726 return TRUE;
727 }
728 seq_len = StringLen (afp->sequences[0]);
729 for (seq_index = 1; seq_index < afp->num_sequences; seq_index++) {
730 if (StringLen (afp->sequences[seq_index]) != seq_len) {
731 return FALSE;
732 }
733 }
734 return TRUE;
735 }
736
737
738
739 #ifdef INTERNAL_NCBI_ASNDISC
740 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase;
741 #else
742 const PerformDiscrepancyTest taxlookup = NULL;
743 #endif
744
745
746
747
748 typedef struct reblobseq {
749 SeqIdPtr sip;
750 CharPtr orig_file;
751 CharPtr feature_file;
752 CharPtr src_file;
753 CharPtr quality_file;
754 BioseqPtr bsp;
755 } ReblobSeqData, PNTR ReblobSeqPtr;
756
757
ReblobSeqNew(ValNodePtr token_list)758 static ReblobSeqPtr ReblobSeqNew (ValNodePtr token_list)
759 {
760 ReblobSeqPtr rs;
761 Char id_buf[PATH_MAX];
762
763 rs = (ReblobSeqPtr) MemNew (sizeof (ReblobSeqData));
764 if (token_list != NULL) {
765 sprintf (id_buf, "gb|%s", (CharPtr) token_list->data.ptrvalue);
766 rs->sip = MakeSeqID (id_buf);
767 if (token_list->next != NULL) {
768 rs->orig_file = token_list->next->data.ptrvalue;
769 token_list->next->data.ptrvalue = NULL;
770 if (token_list->next->next != NULL) {
771 rs->feature_file = token_list->next->next->data.ptrvalue;
772 token_list->next->next->data.ptrvalue = NULL;
773 if (token_list->next->next->next != NULL) {
774 rs->src_file = token_list->next->next->next->data.ptrvalue;
775 token_list->next->next->next->data.ptrvalue = NULL;
776 if (token_list->next->next->next->next != NULL) {
777 rs->quality_file = token_list->next->next->next->next->data.ptrvalue;
778 token_list->next->next->next->next->data.ptrvalue = NULL;
779 }
780 }
781 }
782 }
783 }
784 return rs;
785 }
786
787
ReblobSeqFree(ReblobSeqPtr rs)788 static ReblobSeqPtr ReblobSeqFree (ReblobSeqPtr rs)
789 {
790 if (rs != NULL) {
791 rs->sip = SeqIdFree (rs->sip);
792 rs->orig_file = MemFree (rs->orig_file);
793 rs->feature_file = MemFree (rs->feature_file);
794 rs->src_file = MemFree (rs->src_file);
795 rs->quality_file = MemFree (rs->quality_file);
796 rs = MemFree (rs);
797 }
798 return rs;
799 }
800
801
ReblobSeqListFree(ValNodePtr vnp)802 static ValNodePtr ReblobSeqListFree (ValNodePtr vnp)
803 {
804 ValNodePtr vnp_next;
805
806 while (vnp != NULL) {
807 vnp_next = vnp->next;
808 vnp->next = NULL;
809 vnp->data.ptrvalue = ReblobSeqFree (vnp->data.ptrvalue);
810 vnp = ValNodeFree (vnp);
811 vnp = vnp_next;
812 }
813 return vnp;
814 }
815
816
817 typedef struct reblobgroup {
818 CharPtr filename;
819 ValNodePtr seq_list;
820 } ReblobGroupData, PNTR ReblobGroupPtr;
821
822
ReblobGroupNew(CharPtr filename,ValNodePtr seq_list)823 static ReblobGroupPtr ReblobGroupNew (CharPtr filename, ValNodePtr seq_list)
824 {
825 ReblobGroupPtr rg;
826
827 rg = (ReblobGroupPtr) MemNew (sizeof (ReblobGroupData));
828 rg->filename = StringSave (filename);
829 rg->seq_list = seq_list;
830 return rg;
831 }
832
833
ReblobGroupFree(ReblobGroupPtr rg)834 static ReblobGroupPtr ReblobGroupFree (ReblobGroupPtr rg)
835 {
836 if (rg != NULL) {
837 rg->filename = MemFree (rg->filename);
838 rg->seq_list = ReblobSeqListFree (rg->seq_list);
839 rg = MemFree (rg);
840 }
841 return rg;
842 }
843
844
ReblobGroupListFree(ValNodePtr vnp)845 static ValNodePtr ReblobGroupListFree (ValNodePtr vnp)
846 {
847 ValNodePtr vnp_next;
848
849 while (vnp != NULL) {
850 vnp_next = vnp->next;
851 vnp->next = NULL;
852 vnp->data.ptrvalue = ReblobGroupFree (vnp->data.ptrvalue);
853 vnp = ValNodeFree (vnp);
854 vnp = vnp_next;
855 }
856 return vnp;
857 }
858
859
860 const char *nucleotide_alphabet = "ABCDGHKMRSTUVWYabcdghkmrstuvwy";
861 const char *protein_alphabet = "ABCDEFGHIKLMPQRSTUVWXYZabcdefghiklmpqrstuvwxyz";
862
GetDefaultSequenceInfo(void)863 extern TSequenceInfoPtr GetDefaultSequenceInfo (void)
864 {
865 TSequenceInfoPtr sequence_info = SequenceInfoNew();
866
867 sequence_info->missing = MemFree (sequence_info->missing);
868 sequence_info->missing = StringSave ("?Nn");
869
870 sequence_info->beginning_gap = MemFree (sequence_info->beginning_gap);
871 sequence_info->beginning_gap = StringSave ("-.Nn?");
872
873 sequence_info->middle_gap = MemFree (sequence_info->middle_gap);
874 sequence_info->middle_gap = StringSave ("-.");
875
876 sequence_info->end_gap = MemFree (sequence_info->end_gap);
877 sequence_info->end_gap = StringSave ("-.Nn?");
878
879 sequence_info->match = MemFree (sequence_info->match);
880 sequence_info->match = StringSave (":");
881
882 sequence_info->alphabet = nucleotide_alphabet;
883
884 return sequence_info;
885 }
886
887 static void MakeFeatureTableFileIdsGenbank (SeqAnnotPtr sap);
888
AddOneFeatureTable(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)889 static Boolean AddOneFeatureTable (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
890 {
891 FILE *fp;
892 Pointer dataptr;
893 Uint2 datatype;
894 Boolean failure = FALSE;
895 Int4 linenum = 0;
896 SeqAnnotPtr sap;
897 TblArgs tbl_args;
898 SeqEntryPtr orig_scope;
899
900
901 if (StringHasNoText (filename)) {
902 return FALSE;
903 }
904 fp = FileOpen (filename, "r");
905 if (fp == NULL) {
906 Message (MSG_ERROR, "Unable to open feature table file %s\n", filename);
907 return FALSE;
908 }
909 orig_scope = SeqEntrySetScope (sep);
910
911 MemSet (&tbl_args, 0, sizeof (TblArgs));
912
913 while ((! failure) && (dataptr = ReadFeatureTableFile (fp, &datatype, NULL, &linenum, &failure, TRUE)) != NULL) {
914 if (datatype == OBJ_SEQANNOT) {
915 sap = (SeqAnnotPtr) dataptr;
916 MakeFeatureTableFileIdsGenbank (sap);
917 ProcessOneAnnot (sap, entityID, &tbl_args);
918
919 } else {
920 ObjMgrFree (datatype, dataptr);
921 }
922 }
923 FileClose (fp);
924
925 SeqEntrySetScope (orig_scope);
926
927 if (failure) {
928 Message (MSG_POSTERR, "Bad feature table at line %ld of file %s", (long) linenum, filename);
929 return FALSE;
930 } else {
931 return TRUE;
932 }
933 }
934
935
AddFeaturesToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)936 static Boolean AddFeaturesToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
937 {
938 ValNodePtr vnp;
939 ReblobSeqPtr rs;
940 ValNodeBlock file_list;
941 Boolean rval = TRUE;
942
943 if (rg == NULL) {
944 return FALSE;
945 }
946
947 InitValNodeBlock (&file_list, NULL);
948
949 for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
950 rs = vnp->data.ptrvalue;
951 if (rs != NULL && !StringHasNoText (rs->feature_file)
952 && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
953 ValNodeAddPointerToEnd (&file_list, 0, rs->feature_file);
954 }
955 }
956
957 file_list.head = ValNodeSort (file_list.head, SortVnpByString);
958 ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
959
960 for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
961 AddOneFeatureTable (vnp->data.ptrvalue, sep, entityID);
962 }
963 file_list.head = ValNodeFree (file_list.head);
964 return rval;
965 }
966
967
968 static ValNodePtr
AutomatchQualsFromTableHeader(ValNodePtr header,Boolean force_first_accession,Boolean allow_no_match)969 AutomatchQualsFromTableHeader
970 (ValNodePtr header,
971 Boolean force_first_accession,
972 Boolean allow_no_match)
973 {
974 ValNodePtr val;
975 ValNodeBlock columns;
976 TabColumnConfigPtr t;
977
978 InitValNodeBlock (&columns, NULL);
979
980 for (val = header;
981 val != NULL;
982 val = val->next) {
983 t = TabColumnConfigNew ();
984 if (force_first_accession && columns.head == NULL) {
985 t->match_type = MatchTypeNew();
986 t->match_type->choice = eTableMatchNucID;
987 t->match_type->match_location = String_location_equals;
988 } else {
989 t->field = FieldTypeFromString (val->data.ptrvalue);
990 if (t->field == NULL) {
991 Message (MSG_POSTERR, "Unrecognized source qualifier: %s", val->data.ptrvalue);
992 t = TabColumnConfigFree (t);
993 if (!allow_no_match) {
994 columns.head = TabColumnConfigListFree (columns.head);
995 return NULL;
996 }
997 } else {
998 t->existing_text = ExistingTextOption_replace_old;
999 }
1000 }
1001 ValNodeAddPointerToEnd (&columns, 0, t);
1002 }
1003 return columns.head;
1004 }
1005
1006
PostErrorList(ValNodePtr vnp,CharPtr filename)1007 static void PostErrorList (ValNodePtr vnp, CharPtr filename)
1008 {
1009 while (vnp != NULL) {
1010 Message (MSG_POSTERR, "%s in %s", (CharPtr) vnp->data.ptrvalue, filename);
1011 vnp = vnp->next;
1012 }
1013 }
1014
1015
SpecialFixSrcMods(ValNodePtr list)1016 static void SpecialFixSrcMods (ValNodePtr list)
1017 {
1018 while (list != NULL) {
1019 if (StringICmp (list->data.ptrvalue, "note") == 0) {
1020 list->data.ptrvalue = MemFree (list->data.ptrvalue);
1021 list->data.ptrvalue = StringSave ("note-subsrc");
1022 }
1023 list = list->next;
1024 }
1025 }
1026
1027
AddOneSrcTable(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)1028 static Boolean AddOneSrcTable (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
1029 {
1030 ValNodePtr err_list;
1031 ValNodePtr columns, obj_table;
1032 ValNodePtr table;
1033 FILE *fp;
1034
1035 if (StringHasNoText (filename)) {
1036 return FALSE;
1037 }
1038 fp = FileOpen (filename, "r");
1039 if (fp == NULL) {
1040 Message (MSG_POSTERR, "Unable to open source file %s", filename);
1041 return FALSE;
1042 }
1043 table = ReadTabTableFromFile (fp);
1044 FileClose (fp);
1045 err_list = AutoReplaceSpecialCharactersInTabTable (table);
1046 PostErrorList(err_list, filename);
1047 err_list = ValNodeFreeData(err_list);
1048
1049 if (table == NULL || table->next == NULL || table->data.ptrvalue == NULL) {
1050 Message (MSG_POSTERR, "Unable to read table from file %s", filename);
1051 table = FreeTabTable (table);
1052 return FALSE;
1053 }
1054 SpecialFixSrcMods (table->data.ptrvalue);
1055 columns = AutomatchQualsFromTableHeader(table->data.ptrvalue, TRUE, FALSE);
1056 if (columns == NULL) {
1057 Message (MSG_POSTERR, "Unable to read table from file %s", filename);
1058 table = FreeTabTable (table);
1059 return FALSE;
1060 }
1061
1062 err_list = ValidateTabTableValues (table->next, columns);
1063 if (err_list != NULL) {
1064 PostErrorList (err_list, filename);
1065 err_list = ValNodeFreeData (err_list);
1066 table = FreeTabTable (table);
1067 return FALSE;
1068 }
1069
1070 obj_table = GetObjectTableForTabTable (sep, table, columns, &err_list);
1071
1072 err_list = CheckObjTableForRowsThatApplyToTheSameDestination (obj_table);
1073 if (err_list != NULL) {
1074 PostErrorList (err_list, filename);
1075 Message (MSG_POSTERR, "For one or more columns in %s, two or more rows in the table apply to the same object. Cannot apply table.", filename);
1076 err_list = ValNodeFreeData (err_list);
1077 obj_table = FreeObjectTableForTabTable (obj_table);
1078 DeleteMarkedObjects (entityID, 0, NULL);
1079 return FALSE;
1080 }
1081
1082 err_list = ApplyTableValuesToObjectTable (sep, table, columns, obj_table);
1083 /* PostErrorList (err_list, filename); */
1084
1085 err_list = ValNodeFreeData (err_list);
1086 obj_table = FreeObjectTableForTabTable (obj_table);
1087 DeleteMarkedObjects (entityID, 0, NULL);
1088 table = FreeTabTable(table);
1089 return TRUE;
1090
1091 }
1092
1093
ApplySrcTableToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)1094 static Boolean ApplySrcTableToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
1095 {
1096 ValNodePtr vnp;
1097 ReblobSeqPtr rs;
1098 ValNodeBlock file_list;
1099 Boolean rval = TRUE;
1100
1101 if (rg == NULL) {
1102 return FALSE;
1103 }
1104
1105 InitValNodeBlock (&file_list, NULL);
1106
1107 for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1108 rs = vnp->data.ptrvalue;
1109 if (rs != NULL && !StringHasNoText (rs->src_file)
1110 && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
1111 ValNodeAddPointerToEnd (&file_list, 0, rs->src_file);
1112 }
1113 }
1114
1115 file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1116 ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1117
1118 for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1119 rval &= AddOneSrcTable (vnp->data.ptrvalue, sep, entityID);
1120 }
1121 file_list.head = ValNodeFree (file_list.head);
1122 return rval;
1123
1124 }
1125
1126
NewGraphSeqAnnot(CharPtr name,SeqGraphPtr sgp)1127 static SeqAnnotPtr NewGraphSeqAnnot (
1128 CharPtr name,
1129 SeqGraphPtr sgp
1130 )
1131
1132 {
1133 SeqAnnotPtr sap = NULL;
1134
1135 if (sgp == NULL) return NULL;
1136 sap = SeqAnnotNew ();
1137 if (sap == NULL) return NULL;
1138
1139 if (StringDoesHaveText (name)) {
1140 SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name));
1141 }
1142 sap->type = 3;
1143 sap->data = (Pointer) sgp;
1144
1145 return sap;
1146 }
1147
1148
MakeSeqGraphFileId(CharPtr str)1149 static SeqIdPtr MakeSeqGraphFileId (CharPtr str)
1150 {
1151 SeqIdPtr sip_new;
1152 TextSeqIdPtr tsip;
1153 CharPtr cp;
1154
1155 tsip = TextSeqIdNew ();
1156 tsip->accession = StringSave (str);
1157 if ((cp = StringChr(tsip->accession, '.')) != NULL) {
1158 tsip->version = atoi (cp + 1);
1159 *cp = 0;
1160 }
1161 sip_new = ValNodeNew (NULL);
1162 sip_new->choice = SEQID_GENBANK;
1163 sip_new->data.ptrvalue = tsip;
1164
1165 return sip_new;
1166 }
1167
1168
ApplyOneQualityFile(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)1169 static Boolean ApplyOneQualityFile (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
1170 {
1171 FILE *fp;
1172 FileCache fc;
1173 Boolean goOn = TRUE;
1174 Char buf [1024];
1175 Boolean nonewline;
1176 CharPtr str, ptr;
1177 SeqIdPtr sip;
1178 BioseqPtr bsp;
1179 SeqGraphPtr sgp, lastsgp;
1180 SeqAnnotPtr sap;
1181
1182 fp = FileOpen (filename, "r");
1183 if (fp == NULL) {
1184 return FALSE;
1185 }
1186
1187 FileCacheSetup (&fc, fp);
1188
1189 goOn = TRUE;
1190 while (goOn) {
1191 str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
1192 if (str == NULL) {
1193 goOn = FALSE;
1194 } else if (StringDoesHaveText (str)) {
1195 if (str [0] == '>') {
1196 ptr = StringChr (str, ' ');
1197 if (ptr == NULL) {
1198 ptr = StringChr (str, '\t');
1199 }
1200 if (ptr != NULL) {
1201 *ptr = '\0';
1202 }
1203 sip = MakeSeqGraphFileId (str + 1);
1204 bsp = BioseqFind (sip);
1205 if (bsp != NULL) {
1206 sgp = ReadPhrapQualityFC (&fc, bsp);
1207 if (sgp != NULL) {
1208 for (sap = bsp->annot; sap != NULL; sap = sap->next) {
1209 if (sap->type == 3) {
1210 for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) {
1211 continue;
1212 }
1213 lastsgp->next = sgp;
1214 break;
1215 }
1216 }
1217 if (sap == NULL) {
1218 if (bsp->annot != NULL) {
1219 for (sap = bsp->annot; sap->next != NULL; sap = sap->next) {
1220 continue;
1221 }
1222 sap->next = NewGraphSeqAnnot ("Phrap Graph", sgp);
1223 } else {
1224 bsp->annot = NewGraphSeqAnnot ("Phrap Graph", sgp);
1225 }
1226 }
1227 }
1228 }
1229 SeqIdFree (sip);
1230 }
1231 }
1232 }
1233 FileClose (fp);
1234 return TRUE;
1235 }
1236
1237
ApplyQualityScoresToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)1238 static Boolean ApplyQualityScoresToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
1239 {
1240 ValNodePtr vnp;
1241 ReblobSeqPtr rs;
1242 ValNodeBlock file_list;
1243 Boolean rval = TRUE;
1244
1245 if (rg == NULL) {
1246 return FALSE;
1247 }
1248
1249 InitValNodeBlock (&file_list, NULL);
1250
1251 for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1252 rs = vnp->data.ptrvalue;
1253 if (rs != NULL && !StringHasNoText (rs->quality_file)
1254 && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
1255 ValNodeAddPointerToEnd (&file_list, 0, rs->quality_file);
1256 }
1257 }
1258
1259 file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1260 ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1261
1262 for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1263 rval &= ApplyOneQualityFile (vnp->data.ptrvalue, sep, entityID);
1264 }
1265 file_list.head = ValNodeFree (file_list.head);
1266 return rval;
1267 }
1268
1269
CopyDescriptors(BioseqPtr orig_bsp,BioseqPtr new_bsp)1270 static void CopyDescriptors (BioseqPtr orig_bsp, BioseqPtr new_bsp)
1271 {
1272 SeqMgrDescContext context;
1273 SeqDescPtr sdp;
1274
1275 /* only copy title descriptors if they are on the bioseq itself */
1276 for (sdp = SeqMgrGetNextDescriptor (orig_bsp, NULL, 0, &context);
1277 sdp != NULL;
1278 sdp = SeqMgrGetNextDescriptor (orig_bsp, sdp, 0, &context)) {
1279 if (sdp->choice != Seq_descr_title) {
1280 ValNodeLink (&(new_bsp->descr),
1281 AsnIoMemCopy ((Pointer) sdp,
1282 (AsnReadFunc) SeqDescAsnRead,
1283 (AsnWriteFunc) SeqDescAsnWrite));
1284 }
1285 }
1286
1287 /* copy titles on bioseq */
1288 for (sdp = orig_bsp->descr; sdp != NULL; sdp = sdp->next) {
1289 if (sdp->choice == Seq_descr_title) {
1290 ValNodeLink (&(new_bsp->descr),
1291 AsnIoMemCopy ((Pointer) sdp,
1292 (AsnReadFunc) SeqDescAsnRead,
1293 (AsnWriteFunc) SeqDescAsnWrite));
1294 }
1295 }
1296
1297 }
1298
1299
1300 static void
CopyFeaturesAndProteins(Uint2 entityID,BioseqPtr orig_bsp,BioseqPtr new_bsp,SeqEntryPtr orig_scope,SeqEntryPtr new_scope)1301 CopyFeaturesAndProteins
1302 (Uint2 entityID,
1303 BioseqPtr orig_bsp,
1304 BioseqPtr new_bsp,
1305 SeqEntryPtr orig_scope,
1306 SeqEntryPtr new_scope)
1307 {
1308 SeqMgrFeatContext context;
1309 SeqFeatPtr sfp, sfp_cpy;
1310 BioseqPtr prod_bsp, prod_cpy;
1311 SeqEntryPtr sep, sep_prod;
1312
1313 sep = GetBestTopParentForData (entityID, new_bsp);
1314 SeqEntrySetScope (orig_scope);
1315 for (sfp = SeqMgrGetNextFeature (orig_bsp, NULL, 0, 0, &context);
1316 sfp != NULL;
1317 sfp = SeqMgrGetNextFeature (orig_bsp, sfp, 0, 0, &context)) {
1318 if (sfp->product != NULL && (prod_bsp = BioseqFindFromSeqLoc (sfp->product)) != NULL) {
1319 prod_cpy = AsnIoMemCopy (prod_bsp, (AsnReadFunc) BioseqAsnRead, (AsnWriteFunc) BioseqAsnWrite);
1320 sep_prod = SeqEntryNew ();
1321 sep_prod->choice = 1;
1322 sep_prod->data.ptrvalue = prod_cpy;
1323 AddSeqEntryToSeqEntry (sep, sep_prod, TRUE);
1324 sep = GetBestTopParentForData (entityID, new_bsp);
1325 }
1326 sfp_cpy = AsnIoMemCopy (sfp, (AsnReadFunc) SeqFeatAsnRead, (AsnWriteFunc) SeqFeatAsnWrite);
1327 CreateNewFeature (SeqMgrGetSeqEntryForData (new_bsp), NULL, sfp->data.choice, sfp_cpy);
1328 }
1329
1330 }
1331
1332
CopyExtraIds(BioseqPtr new_bsp,BioseqPtr orig_bsp)1333 static void CopyExtraIds (BioseqPtr new_bsp, BioseqPtr orig_bsp)
1334 {
1335 SeqIdPtr sip_new, sip_orig, sip_last;
1336 Boolean found;
1337 TextSeqIdPtr orig_acc, new_acc;
1338
1339 for (sip_orig = orig_bsp->id; sip_orig != NULL; sip_orig = sip_orig->next) {
1340 found = FALSE;
1341 sip_last = NULL;
1342 for (sip_new = new_bsp->id; sip_new != NULL && !found; sip_new = sip_new->next) {
1343 if (SeqIdComp (sip_new, sip_orig) == SIC_YES) {
1344 found = TRUE;
1345 }
1346 sip_last = sip_new;
1347 }
1348 if (!found) {
1349 sip_new = AsnIoMemCopy (sip_orig, (AsnReadFunc) SeqIdAsnRead, (AsnWriteFunc) SeqIdAsnWrite);
1350 if (sip_last == NULL) {
1351 new_bsp->id = sip_new;
1352 } else {
1353 sip_last->next = sip_new;
1354 }
1355 } else if (sip_orig->choice == SEQID_GENBANK && sip_last->choice == SEQID_GENBANK) {
1356 orig_acc = (TextSeqIdPtr) sip_orig->data.ptrvalue;
1357 new_acc = (TextSeqIdPtr) sip_last->data.ptrvalue;
1358 if (orig_acc != NULL && new_acc != NULL) {
1359 new_acc->version = orig_acc->version;
1360 }
1361 }
1362 }
1363 }
1364
1365
CopySetType(SeqEntryPtr new_sep,SeqEntryPtr orig_sep)1366 static Boolean CopySetType (SeqEntryPtr new_sep, SeqEntryPtr orig_sep)
1367 {
1368 BioseqSetPtr orig_bssp, new_bssp;
1369
1370 if (orig_sep != NULL && IS_Bioseq_set (orig_sep) && (orig_bssp = orig_sep->data.ptrvalue) != NULL
1371 && new_sep != NULL && IS_Bioseq_set (new_sep) && (new_bssp = new_sep->data.ptrvalue) != NULL
1372 && orig_bssp->_class != BioseqseqSet_class_nuc_prot) {
1373 new_bssp->_class = orig_bssp->_class;
1374 return TRUE;
1375 } else {
1376 return FALSE;
1377 }
1378
1379 }
1380
1381
CopySetTitle(SeqEntryPtr new_sep,SeqEntryPtr orig_sep)1382 static Boolean CopySetTitle (SeqEntryPtr new_sep, SeqEntryPtr orig_sep)
1383 {
1384 BioseqSetPtr orig_bssp, new_bssp;
1385 SeqDescPtr sdp;
1386 Boolean rval = FALSE;
1387
1388 if (orig_sep != NULL && IS_Bioseq_set (orig_sep) && (orig_bssp = orig_sep->data.ptrvalue) != NULL
1389 && new_sep != NULL && IS_Bioseq_set (new_sep) && (new_bssp = new_sep->data.ptrvalue) != NULL) {
1390 for (sdp = orig_bssp->descr; sdp != NULL && sdp->choice != Seq_descr_title; sdp = sdp->next) {
1391 }
1392 if (sdp != NULL) {
1393 ValNodeLink (&(new_bssp->descr),
1394 AsnIoMemCopy ((Pointer) sdp,
1395 (AsnReadFunc) SeqDescAsnRead,
1396 (AsnWriteFunc) SeqDescAsnWrite));
1397 rval = TRUE;
1398 }
1399 }
1400 return rval;
1401 }
1402
1403
CopyQualityGraphs(BioseqPtr bsp_new,BioseqPtr bsp_orig)1404 static Boolean CopyQualityGraphs (BioseqPtr bsp_new, BioseqPtr bsp_orig)
1405 {
1406 SeqAnnotPtr sap, sap_last, sap_new;
1407
1408 if (bsp_new == NULL || bsp_orig == NULL || CompareSequences (bsp_new, bsp_orig, 0) != 0) {
1409 return FALSE;
1410 }
1411
1412 sap_last = bsp_new->annot;
1413 while (sap_last != NULL && sap_last->next != NULL) {
1414 sap_last = sap_last->next;
1415 }
1416
1417 for (sap = bsp_orig->annot; sap != NULL; sap = sap->next) {
1418 if (sap->type == 3) {
1419 sap_new = (SeqAnnotPtr) AsnIoMemCopy (sap, (AsnReadFunc) SeqAnnotAsnRead, (AsnWriteFunc) SeqAnnotAsnWrite);
1420 if (sap_last == NULL) {
1421 bsp_new->annot = sap_new;
1422 } else {
1423 sap_last->next = sap_new;
1424 }
1425 sap_last = sap_new;
1426 }
1427 }
1428 return TRUE;
1429 }
1430
1431
1432 /* NOTES:
1433 * if the orig is in a nuc-prot set, need to add all the proteins from the original nuc-prot-set to
1434 * the new file.
1435 * also, should look for descriptors on the nuc-prot set and copy those as well.
1436 */
CopyDataFromOriginals(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID)1437 static Boolean CopyDataFromOriginals (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID)
1438 {
1439 ValNodePtr vnp, vnp_r;
1440 ReblobSeqPtr rs;
1441 ValNodeBlock file_list;
1442 Boolean rval = TRUE;
1443 SeqEntryPtr orig_sep;
1444 AsnIoPtr aip;
1445 BioseqPtr orig_bsp;
1446 CharPtr filename;
1447 Uint2 orig_entityID;
1448 Char buf[PATH_MAX];
1449
1450 if (rg == NULL) {
1451 return FALSE;
1452 }
1453
1454 InitValNodeBlock (&file_list, NULL);
1455
1456 SeqEntrySetScope (sep);
1457 SeqMgrIndexFeatures (entityID, NULL);
1458 for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1459 rs = vnp->data.ptrvalue;
1460 if (rs != NULL && !StringHasNoText (rs->orig_file)) {
1461 ValNodeAddPointerToEnd (&file_list, 0, rs->orig_file);
1462 rs->bsp = BioseqFind (rs->sip);
1463 }
1464 }
1465
1466 file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1467 ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1468
1469 for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1470 filename = vnp->data.ptrvalue;
1471 aip = AsnIoOpen (filename, "r");
1472 orig_sep = SeqEntryAsnRead (aip, NULL);
1473 AsnIoClose (aip);
1474 if (orig_sep == NULL) {
1475 Message (MSG_ERROR, "Unable to read Seq-entry from %s", filename);
1476 continue;
1477 }
1478 SeqEntrySetScope (orig_sep);
1479 orig_entityID = ObjMgrGetEntityIDForChoice (orig_sep);
1480 AssignIDsInEntity (orig_entityID, 0, NULL);
1481 SeqMgrIndexFeatures (orig_entityID, NULL);
1482 CopySetType (sep, orig_sep);
1483 CopySetTitle (sep, orig_sep);
1484 for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1485 rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1486 if (StringCmp (rs->orig_file, filename) == 0) {
1487 /* find original Bioseq */
1488 orig_bsp = BioseqFind (rs->sip);
1489 if (orig_bsp != NULL && rs->bsp != NULL && orig_bsp != rs->bsp) {
1490 /* copy over descriptors */
1491 CopyDescriptors (orig_bsp, rs->bsp);
1492
1493 /* copy over features if not retrieving from feature file */
1494 if (StringHasNoText (rs->feature_file)) {
1495 CopyFeaturesAndProteins (entityID, orig_bsp, rs->bsp, orig_sep, sep);
1496 }
1497 /* additional Seq-ids */
1498 CopyExtraIds (rs->bsp, orig_bsp);
1499
1500 /* copy mol */
1501 rs->bsp->mol = orig_bsp->mol;
1502
1503 #if 0
1504 /* do not copy quality graphs */
1505 /* quality graphs */
1506 CopyQualityGraphs (rs->bsp, orig_bsp);
1507 #endif
1508
1509 /* anything else? */
1510
1511 } else {
1512 SeqIdWrite (rs->sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
1513 Message (MSG_POSTERR, "Unable to find original sequence for %s", buf);
1514 }
1515 }
1516 }
1517 SeqEntrySetScope (NULL);
1518 SeqEntryFree (orig_sep);
1519 }
1520 SeqEntrySetScope (sep);
1521 return rval;
1522 }
1523
1524
MarkFeaturesForDeletion(SeqFeatPtr sfp,Pointer data)1525 static void MarkFeaturesForDeletion (SeqFeatPtr sfp, Pointer data)
1526 {
1527 if (sfp != NULL) {
1528 sfp->idx.deleteme = TRUE;
1529 }
1530 }
1531
1532
StripFeaturesFromSelectedSequences(ReblobGroupPtr rg,Uint2 orig_entityID,CharPtr filename)1533 static Boolean StripFeaturesFromSelectedSequences (ReblobGroupPtr rg, Uint2 orig_entityID, CharPtr filename)
1534 {
1535 ValNodePtr vnp_r;
1536 ReblobSeqPtr rs;
1537 BioseqPtr orig_bsp;
1538 SeqEntryPtr seq_sep;
1539 Boolean rval = TRUE;
1540 Char buf[PATH_MAX];
1541
1542 if (rg == NULL) {
1543 return FALSE;
1544 }
1545
1546 for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1547 rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1548 if (StringCmp (rs->orig_file, filename) == 0) {
1549 /* find original Bioseq */
1550 orig_bsp = BioseqFind (rs->sip);
1551 seq_sep = GetBestTopParentForData (orig_entityID, orig_bsp);
1552 if (seq_sep == NULL) {
1553 SeqIdWrite (rs->sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
1554 Message (MSG_POSTERR, "Unable to find original sequence for %s", buf);
1555 rval = FALSE;
1556 } else {
1557 /* strip features if retrieving from feature file */
1558 if (!StringHasNoText (rs->feature_file)) {
1559 VisitFeaturesInSep (seq_sep, NULL, MarkFeaturesForDeletion);
1560 }
1561 }
1562 }
1563 }
1564 DeleteMarkedObjects (orig_entityID, 0, NULL);
1565 return rval;
1566 }
1567
1568
MakeAlignmentFileIdsGenbank(TAlignmentFilePtr afp)1569 static void MakeAlignmentFileIdsGenbank (TAlignmentFilePtr afp)
1570 {
1571 int i;
1572 CharPtr fmt = "gb|%s";
1573 CharPtr tmp;
1574
1575
1576 if (afp == NULL) {
1577 return;
1578 }
1579 for (i = 0; i < afp->num_sequences; i++) {
1580 if (StringNICmp(afp->ids[i], "gb|", 3) != 0) {
1581 tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (afp->ids[i]) + StringLen (fmt)));
1582 sprintf (tmp, fmt, afp->ids[i]);
1583 afp->ids[i] = MemFree (afp->ids[i]);
1584 afp->ids[i] = tmp;
1585 }
1586 }
1587 }
1588
1589
MakeFeatureTableFileIdsGenbank(SeqAnnotPtr sap)1590 static void MakeFeatureTableFileIdsGenbank (SeqAnnotPtr sap)
1591 {
1592 SeqFeatPtr sfp;
1593 SeqIdPtr sip_orig, sip_new;
1594 ObjectIdPtr oip;
1595 TextSeqIdPtr tsip;
1596 Boolean found = FALSE;
1597 CharPtr cp;
1598
1599 if (sap == NULL || sap->type != 1) {
1600 return;
1601 }
1602 for (sfp = sap->data; sfp != NULL; sfp = sfp->next) {
1603 sip_orig = SeqLocId (sfp->location);
1604 if (sip_orig != NULL && sip_orig->choice == 1) {
1605 oip = (ObjectIdPtr) sip_orig->data.ptrvalue;
1606 if (oip != NULL && oip->str != NULL) {
1607 found = TRUE;
1608 break;
1609 }
1610 }
1611 }
1612 if (found) {
1613 tsip = TextSeqIdNew ();
1614 tsip->accession = StringSave (oip->str);
1615 if ((cp = StringChr(tsip->accession, '.')) != NULL) {
1616 tsip->version = atoi (cp + 1);
1617 *cp = 0;
1618 }
1619 sip_new = ValNodeNew (NULL);
1620 sip_new->choice = SEQID_GENBANK;
1621 sip_new->data.ptrvalue = tsip;
1622
1623 sip_orig = SeqIdDup (sip_orig);
1624 for (sfp = sap->data; sfp != NULL; sfp = sfp->next) {
1625 ReplaceSeqIdWithSeqIdInFeat (sip_orig, sip_new, sfp);
1626 }
1627 sip_orig = SeqIdFree (sip_orig);
1628 sip_new = SeqIdFree (sip_new);
1629 }
1630 }
1631
1632
RemoveAlignmentTitlesAndOrganisms(TAlignmentFilePtr afp)1633 static void RemoveAlignmentTitlesAndOrganisms (TAlignmentFilePtr afp)
1634 {
1635 int i;
1636
1637 if (afp == NULL || afp->deflines == NULL || afp->num_deflines == 0) {
1638 return;
1639 }
1640 for (i = 0; i < afp->num_deflines; i++) {
1641 free (afp->deflines[i]);
1642 }
1643 free (afp->deflines);
1644 afp->deflines = NULL;
1645 afp->num_deflines = 0;
1646 for (i = 0; i < afp->num_organisms; i++) {
1647 free (afp->organisms [i]);
1648 }
1649 free (afp->organisms);
1650 afp->organisms = NULL;
1651 afp->num_organisms = 0;
1652 }
1653
1654
EndsWithAsn(CharPtr filename)1655 static Boolean EndsWithAsn (CharPtr filename)
1656 {
1657 Int4 len;
1658
1659 if (StringHasNoText (filename)) {
1660 return FALSE;
1661 }
1662 len = StringLen (filename);
1663 if (StringCmp (filename + len - 4, ".asn") == 0) {
1664 return TRUE;
1665 } else {
1666 return FALSE;
1667 }
1668 }
1669
1670
RemoveSingleItemAlignments(SeqAnnotPtr sap,Pointer data)1671 static void RemoveSingleItemAlignments (SeqAnnotPtr sap, Pointer data)
1672 {
1673 SeqAlignPtr salp;
1674
1675 if (sap != NULL && sap->type == 2 && (salp = sap->data) != NULL && salp->dim == 1) {
1676 sap->idx.deleteme = TRUE;
1677 }
1678 }
1679
1680
ProcessGroup(ReblobGroupPtr rg)1681 static SeqEntryPtr ProcessGroup (ReblobGroupPtr rg)
1682 {
1683 TErrorInfoPtr error_list = NULL;
1684 ReadBufferData rbd;
1685 TAlignmentFilePtr afp;
1686 SeqEntryPtr sep = NULL;
1687 TSequenceInfoPtr sequence_info;
1688 Uint2 entityID;
1689 AsnIoPtr aip;
1690
1691 if (rg == NULL) {
1692 return NULL;
1693 }
1694
1695 if (EndsWithAsn(rg->filename)) {
1696 /* not an alignment, no reblobbing, just do features and source table */
1697 aip = AsnIoOpen (rg->filename, "r");
1698 sep = SeqEntryAsnRead (aip, NULL);
1699 AsnIoClose (aip);
1700 if (sep == NULL) {
1701 Message (MSG_ERROR, "Unable to read Seq-entry from file %s\n", rg->filename);
1702 }
1703 SeqEntrySetScope (sep);
1704 entityID = ObjMgrGetEntityIDForChoice (sep);
1705 AssignIDsInEntity (entityID, 0, NULL);
1706 SeqMgrIndexFeatures (entityID, NULL);
1707 StripFeaturesFromSelectedSequences (rg, entityID, rg->filename);
1708 AddFeaturesToGroup (rg, sep, entityID, rg->filename);
1709 sep = GetTopSeqEntryForEntityID (entityID);
1710 ApplySrcTableToGroup (rg, sep, entityID, rg->filename);
1711 } else {
1712 rbd.fp = FileOpen (rg->filename, "r");
1713 if (rbd.fp == NULL) {
1714 Message (MSG_ERROR, "Unable to open alignment file %s", rg->filename);
1715 return NULL;
1716 }
1717 rbd.current_data = NULL;
1718 sequence_info = GetDefaultSequenceInfo();
1719 afp = ReadAlignmentFile ( AbstractReadFunction,
1720 (Pointer) &rbd,
1721 AbstractReportError,
1722 (Pointer) &error_list,
1723 sequence_info);
1724 FileClose (rbd.fp);
1725 ErrorInfoFree (error_list);
1726 if (afp == NULL) {
1727 Message (MSG_ERROR, "Unable to read alignment from %s", rg->filename);
1728 } else if (! DoSequenceLengthsMatch (afp)) {
1729 Message (MSG_ERROR, "Your alignment in %s is incorrectly formatted. Sequence plus gaps should be the same length for all sequences", rg->filename);
1730 } else {
1731 MakeAlignmentFileIdsGenbank (afp);
1732 RemoveAlignmentTitlesAndOrganisms (afp);
1733 sep = MakeSequinDataFromAlignment (afp, Seq_mol_na);
1734 entityID = ObjMgrGetEntityIDForChoice (sep);
1735 /* now - copy extras from original, add in new feature annotation */
1736 CopyDataFromOriginals (rg, sep, entityID);
1737 AddFeaturesToGroup (rg, sep, entityID, NULL);
1738 sep = GetTopSeqEntryForEntityID (entityID);
1739 ApplySrcTableToGroup (rg, sep, entityID, NULL);
1740 /* add quality scores */
1741 ApplyQualityScoresToGroup (rg, sep, entityID, NULL);
1742 /* collect pubs on set level */
1743 MovePopPhyMutPubs(sep);
1744 VisitAnnotsInSep (sep, NULL, RemoveSingleItemAlignments);
1745 DeleteMarkedObjects (entityID, 0, NULL);
1746 }
1747
1748 SequenceInfoFree (sequence_info);
1749 AlignmentFileFree (afp);
1750 }
1751
1752 return sep;
1753 }
1754
1755
NotTitle(SeqDescPtr sdp,Pointer extradata)1756 static Boolean NotTitle (SeqDescPtr sdp, Pointer extradata)
1757 {
1758
1759 if (sdp == NULL
1760 || sdp->choice == Seq_descr_title) {
1761 return FALSE;
1762 } else {
1763 return TRUE;
1764 }
1765 }
1766
1767
GetCDS(BioseqPtr bsp)1768 static SeqFeatPtr GetCDS (BioseqPtr bsp)
1769 {
1770 SeqFeatPtr sfp = NULL, tmp_sfp;
1771 SeqAnnotPtr sap;
1772 BioseqSetPtr bssp;
1773
1774 if (bsp == NULL) {
1775 return NULL;
1776 }
1777 for (sap = bsp->annot; sap != NULL; sap = sap->next) {
1778 if (sap->type == 1) {
1779 tmp_sfp = sap->data;
1780 while (tmp_sfp != NULL) {
1781 if (tmp_sfp->data.choice == SEQFEAT_CDREGION) {
1782 if (sfp == NULL) {
1783 sfp = tmp_sfp;
1784 } else {
1785 /* found second coding region */
1786 return NULL;
1787 }
1788 }
1789 tmp_sfp = tmp_sfp->next;
1790 }
1791 }
1792 }
1793 if (bsp->idx.parenttype == OBJ_BIOSEQSET && (bssp = (BioseqSetPtr) bsp->idx.parentptr) != NULL) {
1794 for (sap = bssp->annot; sap != NULL; sap = sap->next) {
1795 if (sap->type == 1) {
1796 tmp_sfp = sap->data;
1797 while (tmp_sfp != NULL) {
1798 if (tmp_sfp->data.choice == SEQFEAT_CDREGION) {
1799 if (sfp == NULL) {
1800 sfp = tmp_sfp;
1801 } else {
1802 /* found second coding region */
1803 return NULL;
1804 }
1805 }
1806 tmp_sfp = tmp_sfp->next;
1807 }
1808 }
1809 }
1810 }
1811 return sfp;
1812 }
1813
1814
CopyProteinId(SeqEntryPtr orig_sep,SeqEntryPtr new_sep,BioseqPtr prot_bsp)1815 static Boolean CopyProteinId (SeqEntryPtr orig_sep, SeqEntryPtr new_sep, BioseqPtr prot_bsp)
1816 {
1817 SeqEntryPtr orig_scope;
1818 SeqFeatPtr orig_cds, new_cds;
1819 BioseqPtr orig_nuc, new_nuc, new_prot;
1820 Boolean rval = FALSE;
1821 SeqIdPtr sip, sip_tmp;
1822 TextSeqIdPtr tsip;
1823
1824 orig_scope = SeqEntrySetScope (orig_sep);
1825 orig_cds = SeqMgrGetCDSgivenProduct (prot_bsp, NULL);
1826 if (orig_cds != NULL) {
1827 orig_nuc = BioseqFindFromSeqLoc (orig_cds->location);
1828 SeqEntrySetScope (new_sep);
1829 new_nuc = NULL;
1830 for (sip = orig_nuc->id; sip != NULL && new_nuc == NULL; sip = sip->next) {
1831 new_nuc = BioseqFind (sip);
1832 if (new_nuc == NULL && sip->choice == SEQID_GENBANK) {
1833 sip_tmp = SeqIdDup (sip);
1834 tsip = (TextSeqIdPtr) sip_tmp->data.ptrvalue;
1835 tsip->version = 0;
1836 new_nuc = BioseqFind (sip_tmp);
1837 sip_tmp = SeqIdFree (sip_tmp);
1838 }
1839 }
1840 if (new_nuc != NULL) {
1841 new_cds = GetCDS (new_nuc);
1842 if (new_cds != NULL) {
1843 new_prot = BioseqFindFromSeqLoc (new_cds->product);
1844 if (new_prot != NULL) {
1845 sip = SeqIdFindBest (prot_bsp->id, SEQID_GENBANK);
1846 if (sip != NULL) {
1847 sip = SeqIdDup (sip);
1848 BioseqReplaceID (new_prot, sip);
1849 sip = SeqIdFree (sip);
1850 rval = TRUE;
1851 }
1852 }
1853 }
1854 }
1855 }
1856
1857 SeqEntrySetScope (orig_scope);
1858 return rval;
1859 }
1860
1861
ProcessSingletons(ReblobGroupPtr rg,SeqEntryPtr top_sep)1862 static Boolean ProcessSingletons (ReblobGroupPtr rg, SeqEntryPtr top_sep)
1863 {
1864 ValNodePtr vnp, vnp_r;
1865 ReblobSeqPtr rs;
1866 ValNodeBlock file_list;
1867 Boolean rval = TRUE;
1868 SeqEntryPtr orig_sep, seq_sep;
1869 AsnIoPtr aip;
1870 BioseqPtr orig_bsp;
1871 CharPtr filename;
1872 Uint2 orig_entityID;
1873
1874 if (rg == NULL) {
1875 return FALSE;
1876 }
1877
1878 InitValNodeBlock (&file_list, NULL);
1879
1880 SeqEntrySetScope (top_sep);
1881
1882 for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1883 rs = vnp->data.ptrvalue;
1884 if (rs != NULL && !StringHasNoText (rs->orig_file)) {
1885 ValNodeAddPointerToEnd (&file_list, 0, rs->orig_file);
1886 rs->bsp = BioseqFind (rs->sip);
1887 }
1888 }
1889
1890 file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1891 ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1892
1893 for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1894 filename = vnp->data.ptrvalue;
1895 aip = AsnIoOpen (filename, "r");
1896 orig_sep = SeqEntryAsnRead (aip, NULL);
1897 AsnIoClose (aip);
1898 if (orig_sep == NULL) {
1899 Message (MSG_ERROR, "Unable to read from file %s", filename);
1900 continue;
1901 }
1902 SeqEntrySetScope (orig_sep);
1903 orig_entityID = ObjMgrGetEntityIDForChoice (orig_sep);
1904 AssignIDsInEntity (orig_entityID, 0, NULL);
1905 SeqMgrIndexFeatures (orig_entityID, NULL);
1906 /* push descriptors so they will be attached to the singleton sequences */
1907 if (IS_Bioseq_set(orig_sep)) {
1908 PropagateSomeDescriptors(orig_sep, NotTitle, NULL);
1909 }
1910
1911 rval &= StripFeaturesFromSelectedSequences (rg, orig_entityID, filename);
1912
1913 /* add features */
1914 rval &= AddFeaturesToGroup (rg, orig_sep, orig_entityID, filename);
1915
1916 /* add source table information */
1917 rval &= ApplySrcTableToGroup (rg, orig_sep, orig_entityID, filename);
1918
1919 /* add quality scores */
1920 rval &= ApplyQualityScoresToGroup (rg, orig_sep, orig_entityID, filename);
1921
1922 /* now make copies and add to top_sep */
1923 for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1924 rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1925 if (StringCmp (rs->orig_file, filename) == 0) {
1926 /* find original Bioseq */
1927 orig_bsp = BioseqFind (rs->sip);
1928 /* don't add if protein, will have been added with nucleotide already */
1929 if (orig_bsp != NULL) {
1930 if (ISA_aa (orig_bsp->mol)) {
1931 /* if there's a replacement protein for this one, copy the accession */
1932 CopyProteinId (orig_sep, top_sep, orig_bsp);
1933 } else {
1934 seq_sep = GetBestTopParentForData (orig_entityID, orig_bsp);
1935 if (seq_sep != NULL) {
1936 seq_sep = AsnIoMemCopy (seq_sep, (AsnReadFunc) SeqEntryAsnRead, (AsnWriteFunc) SeqEntryAsnWrite);
1937 PromoteAllToWorstID (seq_sep);
1938 AddSeqEntryToSeqEntry (top_sep, seq_sep, TRUE);
1939 }
1940 }
1941 }
1942 }
1943 }
1944
1945 SeqEntrySetScope (NULL);
1946 SeqEntryFree (orig_sep);
1947 }
1948 return rval;
1949 }
1950
1951
ProcessGroupList(ValNodePtr group_list)1952 static SeqEntryPtr ProcessGroupList (ValNodePtr group_list)
1953 {
1954 ValNodePtr vnp;
1955 SeqEntryPtr top_sep, sep;
1956 BioseqSetPtr bssp;
1957 ReblobGroupPtr rg;
1958
1959 bssp = BioseqSetNew ();
1960 bssp->_class = BioseqseqSet_class_genbank;
1961 top_sep = SeqEntryNew();
1962 top_sep->choice = 2;
1963 top_sep->data.ptrvalue = bssp;
1964
1965 for (vnp = group_list; vnp != NULL; vnp = vnp->next) {
1966 rg = (ReblobGroupPtr) vnp->data.ptrvalue;
1967 if (StringHasNoText (rg->filename)) {
1968 /* process as singletons */
1969 ProcessSingletons (rg, top_sep);
1970 } else {
1971 sep = ProcessGroup (vnp->data.ptrvalue);
1972 if (sep != NULL) {
1973 AddSeqEntryToSeqEntry (top_sep, sep, TRUE);
1974 }
1975 }
1976 }
1977
1978 return top_sep;
1979 }
1980
1981
ReadMapFile(CharPtr filename,CharPtr result_dir)1982 static Int4 ReadMapFile (CharPtr filename, CharPtr result_dir)
1983 {
1984 FILE *fp;
1985 ReadBufferData rbd;
1986 CharPtr line;
1987 Boolean collecting = FALSE;
1988 CharPtr grp_file = NULL;
1989 ValNodeBlock group_list;
1990 ValNodeBlock seq_list;
1991 ValNodePtr tokens;
1992 SeqEntryPtr sep;
1993 AsnIoPtr aip;
1994 Char output_file[PATH_MAX];
1995 Int4 cluster_number = 1;
1996
1997 fp = FileOpen (filename, "r");
1998 if (fp == NULL) {
1999 Message (MSG_FATAL, "Unable to open map file %s", filename);
2000 return 1;
2001 }
2002
2003 rbd.fp = fp;
2004 rbd.current_data = NULL;
2005
2006 line = AbstractReadFunction (&rbd);
2007 while (line != NULL) {
2008 if (StringNICmp (line, "START", 5) == 0) {
2009 /* start collecting group */
2010 collecting = TRUE;
2011 InitValNodeBlock (&group_list, NULL);
2012 InitValNodeBlock (&seq_list, NULL);
2013 grp_file = MemFree (grp_file);
2014 } else if (StringNICmp (line, "STOP", 4) == 0) {
2015 /* finish last group */
2016 ValNodeAddPointerToEnd (&group_list, 0, ReblobGroupNew (grp_file, seq_list.head));
2017
2018 /* process group list */
2019 sep = ProcessGroupList (group_list.head);
2020 if (sep == NULL) {
2021 Message (MSG_ERROR, "Unable to process cluster %d", cluster_number);
2022 } else {
2023 sprintf (output_file, "%s/cluster_%d.sqn", result_dir, cluster_number);
2024 aip = AsnIoOpen (output_file, "w");
2025 SeqEntryAsnWrite (sep, aip, NULL);
2026 AsnIoClose (aip);
2027 sep = SeqEntryFree (sep);
2028 }
2029
2030 /* cleanup */
2031 cluster_number++;
2032 group_list.head = ReblobGroupListFree (group_list.head);
2033 collecting = FALSE;
2034 grp_file = MemFree (grp_file);
2035 } else if (collecting) {
2036 tokens = ReadOneColumnList (line);
2037 if (tokens != NULL) {
2038 if (grp_file == NULL) {
2039 grp_file = StringSave (tokens->data.ptrvalue);
2040 } else if (StringCmp (grp_file, tokens->data.ptrvalue) != 0) {
2041 ValNodeAddPointerToEnd (&group_list, 0, ReblobGroupNew (grp_file, seq_list.head));
2042 InitValNodeBlock (&seq_list, NULL);
2043 grp_file = MemFree (grp_file);
2044 grp_file = StringSave (tokens->data.ptrvalue);
2045 }
2046 if (tokens->next != NULL) {
2047 ValNodeAddPointerToEnd (&seq_list, 0, ReblobSeqNew (tokens->next));
2048 }
2049 }
2050 tokens = ValNodeFreeData (tokens);
2051 }
2052 line = AbstractReadFunction (&rbd);
2053 }
2054
2055 FileClose(fp);
2056 return 0;
2057 }
2058
2059
2060 /* Args structure contains command-line arguments */
2061
2062 typedef enum {
2063 m_argMapFile = 0,
2064 r_argOutputPath,
2065 } Arguments;
2066
2067
2068 Args myargs [] = {
2069 {"Input Map File", NULL, NULL, NULL,
2070 TRUE, 'm', ARG_STRING, 0.0, 0, NULL},
2071 {"Path for Results", NULL, NULL, NULL,
2072 TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
2073 };
2074
Main(void)2075 Int2 Main (void)
2076
2077 {
2078 Char app [64];
2079 CharPtr base;
2080 CharPtr results;
2081
2082 /* standard setup */
2083
2084 ErrSetFatalLevel (SEV_MAX);
2085 ErrSetMessageLevel (SEV_MAX);
2086 ErrClearOptFlags (EO_SHOW_USERSTR);
2087 UseLocalAsnloadDataAndErrMsg ();
2088 ErrPathReset ();
2089
2090 /* finish resolving internal connections in ASN.1 parse tables */
2091
2092 if (! AllObjLoad ()) {
2093 Message (MSG_FATAL, "AllObjLoad failed");
2094 return 1;
2095 }
2096 if (! SubmitAsnLoad ()) {
2097 Message (MSG_FATAL, "SubmitAsnLoad failed");
2098 return 1;
2099 }
2100 if (! FeatDefSetLoad ()) {
2101 Message (MSG_FATAL, "FeatDefSetLoad failed");
2102 return 1;
2103 }
2104 if (! SeqCodeSetLoad ()) {
2105 Message (MSG_FATAL, "SeqCodeSetLoad failed");
2106 return 1;
2107 }
2108 if (! GeneticCodeTableLoad ()) {
2109 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
2110 return 1;
2111 }
2112
2113
2114 /* process command line arguments */
2115
2116 sprintf (app, "reblobber %s", REBLOBBER_APPLICATION);
2117 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
2118 return 0;
2119 }
2120
2121 SetAppProperty ("NcbiTbl2Asn", (void *) 1024);
2122
2123 results = (CharPtr) myargs [r_argOutputPath].strvalue;
2124 base = (CharPtr) myargs [m_argMapFile].strvalue;
2125 if (StringHasNoText (results) || StringHasNoText (base)) {
2126 Message (MSG_FATAL, "Must supply map file and results directory.");
2127 return 1;
2128 }
2129
2130 return ReadMapFile (base, results);
2131
2132 }
2133
2134