1 /*   reblobber.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  reblobber.c
27 *
28 * Author:  Colleen Bollin
29 *
30 * Version Creation Date:   Dec. 2, 2011
31 *
32 * $Revision: 1.7 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <sequtil.h>
51 #include <salutil.h>
52 #include <edutil.h>
53 #include <seqport.h>
54 #include <gather.h>
55 #include <sqnutils.h>
56 #include <subutil.h>
57 #include <toasn3.h>
58 #include <valid.h>
59 #include <asn2gnbk.h>
60 #include <explore.h>
61 #include <tofasta.h>
62 #include <simple.h>
63 #include <suggslp.h>
64 #include <toporg.h>
65 #include <aliparse.h>
66 #include <util/creaders/alnread.h>
67 #include <pmfapi.h>
68 #include <tax3api.h>
69 #ifdef INTERNAL_NCBI_TBL2ASN
70 #include <accpubseq.h>
71 #endif
72 #define NLM_GENERATED_CODE_PROTO
73 #include <objmacro.h>
74 #include <macroapi.h>
75 
76 #define REBLOBBER_APP_VER "1.0"
77 
78 CharPtr REBLOBBER_APPLICATION = REBLOBBER_APP_VER;
79 
80 typedef struct cleanupargs {
81   Boolean collection_dates;
82   Boolean collection_dates_month_first;
83   Boolean add_notes_to_overlapping_cds_without_abc;
84   Boolean extend_partial_features_to_gaps_or_ends;
85   Boolean add_exception_to_nonextendable_partials;
86   Boolean add_exception_to_short_introns;
87   FILE *  cleanup_log;
88 } CleanupArgsData, PNTR CleanupArgsPtr;
89 
90 typedef struct tblargs {
91   Boolean     raw2delt;
92   Int2        r2dmin;
93   Boolean     r2dunk100;
94   Boolean     fastaset;
95   Int2        whichclass;
96   Boolean     deltaset;
97   Boolean     alignset;
98   Boolean     gapped;
99   Boolean     phrapace;
100   Boolean     ftable;
101   Boolean     genprodset;
102   Boolean     delaygenprodset;
103   Boolean     linkbyoverlap;
104   Boolean     linkbyproduct;
105   Boolean     implicitgaps;
106   Boolean     forcelocalid;
107   Boolean     gpstonps;
108   Boolean     gnltonote;
109   Boolean     removeunnecxref;
110   Boolean     dotaxlookup;
111   Boolean     dopublookup;
112   CharPtr     accn;
113   CharPtr     center;
114   Int4        project_version;
115   CharPtr     organism;
116   CharPtr     srcquals;
117   CharPtr     comment;
118   CharPtr     commentFile;
119   CharPtr     tableFile;
120   Boolean     findorf;
121   Boolean     runonorf;
122   Boolean     altstart;
123   Boolean     conflict;
124   Boolean     validate;
125   Boolean     relaxed;
126   Boolean     validate_barcode;
127   Boolean     flatfile;
128   Boolean     genereport;
129   Boolean     seqidfromfile;
130   Boolean     smartfeats;
131   Boolean     refSeqTitles;
132   Boolean     smarttitle;
133   Boolean     logtoterminal;
134   CharPtr     aln_beginning_gap;
135   CharPtr     aln_end_gap;
136   CharPtr     aln_middle_gap;
137   CharPtr     aln_missing;
138   CharPtr     aln_match;
139   Boolean     aln_is_protein;
140   Boolean     save_bioseq_set;
141   Boolean     auto_def;
142   Boolean     apply_cmt_to_all;
143   Boolean     adjust_mrna_for_cds_stop_codon;
144   Boolean     seq_fetch_failure;
145 
146   GlobalDiscrepReportPtr global_report;
147 
148   CleanupArgsData cleanup_args;
149 } TblArgs, PNTR TblArgsPtr;
150 
151 
RetryRead(CharPtr filename)152 static void RetryRead (CharPtr filename)
153 {
154   AsnIoPtr aip;
155   SeqEntryPtr orig_sep;
156   FILE *fp;
157 
158   fp = FileOpen (filename, "r");
159   if (fp == NULL) {
160     Message (MSG_ERROR, "Unable to open %s", filename);
161   }
162   FileClose (fp);
163 
164   /* for debugging purposes, try to read it again */
165   aip = AsnIoOpen (filename, "r");
166   orig_sep = SeqEntryAsnRead (aip, NULL);
167   AsnIoClose (aip);
168 }
169 
170 
171 /* source information for several common organisms sequenced by genome centers */
172 
173 
174 typedef struct gcmdata {
175   SeqFeatPtr  gene;
176   SeqFeatPtr  feat;
177   CharPtr     label;
178 } GmcData, PNTR GmcDataPtr;
179 
SortByGenePtr(VoidPtr vp1,VoidPtr vp2)180 static int LIBCALLBACK SortByGenePtr (
181   VoidPtr vp1,
182   VoidPtr vp2
183 )
184 
185 {
186   GmcDataPtr gdp1, gdp2;
187 
188   if (vp1 == NULL || vp2 == NULL) return 0;
189   gdp1 = (GmcDataPtr) vp1;
190   gdp2 = (GmcDataPtr) vp2;
191   if (gdp1 == NULL || gdp2 == NULL) return 0;
192 
193   if (gdp1->gene > gdp2->gene) return -1;
194   if (gdp1->gene < gdp2->gene) return 1;
195 
196   if (gdp1->feat > gdp2->feat) return -1;
197   if (gdp1->feat < gdp2->feat) return 1;
198 
199   return 0;
200 }
201 
202 
203 
EnhanceOneCDS(SeqFeatPtr sfp,Boolean alt_splice)204 static void EnhanceOneCDS (
205   SeqFeatPtr sfp,
206   Boolean alt_splice
207 )
208 
209 {
210   DbtagPtr        dbt;
211   GBQualPtr       gbq;
212   Char            id [64];
213   SeqIdPtr        ids, sip;
214   size_t          len;
215   CharPtr         name, nwstr, ptr, str;
216   ObjectIdPtr     oip;
217   ProtRefPtr      prp;
218   Char            tmp [256];
219   ValNodePtr      vnp;
220   SeqFeatXrefPtr  xref;
221 
222   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
223 
224   name = NULL;
225   vnp = NULL;
226   prp = NULL;
227 
228   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
229     if (xref->data.choice == SEQFEAT_PROT) {
230       prp = (ProtRefPtr) xref->data.value.ptrvalue;
231     }
232   }
233 
234   id [0] = '\0';
235   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
236     if (StringICmp (gbq->qual, "protein_id") == 0) {
237       StringNCpy_0 (id, gbq->val, sizeof (id));
238     }
239   }
240   if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
241     str = NULL;
242     ids = SeqIdParse (id);
243     for (sip = ids; sip != NULL; sip = sip->next) {
244       if (sip->choice != SEQID_GENERAL) continue;
245       dbt = (DbtagPtr) sip->data.ptrvalue;
246       if (dbt == NULL) continue;
247       if (IsSkippableDbtag (dbt)) continue;
248       oip = dbt->tag;
249       if (oip == NULL) continue;
250       str = oip->str;
251     }
252 
253     if (StringDoesHaveText (str)) {
254       if (prp != NULL && prp->name != NULL) {
255         vnp = prp->name;
256         name = (CharPtr) vnp->data.ptrvalue;
257       }
258       if (StringDoesHaveText (name) && vnp != NULL) {
259         if (alt_splice) {
260           ptr = StringChr (str, '-');
261           if (ptr != NULL && StringLen (ptr) == 3) {
262             ptr++;
263             ptr++;
264             sprintf (tmp, "%s, isoform %s", str, ptr);
265             len = StringLen (name) + StringLen (", ") + StringLen (tmp);
266             nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
267             if (nwstr != NULL) {
268               StringCpy (nwstr, name);
269               /*
270               StringCat (nwstr, ", ");
271               */
272               StringCat (nwstr, " ");
273               StringCat (nwstr, tmp);
274               vnp->data.ptrvalue = (Pointer) nwstr;
275               MemFree (name);
276             }
277           } else {
278             AddQualifierToFeature (sfp, "product", str);
279           }
280         } else {
281           len = StringLen (name) + StringLen (", ") + StringLen (str);
282           nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
283           if (nwstr != NULL) {
284             StringCpy (nwstr, name);
285             /*
286             StringCat (nwstr, ", ");
287             */
288             StringCat (nwstr, " ");
289             StringCat (nwstr, str);
290             vnp->data.ptrvalue = (Pointer) nwstr;
291             MemFree (name);
292           }
293         }
294       } else {
295         if (alt_splice) {
296           ptr = StringChr (str, '-');
297           if (ptr != NULL && StringLen (ptr) == 3) {
298             ptr++;
299             ptr++;
300             sprintf (tmp, "%s, isoform %s", str, ptr);
301             AddQualifierToFeature (sfp, "product", tmp);
302           } else {
303             AddQualifierToFeature (sfp, "product", str);
304           }
305         } else {
306           AddQualifierToFeature (sfp, "product", str);
307         }
308       }
309     }
310 
311     SeqIdSetFree (ids);
312   }
313 }
314 
EnhanceOneRna(SeqFeatPtr sfp,Boolean alt_splice)315 static void EnhanceOneRna (
316   SeqFeatPtr sfp,
317   Boolean alt_splice
318 )
319 
320 {
321   DbtagPtr     dbt;
322   GBQualPtr    gbq, nm_gbq;
323   Char         id [64];
324   SeqIdPtr     ids, sip;
325   size_t       len;
326   CharPtr      name, nwstr, ptr, str;
327   ObjectIdPtr  oip;
328   RnaRefPtr    rrp;
329   Char         tmp [256];
330 
331   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
332 
333   name = NULL;
334   nm_gbq = NULL;
335 
336   rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
337   if (rrp != NULL && rrp->ext.choice == 1) {
338     switch (rrp->type) {
339       case 1 :  /* precurrsor_RNA */
340       case 2 :  /* mRNA */
341       case 4 :  /* rRNA */
342         name = rrp->ext.value.ptrvalue;
343         break;
344       case 255 :  /* misc_RNA, ncRNA, tmRNA */
345         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
346           if (StringICmp (gbq->qual, "product") == 0) {
347             nm_gbq = gbq;
348             name = gbq->val;
349           }
350         }
351         break;
352       case 3:  /* tRNA */
353         return;
354       default :
355         break;
356     }
357   }
358 
359   id [0] = '\0';
360   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
361     if (StringICmp (gbq->qual, "transcript_id") == 0) {
362       StringNCpy_0 (id, gbq->val, sizeof (id));
363     }
364   }
365   if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
366     str = NULL;
367     ids = SeqIdParse (id);
368     for (sip = ids; sip != NULL; sip = sip->next) {
369       if (sip->choice != SEQID_GENERAL) continue;
370       dbt = (DbtagPtr) sip->data.ptrvalue;
371       if (dbt == NULL) continue;
372       if (IsSkippableDbtag(dbt)) continue;
373       oip = dbt->tag;
374       if (oip == NULL) continue;
375       str = oip->str;
376     }
377 
378     if (StringDoesHaveText (str)) {
379       if (StringDoesHaveText (name) && StringCmp (str, name) != 0) {
380         if (alt_splice) {
381           ptr = StringChr (str, '-');
382           if (ptr != NULL && StringLen (ptr) == 3) {
383             ptr++;
384             ptr++;
385             sprintf (tmp, "%s, transcript variant %s", str, ptr);
386             len = StringLen (name) + StringLen (", ") + StringLen (tmp);
387             nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
388             if (nwstr != NULL) {
389               StringCpy (nwstr, name);
390               /*
391               StringCat (nwstr, ", ");
392               */
393               StringCat (nwstr, " ");
394               StringCat (nwstr, tmp);
395               if (nm_gbq != NULL) {
396                 nm_gbq->val = (Pointer) nwstr;
397               } else if (rrp != NULL) {
398                 rrp->ext.value.ptrvalue = (Pointer) nwstr;
399               }
400               MemFree (name);
401             }
402           } else {
403             AddQualifierToFeature (sfp, "product", str);
404           }
405         } else {
406           len = StringLen (name) + StringLen (", ") + StringLen (str);
407           nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
408           if (nwstr != NULL) {
409             StringCpy (nwstr, name);
410             /*
411             StringCat (nwstr, ", ");
412             */
413             StringCat (nwstr, " ");
414             StringCat (nwstr, str);
415             if (nm_gbq != NULL) {
416               nm_gbq->val = (Pointer) nwstr;
417             } else if (rrp != NULL) {
418               rrp->ext.value.ptrvalue = (Pointer) nwstr;
419             }
420             MemFree (name);
421           }
422         }
423       } else {
424         if (alt_splice) {
425           ptr = StringChr (str, '-');
426           if (ptr != NULL && StringLen (ptr) == 3) {
427             ptr++;
428             ptr++;
429             sprintf (tmp, "%s, transcript variant %s", str, ptr);
430             AddQualifierToFeature (sfp, "product", tmp);
431           } else {
432             AddQualifierToFeature (sfp, "product", str);
433           }
434         } else {
435           AddQualifierToFeature (sfp, "product", str);
436         }
437       }
438     }
439 
440     SeqIdSetFree (ids);
441   }
442 }
443 
EnhanceFeatureAnnotation(SeqFeatPtr features,BioseqPtr bsp)444 static void EnhanceFeatureAnnotation (
445   SeqFeatPtr features,
446   BioseqPtr bsp
447 )
448 
449 {
450   GmcDataPtr  gdp, head;
451   GeneRefPtr  grp;
452   Int2        i, j, k, numgene, numcds, numrna;
453   SeqFeatPtr  sfp;
454 
455   if (features == NULL || bsp == NULL) return;
456 
457   numgene = 0;
458   numcds = 0;
459   numrna = 0;
460 
461   for (sfp = features; sfp != NULL; sfp = sfp->next) {
462     switch (sfp->data.choice) {
463       case SEQFEAT_GENE :
464         numgene++;
465         break;
466       case SEQFEAT_CDREGION :
467         numcds++;
468         break;
469       case SEQFEAT_RNA :
470         numrna++;
471         break;
472       default :
473         break;
474     }
475   }
476 
477   if (numgene == 0) return;
478 
479   if (numcds > 0) {
480     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numcds + 1));
481     if (head != NULL) {
482       gdp = head;
483       for (sfp = features; sfp != NULL; sfp = sfp->next) {
484         if (sfp->idx.subtype == FEATDEF_CDS) {
485           gdp->feat = sfp;
486           grp = SeqMgrGetGeneXref (sfp);
487           if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
488             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
489           }
490           gdp++;
491         }
492       }
493       HeapSort (head, (size_t) numcds, sizeof (GmcData), SortByGenePtr);
494       for (i = 0; i < numcds; i += j) {
495         sfp = head [i].gene;
496         for (j = 1; i + j < numcds && sfp == head [i + j].gene; j++) continue;
497         if (j == 1) {
498           /* no alt splicing */
499           EnhanceOneCDS (head [i].feat, FALSE);
500         } else {
501           /* is alt splicing */
502           for (k = 0; k < j; k++) {
503             EnhanceOneCDS (head [i + k].feat, TRUE);
504           }
505         }
506       }
507     }
508     MemFree (head);
509   }
510 
511   if (numrna > 0) {
512     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1));
513     if (head != NULL) {
514       gdp = head;
515       for (sfp = features; sfp != NULL; sfp = sfp->next) {
516         if (sfp->data.choice == SEQFEAT_RNA) {
517           gdp->feat = sfp;
518           grp = SeqMgrGetGeneXref (sfp);
519           if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
520             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
521           }
522           gdp++;
523         }
524       }
525       HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr);
526       for (i = 0; i < numrna; i += j) {
527         sfp = head [i].gene;
528         for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue;
529         if (j == 1) {
530           /* no alt splicing */
531           EnhanceOneRna (head [i].feat, FALSE);
532         } else {
533           /* is alt splicing */
534           for (k = 0; k < j; k++) {
535             EnhanceOneRna (head [i + k].feat, TRUE);
536           }
537         }
538       }
539     }
540     MemFree (head);
541   }
542 }
543 
AttachSeqAnnotEntity(Uint2 entityID,SeqAnnotPtr sap,TblArgsPtr tbl)544 static BioseqPtr AttachSeqAnnotEntity (
545   Uint2 entityID,
546   SeqAnnotPtr sap,
547   TblArgsPtr tbl
548 )
549 
550 {
551   SeqAnnotPtr  anp;
552   BioseqPtr    bsp;
553   Char         buf [80];
554   Int2         genCode;
555   SeqEntryPtr  oldscope;
556   SeqEntryPtr  sep;
557   SeqFeatPtr   sfp = NULL;
558   SeqIdPtr     sip;
559   SeqLocPtr    slp;
560 
561   if (sap == NULL || tbl == NULL) return NULL;
562 
563   bsp = GetBioseqReferencedByAnnot (sap, entityID);
564   if (bsp == NULL) {
565     oldscope = SeqEntrySetScope (NULL);
566     if (oldscope != NULL) {
567       bsp = GetBioseqReferencedByAnnot (sap, entityID);
568       SeqEntrySetScope (oldscope);
569     }
570   }
571   if (bsp != NULL) {
572     sep = SeqMgrGetSeqEntryForData (bsp);
573     entityID = ObjMgrGetEntityIDForChoice (sep);
574     if (sap->type == 1) {
575       sfp = (SeqFeatPtr) sap->data;
576       genCode = GetGenCodeForBsp (bsp);
577       SetEmptyGeneticCodes (sap, genCode);
578     }
579     if (bsp->annot == NULL) {
580       bsp->annot = sap;
581     } else {
582       anp = bsp->annot;
583       while (anp->next != NULL) {
584         anp = anp->next;
585       }
586       anp->next = sap;
587     }
588     if (sfp != NULL) {
589       if (tbl->smartfeats) {
590 
591         /* indexing needed to find mRNA and CDS within each gene */
592 
593         SeqMgrIndexFeatures (entityID, NULL);
594 
595         EnhanceFeatureAnnotation (sfp, bsp);
596       }
597 
598       PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid, &(tbl->seq_fetch_failure));
599       sep = GetTopSeqEntryForEntityID (entityID);
600     }
601   } else {
602     buf [0] = '\0';
603     if (sap->type == 1) {
604       sfp = (SeqFeatPtr) sap->data;
605       if (sfp != NULL && sfp->location != NULL) {
606         slp = SeqLocFindNext (sfp->location, NULL);
607         if (slp != NULL) {
608           sip = SeqLocId (slp);
609           if (sip != NULL) {
610             SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
611           }
612         }
613       }
614     }
615 //    Message (MSG_POSTERR, "Feature table identifiers %s do not match record", buf);
616   }
617   sep = GetTopSeqEntryForEntityID (entityID);
618   return bsp;
619 }
620 
621 
622 
623 
ProcessOneAnnot(SeqAnnotPtr sap,Uint2 entityID,TblArgsPtr tbl)624 static void ProcessOneAnnot (
625   SeqAnnotPtr sap,
626   Uint2 entityID,
627   TblArgsPtr tbl
628 )
629 
630 {
631   BioseqPtr   bsp;
632   GBQualPtr   gbq;
633   Int2        genCode;
634   SeqFeatPtr  sfp;
635   SeqEntryPtr sep;
636 
637   if (sap == NULL || tbl == NULL) return;
638 
639   if (tbl->delaygenprodset) {
640     if (sap->type == 1) {
641       for (sfp = (SeqFeatPtr) sap->data; sfp != NULL; sfp = sfp->next) {
642         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
643           if (StringICmp (gbq->qual, "protein_id") == 0 && sfp->data.choice == SEQFEAT_RNA) {
644             gbq->qual = MemFree (gbq->qual);
645             gbq->qual = StringSave ("orig_protein_id");
646           }
647           if (StringICmp (gbq->qual, "transcript_id") == 0) {
648             gbq->qual = MemFree (gbq->qual);
649             gbq->qual = StringSave ("orig_transcript_id");
650           }
651         }
652       }
653     }
654   }
655 
656   bsp = AttachSeqAnnotEntity (entityID, sap, tbl);
657   if (bsp == NULL) return;
658 
659   sep = GetTopSeqEntryForEntityID (entityID);
660 
661   /* correct all idx parent pointers */
662 
663   AssignIDsInEntity (entityID, 0, NULL);
664 
665   genCode = GetGenCodeForBsp (bsp);
666 
667   /* coercion of SeqIds to accession moved to ProcessOneRecord->MakeAccessionID */
668 
669   /* for parsed in features or best ORF, promote CDS products to protein bioseq */
670 
671   for (sap = bsp->annot; sap != NULL; sap = sap->next) {
672     if (sap->type == 1) {
673       SetEmptyGeneticCodes (sap, genCode);
674       sfp = (SeqFeatPtr) sap->data;
675       PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid, &(tbl->seq_fetch_failure));
676     }
677   }
678   sep = GetTopSeqEntryForEntityID (entityID);
679   move_cds_ex (sep, FALSE);
680 }
681 
682 
683 
684 
685 
686 typedef struct raw2deltdata {
687   Uint2           entityID;
688   SubmitBlockPtr  sbp;
689   BioSourcePtr    src;
690   TblArgsPtr      tbl;
691   MolInfoPtr      template_molinfo;
692 } Raw2DeltData, PNTR Raw2DeltPtr;
693 
694 
695 
696 
697 typedef struct resqseqgph {
698   Int2         index;
699   SeqGraphPtr  sgp;
700 } ResqSeqgph, PNTR ResqSeqgphPtr;
701 
702 
703 
704 
705 
706 typedef struct reqcontig {
707   Int2  index;
708   Char  str [41];
709 } ResqContig, PNTR ResqContigPtr;
710 
711 #define MAX_FIELDS  8
712 
713 
714 
715 
716 
DoSequenceLengthsMatch(TAlignmentFilePtr afp)717 static Boolean DoSequenceLengthsMatch (
718   TAlignmentFilePtr afp
719 )
720 
721 {
722   int    seq_index;
723   Int4   seq_len;
724 
725   if (afp == NULL || afp->sequences == NULL || afp->num_sequences == 0) {
726     return TRUE;
727   }
728   seq_len = StringLen (afp->sequences[0]);
729   for (seq_index = 1; seq_index < afp->num_sequences; seq_index++) {
730     if (StringLen (afp->sequences[seq_index]) != seq_len) {
731       return FALSE;
732     }
733   }
734   return TRUE;
735 }
736 
737 
738 
739 #ifdef INTERNAL_NCBI_ASNDISC
740 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase;
741 #else
742 const PerformDiscrepancyTest taxlookup = NULL;
743 #endif
744 
745 
746 
747 
748 typedef struct reblobseq {
749   SeqIdPtr  sip;
750   CharPtr   orig_file;
751   CharPtr   feature_file;
752   CharPtr   src_file;
753   CharPtr   quality_file;
754   BioseqPtr bsp;
755 } ReblobSeqData, PNTR ReblobSeqPtr;
756 
757 
ReblobSeqNew(ValNodePtr token_list)758 static ReblobSeqPtr ReblobSeqNew (ValNodePtr token_list)
759 {
760   ReblobSeqPtr rs;
761   Char         id_buf[PATH_MAX];
762 
763   rs = (ReblobSeqPtr) MemNew (sizeof (ReblobSeqData));
764   if (token_list != NULL) {
765     sprintf (id_buf, "gb|%s", (CharPtr) token_list->data.ptrvalue);
766     rs->sip = MakeSeqID (id_buf);
767     if (token_list->next != NULL) {
768       rs->orig_file = token_list->next->data.ptrvalue;
769       token_list->next->data.ptrvalue = NULL;
770       if (token_list->next->next != NULL) {
771         rs->feature_file = token_list->next->next->data.ptrvalue;
772         token_list->next->next->data.ptrvalue = NULL;
773         if (token_list->next->next->next != NULL) {
774           rs->src_file = token_list->next->next->next->data.ptrvalue;
775           token_list->next->next->next->data.ptrvalue = NULL;
776           if (token_list->next->next->next->next != NULL) {
777             rs->quality_file = token_list->next->next->next->next->data.ptrvalue;
778             token_list->next->next->next->next->data.ptrvalue = NULL;
779           }
780         }
781       }
782     }
783   }
784   return rs;
785 }
786 
787 
ReblobSeqFree(ReblobSeqPtr rs)788 static ReblobSeqPtr ReblobSeqFree (ReblobSeqPtr rs)
789 {
790   if (rs != NULL) {
791     rs->sip = SeqIdFree (rs->sip);
792     rs->orig_file = MemFree (rs->orig_file);
793     rs->feature_file = MemFree (rs->feature_file);
794     rs->src_file = MemFree (rs->src_file);
795     rs->quality_file = MemFree (rs->quality_file);
796     rs = MemFree (rs);
797   }
798   return rs;
799 }
800 
801 
ReblobSeqListFree(ValNodePtr vnp)802 static ValNodePtr ReblobSeqListFree (ValNodePtr vnp)
803 {
804   ValNodePtr vnp_next;
805 
806   while (vnp != NULL) {
807     vnp_next = vnp->next;
808     vnp->next = NULL;
809     vnp->data.ptrvalue = ReblobSeqFree (vnp->data.ptrvalue);
810     vnp = ValNodeFree (vnp);
811     vnp = vnp_next;
812   }
813   return vnp;
814 }
815 
816 
817 typedef struct reblobgroup {
818   CharPtr filename;
819   ValNodePtr seq_list;
820 } ReblobGroupData, PNTR ReblobGroupPtr;
821 
822 
ReblobGroupNew(CharPtr filename,ValNodePtr seq_list)823 static ReblobGroupPtr ReblobGroupNew (CharPtr filename, ValNodePtr seq_list)
824 {
825   ReblobGroupPtr rg;
826 
827   rg = (ReblobGroupPtr) MemNew (sizeof (ReblobGroupData));
828   rg->filename = StringSave (filename);
829   rg->seq_list = seq_list;
830   return rg;
831 }
832 
833 
ReblobGroupFree(ReblobGroupPtr rg)834 static ReblobGroupPtr ReblobGroupFree (ReblobGroupPtr rg)
835 {
836   if (rg != NULL) {
837     rg->filename = MemFree (rg->filename);
838     rg->seq_list = ReblobSeqListFree (rg->seq_list);
839     rg = MemFree (rg);
840   }
841   return rg;
842 }
843 
844 
ReblobGroupListFree(ValNodePtr vnp)845 static ValNodePtr ReblobGroupListFree (ValNodePtr vnp)
846 {
847   ValNodePtr vnp_next;
848 
849   while (vnp != NULL) {
850     vnp_next = vnp->next;
851     vnp->next = NULL;
852     vnp->data.ptrvalue = ReblobGroupFree (vnp->data.ptrvalue);
853     vnp = ValNodeFree (vnp);
854     vnp = vnp_next;
855   }
856   return vnp;
857 }
858 
859 
860 const char *nucleotide_alphabet = "ABCDGHKMRSTUVWYabcdghkmrstuvwy";
861 const char *protein_alphabet = "ABCDEFGHIKLMPQRSTUVWXYZabcdefghiklmpqrstuvwxyz";
862 
GetDefaultSequenceInfo(void)863 extern TSequenceInfoPtr GetDefaultSequenceInfo (void)
864 {
865   TSequenceInfoPtr sequence_info = SequenceInfoNew();
866 
867   sequence_info->missing = MemFree (sequence_info->missing);
868   sequence_info->missing = StringSave ("?Nn");
869 
870   sequence_info->beginning_gap = MemFree (sequence_info->beginning_gap);
871   sequence_info->beginning_gap = StringSave ("-.Nn?");
872 
873   sequence_info->middle_gap = MemFree (sequence_info->middle_gap);
874   sequence_info->middle_gap = StringSave ("-.");
875 
876   sequence_info->end_gap = MemFree (sequence_info->end_gap);
877   sequence_info->end_gap = StringSave ("-.Nn?");
878 
879   sequence_info->match = MemFree (sequence_info->match);
880   sequence_info->match = StringSave (":");
881 
882   sequence_info->alphabet = nucleotide_alphabet;
883 
884   return sequence_info;
885 }
886 
887 static void MakeFeatureTableFileIdsGenbank (SeqAnnotPtr sap);
888 
AddOneFeatureTable(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)889 static Boolean AddOneFeatureTable (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
890 {
891   FILE *fp;
892   Pointer      dataptr;
893   Uint2        datatype;
894   Boolean      failure = FALSE;
895   Int4         linenum = 0;
896   SeqAnnotPtr  sap;
897   TblArgs      tbl_args;
898   SeqEntryPtr  orig_scope;
899 
900 
901   if (StringHasNoText (filename)) {
902     return FALSE;
903   }
904   fp = FileOpen (filename, "r");
905   if (fp == NULL) {
906     Message (MSG_ERROR, "Unable to open feature table file %s\n", filename);
907     return FALSE;
908   }
909   orig_scope = SeqEntrySetScope (sep);
910 
911   MemSet (&tbl_args, 0, sizeof (TblArgs));
912 
913   while ((! failure) && (dataptr = ReadFeatureTableFile (fp, &datatype, NULL, &linenum, &failure, TRUE)) != NULL) {
914     if (datatype == OBJ_SEQANNOT) {
915       sap = (SeqAnnotPtr) dataptr;
916       MakeFeatureTableFileIdsGenbank (sap);
917       ProcessOneAnnot (sap, entityID, &tbl_args);
918 
919     } else {
920       ObjMgrFree (datatype, dataptr);
921     }
922   }
923   FileClose (fp);
924 
925   SeqEntrySetScope (orig_scope);
926 
927   if (failure) {
928     Message (MSG_POSTERR, "Bad feature table at line %ld of file %s", (long) linenum, filename);
929     return FALSE;
930   } else {
931     return TRUE;
932   }
933 }
934 
935 
AddFeaturesToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)936 static Boolean AddFeaturesToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
937 {
938   ValNodePtr vnp;
939   ReblobSeqPtr rs;
940   ValNodeBlock file_list;
941   Boolean rval = TRUE;
942 
943   if (rg == NULL) {
944     return FALSE;
945   }
946 
947   InitValNodeBlock (&file_list, NULL);
948 
949   for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
950     rs = vnp->data.ptrvalue;
951     if (rs != NULL && !StringHasNoText (rs->feature_file)
952         && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
953       ValNodeAddPointerToEnd (&file_list, 0, rs->feature_file);
954     }
955   }
956 
957   file_list.head = ValNodeSort (file_list.head, SortVnpByString);
958   ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
959 
960   for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
961     AddOneFeatureTable (vnp->data.ptrvalue, sep, entityID);
962   }
963   file_list.head = ValNodeFree (file_list.head);
964   return rval;
965 }
966 
967 
968 static ValNodePtr
AutomatchQualsFromTableHeader(ValNodePtr header,Boolean force_first_accession,Boolean allow_no_match)969 AutomatchQualsFromTableHeader
970 (ValNodePtr header,
971  Boolean force_first_accession,
972  Boolean allow_no_match)
973 {
974   ValNodePtr val;
975   ValNodeBlock columns;
976   TabColumnConfigPtr t;
977 
978   InitValNodeBlock (&columns, NULL);
979 
980   for (val = header;
981        val != NULL;
982        val = val->next) {
983     t = TabColumnConfigNew ();
984     if (force_first_accession && columns.head == NULL) {
985       t->match_type = MatchTypeNew();
986       t->match_type->choice = eTableMatchNucID;
987       t->match_type->match_location = String_location_equals;
988     } else {
989       t->field = FieldTypeFromString (val->data.ptrvalue);
990       if (t->field == NULL) {
991         Message (MSG_POSTERR, "Unrecognized source qualifier: %s", val->data.ptrvalue);
992         t = TabColumnConfigFree (t);
993         if (!allow_no_match) {
994           columns.head = TabColumnConfigListFree (columns.head);
995           return NULL;
996         }
997       } else {
998         t->existing_text = ExistingTextOption_replace_old;
999       }
1000     }
1001     ValNodeAddPointerToEnd (&columns, 0, t);
1002   }
1003   return columns.head;
1004 }
1005 
1006 
PostErrorList(ValNodePtr vnp,CharPtr filename)1007 static void PostErrorList (ValNodePtr vnp, CharPtr filename)
1008 {
1009   while (vnp != NULL) {
1010     Message (MSG_POSTERR, "%s in %s", (CharPtr) vnp->data.ptrvalue, filename);
1011     vnp = vnp->next;
1012   }
1013 }
1014 
1015 
SpecialFixSrcMods(ValNodePtr list)1016 static void SpecialFixSrcMods (ValNodePtr list)
1017 {
1018   while (list != NULL) {
1019     if (StringICmp (list->data.ptrvalue, "note") == 0) {
1020       list->data.ptrvalue = MemFree (list->data.ptrvalue);
1021       list->data.ptrvalue = StringSave ("note-subsrc");
1022     }
1023     list = list->next;
1024   }
1025 }
1026 
1027 
AddOneSrcTable(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)1028 static Boolean AddOneSrcTable (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
1029 {
1030   ValNodePtr err_list;
1031   ValNodePtr columns, obj_table;
1032   ValNodePtr table;
1033   FILE *fp;
1034 
1035   if (StringHasNoText (filename)) {
1036     return FALSE;
1037   }
1038   fp = FileOpen (filename, "r");
1039   if (fp == NULL) {
1040     Message (MSG_POSTERR, "Unable to open source file %s", filename);
1041     return FALSE;
1042   }
1043   table = ReadTabTableFromFile (fp);
1044   FileClose (fp);
1045   err_list = AutoReplaceSpecialCharactersInTabTable (table);
1046   PostErrorList(err_list, filename);
1047   err_list = ValNodeFreeData(err_list);
1048 
1049   if (table == NULL || table->next == NULL || table->data.ptrvalue == NULL) {
1050     Message (MSG_POSTERR, "Unable to read table from file %s", filename);
1051     table = FreeTabTable (table);
1052     return FALSE;
1053   }
1054   SpecialFixSrcMods (table->data.ptrvalue);
1055   columns = AutomatchQualsFromTableHeader(table->data.ptrvalue, TRUE, FALSE);
1056   if (columns == NULL) {
1057     Message (MSG_POSTERR, "Unable to read table from file %s", filename);
1058     table = FreeTabTable (table);
1059     return FALSE;
1060   }
1061 
1062   err_list = ValidateTabTableValues (table->next, columns);
1063   if (err_list != NULL) {
1064     PostErrorList (err_list, filename);
1065     err_list = ValNodeFreeData (err_list);
1066     table = FreeTabTable (table);
1067     return FALSE;
1068   }
1069 
1070   obj_table = GetObjectTableForTabTable (sep, table, columns, &err_list);
1071 
1072   err_list = CheckObjTableForRowsThatApplyToTheSameDestination (obj_table);
1073   if (err_list != NULL) {
1074     PostErrorList (err_list, filename);
1075     Message (MSG_POSTERR, "For one or more columns in %s, two or more rows in the table apply to the same object.  Cannot apply table.", filename);
1076     err_list = ValNodeFreeData (err_list);
1077     obj_table = FreeObjectTableForTabTable (obj_table);
1078     DeleteMarkedObjects (entityID, 0, NULL);
1079     return FALSE;
1080   }
1081 
1082   err_list = ApplyTableValuesToObjectTable (sep, table, columns, obj_table);
1083   /* PostErrorList (err_list, filename); */
1084 
1085   err_list = ValNodeFreeData (err_list);
1086   obj_table = FreeObjectTableForTabTable (obj_table);
1087   DeleteMarkedObjects (entityID, 0, NULL);
1088   table = FreeTabTable(table);
1089   return TRUE;
1090 
1091 }
1092 
1093 
ApplySrcTableToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)1094 static Boolean ApplySrcTableToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
1095 {
1096   ValNodePtr vnp;
1097   ReblobSeqPtr rs;
1098   ValNodeBlock file_list;
1099   Boolean rval = TRUE;
1100 
1101   if (rg == NULL) {
1102     return FALSE;
1103   }
1104 
1105   InitValNodeBlock (&file_list, NULL);
1106 
1107   for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1108     rs = vnp->data.ptrvalue;
1109     if (rs != NULL && !StringHasNoText (rs->src_file)
1110         && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
1111       ValNodeAddPointerToEnd (&file_list, 0, rs->src_file);
1112     }
1113   }
1114 
1115   file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1116   ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1117 
1118   for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1119     rval &= AddOneSrcTable (vnp->data.ptrvalue, sep, entityID);
1120   }
1121   file_list.head = ValNodeFree (file_list.head);
1122   return rval;
1123 
1124 }
1125 
1126 
NewGraphSeqAnnot(CharPtr name,SeqGraphPtr sgp)1127 static SeqAnnotPtr NewGraphSeqAnnot (
1128   CharPtr name,
1129   SeqGraphPtr sgp
1130 )
1131 
1132 {
1133   SeqAnnotPtr  sap = NULL;
1134 
1135   if (sgp == NULL) return NULL;
1136   sap = SeqAnnotNew ();
1137   if (sap == NULL) return NULL;
1138 
1139   if (StringDoesHaveText (name)) {
1140     SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name));
1141   }
1142   sap->type = 3;
1143   sap->data = (Pointer) sgp;
1144 
1145   return sap;
1146 }
1147 
1148 
MakeSeqGraphFileId(CharPtr str)1149 static SeqIdPtr MakeSeqGraphFileId (CharPtr str)
1150 {
1151   SeqIdPtr     sip_new;
1152   TextSeqIdPtr tsip;
1153   CharPtr      cp;
1154 
1155   tsip = TextSeqIdNew ();
1156   tsip->accession = StringSave (str);
1157   if ((cp = StringChr(tsip->accession, '.')) != NULL) {
1158     tsip->version = atoi (cp + 1);
1159     *cp = 0;
1160   }
1161   sip_new = ValNodeNew (NULL);
1162   sip_new->choice = SEQID_GENBANK;
1163   sip_new->data.ptrvalue = tsip;
1164 
1165   return sip_new;
1166 }
1167 
1168 
ApplyOneQualityFile(CharPtr filename,SeqEntryPtr sep,Uint2 entityID)1169 static Boolean ApplyOneQualityFile (CharPtr filename, SeqEntryPtr sep, Uint2 entityID)
1170 {
1171   FILE              *fp;
1172   FileCache          fc;
1173   Boolean            goOn = TRUE;
1174   Char               buf [1024];
1175   Boolean            nonewline;
1176   CharPtr            str, ptr;
1177   SeqIdPtr           sip;
1178   BioseqPtr          bsp;
1179   SeqGraphPtr        sgp, lastsgp;
1180   SeqAnnotPtr        sap;
1181 
1182   fp = FileOpen (filename, "r");
1183   if (fp == NULL) {
1184     return FALSE;
1185   }
1186 
1187   FileCacheSetup (&fc, fp);
1188 
1189   goOn = TRUE;
1190   while (goOn) {
1191     str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
1192     if (str == NULL) {
1193       goOn = FALSE;
1194     } else if (StringDoesHaveText (str)) {
1195       if (str [0] == '>') {
1196         ptr = StringChr (str, ' ');
1197         if (ptr == NULL) {
1198           ptr = StringChr (str, '\t');
1199         }
1200         if (ptr != NULL) {
1201           *ptr = '\0';
1202         }
1203         sip = MakeSeqGraphFileId (str + 1);
1204         bsp = BioseqFind (sip);
1205         if (bsp != NULL) {
1206           sgp = ReadPhrapQualityFC (&fc, bsp);
1207           if (sgp != NULL) {
1208             for (sap = bsp->annot; sap != NULL; sap = sap->next) {
1209               if (sap->type == 3) {
1210                 for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) {
1211                   continue;
1212                 }
1213                 lastsgp->next = sgp;
1214                 break;
1215               }
1216             }
1217             if (sap == NULL) {
1218               if (bsp->annot != NULL) {
1219                 for (sap = bsp->annot; sap->next != NULL; sap = sap->next) {
1220                   continue;
1221                 }
1222                 sap->next = NewGraphSeqAnnot ("Phrap Graph", sgp);
1223               } else {
1224                 bsp->annot = NewGraphSeqAnnot ("Phrap Graph", sgp);
1225               }
1226             }
1227           }
1228         }
1229         SeqIdFree (sip);
1230       }
1231     }
1232   }
1233   FileClose (fp);
1234   return TRUE;
1235 }
1236 
1237 
ApplyQualityScoresToGroup(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID,CharPtr file_filter)1238 static Boolean ApplyQualityScoresToGroup (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID, CharPtr file_filter)
1239 {
1240   ValNodePtr vnp;
1241   ReblobSeqPtr rs;
1242   ValNodeBlock file_list;
1243   Boolean rval = TRUE;
1244 
1245   if (rg == NULL) {
1246     return FALSE;
1247   }
1248 
1249   InitValNodeBlock (&file_list, NULL);
1250 
1251   for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1252     rs = vnp->data.ptrvalue;
1253     if (rs != NULL && !StringHasNoText (rs->quality_file)
1254         && (StringHasNoText (file_filter) || StringCmp (file_filter, rs->orig_file) == 0)) {
1255       ValNodeAddPointerToEnd (&file_list, 0, rs->quality_file);
1256     }
1257   }
1258 
1259   file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1260   ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1261 
1262   for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1263     rval &= ApplyOneQualityFile (vnp->data.ptrvalue, sep, entityID);
1264   }
1265   file_list.head = ValNodeFree (file_list.head);
1266   return rval;
1267 }
1268 
1269 
CopyDescriptors(BioseqPtr orig_bsp,BioseqPtr new_bsp)1270 static void CopyDescriptors (BioseqPtr orig_bsp, BioseqPtr new_bsp)
1271 {
1272   SeqMgrDescContext context;
1273   SeqDescPtr sdp;
1274 
1275   /* only copy title descriptors if they are on the bioseq itself */
1276   for (sdp = SeqMgrGetNextDescriptor (orig_bsp, NULL, 0, &context);
1277        sdp != NULL;
1278        sdp = SeqMgrGetNextDescriptor (orig_bsp, sdp, 0, &context)) {
1279     if (sdp->choice != Seq_descr_title) {
1280       ValNodeLink (&(new_bsp->descr),
1281                    AsnIoMemCopy ((Pointer) sdp,
1282                                  (AsnReadFunc) SeqDescAsnRead,
1283                                  (AsnWriteFunc) SeqDescAsnWrite));
1284     }
1285   }
1286 
1287   /* copy titles on bioseq */
1288   for (sdp = orig_bsp->descr; sdp != NULL; sdp = sdp->next) {
1289     if (sdp->choice == Seq_descr_title) {
1290       ValNodeLink (&(new_bsp->descr),
1291                    AsnIoMemCopy ((Pointer) sdp,
1292                                  (AsnReadFunc) SeqDescAsnRead,
1293                                  (AsnWriteFunc) SeqDescAsnWrite));
1294     }
1295   }
1296 
1297 }
1298 
1299 
1300 static void
CopyFeaturesAndProteins(Uint2 entityID,BioseqPtr orig_bsp,BioseqPtr new_bsp,SeqEntryPtr orig_scope,SeqEntryPtr new_scope)1301 CopyFeaturesAndProteins
1302 (Uint2       entityID,
1303  BioseqPtr   orig_bsp,
1304  BioseqPtr   new_bsp,
1305  SeqEntryPtr orig_scope,
1306  SeqEntryPtr new_scope)
1307 {
1308   SeqMgrFeatContext context;
1309   SeqFeatPtr sfp, sfp_cpy;
1310   BioseqPtr prod_bsp, prod_cpy;
1311   SeqEntryPtr sep, sep_prod;
1312 
1313   sep = GetBestTopParentForData (entityID, new_bsp);
1314   SeqEntrySetScope (orig_scope);
1315   for (sfp = SeqMgrGetNextFeature (orig_bsp, NULL, 0, 0, &context);
1316        sfp != NULL;
1317        sfp = SeqMgrGetNextFeature (orig_bsp, sfp, 0, 0, &context)) {
1318     if (sfp->product != NULL && (prod_bsp = BioseqFindFromSeqLoc (sfp->product)) != NULL) {
1319       prod_cpy = AsnIoMemCopy (prod_bsp, (AsnReadFunc) BioseqAsnRead, (AsnWriteFunc) BioseqAsnWrite);
1320       sep_prod = SeqEntryNew ();
1321       sep_prod->choice = 1;
1322       sep_prod->data.ptrvalue = prod_cpy;
1323       AddSeqEntryToSeqEntry (sep, sep_prod, TRUE);
1324       sep = GetBestTopParentForData (entityID, new_bsp);
1325     }
1326     sfp_cpy = AsnIoMemCopy (sfp, (AsnReadFunc) SeqFeatAsnRead, (AsnWriteFunc) SeqFeatAsnWrite);
1327     CreateNewFeature (SeqMgrGetSeqEntryForData (new_bsp), NULL, sfp->data.choice, sfp_cpy);
1328   }
1329 
1330 }
1331 
1332 
CopyExtraIds(BioseqPtr new_bsp,BioseqPtr orig_bsp)1333 static void CopyExtraIds (BioseqPtr new_bsp, BioseqPtr orig_bsp)
1334 {
1335   SeqIdPtr sip_new, sip_orig, sip_last;
1336   Boolean  found;
1337   TextSeqIdPtr orig_acc, new_acc;
1338 
1339   for (sip_orig = orig_bsp->id; sip_orig != NULL; sip_orig = sip_orig->next) {
1340     found = FALSE;
1341     sip_last = NULL;
1342     for (sip_new = new_bsp->id; sip_new != NULL && !found; sip_new = sip_new->next) {
1343       if (SeqIdComp (sip_new, sip_orig) == SIC_YES) {
1344         found = TRUE;
1345       }
1346       sip_last = sip_new;
1347     }
1348     if (!found) {
1349       sip_new = AsnIoMemCopy (sip_orig, (AsnReadFunc) SeqIdAsnRead, (AsnWriteFunc) SeqIdAsnWrite);
1350       if (sip_last == NULL) {
1351         new_bsp->id = sip_new;
1352       } else {
1353         sip_last->next = sip_new;
1354       }
1355     } else if (sip_orig->choice == SEQID_GENBANK && sip_last->choice == SEQID_GENBANK) {
1356       orig_acc = (TextSeqIdPtr) sip_orig->data.ptrvalue;
1357       new_acc = (TextSeqIdPtr) sip_last->data.ptrvalue;
1358       if (orig_acc != NULL && new_acc != NULL) {
1359         new_acc->version = orig_acc->version;
1360       }
1361     }
1362   }
1363 }
1364 
1365 
CopySetType(SeqEntryPtr new_sep,SeqEntryPtr orig_sep)1366 static Boolean CopySetType (SeqEntryPtr new_sep, SeqEntryPtr orig_sep)
1367 {
1368   BioseqSetPtr orig_bssp, new_bssp;
1369 
1370   if (orig_sep != NULL && IS_Bioseq_set (orig_sep) && (orig_bssp = orig_sep->data.ptrvalue) != NULL
1371      && new_sep != NULL && IS_Bioseq_set (new_sep) && (new_bssp = new_sep->data.ptrvalue) != NULL
1372      && orig_bssp->_class != BioseqseqSet_class_nuc_prot) {
1373     new_bssp->_class = orig_bssp->_class;
1374     return TRUE;
1375   } else {
1376     return FALSE;
1377   }
1378 
1379 }
1380 
1381 
CopySetTitle(SeqEntryPtr new_sep,SeqEntryPtr orig_sep)1382 static Boolean CopySetTitle (SeqEntryPtr new_sep, SeqEntryPtr orig_sep)
1383 {
1384   BioseqSetPtr orig_bssp, new_bssp;
1385   SeqDescPtr sdp;
1386   Boolean rval = FALSE;
1387 
1388   if (orig_sep != NULL && IS_Bioseq_set (orig_sep) && (orig_bssp = orig_sep->data.ptrvalue) != NULL
1389      && new_sep != NULL && IS_Bioseq_set (new_sep) && (new_bssp = new_sep->data.ptrvalue) != NULL) {
1390     for (sdp = orig_bssp->descr; sdp != NULL && sdp->choice != Seq_descr_title; sdp = sdp->next) {
1391     }
1392     if (sdp != NULL) {
1393       ValNodeLink (&(new_bssp->descr),
1394                    AsnIoMemCopy ((Pointer) sdp,
1395                                  (AsnReadFunc) SeqDescAsnRead,
1396                                  (AsnWriteFunc) SeqDescAsnWrite));
1397       rval = TRUE;
1398     }
1399   }
1400   return rval;
1401 }
1402 
1403 
CopyQualityGraphs(BioseqPtr bsp_new,BioseqPtr bsp_orig)1404 static Boolean CopyQualityGraphs (BioseqPtr bsp_new, BioseqPtr bsp_orig)
1405 {
1406   SeqAnnotPtr sap, sap_last, sap_new;
1407 
1408   if (bsp_new == NULL || bsp_orig == NULL || CompareSequences (bsp_new, bsp_orig, 0) != 0) {
1409     return FALSE;
1410   }
1411 
1412   sap_last = bsp_new->annot;
1413   while (sap_last != NULL && sap_last->next != NULL) {
1414     sap_last = sap_last->next;
1415   }
1416 
1417   for (sap = bsp_orig->annot; sap != NULL; sap = sap->next) {
1418     if (sap->type == 3) {
1419       sap_new = (SeqAnnotPtr) AsnIoMemCopy (sap, (AsnReadFunc) SeqAnnotAsnRead, (AsnWriteFunc) SeqAnnotAsnWrite);
1420       if (sap_last == NULL) {
1421         bsp_new->annot = sap_new;
1422       } else {
1423         sap_last->next = sap_new;
1424       }
1425       sap_last = sap_new;
1426     }
1427   }
1428   return TRUE;
1429 }
1430 
1431 
1432 /* NOTES:
1433  * if the orig is in a nuc-prot set, need to add all the proteins from the original nuc-prot-set to
1434  * the new file.
1435  * also, should look for descriptors on the nuc-prot set and copy those as well.
1436  */
CopyDataFromOriginals(ReblobGroupPtr rg,SeqEntryPtr sep,Uint2 entityID)1437 static Boolean CopyDataFromOriginals (ReblobGroupPtr rg, SeqEntryPtr sep, Uint2 entityID)
1438 {
1439   ValNodePtr vnp, vnp_r;
1440   ReblobSeqPtr rs;
1441   ValNodeBlock file_list;
1442   Boolean rval = TRUE;
1443   SeqEntryPtr orig_sep;
1444   AsnIoPtr       aip;
1445   BioseqPtr      orig_bsp;
1446   CharPtr        filename;
1447   Uint2          orig_entityID;
1448   Char           buf[PATH_MAX];
1449 
1450   if (rg == NULL) {
1451     return FALSE;
1452   }
1453 
1454   InitValNodeBlock (&file_list, NULL);
1455 
1456   SeqEntrySetScope (sep);
1457   SeqMgrIndexFeatures (entityID, NULL);
1458   for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1459     rs = vnp->data.ptrvalue;
1460     if (rs != NULL && !StringHasNoText (rs->orig_file)) {
1461       ValNodeAddPointerToEnd (&file_list, 0, rs->orig_file);
1462       rs->bsp = BioseqFind (rs->sip);
1463     }
1464   }
1465 
1466   file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1467   ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1468 
1469   for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1470     filename = vnp->data.ptrvalue;
1471     aip = AsnIoOpen (filename, "r");
1472     orig_sep = SeqEntryAsnRead (aip, NULL);
1473     AsnIoClose (aip);
1474     if (orig_sep == NULL) {
1475       Message (MSG_ERROR, "Unable to read Seq-entry from %s", filename);
1476       continue;
1477     }
1478     SeqEntrySetScope (orig_sep);
1479     orig_entityID = ObjMgrGetEntityIDForChoice (orig_sep);
1480     AssignIDsInEntity (orig_entityID, 0, NULL);
1481     SeqMgrIndexFeatures (orig_entityID, NULL);
1482     CopySetType (sep, orig_sep);
1483     CopySetTitle (sep, orig_sep);
1484     for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1485       rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1486       if (StringCmp (rs->orig_file, filename) == 0) {
1487         /* find original Bioseq */
1488         orig_bsp = BioseqFind (rs->sip);
1489         if (orig_bsp != NULL && rs->bsp != NULL && orig_bsp != rs->bsp) {
1490           /* copy over descriptors */
1491           CopyDescriptors (orig_bsp, rs->bsp);
1492 
1493           /* copy over features if not retrieving from feature file */
1494           if (StringHasNoText (rs->feature_file)) {
1495             CopyFeaturesAndProteins (entityID, orig_bsp, rs->bsp, orig_sep, sep);
1496           }
1497           /* additional Seq-ids */
1498           CopyExtraIds (rs->bsp, orig_bsp);
1499 
1500           /* copy mol */
1501           rs->bsp->mol = orig_bsp->mol;
1502 
1503 #if 0
1504           /* do not copy quality graphs */
1505           /* quality graphs */
1506           CopyQualityGraphs (rs->bsp, orig_bsp);
1507 #endif
1508 
1509           /* anything else? */
1510 
1511         } else {
1512           SeqIdWrite (rs->sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
1513           Message (MSG_POSTERR, "Unable to find original sequence for %s", buf);
1514         }
1515       }
1516     }
1517     SeqEntrySetScope (NULL);
1518     SeqEntryFree (orig_sep);
1519   }
1520   SeqEntrySetScope (sep);
1521   return rval;
1522 }
1523 
1524 
MarkFeaturesForDeletion(SeqFeatPtr sfp,Pointer data)1525 static void MarkFeaturesForDeletion (SeqFeatPtr sfp, Pointer data)
1526 {
1527   if (sfp != NULL) {
1528     sfp->idx.deleteme = TRUE;
1529   }
1530 }
1531 
1532 
StripFeaturesFromSelectedSequences(ReblobGroupPtr rg,Uint2 orig_entityID,CharPtr filename)1533 static Boolean StripFeaturesFromSelectedSequences (ReblobGroupPtr rg, Uint2 orig_entityID, CharPtr filename)
1534 {
1535   ValNodePtr   vnp_r;
1536   ReblobSeqPtr rs;
1537   BioseqPtr    orig_bsp;
1538   SeqEntryPtr  seq_sep;
1539   Boolean      rval = TRUE;
1540   Char         buf[PATH_MAX];
1541 
1542   if (rg == NULL) {
1543     return FALSE;
1544   }
1545 
1546   for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1547     rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1548     if (StringCmp (rs->orig_file, filename) == 0) {
1549       /* find original Bioseq */
1550       orig_bsp = BioseqFind (rs->sip);
1551       seq_sep = GetBestTopParentForData (orig_entityID, orig_bsp);
1552       if (seq_sep == NULL) {
1553         SeqIdWrite (rs->sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
1554         Message (MSG_POSTERR, "Unable to find original sequence for %s", buf);
1555         rval = FALSE;
1556       } else {
1557         /* strip features if retrieving from feature file */
1558         if (!StringHasNoText (rs->feature_file)) {
1559           VisitFeaturesInSep (seq_sep, NULL, MarkFeaturesForDeletion);
1560         }
1561       }
1562     }
1563   }
1564   DeleteMarkedObjects (orig_entityID, 0, NULL);
1565   return rval;
1566 }
1567 
1568 
MakeAlignmentFileIdsGenbank(TAlignmentFilePtr afp)1569 static void MakeAlignmentFileIdsGenbank (TAlignmentFilePtr afp)
1570 {
1571   int i;
1572   CharPtr fmt = "gb|%s";
1573   CharPtr tmp;
1574 
1575 
1576   if (afp == NULL) {
1577     return;
1578   }
1579   for (i = 0; i < afp->num_sequences; i++) {
1580     if (StringNICmp(afp->ids[i], "gb|", 3) != 0) {
1581       tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (afp->ids[i]) + StringLen (fmt)));
1582       sprintf (tmp, fmt, afp->ids[i]);
1583       afp->ids[i] = MemFree (afp->ids[i]);
1584       afp->ids[i] = tmp;
1585     }
1586   }
1587 }
1588 
1589 
MakeFeatureTableFileIdsGenbank(SeqAnnotPtr sap)1590 static void MakeFeatureTableFileIdsGenbank (SeqAnnotPtr sap)
1591 {
1592   SeqFeatPtr   sfp;
1593   SeqIdPtr     sip_orig, sip_new;
1594   ObjectIdPtr  oip;
1595   TextSeqIdPtr tsip;
1596   Boolean      found = FALSE;
1597   CharPtr      cp;
1598 
1599   if (sap == NULL || sap->type != 1) {
1600     return;
1601   }
1602   for (sfp = sap->data; sfp != NULL; sfp = sfp->next) {
1603     sip_orig = SeqLocId (sfp->location);
1604     if (sip_orig != NULL && sip_orig->choice == 1) {
1605       oip = (ObjectIdPtr) sip_orig->data.ptrvalue;
1606       if (oip != NULL && oip->str != NULL) {
1607         found = TRUE;
1608         break;
1609       }
1610     }
1611   }
1612   if (found) {
1613     tsip = TextSeqIdNew ();
1614     tsip->accession = StringSave (oip->str);
1615     if ((cp = StringChr(tsip->accession, '.')) != NULL) {
1616       tsip->version = atoi (cp + 1);
1617       *cp = 0;
1618     }
1619     sip_new = ValNodeNew (NULL);
1620     sip_new->choice = SEQID_GENBANK;
1621     sip_new->data.ptrvalue = tsip;
1622 
1623     sip_orig = SeqIdDup (sip_orig);
1624     for (sfp = sap->data; sfp != NULL; sfp = sfp->next) {
1625       ReplaceSeqIdWithSeqIdInFeat (sip_orig, sip_new, sfp);
1626     }
1627     sip_orig = SeqIdFree (sip_orig);
1628     sip_new = SeqIdFree (sip_new);
1629   }
1630 }
1631 
1632 
RemoveAlignmentTitlesAndOrganisms(TAlignmentFilePtr afp)1633 static void RemoveAlignmentTitlesAndOrganisms (TAlignmentFilePtr afp)
1634 {
1635   int i;
1636 
1637   if (afp == NULL || afp->deflines == NULL || afp->num_deflines == 0) {
1638     return;
1639   }
1640   for (i = 0; i < afp->num_deflines; i++) {
1641     free (afp->deflines[i]);
1642   }
1643   free (afp->deflines);
1644   afp->deflines = NULL;
1645   afp->num_deflines = 0;
1646   for (i = 0; i < afp->num_organisms; i++) {
1647     free (afp->organisms [i]);
1648   }
1649   free (afp->organisms);
1650   afp->organisms = NULL;
1651   afp->num_organisms = 0;
1652 }
1653 
1654 
EndsWithAsn(CharPtr filename)1655 static Boolean EndsWithAsn (CharPtr filename)
1656 {
1657   Int4 len;
1658 
1659   if (StringHasNoText (filename)) {
1660     return FALSE;
1661   }
1662   len = StringLen (filename);
1663   if (StringCmp (filename + len - 4, ".asn") == 0) {
1664     return TRUE;
1665   } else {
1666     return FALSE;
1667   }
1668 }
1669 
1670 
RemoveSingleItemAlignments(SeqAnnotPtr sap,Pointer data)1671 static void RemoveSingleItemAlignments (SeqAnnotPtr sap, Pointer data)
1672 {
1673   SeqAlignPtr salp;
1674 
1675   if (sap != NULL && sap->type == 2 && (salp = sap->data) != NULL && salp->dim == 1) {
1676     sap->idx.deleteme = TRUE;
1677   }
1678 }
1679 
1680 
ProcessGroup(ReblobGroupPtr rg)1681 static SeqEntryPtr ProcessGroup (ReblobGroupPtr rg)
1682 {
1683   TErrorInfoPtr     error_list = NULL;
1684   ReadBufferData    rbd;
1685   TAlignmentFilePtr afp;
1686   SeqEntryPtr       sep = NULL;
1687   TSequenceInfoPtr  sequence_info;
1688   Uint2             entityID;
1689   AsnIoPtr          aip;
1690 
1691   if (rg == NULL) {
1692     return NULL;
1693   }
1694 
1695   if (EndsWithAsn(rg->filename)) {
1696     /* not an alignment, no reblobbing, just do features and source table */
1697     aip = AsnIoOpen (rg->filename, "r");
1698     sep = SeqEntryAsnRead (aip, NULL);
1699     AsnIoClose (aip);
1700     if (sep == NULL) {
1701       Message (MSG_ERROR, "Unable to read Seq-entry from file %s\n", rg->filename);
1702     }
1703     SeqEntrySetScope (sep);
1704     entityID = ObjMgrGetEntityIDForChoice (sep);
1705     AssignIDsInEntity (entityID, 0, NULL);
1706     SeqMgrIndexFeatures (entityID, NULL);
1707     StripFeaturesFromSelectedSequences (rg, entityID, rg->filename);
1708     AddFeaturesToGroup (rg, sep, entityID, rg->filename);
1709     sep = GetTopSeqEntryForEntityID (entityID);
1710     ApplySrcTableToGroup (rg, sep, entityID, rg->filename);
1711   } else {
1712     rbd.fp = FileOpen (rg->filename, "r");
1713     if (rbd.fp == NULL) {
1714       Message (MSG_ERROR, "Unable to open alignment file %s", rg->filename);
1715       return NULL;
1716     }
1717     rbd.current_data = NULL;
1718     sequence_info = GetDefaultSequenceInfo();
1719     afp = ReadAlignmentFile ( AbstractReadFunction,
1720                               (Pointer) &rbd,
1721                               AbstractReportError,
1722                               (Pointer) &error_list,
1723                               sequence_info);
1724     FileClose (rbd.fp);
1725     ErrorInfoFree (error_list);
1726     if (afp == NULL) {
1727       Message (MSG_ERROR, "Unable to read alignment from %s", rg->filename);
1728     } else if (! DoSequenceLengthsMatch (afp)) {
1729       Message (MSG_ERROR, "Your alignment in %s is incorrectly formatted.  Sequence plus gaps should be the same length for all sequences", rg->filename);
1730     } else {
1731       MakeAlignmentFileIdsGenbank (afp);
1732       RemoveAlignmentTitlesAndOrganisms (afp);
1733       sep = MakeSequinDataFromAlignment (afp, Seq_mol_na);
1734       entityID = ObjMgrGetEntityIDForChoice (sep);
1735       /* now - copy extras from original, add in new feature annotation */
1736       CopyDataFromOriginals (rg, sep, entityID);
1737       AddFeaturesToGroup (rg, sep, entityID, NULL);
1738       sep = GetTopSeqEntryForEntityID (entityID);
1739       ApplySrcTableToGroup (rg, sep, entityID, NULL);
1740       /* add quality scores */
1741       ApplyQualityScoresToGroup (rg, sep, entityID, NULL);
1742       /* collect pubs on set level */
1743       MovePopPhyMutPubs(sep);
1744       VisitAnnotsInSep (sep, NULL, RemoveSingleItemAlignments);
1745       DeleteMarkedObjects (entityID, 0, NULL);
1746     }
1747 
1748     SequenceInfoFree (sequence_info);
1749     AlignmentFileFree (afp);
1750   }
1751 
1752   return sep;
1753 }
1754 
1755 
NotTitle(SeqDescPtr sdp,Pointer extradata)1756 static Boolean NotTitle (SeqDescPtr sdp, Pointer extradata)
1757 {
1758 
1759   if (sdp == NULL
1760       || sdp->choice == Seq_descr_title) {
1761     return FALSE;
1762   } else {
1763     return TRUE;
1764   }
1765 }
1766 
1767 
GetCDS(BioseqPtr bsp)1768 static SeqFeatPtr GetCDS (BioseqPtr bsp)
1769 {
1770   SeqFeatPtr sfp = NULL, tmp_sfp;
1771   SeqAnnotPtr sap;
1772   BioseqSetPtr bssp;
1773 
1774   if (bsp == NULL) {
1775     return NULL;
1776   }
1777   for (sap = bsp->annot; sap != NULL; sap = sap->next) {
1778     if (sap->type == 1) {
1779       tmp_sfp = sap->data;
1780       while (tmp_sfp != NULL) {
1781         if (tmp_sfp->data.choice == SEQFEAT_CDREGION) {
1782           if (sfp == NULL) {
1783             sfp = tmp_sfp;
1784           } else {
1785             /* found second coding region */
1786             return NULL;
1787           }
1788         }
1789         tmp_sfp = tmp_sfp->next;
1790       }
1791     }
1792   }
1793   if (bsp->idx.parenttype == OBJ_BIOSEQSET && (bssp = (BioseqSetPtr) bsp->idx.parentptr) != NULL) {
1794     for (sap = bssp->annot; sap != NULL; sap = sap->next) {
1795       if (sap->type == 1) {
1796         tmp_sfp = sap->data;
1797         while (tmp_sfp != NULL) {
1798           if (tmp_sfp->data.choice == SEQFEAT_CDREGION) {
1799             if (sfp == NULL) {
1800               sfp = tmp_sfp;
1801             } else {
1802               /* found second coding region */
1803               return NULL;
1804             }
1805           }
1806           tmp_sfp = tmp_sfp->next;
1807         }
1808       }
1809     }
1810   }
1811   return sfp;
1812 }
1813 
1814 
CopyProteinId(SeqEntryPtr orig_sep,SeqEntryPtr new_sep,BioseqPtr prot_bsp)1815 static Boolean CopyProteinId (SeqEntryPtr orig_sep, SeqEntryPtr new_sep, BioseqPtr prot_bsp)
1816 {
1817   SeqEntryPtr orig_scope;
1818   SeqFeatPtr  orig_cds, new_cds;
1819   BioseqPtr   orig_nuc, new_nuc, new_prot;
1820   Boolean           rval = FALSE;
1821   SeqIdPtr          sip, sip_tmp;
1822   TextSeqIdPtr      tsip;
1823 
1824   orig_scope = SeqEntrySetScope (orig_sep);
1825   orig_cds = SeqMgrGetCDSgivenProduct (prot_bsp, NULL);
1826   if (orig_cds != NULL) {
1827     orig_nuc = BioseqFindFromSeqLoc (orig_cds->location);
1828     SeqEntrySetScope (new_sep);
1829     new_nuc = NULL;
1830     for (sip = orig_nuc->id; sip != NULL && new_nuc == NULL; sip = sip->next) {
1831       new_nuc = BioseqFind (sip);
1832       if (new_nuc == NULL && sip->choice == SEQID_GENBANK) {
1833         sip_tmp = SeqIdDup (sip);
1834         tsip = (TextSeqIdPtr) sip_tmp->data.ptrvalue;
1835         tsip->version = 0;
1836         new_nuc = BioseqFind (sip_tmp);
1837         sip_tmp = SeqIdFree (sip_tmp);
1838       }
1839     }
1840     if (new_nuc != NULL) {
1841       new_cds = GetCDS (new_nuc);
1842       if (new_cds != NULL) {
1843         new_prot = BioseqFindFromSeqLoc (new_cds->product);
1844         if (new_prot != NULL) {
1845           sip = SeqIdFindBest (prot_bsp->id, SEQID_GENBANK);
1846           if (sip != NULL) {
1847             sip = SeqIdDup (sip);
1848             BioseqReplaceID (new_prot, sip);
1849             sip = SeqIdFree (sip);
1850             rval = TRUE;
1851           }
1852         }
1853       }
1854     }
1855   }
1856 
1857   SeqEntrySetScope (orig_scope);
1858   return rval;
1859 }
1860 
1861 
ProcessSingletons(ReblobGroupPtr rg,SeqEntryPtr top_sep)1862 static Boolean ProcessSingletons (ReblobGroupPtr rg, SeqEntryPtr top_sep)
1863 {
1864   ValNodePtr     vnp, vnp_r;
1865   ReblobSeqPtr   rs;
1866   ValNodeBlock   file_list;
1867   Boolean        rval = TRUE;
1868   SeqEntryPtr    orig_sep, seq_sep;
1869   AsnIoPtr       aip;
1870   BioseqPtr      orig_bsp;
1871   CharPtr        filename;
1872   Uint2          orig_entityID;
1873 
1874   if (rg == NULL) {
1875     return FALSE;
1876   }
1877 
1878   InitValNodeBlock (&file_list, NULL);
1879 
1880   SeqEntrySetScope (top_sep);
1881 
1882   for (vnp = rg->seq_list; vnp != NULL; vnp = vnp->next) {
1883     rs = vnp->data.ptrvalue;
1884     if (rs != NULL && !StringHasNoText (rs->orig_file)) {
1885       ValNodeAddPointerToEnd (&file_list, 0, rs->orig_file);
1886       rs->bsp = BioseqFind (rs->sip);
1887     }
1888   }
1889 
1890   file_list.head = ValNodeSort (file_list.head, SortVnpByString);
1891   ValNodeUnique (&file_list.head, SortVnpByString, ValNodeFree);
1892 
1893   for (vnp = file_list.head; vnp != NULL; vnp = vnp->next) {
1894     filename = vnp->data.ptrvalue;
1895     aip = AsnIoOpen (filename, "r");
1896     orig_sep = SeqEntryAsnRead (aip, NULL);
1897     AsnIoClose (aip);
1898     if (orig_sep == NULL) {
1899       Message (MSG_ERROR, "Unable to read from file %s", filename);
1900       continue;
1901     }
1902     SeqEntrySetScope (orig_sep);
1903     orig_entityID = ObjMgrGetEntityIDForChoice (orig_sep);
1904     AssignIDsInEntity (orig_entityID, 0, NULL);
1905     SeqMgrIndexFeatures (orig_entityID, NULL);
1906     /* push descriptors so they will be attached to the singleton sequences */
1907     if (IS_Bioseq_set(orig_sep)) {
1908       PropagateSomeDescriptors(orig_sep, NotTitle, NULL);
1909     }
1910 
1911     rval &= StripFeaturesFromSelectedSequences (rg, orig_entityID, filename);
1912 
1913     /* add features */
1914     rval &= AddFeaturesToGroup (rg, orig_sep, orig_entityID, filename);
1915 
1916     /* add source table information */
1917     rval &= ApplySrcTableToGroup (rg, orig_sep, orig_entityID, filename);
1918 
1919     /* add quality scores */
1920     rval &= ApplyQualityScoresToGroup (rg, orig_sep, orig_entityID, filename);
1921 
1922     /* now make copies and add to top_sep */
1923     for (vnp_r = rg->seq_list; vnp_r != NULL; vnp_r = vnp_r->next) {
1924       rs = (ReblobSeqPtr) vnp_r->data.ptrvalue;
1925       if (StringCmp (rs->orig_file, filename) == 0) {
1926         /* find original Bioseq */
1927         orig_bsp = BioseqFind (rs->sip);
1928         /* don't add if protein, will have been added with nucleotide already */
1929         if (orig_bsp != NULL) {
1930           if (ISA_aa (orig_bsp->mol)) {
1931             /* if there's a replacement protein for this one, copy the accession */
1932             CopyProteinId (orig_sep, top_sep, orig_bsp);
1933           } else {
1934             seq_sep = GetBestTopParentForData (orig_entityID, orig_bsp);
1935             if (seq_sep != NULL) {
1936               seq_sep = AsnIoMemCopy (seq_sep, (AsnReadFunc) SeqEntryAsnRead, (AsnWriteFunc) SeqEntryAsnWrite);
1937               PromoteAllToWorstID (seq_sep);
1938               AddSeqEntryToSeqEntry (top_sep, seq_sep, TRUE);
1939             }
1940           }
1941         }
1942       }
1943     }
1944 
1945     SeqEntrySetScope (NULL);
1946     SeqEntryFree (orig_sep);
1947   }
1948   return rval;
1949 }
1950 
1951 
ProcessGroupList(ValNodePtr group_list)1952 static SeqEntryPtr ProcessGroupList (ValNodePtr group_list)
1953 {
1954   ValNodePtr vnp;
1955   SeqEntryPtr top_sep, sep;
1956   BioseqSetPtr bssp;
1957   ReblobGroupPtr rg;
1958 
1959   bssp = BioseqSetNew ();
1960   bssp->_class = BioseqseqSet_class_genbank;
1961   top_sep = SeqEntryNew();
1962   top_sep->choice = 2;
1963   top_sep->data.ptrvalue = bssp;
1964 
1965   for (vnp = group_list; vnp != NULL; vnp = vnp->next) {
1966     rg = (ReblobGroupPtr) vnp->data.ptrvalue;
1967     if (StringHasNoText (rg->filename)) {
1968       /* process as singletons */
1969       ProcessSingletons (rg, top_sep);
1970     } else {
1971       sep = ProcessGroup (vnp->data.ptrvalue);
1972       if (sep != NULL) {
1973         AddSeqEntryToSeqEntry (top_sep, sep, TRUE);
1974       }
1975     }
1976   }
1977 
1978   return top_sep;
1979 }
1980 
1981 
ReadMapFile(CharPtr filename,CharPtr result_dir)1982 static Int4 ReadMapFile (CharPtr filename, CharPtr result_dir)
1983 {
1984   FILE *fp;
1985   ReadBufferData rbd;
1986   CharPtr        line;
1987   Boolean        collecting = FALSE;
1988   CharPtr        grp_file = NULL;
1989   ValNodeBlock   group_list;
1990   ValNodeBlock   seq_list;
1991   ValNodePtr     tokens;
1992   SeqEntryPtr    sep;
1993   AsnIoPtr       aip;
1994   Char           output_file[PATH_MAX];
1995   Int4           cluster_number = 1;
1996 
1997   fp = FileOpen (filename, "r");
1998   if (fp == NULL) {
1999     Message (MSG_FATAL, "Unable to open map file %s", filename);
2000     return 1;
2001   }
2002 
2003   rbd.fp = fp;
2004   rbd.current_data = NULL;
2005 
2006   line = AbstractReadFunction (&rbd);
2007   while (line != NULL) {
2008     if (StringNICmp (line, "START", 5) == 0) {
2009       /* start collecting group */
2010       collecting = TRUE;
2011       InitValNodeBlock (&group_list, NULL);
2012       InitValNodeBlock (&seq_list, NULL);
2013       grp_file = MemFree (grp_file);
2014     } else if (StringNICmp (line, "STOP", 4) == 0) {
2015       /* finish last group */
2016       ValNodeAddPointerToEnd (&group_list, 0, ReblobGroupNew (grp_file, seq_list.head));
2017 
2018       /* process group list */
2019       sep = ProcessGroupList (group_list.head);
2020       if (sep == NULL) {
2021         Message (MSG_ERROR, "Unable to process cluster %d", cluster_number);
2022       } else {
2023         sprintf (output_file, "%s/cluster_%d.sqn", result_dir, cluster_number);
2024         aip = AsnIoOpen (output_file, "w");
2025         SeqEntryAsnWrite (sep, aip, NULL);
2026         AsnIoClose (aip);
2027         sep = SeqEntryFree (sep);
2028       }
2029 
2030       /* cleanup */
2031       cluster_number++;
2032       group_list.head = ReblobGroupListFree (group_list.head);
2033       collecting = FALSE;
2034       grp_file = MemFree (grp_file);
2035     } else if (collecting) {
2036       tokens = ReadOneColumnList (line);
2037       if (tokens != NULL) {
2038         if (grp_file == NULL) {
2039           grp_file = StringSave (tokens->data.ptrvalue);
2040         } else if (StringCmp (grp_file, tokens->data.ptrvalue) != 0) {
2041           ValNodeAddPointerToEnd (&group_list, 0, ReblobGroupNew (grp_file, seq_list.head));
2042           InitValNodeBlock (&seq_list, NULL);
2043           grp_file = MemFree (grp_file);
2044           grp_file = StringSave (tokens->data.ptrvalue);
2045         }
2046         if (tokens->next != NULL) {
2047           ValNodeAddPointerToEnd (&seq_list, 0, ReblobSeqNew (tokens->next));
2048         }
2049       }
2050       tokens = ValNodeFreeData (tokens);
2051     }
2052     line = AbstractReadFunction (&rbd);
2053   }
2054 
2055   FileClose(fp);
2056   return 0;
2057 }
2058 
2059 
2060 /* Args structure contains command-line arguments */
2061 
2062 typedef enum {
2063   m_argMapFile = 0,
2064   r_argOutputPath,
2065 } Arguments;
2066 
2067 
2068 Args myargs [] = {
2069   {"Input Map File", NULL, NULL, NULL,
2070     TRUE, 'm', ARG_STRING, 0.0, 0, NULL},
2071   {"Path for Results", NULL, NULL, NULL,
2072     TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
2073 };
2074 
Main(void)2075 Int2 Main (void)
2076 
2077 {
2078   Char            app [64];
2079   CharPtr         base;
2080   CharPtr         results;
2081 
2082   /* standard setup */
2083 
2084   ErrSetFatalLevel (SEV_MAX);
2085   ErrSetMessageLevel (SEV_MAX);
2086   ErrClearOptFlags (EO_SHOW_USERSTR);
2087   UseLocalAsnloadDataAndErrMsg ();
2088   ErrPathReset ();
2089 
2090   /* finish resolving internal connections in ASN.1 parse tables */
2091 
2092   if (! AllObjLoad ()) {
2093     Message (MSG_FATAL, "AllObjLoad failed");
2094     return 1;
2095   }
2096   if (! SubmitAsnLoad ()) {
2097     Message (MSG_FATAL, "SubmitAsnLoad failed");
2098     return 1;
2099   }
2100   if (! FeatDefSetLoad ()) {
2101     Message (MSG_FATAL, "FeatDefSetLoad failed");
2102     return 1;
2103   }
2104   if (! SeqCodeSetLoad ()) {
2105     Message (MSG_FATAL, "SeqCodeSetLoad failed");
2106     return 1;
2107   }
2108   if (! GeneticCodeTableLoad ()) {
2109     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
2110     return 1;
2111   }
2112 
2113 
2114   /* process command line arguments */
2115 
2116   sprintf (app, "reblobber %s", REBLOBBER_APPLICATION);
2117   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
2118     return 0;
2119   }
2120 
2121   SetAppProperty ("NcbiTbl2Asn", (void *) 1024);
2122 
2123   results = (CharPtr) myargs [r_argOutputPath].strvalue;
2124   base = (CharPtr) myargs [m_argMapFile].strvalue;
2125   if (StringHasNoText (results) || StringHasNoText (base)) {
2126     Message (MSG_FATAL, "Must supply map file and results directory.");
2127     return 1;
2128   }
2129 
2130   return ReadMapFile (base, results);
2131 
2132 }
2133 
2134